Index: include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfo.h
+++ include/llvm/Analysis/TargetTransformInfo.h
@@ -528,6 +528,18 @@
                                  ArrayRef<unsigned> Indices, unsigned Alignment,
                                  unsigned AddressSpace) const;
 
+  /// \return The cost of the strided memory operation.
+  /// \p Opcode is the memory operation code
+  /// \p VecTy is the vector type of the interleaved access.
+  /// \p Factor is the interleave factor
+  /// \p Indices is the indices for interleaved load members (as interleaved
+  ///    load allows gaps)
+  /// \p Alignment is the alignment of the memory operation
+  /// \p AddressSpace is address space of the pointer.
+  int getStridedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
+                             ArrayRef<unsigned> Indices, unsigned Alignment,
+                             unsigned AddressSpace) const;
+
   /// \brief Calculate the cost of performing a vector reduction.
   ///
   /// This is the cost of reducing the vector value of type \p Ty to a scalar
@@ -692,6 +704,11 @@
                                          ArrayRef<unsigned> Indices,
                                          unsigned Alignment,
                                          unsigned AddressSpace) = 0;
+  virtual int getStridedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                     unsigned Factor,
+                                     ArrayRef<unsigned> Indices,
+                                     unsigned Alignment,
+                                     unsigned AddressSpace) = 0;
   virtual int getReductionCost(unsigned Opcode, Type *Ty,
                                bool IsPairwiseForm) = 0;
   virtual int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
@@ -900,6 +917,12 @@
     return Impl.getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
                                            Alignment, AddressSpace);
   }
+  int getStridedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
+                             ArrayRef<unsigned> Indices, unsigned Alignment,
+                             unsigned AddressSpace) override {
+    return Impl.getStridedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                       Alignment, AddressSpace);
+  }
   int getReductionCost(unsigned Opcode, Type *Ty,
                        bool IsPairwiseForm) override {
     return Impl.getReductionCost(Opcode, Ty, IsPairwiseForm);
Index: include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfoImpl.h
+++ include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -332,6 +332,15 @@
     return 1;
   }
 
+  unsigned getStridedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                  unsigned Factor,
+                                  ArrayRef<unsigned> Indices,
+                                  unsigned Alignment,
+                                  unsigned AddressSpace) {
+    return getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                      Alignment, AddressSpace);
+  }
+
   unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
                                  ArrayRef<Type *> Tys, FastMathFlags FMF) {
     return 1;
Index: include/llvm/CodeGen/BasicTTIImpl.h
===================================================================
--- include/llvm/CodeGen/BasicTTIImpl.h
+++ include/llvm/CodeGen/BasicTTIImpl.h
@@ -595,6 +595,15 @@
     return Cost;
   }
 
+  unsigned getStridedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                  unsigned Factor,
+                                  ArrayRef<unsigned> Indices,
+                                  unsigned Alignment,
+                                  unsigned AddressSpace) {
+    return getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                      Alignment, AddressSpace);
+  }
+
   /// Get intrinsic cost based on arguments  
   unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
                                  ArrayRef<Value *> Args, FastMathFlags FMF) {
Index: lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- lib/Analysis/TargetTransformInfo.cpp
+++ lib/Analysis/TargetTransformInfo.cpp
@@ -331,6 +331,15 @@
   return Cost;
 }
 
+int TargetTransformInfo::getStridedMemoryOpCost(
+    unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
+    unsigned Alignment, unsigned AddressSpace) const {
+  int Cost = TTIImpl->getStridedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                             Alignment, AddressSpace);
+  assert(Cost >= 0 && "TTI should not produce negative costs!");
+  return Cost;
+}
+
 int TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
                                                ArrayRef<Type *> Tys,
                                                FastMathFlags FMF) const {
Index: lib/Target/X86/X86ISelLowering.h
===================================================================
--- lib/Target/X86/X86ISelLowering.h
+++ lib/Target/X86/X86ISelLowering.h
@@ -682,6 +682,9 @@
     unsigned getByValTypeAlignment(Type *Ty,
                                    const DataLayout &DL) const override;
 
+    /// Returns maximum supported Interleave factor.
+    unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
+
     /// Returns the target specific optimal type for load
     /// and store operations as a result of memset, memcpy, and memmove
     /// lowering. If DstAlign is zero that means it's safe to destination
Index: lib/Target/X86/X86TargetTransformInfo.h
===================================================================
--- lib/Target/X86/X86TargetTransformInfo.h
+++ lib/Target/X86/X86TargetTransformInfo.h
@@ -76,6 +76,9 @@
                       unsigned AddressSpace);
   int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
                             unsigned AddressSpace);
+  int getStridedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
+                             ArrayRef<unsigned> Indices, unsigned Alignment,
+                             unsigned AddressSpace);
   int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
                              bool VariableMask, unsigned Alignment);
   int getAddressComputationCost(Type *PtrTy, bool IsComplex);
Index: lib/Target/X86/X86TargetTransformInfo.cpp
===================================================================
--- lib/Target/X86/X86TargetTransformInfo.cpp
+++ lib/Target/X86/X86TargetTransformInfo.cpp
@@ -26,6 +26,51 @@
 
 #define DEBUG_TYPE "x86tti"
 
+/// Returns number of valid memory in one VF access.
+/// i.e. For Stride 2 and VF 4
+/// |---|---|---|---|
+/// | a | * | b | * |
+/// |---|---|---|---|
+///   0   1   2   3
+/// Only offset 0 & 2 holding value a & b are memory accesses of interest
+/// So it will return 2.
+/// NOTE: It assumes all iteration for a given stride holds common memory
+/// accesses pattern.
+static unsigned getValidMemInSingleVFAccess(unsigned VF, unsigned Stride) {
+  return ((VF - 1) / Stride) + 1;
+}
+
+/// Returns number of load/store required to fill a vector register.
+/// i.e. For Stride 3 and VF 4
+/// |---|---|---|---|---|---|---|---|
+/// | a | * | * | b | c | * | * | d |
+/// |---|---|---|---|---|---|---|---|
+///   0   1   2   3   6   7   8   9
+/// Offset 0, 3, 6, 9 required to fill vector register.
+/// So 2 vector load will be requied.
+/// NOTE: It assumes all iteration for a given stride holds common memory
+/// accesses pattern.
+static unsigned getRequiredLoadStore(unsigned VF, unsigned Stride) {
+  unsigned VMem = getValidMemInSingleVFAccess(VF, Stride);
+  return (VF / VMem) + ((VF % VMem) ? 1 : 0);
+}
+
+/// Return number of shuffle required for promotion.
+/// To promote shuffle to a upper type may need to generate additional
+/// shuffle.
+static unsigned getShuffleRequiredForPromotion(unsigned VF, unsigned Stride) {
+  unsigned ValidElements = getValidMemInSingleVFAccess(VF, Stride);
+  unsigned FixUpStride = 0;
+  // Identify valid elements in first 2 loads.
+  unsigned Prev = 2 * ValidElements;
+  // Iterate collected elements less than VF
+  while(Prev < VF) {
+    FixUpStride++;
+    Prev = Prev + ValidElements;
+  }
+  return FixUpStride;
+}
+
 //===----------------------------------------------------------------------===//
 //
 // X86 cost model.
@@ -1100,6 +1145,53 @@
   return Cost+LT.first;
 }
 
+/// Returns strided memory load/store instruction vectorization cost.
+/// In addition to load/store instruction it considers cost of
+/// shuffle also.
+int X86TTIImpl::getStridedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                       unsigned Factor,
+                                       ArrayRef<unsigned> Indices,
+                                       unsigned Alignment,
+                                       unsigned AddressSpace) {
+   assert(Factor >= 2 && "Invalid interleave factor");
+   assert(isa<VectorType>(VecTy) && "Expect a vector type");
+   if (Factor <= TLI->getMaxSupportedInterleaveFactor()) {
+     // Input is WideVector type (i.e. VectorTy * Stride)
+     // First get the actual vector type.
+     Type *SubVecTy = VectorType::get(VecTy->getScalarType(),
+                                      (VecTy->getVectorNumElements() / Factor));
+     // Identify vector factor and number of load/store required.
+     unsigned VF = SubVecTy->getVectorNumElements();
+     unsigned MemOpRequired = getRequiredLoadStore(VF, Factor);
+     // Firstly, the cost of load/store operation.
+     unsigned Cost = getMemoryOpCost(Opcode, SubVecTy, Alignment, AddressSpace);
+     // Multiply Cost by number of load/store operation.
+     Cost = Cost * MemOpRequired;
+     unsigned ShuffleCost = 0;
+     // Identify number of shuffle required for promotion
+     // When types are different we cant directly shuffle elements
+     // because shuffle expects same type
+     // i.e. <4 x i32> * <2 x i32> is not possible, so its required
+     // to promote <2 x i32> to <4 x i32> with undefs vector.
+     unsigned TotalSufflePerMemOp = getShuffleRequiredForPromotion(VF, Factor);
+     // Identify cost of shuffle which required for promotion.
+     // NOTE: This is only applicable for load.
+     if(Opcode == Instruction::Load)
+       for (unsigned i = 0; i < MemOpRequired; i++)
+         for (unsigned i = 0; i < TotalSufflePerMemOp; i++)
+           ShuffleCost += getShuffleCost(TTI::SK_InsertSubvector, VecTy, 0, SubVecTy);
+     // Identify cost of shuffle required for multiple memory operation.
+     for (unsigned i = 0; i < (MemOpRequired - 1); i++)
+       ShuffleCost += getShuffleCost(TTI::SK_InsertSubvector, SubVecTy, 0, SubVecTy);
+     // Update final cost & return.
+     Cost +=  ShuffleCost;
+     return Cost;
+   }
+   // Default call Base's getInterleavedMemoryOpCost.
+   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                            Alignment, AddressSpace);
+}
+
 int X86TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
   // Address computations in vectorized code with non-consecutive addresses will
   // likely result in more instructions compared to scalar code where the
Index: lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorize.cpp
+++ lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -149,6 +149,17 @@
     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
 
+/// Option to enable strided vectorization.
+static cl::opt<bool> EnableStridedVectorization(
+    "enable-strided-vectorization", cl::init(false), cl::Hidden,
+    cl::desc("Enable strided memory accesses vectorization in a loop"));
+
+/// Option to enable mask load generation, will work only
+/// with strided vectorization.
+static cl::opt<bool> EnableStridedMaskLoad(
+    "enable-strided-mask-load", cl::init(false), cl::Hidden,
+    cl::desc("Enable strided memory vectorization with maked load generation"));
+
 /// Maximum factor for an interleaved memory access.
 static cl::opt<unsigned> MaxInterleaveGroupFactor(
     "max-interleave-group-factor", cl::Hidden,
@@ -293,6 +304,47 @@
   return nullptr;
 }
 
+/// Returns SkipFactor.
+/// SkipFactor is the additional number of offsets to move Index pointer after
+/// a vector load/store. Index pointer is the start address of memory in a
+/// particular loop iteration, from this address we will do a vector load/store.
+/// Index pointer movement required for subsequent memory accesses in loop.
+/// By introducing SkipFactor we ensures all vector memory accesses holds same
+/// memory pattern. This is the reason why we use common mask for accesses
+/// across iteration.
+static unsigned getSkipFactor(unsigned VF, unsigned Stride) {
+   return (Stride - (VF % Stride)) % Stride;
+}
+
+/// Returns number of valid memory in one VF access.
+/// i.e. For Stride 2 and VF 4
+/// |---|---|---|---|
+/// | a | * | b | * |
+/// |---|---|---|---|
+///   0   1   2   3
+/// Only offset 0 & 2 holding value a & b are memory accesses of interest
+/// So it will return 2.
+/// NOTE: It assumes all iteration for a given stride holds common memory
+/// accesses pattern.
+static unsigned getValidMemInSingleVFAccess(unsigned VF, unsigned Stride) {
+  return ((VF - 1) / Stride) + 1;
+}
+
+/// Returns number of load/store required to fill a vector register.
+/// i.e. For Stride 3 and VF 4
+/// |---|---|---|---|---|---|---|---|
+/// | a | * | * | b | c | * | * | d |
+/// |---|---|---|---|---|---|---|---|
+///   0   1   2   3   6   7   8   9
+/// Offset 0, 3, 6, 9 required to fill vector register.
+/// So 2 vector load will be requied.
+/// NOTE: It assumes all iteration for a given stride holds common memory
+/// accesses pattern.
+static unsigned getRequiredLoadStore(unsigned VF, unsigned Stride) {
+  unsigned VMem = getValidMemInSingleVFAccess(VF, Stride);
+  return (VF / VMem) + ((VF % VMem) ? 1 : 0);
+}
+
 /// InnerLoopVectorizer vectorizes loops which contain only one basic
 /// block to a specified vectorization factor (VF).
 /// This class performs the widening of scalars into vectors, or multiple
@@ -440,6 +492,15 @@
   /// Try to vectorize the interleaved access group that \p Instr belongs to.
   void vectorizeInterleaveGroup(Instruction *Instr);
 
+  /// Try to vectorize the stride memory access.
+  void vectorizeStridedMemoryAccess(Instruction *Instr);
+
+  /// Vectorize the strided load.
+  void vectorizeStridedLoad(Instruction *Instr);
+
+  /// Vectorize the strided store.
+  void vectorizeStridedStore(Instruction *Instr);
+
   /// Generate a shuffle sequence that will reverse the vector Vec.
   virtual Value *reverseVector(Value *Vec);
 
@@ -459,7 +520,17 @@
   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
   /// Emit bypass checks to check any memory assumptions we may have made.
   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
-
+  /// Returns mask required for masked load/store instruction.
+  Constant *getLoadStoreInstMask(IRBuilder<> &Builder, unsigned VF,
+                                 unsigned Stride, unsigned NumVec);
+  /// Returns mask required for shuffle store instruction.
+  Constant *getStoreShuffleVectorMask(IRBuilder<> &Builder, unsigned VF,
+                                      unsigned Stride, unsigned Part);
+  /// Returns mask required for shuffle load instruction.
+  Constant *getLoadShuffleVectorMask(IRBuilder<> &Builder, unsigned Start,
+                                     unsigned Stride, unsigned VF,
+                                     bool isFirstOpShuffle, unsigned FirstOpLen,
+                                     unsigned SecondOpLen);
   /// Add additional metadata to \p To that was not present on \p Orig.
   ///
   /// Currently this is used to add the noalias annotations based on the
@@ -830,6 +901,142 @@
   Instruction *InsertPos;
 };
 
+/// \brief The descriptor for a strided memory access.
+struct StrideDescriptor {
+  StrideDescriptor(int Stride, const SCEV *Scev, unsigned Size, unsigned Align)
+      : Stride(Stride), Scev(Scev), Size(Size), Align(Align) {}
+
+  StrideDescriptor() : Stride(0), Scev(nullptr), Size(0), Align(0) {}
+
+  int Stride;       // The access's stride. It is negative for a reverse access.
+  const SCEV *Scev; // The scalar expression of this access
+  unsigned Size;    // The size of the memory object.
+  unsigned Align;   // The alignment of this access.
+};
+
+class StrideAccessInfo {
+public:
+  const Instruction *MemAccessInstr;
+  StrideDescriptor &SD;
+  bool hasValidStride;
+  ScalarEvolution *SE;
+  Loop *CurLoop;
+  LoopInfo *LInfo;
+  SmallVector<const Instruction *, 8> DependentInstr;
+
+  StrideAccessInfo(const Instruction *MemAccessInstr, StrideDescriptor &SD,
+                   ScalarEvolution *SE, Loop *CurLoop, LoopInfo *LInfo)
+      : MemAccessInstr(MemAccessInstr), SD(SD), hasValidStride(false), SE(SE),
+        CurLoop(CurLoop), LInfo(LInfo) {
+    analyzeStride(MemAccessInstr);
+  }
+
+  void insertDependentInstr(const Instruction *I) {
+    DependentInstr.push_back(I);
+  }
+
+  SmallVector<const Instruction *, 8> &getDependentInstr(const Instruction *I) {
+    return DependentInstr;
+  }
+
+  bool isValidStrideOperation() const;
+  bool validateStrideOperation(Instruction *I);
+  void analyzeStride(const Instruction *);
+
+  void print() const {
+    dbgs() << "Memory Instruction : ";
+    MemAccessInstr->dump();
+    dbgs() << "Has dependent Instr \n";
+    SmallVector<const Instruction *, 8>::const_iterator I, E;
+    for (I = DependentInstr.begin(), E = DependentInstr.end(); I != E; ++I) {
+      dbgs() << "    ";
+      (*I)->dump();
+      dbgs() << "\n";
+    }
+  }
+};
+
+/// Validates if a instruction holds a valid strided operation
+/// As in index operation it only allows Add, Sub, Shl & Mul operations
+/// in a index has a non constant stride then it returns false.
+/// In case of PHI it should be InductionPHI.
+bool StrideAccessInfo::validateStrideOperation(Instruction *I) {
+  switch (I->getOpcode()) {
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Shl:
+  case Instruction::Mul: {
+    // Second operand should be a compile time constant.
+    ConstantInt *CV = dyn_cast<ConstantInt>(I->getOperand(1));
+    if (!CV)
+      return false;
+    // First operand should be a valid instruction.
+    Instruction *Op0Inst = dyn_cast<Instruction>(I->getOperand(0));
+    if (!Op0Inst)
+      return false;
+    // Analyze first operand further.
+    if (!validateStrideOperation(Op0Inst)) {
+      return false;
+    }
+    // Consider instruction as dependent & return true.
+    insertDependentInstr(I);
+    return true;
+  }
+  case Instruction::PHI: {
+    // The PHI should be a induction PHI, else return false
+    InductionDescriptor ID;
+    PHINode *Phi = dyn_cast<PHINode>(I);
+    if (InductionDescriptor::isInductionPHI(Phi, SE, ID)) {
+      return true;
+    }
+    return false;
+  }
+  default:
+    return false;
+  }
+  return false;
+}
+
+/// Analyze stride operation in an instruction and mark it as valid
+/// stride access, it considers operation involved in stride calculation
+/// It only consider simple stride patern.
+/// i.e. arr[i*X], arr[(i+1) * X], arr[(i*X)+1] (where X is a constant stride)
+void StrideAccessInfo::analyzeStride(const Instruction *I) {
+  const LoadInst *Ld = dyn_cast<const LoadInst>(I);
+  const StoreInst *St = dyn_cast<const StoreInst>(I);
+  assert((Ld || St) && "Invalid instruction, not load or store");
+  const Value *Ptr = Ld ? Ld->getPointerOperand() : St->getPointerOperand();
+
+  const GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
+  if (!Gep)
+    return;
+  unsigned InductionOperand = getGEPInductionOperand(Gep);
+  const SCEV *IndGep = SE->getSCEV(Gep->getOperand(InductionOperand));
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IndGep);
+  if (!AR || !AR->isAffine())
+    return;
+  for (unsigned int i = 0; i < Gep->getNumOperands(); i++) {
+    Instruction *UI = dyn_cast<Instruction>(Gep->getOperand(i));
+    if (!UI)
+      continue;
+    // consider only loop instructions
+    if (LInfo->getLoopFor(UI->getParent()) != CurLoop)
+      continue;
+    // Return when invalid stride operation.
+    if (!validateStrideOperation(UI))
+      return;
+  }
+  hasValidStride = true;
+  DEBUG(dbgs() << "LV: Identified valid stride operation for strided"
+               << " memory vectorization, from instruction:\n"
+               << *I << "\n");
+  return;
+}
+
+/// For a strided memory access, it returns true if hold valid stride
+/// operation, else it returns false.
+bool StrideAccessInfo::isValidStrideOperation() const { return hasValidStride; }
+
 /// \brief Drive the analysis of interleaved memory accesses in the loop.
 ///
 /// Use this class to analyze interleaved accesses only when we can vectorize
@@ -855,7 +1062,7 @@
 
   /// \brief Analyze the interleaved accesses and collect them in interleave
   /// groups. Substitute symbolic strides using \p Strides.
-  void analyzeInterleaving(const ValueToValueMap &Strides);
+  void analyzeInterleaving(const ValueToValueMap &Strides, LoopInfo *LInfo);
 
   /// \brief Check if \p Instr belongs to any interleave group.
   bool isInterleaved(Instruction *Instr) const {
@@ -879,6 +1086,19 @@
     return nullptr;
   }
 
+  /// For a strided memory access, it returns true if hold valid stride
+  /// operation, else it returns false.
+  bool isValidStrideOperation(Instruction *I) {
+    MapVector<Instruction *, const StrideAccessInfo>::iterator It =
+        StrideInfo.find(I);
+    if (It != StrideInfo.end())
+      return It->second.isValidStrideOperation();
+    return false;
+  }
+
+  /// Holds instruction & its stride access information.
+  MapVector<Instruction *, const StrideAccessInfo> StrideInfo;
+
   /// \brief Returns true if an interleaved group that may access memory
   /// out-of-bounds requires a scalar epilogue iteration for correctness.
   bool requiresScalarEpilogue() const { return RequiresScalarEpilogue; }
@@ -900,20 +1120,6 @@
   /// Holds the relationships between the members and the interleave group.
   DenseMap<Instruction *, InterleaveGroup *> InterleaveGroupMap;
 
-  /// \brief The descriptor for a strided memory access.
-  struct StrideDescriptor {
-    StrideDescriptor(int Stride, const SCEV *Scev, unsigned Size,
-                     unsigned Align)
-        : Stride(Stride), Scev(Scev), Size(Size), Align(Align) {}
-
-    StrideDescriptor() : Stride(0), Scev(nullptr), Size(0), Align(0) {}
-
-    int Stride; // The access's stride. It is negative for a reverse access.
-    const SCEV *Scev; // The scalar expression of this access
-    unsigned Size;    // The size of the memory object.
-    unsigned Align;   // The alignment of this access.
-  };
-
   /// \brief Create a new interleave group with the given instruction \p Instr,
   /// stride \p Stride and alignment \p Align.
   ///
@@ -938,7 +1144,7 @@
   /// \brief Collect all the accesses with a constant stride in program order.
   void collectConstStridedAccesses(
       MapVector<Instruction *, StrideDescriptor> &StrideAccesses,
-      const ValueToValueMap &Strides);
+      const ValueToValueMap &Strides, LoopInfo * LInfo);
 };
 
 /// Utility class for getting and setting loop vectorizer hints in the form
@@ -1271,11 +1477,11 @@
                             const TargetTransformInfo *TTI,
                             LoopAccessAnalysis *LAA,
                             LoopVectorizationRequirements *R,
-                            LoopVectorizeHints *H)
+                            LoopVectorizeHints *H, LoopInfo * LInfo)
       : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TheFunction(F),
         TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), InterleaveInfo(PSE, L, DT),
         Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false),
-        Requirements(R), Hints(H) {}
+        Requirements(R), Hints(H), LInfo(LInfo) {}
 
   /// ReductionList contains the reduction descriptors for all
   /// of the reductions that were found in the loop.
@@ -1403,6 +1609,15 @@
   unsigned getNumLoads() const { return LAI->getNumLoads(); }
   unsigned getNumPredStores() const { return NumPredStores; }
 
+  /// \brief Get the interleaved access group that \p Instr belongs to.
+  MapVector<Instruction*,const StrideAccessInfo> &getStrideAccessGroup() {
+    return InterleaveInfo.StrideInfo;
+  }
+  /// For a strided memory access, it returns true if hold valid stride
+  /// operation, else it returns false.
+  bool isValidStrideOperation(Instruction * I) {
+    return InterleaveInfo.isValidStrideOperation(I);
+  }
 private:
   /// Check if a single basic block loop is vectorizable.
   /// At this point we know that this is a loop with a constant trip count
@@ -1515,6 +1730,7 @@
   /// While vectorizing these instructions we have to generate a
   /// call to the appropriate masked intrinsic
   SmallPtrSet<const Instruction *, 8> MaskedOp;
+  LoopInfo * LInfo;
 };
 
 /// LoopVectorizationCostModel - estimates the expected speedups due to
@@ -1888,7 +2104,7 @@
     // Check if it is legal to vectorize the loop.
     LoopVectorizationRequirements Requirements;
     LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, LAA,
-                                  &Requirements, &Hints);
+                                  &Requirements, &Hints, LI);
     if (!LVL.canVectorize()) {
       DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
       emitMissedWarning(F, L, Hints);
@@ -2289,6 +2505,88 @@
                                      "reverse");
 }
 
+/// Based on number of element it will generate mask for load & store.
+/// i.e. if stride is 2 and VF is 4, valid elements are 2
+/// In this case it will generate
+/// Mask:     <i1 1, i1 0, i32 1, i1 0>
+/// If valid elements is 1 (in some case) then it will generate
+/// Mask:     <i1 1, i1 0, i32 0, i1 0>
+/// Here valid elements(NumVec) plays vital role, once valid elements are
+/// over, for remaining it will mark 0 as mask value.
+Constant *InnerLoopVectorizer::getLoadStoreInstMask(IRBuilder<> &Builder,
+                                                    unsigned VF,
+                                                    unsigned Stride,
+                                                    unsigned NumVec) {
+  SmallVector<Constant *, 16> Mask;
+  for (unsigned i = 0; i < VF; i++) {
+    bool M = NumVec && ((i % Stride) ? 0 : 1);
+    Mask.push_back(Builder.getInt1(M));
+    if (M)
+      NumVec = NumVec - 1;
+  }
+  return ConstantVector::get(Mask);
+}
+
+/// Create shuffle vector's mask for mask-store
+/// i.e stride 3 store with VF 4
+/// It will generate following 2 mask for different part.
+/// in 2 iterations:
+/// Part#1 Mask:   <i32 0, i32 undef, i32 undef, i32 1>
+/// Part#2 Mask:   <i32 2, i32 undef, i32 undef, i32 3>
+Constant *InnerLoopVectorizer::getStoreShuffleVectorMask(IRBuilder<> &Builder,
+                                                         unsigned VF,
+                                                         unsigned Stride,
+                                                         unsigned Part) {
+  unsigned ValidElements =
+      VF - (getValidMemInSingleVFAccess(VF, Stride) * Part);
+  unsigned StartIndex = VF + Part * getValidMemInSingleVFAccess(VF, Stride);
+  SmallVector<Constant *, 16> Mask;
+  for (unsigned i = 0; i < VF; i++) {
+    // If its not our element then place i
+    // If valid elements are over push undef index
+    if (!ValidElements || (i % Stride)) {
+      Mask.push_back(Builder.getInt32(i));
+    } else {
+      // If its our element then place StartIndex
+      Mask.push_back(Builder.getInt32(StartIndex));
+      StartIndex = StartIndex + 1;
+      ValidElements = ValidElements - 1;
+    }
+  }
+  return ConstantVector::get(Mask);
+}
+
+/// Create mask for load instruction based on operands.
+/// i.e. for stride 3 and vector factor 4 it will generate
+/// mask: <i32 0, i32 2, i32 4, i32 6>
+/// Any of the operand can be shuffle as well in that case. i.e.
+/// First operand:
+///     %shuf.vec = shufflevector <4 x i32> <i32 0, i32 3, i32 undef, i32 undef>
+/// Second operand:
+///     %load.vec = load <4 x i32>
+/// In this case it will generate mask:  <i32 0, i32 1, i32 4, i32 7>
+/// NOTE: If any operand is shuffle, it has to be first operand.
+Constant *InnerLoopVectorizer::getLoadShuffleVectorMask(
+    IRBuilder<> &Builder, unsigned Start, unsigned Stride, unsigned VF,
+    bool isFirstOpShuffle, unsigned FirstOpLen, unsigned SecondOpLen) {
+  SmallVector<Constant *, 16> Mask;
+  if (isFirstOpShuffle) {
+    for (unsigned i = 0; i < FirstOpLen; i++) {
+      Mask.push_back(Builder.getInt32(Start + i));
+    }
+  } else {
+    for (unsigned i = 0; i < FirstOpLen; i++) {
+      Mask.push_back(Builder.getInt32(Start + (i * Stride)));
+    }
+  }
+  // For second vector move the start by VF.
+  Start = Start + VF;
+  for (unsigned i = 0; i < SecondOpLen; i++) {
+    Mask.push_back(Builder.getInt32(Start + i * Stride));
+  }
+  return ConstantVector::get(Mask);
+}
+
 // Get a mask to interleave \p NumVec vectors into a wide vector.
 // I.e.  <0, VF, VF*2, ..., VF*(NumVec-1), 1, VF+1, VF*2+1, ...>
 // E.g. For 2 interleaved vectors, if VF is 4, the mask is:
@@ -2421,6 +2719,13 @@
   if (Instr != Group->getInsertPos())
     return;
 
+  // Vectorize strided memory if memory instruction holds a valid strided
+  if (EnableStridedVectorization && Legal->isValidStrideOperation(Instr) &&
+      (Group->getNumMembers() == 1)) {
+    vectorizeStridedMemoryAccess(Instr);
+    return;
+  }
+
   LoadInst *LI = dyn_cast<LoadInst>(Instr);
   StoreInst *SI = dyn_cast<StoreInst>(Instr);
   Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
@@ -2534,6 +2839,244 @@
   }
 }
 
+/// Vectorize strided load
+/// It generates multiple vector load by considering skip factor.
+/// Skip factor is the offset gap between 2 loads.
+/// It does multiple loads followed by shuffle to extract elements.
+///
+/// i.e. for stride 2, with vector factor 4
+/// <...> = arr[i*2]
+/// Will generate following instructions:
+///
+/// %1 = getelementptr inbounds [100 x i32], [100 x i32]* @a, i64 0, i64 %0
+/// %2 = bitcast i32* %1 to <4 x i32>*
+/// %stride.load = load <4 x i32>, <4 x i32>* %2
+///
+/// %3 = getelementptr i32, i32* %1, i64 4
+/// %4 = bitcast i32* %3 to <4 x i32>*
+/// %stride.load18 = load <4 x i32>, <4 x i32>* %4, align 16, !tbaa !1
+/// %strided.vec = shufflevector <4 x i32> %stride.load, <4 x i32> %stride.load18, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+///
+/// For stride 2, skip factor will be 0.
+///
+/// For memory safety if user wants to generate vector mask-load, its supported
+/// under "-enable-strided-mask-load" command line option.
+void InnerLoopVectorizer::vectorizeStridedLoad(Instruction *Instr) {
+  DEBUG(dbgs() << "LV: Vectorizing strided load: " << *Instr << "\n");
+  const InterleaveGroup *Group = Legal->getInterleavedAccessGroup(Instr);
+  unsigned InterleaveFactor = Group->getFactor();
+  LoadInst *LI = dyn_cast<LoadInst>(Instr);
+  Value *Ptr = LI->getPointerOperand();
+  Type *ScalarTy = LI->getType();
+  Type *VecTy = VectorType::get(ScalarTy, VF);
+  Type *PtrTy = VecTy->getPointerTo(Ptr->getType()->getPointerAddressSpace());
+  unsigned SkipFactor = getSkipFactor(VF, InterleaveFactor);
+  unsigned NumLoadRequired = getRequiredLoadStore(VF, InterleaveFactor);
+  unsigned NumMemAccessInVF = getValidMemInSingleVFAccess(VF, InterleaveFactor);
+
+  setDebugLocFromInst(Builder, Ptr);
+  VectorParts &PtrParts = getVectorValue(Ptr);
+
+  // Consider Unroll
+  // Widen instructions based on unroll factor.
+  // PtrParts[URIndex] holds memory address in a particular unroll index
+  // For each unroll index with a start address widen memory instructions
+  // by generating multipe unroll versions.
+  for (unsigned URIndex = 0; URIndex < UF; URIndex++) {
+    // Identify start address.
+    Value *StartAddress = Builder.CreateExtractElement(
+        PtrParts[URIndex],
+        Group->isReverse() ? Builder.getInt32(VF - 1) : Builder.getInt32(0));
+
+    Instruction *PrevInstr = nullptr;
+    unsigned Index = 0;
+    // Generate required loads, based on Index
+    for (unsigned Part = 0; Part < NumLoadRequired; Part++) {
+      // Create GEP based on Index
+      // If stride is 2, VF 4 then in that case starting offset will start
+      // from:
+      // %StartIndex = shl i64 %index, 1
+      //
+      // In case of Reverse Loop:
+      // .lhs = shl i64 %offset.idx, 1
+      // %StartIndex = add i64 %.lhs, -6
+      Value *NewPtr = Builder.CreateGEP(StartAddress, Builder.getInt32(Index));
+      Instruction *NewLoadInstr = nullptr;
+      bool LastIteration = (Part + 1 >= NumLoadRequired);
+      // Make last load as masked load
+      if (EnableStridedMaskLoad && LastIteration) {
+        unsigned ElementsToLoad = VF - (NumMemAccessInVF * Part);
+        Constant *LoadMask =
+            getLoadStoreInstMask(Builder, VF, InterleaveFactor, ElementsToLoad);
+        NewLoadInstr = Builder.CreateMaskedLoad(
+            Builder.CreateBitCast(NewPtr, PtrTy), Group->getAlignment(),
+            LoadMask, UndefValue::get(VecTy), "wide.masked.load");
+      } else {
+        // Alignments smaller than the target default indicate that the code
+        // generator for the target needs to generate the unaligned load/store.
+        // When SkipFactor is 0, generate aligned access else generate
+        // unaligned memory access.
+        // NOTE: Alignment value 1 indicates unaligned access.
+        NewLoadInstr = Builder.CreateAlignedLoad(
+            Builder.CreateBitCast(NewPtr, PtrTy),
+            (SkipFactor ? 1 : Group->getAlignment()), "stride.load");
+      }
+      propagateMetadata(NewLoadInstr, Instr);
+
+      if (PrevInstr == nullptr) {
+        PrevInstr = NewLoadInstr;
+      } else {
+        // Check if previous instruction is shuffle
+        bool isShuffle = dyn_cast<ShuffleVectorInst>(PrevInstr) ? true : false;
+        // Identify number of elements to shuffle from first load
+        unsigned FirstOpLen = isShuffle
+                                  ? dyn_cast<ShuffleVectorInst>(PrevInstr)
+                                        ->getShuffleMask()
+                                        .size()
+                                  : NumMemAccessInVF;
+        // Identify number of elements to shuffle from second load
+        unsigned SecondOpLen =
+            LastIteration ? (VF - (NumMemAccessInVF * Part)) : NumMemAccessInVF;
+        // Create input mask for shuffle
+        Constant *StrideMask = getLoadShuffleVectorMask(
+            Builder, 0, InterleaveFactor, VF, isShuffle, FirstOpLen,
+            SecondOpLen);
+        if (isShuffle) {
+          // Making type equal for shuffle.
+          while (PrevInstr->getType() != NewLoadInstr->getType()) {
+            unsigned FirstOpLen =
+                dyn_cast<ShuffleVectorInst>(PrevInstr)->getShuffleMask().size();
+            unsigned SecondOpLen = VF - FirstOpLen;
+            Constant *MakeUpMask = getLoadShuffleVectorMask(
+                Builder, 0, 1,
+                dyn_cast<ShuffleVectorInst>(PrevInstr)->getShuffleMask().size(),
+                isShuffle, FirstOpLen, SecondOpLen);
+            Value *MakeUpShuffle = Builder.CreateShuffleVector(
+                PrevInstr, UndefValue::get(PrevInstr->getType()), MakeUpMask,
+                "strided.vec");
+            PrevInstr = dyn_cast<Instruction>(MakeUpShuffle);
+          }
+        }
+        // Create shuffle vector
+        Value *StridedVec = Builder.CreateShuffleVector(
+            PrevInstr, NewLoadInstr, StrideMask, "strided.vec");
+        // If this member has different type, cast the result type.
+        if (Instr->getType() != ScalarTy) {
+          VectorType *OtherVTy = VectorType::get(Instr->getType(), VF);
+          StridedVec = Builder.CreateBitOrPointerCast(StridedVec, OtherVTy);
+        }
+        // Update instruction to WidenMap.
+        VectorParts &Entry = WidenMap.get(Instr);
+        Entry[URIndex] =
+            Group->isReverse() ? reverseVector(StridedVec) : StridedVec;
+        PrevInstr = dyn_cast<Instruction>(StridedVec);
+      }
+      // Move Index.
+      Index = (NumMemAccessInVF * (Part + 1)) * InterleaveFactor;
+    }
+  }
+  return;
+}
+
+/// Vectorize strided store
+/// It generates multiple vector mask-stores by considering skip factor.
+/// Will generate shuffle followed by mask-store to insert elements.
+/// Skip factor is the offset gap between 2 stores.
+/// i.e. for stride 3, with vector factor 4
+/// arr[3*i] = <...>
+/// Will generate following instructions:
+///
+/// %8 = getelementptr inbounds i32, i32* %arr, i64 %0
+/// %9 = bitcast i32* %8 to <4 x i32>*
+/// %interleaved.vec = shufflevector <4 x i32> %7, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 1, i32 undef>
+/// call void @llvm.masked.store.v4i32(<4 x i32> %interleaved.vec, <4 x i32>* %9, i32 4, <4 x i1> <i1 true, i1 false, i1 true, i1 false>)
+///
+/// %10 = getelementptr i32, i32* %8, i64 6
+/// %11 = bitcast i32* %10 to <4 x i32>*
+/// %interleaved.vec19 = shufflevector <4 x i32> %7, <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 3, i32 undef>
+/// call void @llvm.masked.store.v4i32(<4 x i32> %interleaved.vec19, <4 x i32>* %11, i32 4, <4 x i1> <i1 true, i1 false, i1 true, i1 false>)
+///
+/// For stride 3, skip factor will be 2.
+void InnerLoopVectorizer::vectorizeStridedStore(Instruction *Instr) {
+  DEBUG(dbgs() << "LV: Vectorizing strided store: " << *Instr << "\n");
+  const InterleaveGroup *Group = Legal->getInterleavedAccessGroup(Instr);
+  unsigned InterleaveFactor = Group->getFactor();
+  StoreInst *SI = dyn_cast<StoreInst>(Instr);
+  Value *Ptr = SI->getPointerOperand();
+  // Prepare for the vector type of the strided store.
+  Type *ScalarTy = SI->getValueOperand()->getType();
+  Type *VecTy = VectorType::get(ScalarTy, VF);
+  Type *PtrTy = VecTy->getPointerTo(Ptr->getType()->getPointerAddressSpace());
+  // Check required number of stores & valid elements in each memory access.
+  unsigned NumStoreRequired = getRequiredLoadStore(VF, InterleaveFactor);
+  unsigned NumMemAccessInVF = getValidMemInSingleVFAccess(VF, InterleaveFactor);
+
+  setDebugLocFromInst(Builder, Ptr);
+  VectorParts &PtrParts = getVectorValue(Ptr);
+  VectorParts StoredVal = getVectorValue(SI->getValueOperand());
+  Value *UndefVec = UndefValue::get(VecTy);
+  // Consider Unroll.
+  // Widen instructions based on unroll factor.
+  // PtrParts[URIndex] holds memory address in a particular unroll index
+  // For each unroll index with a start address widen memory instructions
+  // by generating multipe unroll versions.
+  for (unsigned URIndex = 0; URIndex < UF; URIndex++) {
+    unsigned Index = 0;
+    // Identify starting address
+    Value *StartAddress = Builder.CreateExtractElement(
+        PtrParts[URIndex],
+        Group->isReverse() ? Builder.getInt32(VF - 1) : Builder.getInt32(0));
+    // Generate required stores.
+    for (unsigned Part = 0; Part < NumStoreRequired; Part++) {
+      // Create GEP based on Index
+      // If stride is 2, VF 4 then in that case starting offset will start
+      // from:
+      // %StartIndex = shl i64 %index, 1
+      //
+      // In case of Reverse Loop:
+      // .lhs = shl i64 %offset.idx, 1
+      // %StartIndex = add i64 %.lhs, -6
+      Value *NewPtr = Builder.CreateGEP(StartAddress, Builder.getInt32(Index));
+      NewPtr = Builder.CreateBitCast(NewPtr, PtrTy);
+      // Interleave the elements in the wide vector.
+      Constant *ShuffleMask =
+          getStoreShuffleVectorMask(Builder, VF, InterleaveFactor, Part);
+      // Create ShuffleVector, if loop is reverse then use reverse vector.
+      Value *ShuffleVector = Builder.CreateShuffleVector(
+          UndefVec, (Group->isReverse() ? reverseVector(StoredVal[URIndex])
+                                        : StoredVal[URIndex]),
+          ShuffleMask, "interleaved.vec");
+      // Get mask-store instruction's mask
+      Value *StoreMask =
+          getLoadStoreInstMask(Builder, VF, InterleaveFactor, NumMemAccessInVF);
+      // Generate mask-store instruction.
+      Instruction *NewStoreInstr = Builder.CreateMaskedStore(
+          ShuffleVector, NewPtr, Group->getAlignment(), StoreMask);
+      propagateMetadata(NewStoreInstr, Instr);
+      // Move Index.
+      Index = (NumMemAccessInVF * (Part + 1)) * InterleaveFactor;
+    }
+  }
+}
+
+/// Try to vectorize strided load and store memory accesses.
+/// Will generate multiple load operations followed by shuffle to
+/// model extract elements. For store it generate multiple shuffle followed
+/// by mask-store instructions.
+void InnerLoopVectorizer::vectorizeStridedMemoryAccess(Instruction *Instr) {
+  LoadInst *LI = dyn_cast<LoadInst>(Instr);
+  StoreInst *SI = dyn_cast<StoreInst>(Instr);
+  if (LI) {
+    vectorizeStridedLoad(Instr);
+  } else if (SI) {
+    vectorizeStridedStore(Instr);
+  } else {
+    DEBUG(dbgs() << "LV : Unsupported instruction for Strided Vectorization"
+                 << "\n");
+  }
+  return;
+}
+
 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
   // Attempt to issue a wide load.
   LoadInst *LI = dyn_cast<LoadInst>(Instr);
@@ -4596,7 +5139,7 @@
 
   // Analyze interleaved memory accesses.
   if (UseInterleaved)
-    InterleaveInfo.analyzeInterleaving(Strides);
+    InterleaveInfo.analyzeInterleaving(Strides, LInfo);
 
   unsigned SCEVThreshold = VectorizeSCEVCheckThreshold;
   if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
@@ -5044,7 +5587,7 @@
 
 void InterleavedAccessInfo::collectConstStridedAccesses(
     MapVector<Instruction *, StrideDescriptor> &StrideAccesses,
-    const ValueToValueMap &Strides) {
+    const ValueToValueMap &Strides, LoopInfo *LInfo) {
   // Holds load/store instructions in program order.
   SmallVector<Instruction *, 16> AccessList;
 
@@ -5090,6 +5633,12 @@
       Align = DL.getABITypeAlignment(PtrTy->getElementType());
 
     StrideAccesses[I] = StrideDescriptor(Stride, Scev, Size, Align);
+    // Update StrideInfo for instruction.
+    if (EnableStridedVectorization) {
+      StrideInfo.insert(std::pair<Instruction *, const StrideAccessInfo>(
+          I,
+          StrideAccessInfo(I, StrideAccesses[I], PSE.getSE(), TheLoop, LInfo)));
+    }
   }
 }
 
@@ -5114,12 +5663,12 @@
 // The store group of (2) is always inserted at or below (2), and the load group
 // of (1) is always inserted at or above (1). The dependence is safe.
 void InterleavedAccessInfo::analyzeInterleaving(
-    const ValueToValueMap &Strides) {
+    const ValueToValueMap &Strides, LoopInfo *LInfo) {
   DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n");
 
   // Holds all the stride accesses.
   MapVector<Instruction *, StrideDescriptor> StrideAccesses;
-  collectConstStridedAccesses(StrideAccesses, Strides);
+  collectConstStridedAccesses(StrideAccesses, Strides, LInfo);
 
   if (StrideAccesses.empty())
     return;
@@ -5202,8 +5751,14 @@
 
   // Remove interleaved store groups with gaps.
   for (InterleaveGroup *Group : StoreGroups)
-    if (Group->getNumMembers() != Group->getFactor())
-      releaseGroup(Group);
+    if (Group->getNumMembers() != Group->getFactor()) {
+      Instruction *I = Group->getMember(0);
+      // Dont release group, if group has only 1 element & that element
+      // has valid strided operation. Else release it.
+      if (!(EnableStridedVectorization && Group->getNumMembers() == 1 &&
+            isValidStrideOperation(I)))
+        releaseGroup(Group);
+    }
 
   // If there is a non-reversed interleaved load group with gaps, we will need
   // to execute at least one scalar epilogue iteration. This will ensure that
@@ -6036,12 +6591,19 @@
           if (Group->getMember(i))
             Indices.push_back(i);
       }
-
-      // Calculate the cost of the whole interleaved group.
-      unsigned Cost = TTI.getInterleavedMemoryOpCost(
-          I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
-          Group->getAlignment(), AS);
-
+      unsigned Cost = 0;
+      // If its a valid stride then get of cost of memory operations
+      // involved in it, else get the cost of interleave memory group.
+      if (EnableStridedVectorization && Legal->isValidStrideOperation(I) &&
+          (Group->getNumMembers() == 1)) {
+        Cost = TTI.getStridedMemoryOpCost(I->getOpcode(), WideVecTy,
+                                          Group->getFactor(), Indices,
+                                          Group->getAlignment(), AS);
+      } else {
+        Cost = TTI.getInterleavedMemoryOpCost(I->getOpcode(), WideVecTy,
+                                              Group->getFactor(), Indices,
+                                              Group->getAlignment(), AS);
+      }
       if (Group->isReverse())
         Cost +=
             Group->getNumMembers() *