Index: include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfo.h
+++ include/llvm/Analysis/TargetTransformInfo.h
@@ -477,6 +477,9 @@
   /// This is currently measured in number of instructions.
   unsigned getPrefetchDistance() const;
 
+  /// \return The number of cache lines ahead we should prefetch.
+  unsigned getPrefetchDegree() const;
+
   /// \return Some HW prefetchers can handle accesses up to a certain constant
   /// stride.  This is the minimum stride in bytes where it makes sense to start
   /// adding SW prefetches.  The default is 1, i.e. prefetch with any stride.
@@ -730,6 +733,7 @@
   virtual unsigned getRegisterBitWidth(bool Vector) = 0;
   virtual unsigned getCacheLineSize() = 0;
   virtual unsigned getPrefetchDistance() = 0;
+  virtual unsigned getPrefetchDegree() = 0;
   virtual unsigned getMinPrefetchStride() = 0;
   virtual unsigned getMaxPrefetchIterationsAhead() = 0;
   virtual unsigned getMaxInterleaveFactor(unsigned VF) = 0;
@@ -944,6 +948,7 @@
     return Impl.getCacheLineSize();
   }
   unsigned getPrefetchDistance() override { return Impl.getPrefetchDistance(); }
+  unsigned getPrefetchDegree() override { return Impl.getPrefetchDegree(); }
   unsigned getMinPrefetchStride() override {
     return Impl.getMinPrefetchStride();
   }
Index: include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfoImpl.h
+++ include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -295,6 +295,8 @@
 
   unsigned getPrefetchDistance() { return 0; }
 
+  unsigned getPrefetchDegree() { return 0; }
+
   unsigned getMinPrefetchStride() { return 1; }
 
   unsigned getMaxPrefetchIterationsAhead() { return UINT_MAX; }
Index: lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- lib/Analysis/TargetTransformInfo.cpp
+++ lib/Analysis/TargetTransformInfo.cpp
@@ -262,6 +262,10 @@
   return TTIImpl->getPrefetchDistance();
 }
 
+unsigned TargetTransformInfo::getPrefetchDegree() const {
+  return TTIImpl->getPrefetchDegree();
+}
+
 unsigned TargetTransformInfo::getMinPrefetchStride() const {
   return TTIImpl->getMinPrefetchStride();
 }
Index: lib/Target/AArch64/AArch64Subtarget.h
===================================================================
--- lib/Target/AArch64/AArch64Subtarget.h
+++ lib/Target/AArch64/AArch64Subtarget.h
@@ -87,6 +87,7 @@
   uint8_t VectorInsertExtractBaseCost = 3;
   uint16_t CacheLineSize = 0;
   uint16_t PrefetchDistance = 0;
+  uint16_t PrefetchDegree = 0;
   uint16_t MinPrefetchStride = 1;
   unsigned MaxPrefetchIterationsAhead = UINT_MAX;
   unsigned PrefFunctionAlignment = 0;
@@ -197,6 +198,7 @@
   }
   unsigned getCacheLineSize() const { return CacheLineSize; }
   unsigned getPrefetchDistance() const { return PrefetchDistance; }
+  unsigned getPrefetchDegree() const { return PrefetchDegree; }
   unsigned getMinPrefetchStride() const { return MinPrefetchStride; }
   unsigned getMaxPrefetchIterationsAhead() const {
     return MaxPrefetchIterationsAhead;
Index: lib/Target/AArch64/AArch64Subtarget.cpp
===================================================================
--- lib/Target/AArch64/AArch64Subtarget.cpp
+++ lib/Target/AArch64/AArch64Subtarget.cpp
@@ -73,6 +73,7 @@
     VectorInsertExtractBaseCost = 2;
     CacheLineSize = 128;
     PrefetchDistance = 740;
+    PrefetchDegree = 1;
     MinPrefetchStride = 1024;
     MaxPrefetchIterationsAhead = 11;
     break;
Index: lib/Target/AArch64/AArch64TargetTransformInfo.h
===================================================================
--- lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -135,6 +135,8 @@
 
   unsigned getPrefetchDistance();
 
+  unsigned getPrefetchDegree();
+
   unsigned getMinPrefetchStride();
 
   unsigned getMaxPrefetchIterationsAhead();
Index: lib/Target/AArch64/AArch64TargetTransformInfo.cpp
===================================================================
--- lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -633,6 +633,10 @@
   return ST->getPrefetchDistance();
 }
 
+unsigned AArch64TTIImpl::getPrefetchDegree() {
+  return ST->getPrefetchDegree();
+}
+
 unsigned AArch64TTIImpl::getMinPrefetchStride() {
   return ST->getMinPrefetchStride();
 }
Index: lib/Transforms/Scalar/LoopDataPrefetch.cpp
===================================================================
--- lib/Transforms/Scalar/LoopDataPrefetch.cpp
+++ lib/Transforms/Scalar/LoopDataPrefetch.cpp
@@ -52,6 +52,11 @@
                      cl::Hidden);
 
 static cl::opt<unsigned>
+    PrefetchDegree("prefetch-degree",
+                     cl::desc("Number of cache lines to prefetch ahead"),
+                     cl::Hidden);
+
+static cl::opt<unsigned>
     MinPrefetchStride("min-prefetch-stride",
                       cl::desc("Min stride to add prefetches"), cl::Hidden);
 
@@ -80,6 +85,9 @@
   /// warrant a prefetch.
   bool isStrideLargeEnough(const SCEVAddRecExpr *AR);
 
+  /// \brief Look for irregular symbolic strides.
+  bool isIrregularSymbolicStride(const SCEVAddRecExpr *AR, Loop *L);
+
   unsigned getMinPrefetchStride() {
     if (MinPrefetchStride.getNumOccurrences() > 0)
       return MinPrefetchStride;
@@ -92,6 +100,12 @@
     return TTI->getPrefetchDistance();
   }
 
+  unsigned getPrefetchDegree() {
+    if (PrefetchDegree.getNumOccurrences() > 0)
+      return PrefetchDegree;
+    return TTI->getPrefetchDegree();
+  }
+
   unsigned getMaxPrefetchIterationsAhead() {
     if (MaxPrefetchIterationsAhead.getNumOccurrences() > 0)
       return MaxPrefetchIterationsAhead;
@@ -161,6 +175,53 @@
   return TargetMinStride <= AbsStride;
 }
 
+bool LoopDataPrefetch::isIrregularSymbolicStride(const SCEVAddRecExpr *AR,
+                                                 Loop *L) {
+  // If there is no outer loop we cannot ensure the stride is irregular.
+  const Loop *OuterLoop = L->getParentLoop();
+  if (!OuterLoop)
+    return false;
+
+  // If the base address is varying, then it is an irregular access.
+  const SCEV *Base = AR->getStart();
+  if (SE->isLoopInvariant(Base, OuterLoop))
+    return false;
+
+  // Now check if it is a symbolic stride.
+  const SCEV *V = AR->getStepRecurrence(*SE);
+  if (!V)
+    return false;
+
+  const auto *M = dyn_cast<SCEVMulExpr>(V);
+  if (!M)
+    return false;
+  // Check if the step value is a non-unit constant.
+  // Be conservative and give up if step value is larger than the cache line.
+  V = M->getOperand(0);
+  if (V->getSCEVType() != scConstant)
+    return false;
+
+  if (V->isOne() || V->isAllOnesValue())
+    return false;
+
+  const APInt &APStepVal = cast<SCEVConstant>(V)->getAPInt();
+  int64_t StepVal = APStepVal.getSExtValue();
+  if (StepVal > TTI->getCacheLineSize())
+    return false;
+
+  V = M->getOperand(1);
+
+  // Strip off casts.
+  while (const auto *C = dyn_cast<SCEVCastExpr>(V))
+    V = C->getOperand();
+
+  const auto *U = dyn_cast<SCEVUnknown>(V);
+  if (!U)
+    return false;
+
+  return (L->isLoopInvariant(U->getValue()));
+}
+
 PreservedAnalyses LoopDataPrefetchPass::run(Function &F,
                                             FunctionAnalysisManager &AM) {
   LoopInfo *LI = &AM.getResult<LoopAnalysis>(F);
@@ -245,21 +306,16 @@
     LoopSize = 1;
 
   unsigned ItersAhead = getPrefetchDistance() / LoopSize;
+  unsigned BytesAhead = TTI->getCacheLineSize() * getPrefetchDegree();
   if (!ItersAhead)
     ItersAhead = 1;
 
-  if (ItersAhead > getMaxPrefetchIterationsAhead())
-    return MadeChange;
-
-  DEBUG(dbgs() << "Prefetching " << ItersAhead
-               << " iterations ahead (loop size: " << LoopSize << ") in "
-               << L->getHeader()->getParent()->getName() << ": " << *L);
-
   SmallVector<std::pair<Instruction *, const SCEVAddRecExpr *>, 16> PrefLoads;
   for (const auto BB : L->blocks()) {
     for (auto &I : *BB) {
       Value *PtrValue;
       Instruction *MemI;
+      bool PrefetchNextLine = false;
 
       if (LoadInst *LMemI = dyn_cast<LoadInst>(&I)) {
         MemI = LMemI;
@@ -270,7 +326,11 @@
         PtrValue = SMemI->getPointerOperand();
       } else continue;
 
-      unsigned PtrAddrSpace = PtrValue->getType()->getPointerAddressSpace();
+      auto *PtrTy = dyn_cast<PointerType>(PtrValue->getType());
+      if (!PtrTy || PtrTy->isAggregateType())
+        continue;
+
+      unsigned PtrAddrSpace = PtrTy->getPointerAddressSpace();
       if (PtrAddrSpace)
         continue;
 
@@ -278,13 +338,31 @@
         continue;
 
       const SCEV *LSCEV = SE->getSCEV(PtrValue);
-      const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
+      const auto *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
       if (!LSCEVAddRec)
         continue;
 
+      if (getPrefetchDegree() > 0)
+        PrefetchNextLine = isIrregularSymbolicStride(LSCEVAddRec, L);
+
+      if (!PrefetchNextLine &&
+          ItersAhead > getMaxPrefetchIterationsAhead())
+        continue;
+
+      if (PrefetchNextLine) {
+        DEBUG(dbgs() << "Prefetching " << getPrefetchDegree()
+                     << " cache line(s) ahead (irregular access: " << *PtrValue
+                     << ") in " << L->getHeader()->getParent()->getName()
+                     << ": " << *L);
+      } else {
+        DEBUG(dbgs() << "Prefetching " << ItersAhead
+                     << " iterations ahead (loop size: " << LoopSize << ") in "
+                     << L->getHeader()->getParent()->getName() << ": " << *L);
+      }
+
       // Check if the the stride of the accesses is large enough to warrant a
       // prefetch.
-      if (!isStrideLargeEnough(LSCEVAddRec))
+      if (!PrefetchNextLine && !isStrideLargeEnough(LSCEVAddRec))
         continue;
 
       // We don't want to double prefetch individual cache lines. If this load
@@ -305,9 +383,17 @@
       if (DupPref)
         continue;
 
-      const SCEV *NextLSCEV = SE->getAddExpr(LSCEVAddRec, SE->getMulExpr(
-        SE->getConstant(LSCEVAddRec->getType(), ItersAhead),
-        LSCEVAddRec->getStepRecurrence(*SE)));
+      // If this is a irregular symbolic stride, prefetch next cache line.
+      const SCEV *NextLSCEV =
+          PrefetchNextLine
+              ? SE->getAddExpr(LSCEVAddRec,
+                               SE->getConstant(LSCEVAddRec->getType(),
+                                               BytesAhead))
+              : SE->getAddExpr(
+                    LSCEVAddRec,
+                    SE->getMulExpr(
+                        SE->getConstant(LSCEVAddRec->getType(), ItersAhead),
+                        LSCEVAddRec->getStepRecurrence(*SE)));
       if (!isSafeToExpand(NextLSCEV, *SE))
         continue;
 
Index: test/Transforms/LoopDataPrefetch/AArch64/kryo-large-stride.ll
===================================================================
--- test/Transforms/LoopDataPrefetch/AArch64/kryo-large-stride.ll
+++ test/Transforms/LoopDataPrefetch/AArch64/kryo-large-stride.ll
@@ -1,7 +1,11 @@
 ; RUN: opt -mcpu=kryo -mtriple=aarch64-gnu-linux -loop-data-prefetch -max-prefetch-iters-ahead=1000 -S < %s | FileCheck %s --check-prefix=LARGE_PREFETCH --check-prefix=ALL
 ; RUN: opt -mcpu=kryo -mtriple=aarch64-gnu-linux -loop-data-prefetch -S < %s | FileCheck %s --check-prefix=NO_LARGE_PREFETCH --check-prefix=ALL
+; RUN: opt -mcpu=kryo -mtriple=aarch64-gnu-linux -loop-data-prefetch -S < %s | FileCheck %s --check-prefix=SYMBOLIC_PREFETCH --check-prefix=ALL
+; RUN: opt -mcpu=kryo -mtriple=aarch64-gnu-linux -loop-data-prefetch -prefetch-degree=0 -S < %s | FileCheck %s --check-prefix=NO_SYMBOLIC_PREFETCH --check-prefix=ALL
 ; RUN: opt -mcpu=kryo -mtriple=aarch64-gnu-linux -passes=loop-data-prefetch -max-prefetch-iters-ahead=1000 -S < %s | FileCheck %s --check-prefix=LARGE_PREFETCH --check-prefix=ALL
 ; RUN: opt -mcpu=kryo -mtriple=aarch64-gnu-linux -passes=loop-data-prefetch -S < %s | FileCheck %s --check-prefix=NO_LARGE_PREFETCH --check-prefix=ALL
+; RUN: opt -mcpu=kryo -mtriple=aarch64-gnu-linux -passes=loop-data-prefetch -S < %s | FileCheck %s --check-prefix=SYMBOLIC_PREFETCH --check-prefix=ALL
+; RUN: opt -mcpu=kryo -mtriple=aarch64-gnu-linux -passes=loop-data-prefetch -prefetch-degree=0 -S < %s | FileCheck %s --check-prefix=NO_SYMBOLIC_PREFETCH --check-prefix=ALL
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
 
@@ -51,3 +55,109 @@
 for.end:                                          ; preds = %for.body
   ret void
 }
+
+; No Prefetching in the presence of regular symbolic strides:
+;
+;   for (unsigned i = 0; i < 100; i++)
+;     A[i + 1] = A[Stride * i] + B[i];
+
+; ALL-LABEL: @regular_symbolic_stride(
+define void @regular_symbolic_stride(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i64 %N,
+               i64 %stride) {
+entry:
+  br label %for.body
+
+; ALL: for.body:
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %mul = mul i64 %indvars.iv, %stride
+; SYMBOLIC_PREFETCH-NOT: call void @llvm.prefetch
+; NO_SYMBOLIC_PREFETCH-NOT: call void @llvm.prefetch
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %mul
+  %load = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %load_1 = load i32, i32* %arrayidx2, align 4
+  %add = add i32 %load_1, %load
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx_next = getelementptr inbounds i32, i32* %A, i64 %indvars.iv.next
+  store i32 %add, i32* %arrayidx_next, align 4
+  %exitcond = icmp eq i64 %indvars.iv.next, %N
+  br i1 %exitcond, label %for.end, label %for.body
+
+; ALL: for.end:
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; Prefetching in the presence of irregular symbolic strides:
+;
+; struct MyStruct {
+;  int field;
+;  char kk[60];
+;} *my_struct;
+;
+;int f(struct MyStruct *p, struct MyStruct *q, int N) {
+;  int total = 0;
+;  struct MyStruct *r = p;
+;  for (int i = 0; i < N/300; ++i)
+;    for (r = p + i; r < q; r += N)
+;      total += r->field;
+;  return total;
+;}
+
+%struct.MyStruct = type { i32, [60 x i8] }
+
+@my_struct = local_unnamed_addr global %struct.MyStruct* null, align 8
+
+; ALL-LABEL: @irregular_symbolic_stride(
+define i32 @irregular_symbolic_stride(%struct.MyStruct* readonly %p, %struct.MyStruct* readnone %q, i32 %N) local_unnamed_addr {
+entry:
+  %cmp23 = icmp sgt i32 %N, 299
+  br i1 %cmp23, label %for.cond1.preheader.lr.ph, label %for.cond.cleanup
+
+for.cond1.preheader.lr.ph:                        ; preds = %entry
+  %div27 = udiv i32 %N, 300
+  %idx.ext4 = sext i32 %N to i64
+  %0 = zext i32 %div27 to i64
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond1.preheader.lr.ph, %for.inc6
+  %indvars.iv = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next, %for.inc6 ]
+  %total.024 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %total.1.lcssa, %for.inc6 ]
+  %add.ptr519 = getelementptr inbounds %struct.MyStruct, %struct.MyStruct* %p, i64 %indvars.iv
+  %cmp220 = icmp ult %struct.MyStruct* %add.ptr519, %q
+  br i1 %cmp220, label %for.body3.preheader, label %for.inc6
+
+for.body3.preheader:                              ; preds = %for.cond1.preheader
+  br label %for.body3
+
+for.cond.cleanup.loopexit:                        ; preds = %for.inc6
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %total.0.lcssa = phi i32 [ 0, %entry ], [ %total.1.lcssa, %for.cond.cleanup.loopexit ]
+  ret i32 %total.0.lcssa
+
+; ALL: for.body3:
+for.body3:                                        ; preds = %for.body3.preheader, %for.body3
+  %add.ptr522 = phi %struct.MyStruct* [ %add.ptr5, %for.body3 ], [ %add.ptr519, %for.body3.preheader ]
+  %total.121 = phi i32 [ %add, %for.body3 ], [ %total.024, %for.body3.preheader ]
+  %field = getelementptr inbounds %struct.MyStruct, %struct.MyStruct* %add.ptr522, i64 0, i32 0
+; SYMBOLIC_PREFETCH: call void @llvm.prefetch
+; NO_SYMBOLIC_PREFETCH-NOT: call void @llvm.prefetch
+  %1 = load i32, i32* %field, align 4
+  %add = add nsw i32 %1, %total.121
+  %add.ptr5 = getelementptr inbounds %struct.MyStruct, %struct.MyStruct* %add.ptr522, i64 %idx.ext4
+  %cmp2 = icmp ult %struct.MyStruct* %add.ptr5, %q
+  br i1 %cmp2, label %for.body3, label %for.inc6.loopexit
+
+; ALL:for.inc6.loopexit
+for.inc6.loopexit:                                ; preds = %for.body3
+  br label %for.inc6
+
+for.inc6:                                         ; preds = %for.inc6.loopexit, %for.cond1.preheader
+  %total.1.lcssa = phi i32 [ %total.024, %for.cond1.preheader ], [ %add, %for.inc6.loopexit ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %cmp = icmp slt i64 %indvars.iv.next, %0
+  br i1 %cmp, label %for.cond1.preheader, label %for.cond.cleanup.loopexit
+}