Index: llvm/docs/LangRef.rst
===================================================================
--- llvm/docs/LangRef.rst
+++ llvm/docs/LangRef.rst
@@ -7130,6 +7130,33 @@
 not found to interact with the environment in an observable way, the loop may
 be removed. This corresponds to the ``mustprogress`` function attribute.
 
+'``llvm.loop.prefetch``' Metadata
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The ``llvm.loop.prefetch`` metadata can be attached to ``load``/``store`` instruction.
+It can indicate whether to prefetch the data accessed by this memory instruction.
+And it can indicate which cache level to prefetch data to and the number of loop
+iteration ahead of which a prefetch is issued.
+
+.. code-block:: llvm
+
+   %val = load i32, ptr %arrayidx, !llvm.loop.prefetch !0
+   %val0 = load i32, ptr %arrayidx1, !llvm.loop.prefetch !1
+   ...
+   !0 = distinct !{i1 false, i32 -1, i32 -1}
+   !1 = distinct !{i1 true, i32 1, i32 128}
+
+If the first value of this metadata is false, it means that the data accessed by
+the instruction will not be prefetched. In this case, the latter two values do
+not work.
+
+If the first value of this metadata is true, it means that the data accessed by
+the instruction needs to be prefetched. The second value indicates which
+cache level the prefetched data is placed in (-1: unspecified, 0: no cache,
+1: L1 cache, 2: L2 cache, 3: L3 cache). And the third value indicates which
+the number of loop iteration ahead of which a prefetch is issued, before the
+corresponding ``load``/``store`` instruction.
+
 '``irr_loop``' Metadata
 ^^^^^^^^^^^^^^^^^^^^^^^
 
Index: llvm/include/llvm/IR/FixedMetadataKinds.def
===================================================================
--- llvm/include/llvm/IR/FixedMetadataKinds.def
+++ llvm/include/llvm/IR/FixedMetadataKinds.def
@@ -50,3 +50,4 @@
 LLVM_FIXED_MD_KIND(MD_kcfi_type, "kcfi_type", 36)
 LLVM_FIXED_MD_KIND(MD_pcsections, "pcsections", 37)
 LLVM_FIXED_MD_KIND(MD_DIAssignID, "DIAssignID", 38)
+LLVM_FIXED_MD_KIND(MD_loop_prefetch, "llvm.loop.prefetch", 39)
Index: llvm/lib/Analysis/VectorUtils.cpp
===================================================================
--- llvm/lib/Analysis/VectorUtils.cpp
+++ llvm/lib/Analysis/VectorUtils.cpp
@@ -896,7 +896,7 @@
   for (auto Kind : {LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
                     LLVMContext::MD_noalias, LLVMContext::MD_fpmath,
                     LLVMContext::MD_nontemporal, LLVMContext::MD_invariant_load,
-                    LLVMContext::MD_access_group}) {
+                    LLVMContext::MD_access_group, LLVMContext::MD_loop_prefetch}) {
     MDNode *MD = I0->getMetadata(Kind);
 
     for (int J = 1, E = VL.size(); MD && J != E; ++J) {
@@ -920,6 +920,9 @@
       case LLVMContext::MD_access_group:
         MD = intersectAccessGroups(Inst, IJ);
         break;
+      case LLVMContext::MD_loop_prefetch:
+        // No need to handle.
+        break;
       default:
         llvm_unreachable("unhandled metadata");
       }
Index: llvm/lib/Transforms/Scalar/LICM.cpp
===================================================================
--- llvm/lib/Transforms/Scalar/LICM.cpp
+++ llvm/lib/Transforms/Scalar/LICM.cpp
@@ -1716,6 +1716,12 @@
                                                          << ore::NV("Inst", &I);
   });
 
+  // The llvm.loop.prefetch metadata is associated with the loop where the
+  // instruction is located. Once it is hoisted, the metadata will become
+  // invalid. Remove it.
+  if (I.hasMetadata(LLVMContext::MD_loop_prefetch))
+    I.setMetadata(LLVMContext::MD_loop_prefetch, nullptr);
+
   // Metadata can be dependent on conditions we are hoisting above.
   // Conservatively strip all metadata on the instruction unless we were
   // guaranteed to execute I if we entered the loop, in which case the metadata
Index: llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
===================================================================
--- llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
+++ llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
@@ -240,9 +240,16 @@
   bool Writes = false;
   /// The (first seen) prefetched instruction.
   Instruction *MemI = nullptr;
-
-  /// Constructor to create a new Prefetch for \p I.
-  Prefetch(const SCEVAddRecExpr *L, Instruction *I) : LSCEVAddRec(L) {
+  /// Iterations ahead of each prefetch instruction.
+  unsigned ItersAhead;
+  /// Locality of each prefetch instruction.
+  unsigned Locality;
+
+  /// Constructor to create a new Prefetch with ItersAhead and Locality info for
+  /// \p I.
+  Prefetch(const SCEVAddRecExpr *L, Instruction *I, unsigned ItersAhead,
+           unsigned Locality = 3)
+      : LSCEVAddRec(L), ItersAhead(ItersAhead), Locality(Locality) {
     addInstruction(I);
   };
 
@@ -337,6 +344,48 @@
         PtrValue = SMemI->getPointerOperand();
       } else continue;
 
+      // For \p Locality, there are four levels.
+      // '0': no locality
+      // '1': L3 cache
+      // '2': L2 cache
+      // '3': L1 cache
+      unsigned Locality = 3;
+      const int LOCALITY_MAX = 4;
+      unsigned ItersAheadCustomized = ItersAhead;
+      if (auto MD = I.getMetadata(LLVMContext::MD_loop_prefetch)) {
+        // Handle #pragma clang loop noprefetch()
+        bool PrefetchDisabled =
+            !mdconst::extract<ConstantInt>(MD->getOperand(0))->getSExtValue();
+        if (PrefetchDisabled)
+          continue;
+
+        // Handle #pragma clang loop prefetch()
+        int LocalityPrag =
+            mdconst::extract<ConstantInt>(MD->getOperand(1))->getSExtValue();
+        int ItersAheadPrag =
+            mdconst::extract<ConstantInt>(MD->getOperand(2))->getSExtValue();
+        switch (LocalityPrag) {
+        case -1:
+          Locality = 3;
+          break;
+        case 0:
+          Locality = 0;
+          break;
+        default:
+          Locality = LOCALITY_MAX - LocalityPrag;
+          break;
+        }
+
+        ItersAheadCustomized =
+            ItersAheadPrag == -1 ? ItersAhead : ItersAheadPrag;
+        // If ItersAhead == 0, there is no need for prefetching.
+        if (!ItersAheadCustomized ||
+            ItersAheadCustomized > getMaxPrefetchIterationsAhead() ||
+            (ConstantMaxTripCount &&
+             ConstantMaxTripCount < ItersAheadCustomized + 1))
+          continue;
+      }
+
       unsigned PtrAddrSpace = PtrValue->getType()->getPointerAddressSpace();
       if (!TTI->shouldPrefetchAddressSpace(PtrAddrSpace))
         continue;
@@ -367,16 +416,19 @@
         }
       }
       if (!DupPref)
-        Prefetches.push_back(Prefetch(LSCEVAddRec, MemI));
+        Prefetches.push_back(
+            Prefetch(LSCEVAddRec, MemI, ItersAheadCustomized, Locality));
     }
 
   unsigned TargetMinStride =
     getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
                          Prefetches.size(), HasCall);
 
-  LLVM_DEBUG(dbgs() << "Prefetching " << ItersAhead
-             << " iterations ahead (loop size: " << LoopSize << ") in "
-             << L->getHeader()->getParent()->getName() << ": " << *L);
+  for (auto &P : Prefetches) {
+    LLVM_DEBUG(dbgs() << "Prefetching " << P.ItersAhead
+                      << " iterations ahead (loop size: " << LoopSize << ") in "
+                      << L->getHeader()->getParent()->getName() << ": " << *L);
+  }
   LLVM_DEBUG(dbgs() << "Loop has: "
              << NumMemAccesses << " memory accesses, "
              << NumStridedMemAccesses << " strided memory accesses, "
@@ -392,9 +444,10 @@
 
     BasicBlock *BB = P.InsertPt->getParent();
     SCEVExpander SCEVE(*SE, BB->getModule()->getDataLayout(), "prefaddr");
-    const SCEV *NextLSCEV = SE->getAddExpr(P.LSCEVAddRec, SE->getMulExpr(
-      SE->getConstant(P.LSCEVAddRec->getType(), ItersAhead),
-      P.LSCEVAddRec->getStepRecurrence(*SE)));
+    const SCEV *NextLSCEV = SE->getAddExpr(
+        P.LSCEVAddRec,
+        SE->getMulExpr(SE->getConstant(P.LSCEVAddRec->getType(), P.ItersAhead),
+                       P.LSCEVAddRec->getStepRecurrence(*SE)));
     if (!SCEVE.isSafeToExpand(NextLSCEV))
       continue;
 
@@ -407,11 +460,10 @@
     Type *I32 = Type::getInt32Ty(BB->getContext());
     Function *PrefetchFunc = Intrinsic::getDeclaration(
         M, Intrinsic::prefetch, PrefPtrValue->getType());
-    Builder.CreateCall(
-        PrefetchFunc,
-        {PrefPtrValue,
-         ConstantInt::get(I32, P.Writes),
-         ConstantInt::get(I32, 3), ConstantInt::get(I32, 1)});
+    Builder.CreateCall(PrefetchFunc,
+                       {PrefPtrValue, ConstantInt::get(I32, P.Writes),
+                        ConstantInt::get(I32, P.Locality),
+                        ConstantInt::get(I32, 1)});
     ++NumPrefetches;
     LLVM_DEBUG(dbgs() << "  Access: "
                << *P.MemI->getOperand(isa<LoadInst>(P.MemI) ? 0 : 1)
Index: llvm/test/Transforms/LICM/hoist-load-with-prefetch.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LICM/hoist-load-with-prefetch.ll
@@ -0,0 +1,55 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=licm -S | FileCheck %s
+
+define dso_local void @test(ptr %a, i32 %n, ptr noalias %b) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 4
+; CHECK-NEXT:    [[CMP5:%.*]] = icmp slt i32 0, [[N:%.*]]
+; CHECK-NEXT:    br i1 [[CMP5]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.for.cond.cleanup_crit_edge:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_06:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I_06]] to i64
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_06]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:%.*]]
+;
+entry:
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 4
+  %cmp5 = icmp slt i32 0, %n
+  br i1 %cmp5, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.cond.for.cond.cleanup_crit_edge:              ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.for.cond.cleanup_crit_edge, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.06 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %0 = load i32, ptr %arrayidx, align 4, !llvm.loop.prefetch !0
+  %idxprom = zext i32 %i.06 to i64
+  %arrayidx1 = getelementptr inbounds i32, ptr %a, i64 %idxprom
+  %1 = load i32, ptr %arrayidx1, align 4
+  %add = add nsw i32 %1, %0
+  store i32 %add, ptr %arrayidx1, align 4
+  %inc = add nuw nsw i32 %i.06, 1
+  %cmp = icmp slt i32 %inc, %n
+  br i1 %cmp, label %for.body, label %for.cond.for.cond.cleanup_crit_edge
+}
+
+!0 = distinct !{i1 true, i32 -1, i32 -1}
Index: llvm/test/Transforms/LoopDataPrefetch/pragma-loop-prefetch-noprefetch.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopDataPrefetch/pragma-loop-prefetch-noprefetch.ll
@@ -0,0 +1,356 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+;RUN: opt < %s -passes=loop-data-prefetch -prefetch-distance=1000 -cache-line-size=64 -S | FileCheck %s
+
+define dso_local void @noprefetch_test(ptr %a, i32 %n, ptr %sum) {
+; CHECK-LABEL: @noprefetch_test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP4]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[DOTPRE:%.*]] = load double, ptr [[SUM:%.*]], align 8
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi double [ [[DOTPRE]], [[FOR_BODY_PREHEADER]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !llvm.loop.prefetch !0
+; CHECK-NEXT:    [[ADD]] = fadd contract double [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    store double [[ADD]], ptr [[SUM]], align 8
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
+;
+  entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  %.pre = load double, ptr %sum, align 8
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %0 = phi double [ %.pre, %for.body.preheader ], [ %add, %for.body ]
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, ptr %a, i64 %indvars.iv
+  %1 = load double, ptr %arrayidx, align 8, !llvm.loop.prefetch !0
+  %add = fadd contract double %1, %0
+  store double %add, ptr %sum, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define dso_local void @prefetch_one_arg_test(ptr %a, i32 %n, ptr %sum) {
+; CHECK-LABEL: @prefetch_one_arg_test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP4]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[DOTPRE:%.*]] = load double, ptr [[SUM:%.*]], align 8
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi double [ [[DOTPRE]], [[FOR_BODY_PREHEADER]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], 1136
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP2]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    call void @llvm.prefetch.p0(ptr [[UGLYGEP]], i32 0, i32 3, i32 1)
+; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !llvm.loop.prefetch !1
+; CHECK-NEXT:    [[ADD]] = fadd contract double [[TMP3]], [[TMP0]]
+; CHECK-NEXT:    store double [[ADD]], ptr [[SUM]], align 8
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
+;
+entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  %.pre = load double, ptr %sum, align 8
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %0 = phi double [ %.pre, %for.body.preheader ], [ %add, %for.body ]
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, ptr %a, i64 %indvars.iv
+  %1 = load double, ptr %arrayidx, align 8, !llvm.loop.prefetch !1
+  %add = fadd contract double %1, %0
+  store double %add, ptr %sum, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define dso_local void @prefetch_two_args_test(ptr %a, i32 %n, ptr %sum) {
+; CHECK-LABEL: @prefetch_two_args_test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP4]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[DOTPRE:%.*]] = load double, ptr [[SUM:%.*]], align 8
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi double [ [[DOTPRE]], [[FOR_BODY_PREHEADER]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], 1136
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP2]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    call void @llvm.prefetch.p0(ptr [[UGLYGEP]], i32 0, i32 0, i32 1)
+; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !llvm.loop.prefetch !2
+; CHECK-NEXT:    [[ADD]] = fadd contract double [[TMP3]], [[TMP0]]
+; CHECK-NEXT:    store double [[ADD]], ptr [[SUM]], align 8
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
+;
+entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  %.pre = load double, ptr %sum, align 8
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %0 = phi double [ %.pre, %for.body.preheader ], [ %add, %for.body ]
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, ptr %a, i64 %indvars.iv
+  %1 = load double, ptr %arrayidx, align 8, !llvm.loop.prefetch !2
+  %add = fadd contract double %1, %0
+  store double %add, ptr %sum, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define dso_local void @prefetch_two_args_test1(ptr %a, i32 %n, ptr %sum) {
+; CHECK-LABEL: @prefetch_two_args_test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP4]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[DOTPRE:%.*]] = load double, ptr [[SUM:%.*]], align 8
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi double [ [[DOTPRE]], [[FOR_BODY_PREHEADER]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], 1136
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP2]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    call void @llvm.prefetch.p0(ptr [[UGLYGEP]], i32 0, i32 3, i32 1)
+; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !llvm.loop.prefetch !3
+; CHECK-NEXT:    [[ADD]] = fadd contract double [[TMP3]], [[TMP0]]
+; CHECK-NEXT:    store double [[ADD]], ptr [[SUM]], align 8
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
+;
+entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  %.pre = load double, ptr %sum, align 8
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %0 = phi double [ %.pre, %for.body.preheader ], [ %add, %for.body ]
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, ptr %a, i64 %indvars.iv
+  %1 = load double, ptr %arrayidx, align 8, !llvm.loop.prefetch !3
+  %add = fadd contract double %1, %0
+  store double %add, ptr %sum, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define dso_local void @prefetch_two_args_test2(ptr %a, i32 %n, ptr %sum) {
+; CHECK-LABEL: @prefetch_two_args_test2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP4]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[DOTPRE:%.*]] = load double, ptr [[SUM:%.*]], align 8
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi double [ [[DOTPRE]], [[FOR_BODY_PREHEADER]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], 1136
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP2]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    call void @llvm.prefetch.p0(ptr [[UGLYGEP]], i32 0, i32 2, i32 1)
+; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !llvm.loop.prefetch !4
+; CHECK-NEXT:    [[ADD]] = fadd contract double [[TMP3]], [[TMP0]]
+; CHECK-NEXT:    store double [[ADD]], ptr [[SUM]], align 8
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
+;
+entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  %.pre = load double, ptr %sum, align 8
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %0 = phi double [ %.pre, %for.body.preheader ], [ %add, %for.body ]
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, ptr %a, i64 %indvars.iv
+  %1 = load double, ptr %arrayidx, align 8, !llvm.loop.prefetch !4
+  %add = fadd contract double %1, %0
+  store double %add, ptr %sum, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define dso_local void @prefetch_two_args_test3(ptr %a, i32 %n, ptr %sum) {
+; CHECK-LABEL: @prefetch_two_args_test3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP4]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[DOTPRE:%.*]] = load double, ptr [[SUM:%.*]], align 8
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi double [ [[DOTPRE]], [[FOR_BODY_PREHEADER]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], 1136
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP2]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    call void @llvm.prefetch.p0(ptr [[UGLYGEP]], i32 0, i32 1, i32 1)
+; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !llvm.loop.prefetch !5
+; CHECK-NEXT:    [[ADD]] = fadd contract double [[TMP3]], [[TMP0]]
+; CHECK-NEXT:    store double [[ADD]], ptr [[SUM]], align 8
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
+;
+entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  %.pre = load double, double* %sum, align 8
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %0 = phi double [ %.pre, %for.body.preheader ], [ %add, %for.body ]
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, ptr %a, i64 %indvars.iv
+  %1 = load double, ptr %arrayidx, align 8, !llvm.loop.prefetch !5
+  %add = fadd contract double %1, %0
+  store double %add, ptr %sum, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define dso_local void @prefetch_three_args_test(ptr %a, i32 %n, ptr %sum) {
+; CHECK-LABEL: @prefetch_three_args_test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP4]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[DOTPRE:%.*]] = load double, ptr [[SUM:%.*]], align 8
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi double [ [[DOTPRE]], [[FOR_BODY_PREHEADER]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], 80
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP2]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    call void @llvm.prefetch.p0(ptr [[UGLYGEP]], i32 0, i32 1, i32 1)
+; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !llvm.loop.prefetch !6
+; CHECK-NEXT:    [[ADD]] = fadd contract double [[TMP3]], [[TMP0]]
+; CHECK-NEXT:    store double [[ADD]], ptr [[SUM]], align 8
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
+;
+entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  %.pre = load double, ptr %sum, align 8
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %0 = phi double [ %.pre, %for.body.preheader ], [ %add, %for.body ]
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, ptr %a, i64 %indvars.iv
+  %1 = load double, ptr %arrayidx, align 8, !llvm.loop.prefetch !6
+  %add = fadd contract double %1, %0
+  store double %add, ptr %sum, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+!0 = distinct !{i1 false, i32 -1, i32 -1}
+!1 = distinct !{i1 true, i32 -1, i32 -1}
+!2 = distinct !{i1 true, i32 0, i32 -1}
+!3 = distinct !{i1 true, i32 1, i32 -1}
+!4 = distinct !{i1 true, i32 2, i32 -1}
+!5 = distinct !{i1 true, i32 3, i32 -1}
+!6 = distinct !{i1 true, i32 3, i32 10}
Index: llvm/test/Transforms/LoopDataPrefetch/pragma-loop-prefetch-vectorize.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopDataPrefetch/pragma-loop-prefetch-vectorize.ll
@@ -0,0 +1,123 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes='loop-vectorize,loop-data-prefetch' -prefetch-distance=1000 -cache-line-size=64 -S | FileCheck %s
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+define dso_local void @prefetch_vectorize(ptr %a, i32 %n, ptr %b) {
+; CHECK-LABEL: @prefetch_vectorize(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[TMP0:%.*]] = shl nuw nsw i64 [[WIDE_TRIP_COUNT]], 3
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[B]], [[UGLYGEP1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[A]], [[UGLYGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDVAR7:%.*]] = phi i64 [ [[INDVAR_NEXT8:%.*]], [[VECTOR_BODY]] ], [ 0, [[VECTOR_PH]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[INDVAR7]], 5
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], 2432
+; CHECK-NEXT:    [[UGLYGEP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[INDVAR7]], 5
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP3]], 320
+; CHECK-NEXT:    [[UGLYGEP9:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds double, ptr [[TMP7]], i32 0
+; CHECK-NEXT:    call void @llvm.prefetch.p0(ptr [[UGLYGEP9]], i32 0, i32 2, i32 1)
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP9]], align 8, !alias.scope !0, !llvm.loop.prefetch !3
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds double, ptr [[TMP7]], i32 2
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x double>, ptr [[TMP10]], align 8, !alias.scope !0, !llvm.loop.prefetch !3
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds double, ptr [[TMP11]], i32 0
+; CHECK-NEXT:    call void @llvm.prefetch.p0(ptr [[UGLYGEP10]], i32 0, i32 3, i32 1)
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <2 x double>, ptr [[TMP13]], align 8, !alias.scope !4, !noalias !0
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, ptr [[TMP11]], i32 2
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <2 x double>, ptr [[TMP14]], align 8, !alias.scope !4, !noalias !0
+; CHECK-NEXT:    [[TMP15:%.*]] = fadd contract <2 x double> [[WIDE_LOAD]], [[WIDE_LOAD3]]
+; CHECK-NEXT:    [[TMP16:%.*]] = fadd contract <2 x double> [[WIDE_LOAD2]], [[WIDE_LOAD4]]
+; CHECK-NEXT:    store <2 x double> [[TMP15]], ptr [[TMP13]], align 8, !alias.scope !4, !noalias !0
+; CHECK-NEXT:    store <2 x double> [[TMP16]], ptr [[TMP14]], align 8, !alias.scope !4, !noalias !0
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[INDVAR_NEXT8]] = add i64 [[INDVAR7]], 1
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[TMP18:%.*]] = shl nsw i64 [[BC_RESUME_VAL]], 3
+; CHECK-NEXT:    [[TMP19:%.*]] = add nsw i64 [[TMP18]], 80
+; CHECK-NEXT:    [[TMP20:%.*]] = shl nsw i64 [[BC_RESUME_VAL]], 3
+; CHECK-NEXT:    [[TMP21:%.*]] = add nsw i64 [[TMP20]], 1136
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP22:%.*]] = shl i64 [[INDVAR]], 3
+; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[TMP21]], [[TMP22]]
+; CHECK-NEXT:    [[UGLYGEP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP23]]
+; CHECK-NEXT:    [[TMP24:%.*]] = shl i64 [[INDVAR]], 3
+; CHECK-NEXT:    [[TMP25:%.*]] = add i64 [[TMP19]], [[TMP24]]
+; CHECK-NEXT:    [[UGLYGEP5:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP25]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    call void @llvm.prefetch.p0(ptr [[UGLYGEP5]], i32 0, i32 2, i32 1)
+; CHECK-NEXT:    [[TMP26:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !llvm.loop.prefetch !3
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    call void @llvm.prefetch.p0(ptr [[UGLYGEP6]], i32 0, i32 3, i32 1)
+; CHECK-NEXT:    [[TMP27:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
+; CHECK-NEXT:    [[ADD:%.*]] = fadd contract double [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    store double [[ADD]], ptr [[ARRAYIDX2]], align 8
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+;
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, ptr %a, i64 %indvars.iv
+  %0 = load double, ptr %arrayidx, align 8, !llvm.loop.prefetch !0
+  %arrayidx2 = getelementptr inbounds double, ptr %b, i64 %indvars.iv
+  %1 = load double, ptr %arrayidx2, align 8
+  %add = fadd contract double %0, %1
+  store double %add, ptr %arrayidx2, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+!0 = distinct !{i1 true, i32 2, i32 10}