Index: lib/Transforms/Scalar/LoopDataPrefetch.cpp
===================================================================
--- lib/Transforms/Scalar/LoopDataPrefetch.cpp
+++ lib/Transforms/Scalar/LoopDataPrefetch.cpp
@@ -60,10 +60,10 @@
 /// Loop prefetch implementation class.
 class LoopDataPrefetch {
 public:
-  LoopDataPrefetch(AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE,
-                   const TargetTransformInfo *TTI,
+  LoopDataPrefetch(AssumptionCache *AC, DominatorTree *DT, LoopInfo *LI,
+                   ScalarEvolution *SE, const TargetTransformInfo *TTI,
                    OptimizationRemarkEmitter *ORE)
-      : AC(AC), LI(LI), SE(SE), TTI(TTI), ORE(ORE) {}
+      : AC(AC), DT(DT), LI(LI), SE(SE), TTI(TTI), ORE(ORE) {}
 
   bool run();
 
@@ -99,6 +99,7 @@
   }
 
   AssumptionCache *AC;
+  DominatorTree *DT;
   LoopInfo *LI;
   ScalarEvolution *SE;
   const TargetTransformInfo *TTI;
@@ -115,6 +116,7 @@
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
     AU.addPreserved<DominatorTreeWrapperPass>();
     AU.addRequired<LoopInfoWrapperPass>();
     AU.addPreserved<LoopInfoWrapperPass>();
@@ -161,6 +163,7 @@
 
 PreservedAnalyses LoopDataPrefetchPass::run(Function &F,
                                             FunctionAnalysisManager &AM) {
+  DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
   LoopInfo *LI = &AM.getResult<LoopAnalysis>(F);
   ScalarEvolution *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
   AssumptionCache *AC = &AM.getResult<AssumptionAnalysis>(F);
@@ -168,7 +171,7 @@
       &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
   const TargetTransformInfo *TTI = &AM.getResult<TargetIRAnalysis>(F);
 
-  LoopDataPrefetch LDP(AC, LI, SE, TTI, ORE);
+  LoopDataPrefetch LDP(AC, DT, LI, SE, TTI, ORE);
   bool Changed = LDP.run();
 
   if (Changed) {
@@ -185,6 +188,7 @@
   if (skipFunction(F))
     return false;
 
+  DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
   AssumptionCache *AC =
@@ -194,7 +198,7 @@
   const TargetTransformInfo *TTI =
       &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
 
-  LoopDataPrefetch LDP(AC, LI, SE, TTI, ORE);
+  LoopDataPrefetch LDP(AC, DT, LI, SE, TTI, ORE);
   return LDP.run();
 }
 
@@ -257,7 +261,8 @@
                     << " iterations ahead (loop size: " << LoopSize << ") in "
                     << L->getHeader()->getParent()->getName() << ": " << *L);
 
-  SmallVector<std::pair<Instruction *, const SCEVAddRecExpr *>, 16> PrefLoads;
+  // Pairs of (Prefetch, SCEVAddRecExpr) for each emitted prefetch.
+  SmallVector<std::pair<Instruction *, const SCEVAddRecExpr *>, 16> Prefetches;
   for (const auto BB : L->blocks()) {
     for (auto &I : *BB) {
       Value *PtrValue;
@@ -289,16 +294,25 @@
       if (!isStrideLargeEnough(LSCEVAddRec))
         continue;
 
-      // We don't want to double prefetch individual cache lines. If this load
-      // is known to be within one cache line of some other load that has
-      // already been prefetched, then don't prefetch this one as well.
+      // We don't want to double prefetch individual cache lines. If this
+      // access is known to be within one cache line of some other one that
+      // has already been prefetched, then don't prefetch this one as well.
       bool DupPref = false;
-      for (const auto &PrefLoad : PrefLoads) {
-        const SCEV *PtrDiff = SE->getMinusSCEV(LSCEVAddRec, PrefLoad.second);
+      for (const auto &Pref : Prefetches) {
+        const SCEV *PtrDiff = SE->getMinusSCEV(LSCEVAddRec, Pref.second);
         if (const SCEVConstant *ConstPtrDiff =
             dyn_cast<SCEVConstant>(PtrDiff)) {
           int64_t PD = std::abs(ConstPtrDiff->getValue()->getSExtValue());
           if (PD < (int64_t) TTI->getCacheLineSize()) {
+            /// Make sure that the prefetch Pref already emitted is also
+            /// executed each iteration MemI is reached, by moving the
+            /// prefetch up in the CFG if needed.
+            BasicBlock *PrefetchBB = Pref.first->getParent();
+            if (PrefetchBB != BB) {
+              BasicBlock *DomBB = DT->findNearestCommonDominator(PrefetchBB, BB);
+              if (DomBB != PrefetchBB)
+                Pref.first->moveBefore(DomBB->getTerminator());
+            }
             DupPref = true;
             break;
           }
@@ -313,8 +327,6 @@
       if (!isSafeToExpand(NextLSCEV, *SE))
         continue;
 
-      PrefLoads.push_back(std::make_pair(MemI, LSCEVAddRec));
-
       Type *I8Ptr = Type::getInt8PtrTy(BB->getContext(), PtrAddrSpace);
       SCEVExpander SCEVE(*SE, I.getModule()->getDataLayout(), "prefaddr");
       Value *PrefPtrValue = SCEVE.expandCodeFor(NextLSCEV, I8Ptr, MemI);
@@ -324,11 +336,13 @@
       Type *I32 = Type::getInt32Ty(BB->getContext());
       Function *PrefetchFunc = Intrinsic::getDeclaration(
           M, Intrinsic::prefetch, PrefPtrValue->getType());
-      Builder.CreateCall(
-          PrefetchFunc,
-          {PrefPtrValue,
-           ConstantInt::get(I32, MemI->mayReadFromMemory() ? 0 : 1),
-           ConstantInt::get(I32, 3), ConstantInt::get(I32, 1)});
+      Instruction *Prefetch =
+        Builder.CreateCall(
+            PrefetchFunc,
+            {PrefPtrValue,
+             ConstantInt::get(I32, MemI->mayReadFromMemory() ? 0 : 1),
+             ConstantInt::get(I32, 3), ConstantInt::get(I32, 1)});
+      Prefetches.push_back(std::make_pair(Prefetch, LSCEVAddRec));
       ++NumPrefetches;
       LLVM_DEBUG(dbgs() << "  Access: " << *PtrValue << ", SCEV: " << *LSCEV
                         << "\n");
Index: test/CodeGen/SystemZ/prefetch-03.ll
===================================================================
--- /dev/null
+++ test/CodeGen/SystemZ/prefetch-03.ll
@@ -0,0 +1,46 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 -prefetch-distance=50 \
+; RUN:   -loop-prefetch-writes -stop-after=loop-data-prefetch | FileCheck %s
+;
+; Check that prefetches are emitted in a position that is executed each
+; iteration for each targeted memory instruction. The two stores in %true and
+; %false are within one cache line in memory, so they should get a single
+; prefetch in %for.body.
+;
+; CHECK-LABEL: for.body
+; CHECK: call void @llvm.prefetch.p0i8(i8* {{.*}}, i32 0
+; CHECK: call void @llvm.prefetch.p0i8(i8* {{.*}}, i32 1
+; CHECK-LABEL: true
+; CHECK-LABEL: false
+; CHECK-LABEL: latch
+
+define void @fun(i32* nocapture %Src, i32* nocapture readonly %Dst) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next.9, %latch ]
+  %arrayidx = getelementptr inbounds i32, i32* %Dst, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp = icmp sgt i32 %0, 0
+  br i1 %cmp, label %true, label %false
+
+true:  
+  %arrayidx2 = getelementptr inbounds i32, i32* %Src, i64 %indvars.iv
+  store i32 %0, i32* %arrayidx2, align 4
+  br label %latch
+
+false:
+  %a = add i64 %indvars.iv, 8
+  %arrayidx3 = getelementptr inbounds i32, i32* %Src, i64 %a
+  store i32 %0, i32* %arrayidx3, align 4
+  br label %latch
+
+latch:
+  %indvars.iv.next.9 = add nuw nsw i64 %indvars.iv, 1600
+  %cmp.9 = icmp ult i64 %indvars.iv.next.9, 11200
+  br i1 %cmp.9, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
+