Index: lib/Transforms/Scalar/LoopDataPrefetch.cpp =================================================================== --- lib/Transforms/Scalar/LoopDataPrefetch.cpp +++ lib/Transforms/Scalar/LoopDataPrefetch.cpp @@ -60,10 +60,10 @@ /// Loop prefetch implementation class. class LoopDataPrefetch { public: - LoopDataPrefetch(AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, - const TargetTransformInfo *TTI, + LoopDataPrefetch(AssumptionCache *AC, DominatorTree *DT, LoopInfo *LI, + ScalarEvolution *SE, const TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE) - : AC(AC), LI(LI), SE(SE), TTI(TTI), ORE(ORE) {} + : AC(AC), DT(DT), LI(LI), SE(SE), TTI(TTI), ORE(ORE) {} bool run(); @@ -99,6 +99,7 @@ } AssumptionCache *AC; + DominatorTree *DT; LoopInfo *LI; ScalarEvolution *SE; const TargetTransformInfo *TTI; @@ -115,6 +116,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); + AU.addRequired(); AU.addPreserved(); AU.addRequired(); AU.addPreserved(); @@ -161,6 +163,7 @@ PreservedAnalyses LoopDataPrefetchPass::run(Function &F, FunctionAnalysisManager &AM) { + DominatorTree *DT = &AM.getResult(F); LoopInfo *LI = &AM.getResult(F); ScalarEvolution *SE = &AM.getResult(F); AssumptionCache *AC = &AM.getResult(F); @@ -168,7 +171,7 @@ &AM.getResult(F); const TargetTransformInfo *TTI = &AM.getResult(F); - LoopDataPrefetch LDP(AC, LI, SE, TTI, ORE); + LoopDataPrefetch LDP(AC, DT, LI, SE, TTI, ORE); bool Changed = LDP.run(); if (Changed) { @@ -185,6 +188,7 @@ if (skipFunction(F)) return false; + DominatorTree *DT = &getAnalysis().getDomTree(); LoopInfo *LI = &getAnalysis().getLoopInfo(); ScalarEvolution *SE = &getAnalysis().getSE(); AssumptionCache *AC = @@ -194,7 +198,7 @@ const TargetTransformInfo *TTI = &getAnalysis().getTTI(F); - LoopDataPrefetch LDP(AC, LI, SE, TTI, ORE); + LoopDataPrefetch LDP(AC, DT, LI, SE, TTI, ORE); return LDP.run(); } @@ -257,7 +261,8 @@ << " iterations ahead (loop size: " << LoopSize << ") in " << L->getHeader()->getParent()->getName() << ": " << *L); - SmallVector, 16> PrefLoads; + // Pairs of (Prefetch, SCEVAddRecExpr) for each emitted prefetch. + SmallVector, 16> Prefetches; for (const auto BB : L->blocks()) { for (auto &I : *BB) { Value *PtrValue; @@ -289,16 +294,25 @@ if (!isStrideLargeEnough(LSCEVAddRec)) continue; - // We don't want to double prefetch individual cache lines. If this load - // is known to be within one cache line of some other load that has - // already been prefetched, then don't prefetch this one as well. + // We don't want to double prefetch individual cache lines. If this + // access is known to be within one cache line of some other one that + // has already been prefetched, then don't prefetch this one as well. bool DupPref = false; - for (const auto &PrefLoad : PrefLoads) { - const SCEV *PtrDiff = SE->getMinusSCEV(LSCEVAddRec, PrefLoad.second); + for (const auto &Pref : Prefetches) { + const SCEV *PtrDiff = SE->getMinusSCEV(LSCEVAddRec, Pref.second); if (const SCEVConstant *ConstPtrDiff = dyn_cast(PtrDiff)) { int64_t PD = std::abs(ConstPtrDiff->getValue()->getSExtValue()); if (PD < (int64_t) TTI->getCacheLineSize()) { + /// Make sure that the prefetch Pref already emitted is also + /// executed each iteration MemI is reached, by moving the + /// prefetch up in the CFG if needed. + BasicBlock *PrefetchBB = Pref.first->getParent(); + if (PrefetchBB != BB) { + BasicBlock *DomBB = DT->findNearestCommonDominator(PrefetchBB, BB); + if (DomBB != PrefetchBB) + Pref.first->moveBefore(DomBB->getTerminator()); + } DupPref = true; break; } @@ -313,8 +327,6 @@ if (!isSafeToExpand(NextLSCEV, *SE)) continue; - PrefLoads.push_back(std::make_pair(MemI, LSCEVAddRec)); - Type *I8Ptr = Type::getInt8PtrTy(BB->getContext(), PtrAddrSpace); SCEVExpander SCEVE(*SE, I.getModule()->getDataLayout(), "prefaddr"); Value *PrefPtrValue = SCEVE.expandCodeFor(NextLSCEV, I8Ptr, MemI); @@ -324,11 +336,13 @@ Type *I32 = Type::getInt32Ty(BB->getContext()); Function *PrefetchFunc = Intrinsic::getDeclaration( M, Intrinsic::prefetch, PrefPtrValue->getType()); - Builder.CreateCall( - PrefetchFunc, - {PrefPtrValue, - ConstantInt::get(I32, MemI->mayReadFromMemory() ? 0 : 1), - ConstantInt::get(I32, 3), ConstantInt::get(I32, 1)}); + Instruction *Prefetch = + Builder.CreateCall( + PrefetchFunc, + {PrefPtrValue, + ConstantInt::get(I32, MemI->mayReadFromMemory() ? 0 : 1), + ConstantInt::get(I32, 3), ConstantInt::get(I32, 1)}); + Prefetches.push_back(std::make_pair(Prefetch, LSCEVAddRec)); ++NumPrefetches; LLVM_DEBUG(dbgs() << " Access: " << *PtrValue << ", SCEV: " << *LSCEV << "\n"); Index: test/CodeGen/SystemZ/prefetch-03.ll =================================================================== --- /dev/null +++ test/CodeGen/SystemZ/prefetch-03.ll @@ -0,0 +1,46 @@ +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 -prefetch-distance=50 \ +; RUN: -loop-prefetch-writes -stop-after=loop-data-prefetch | FileCheck %s +; +; Check that prefetches are emitted in a position that is executed each +; iteration for each targeted memory instruction. The two stores in %true and +; %false are within one cache line in memory, so they should get a single +; prefetch in %for.body. +; +; CHECK-LABEL: for.body +; CHECK: call void @llvm.prefetch.p0i8(i8* {{.*}}, i32 0 +; CHECK: call void @llvm.prefetch.p0i8(i8* {{.*}}, i32 1 +; CHECK-LABEL: true +; CHECK-LABEL: false +; CHECK-LABEL: latch + +define void @fun(i32* nocapture %Src, i32* nocapture readonly %Dst) { +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next.9, %latch ] + %arrayidx = getelementptr inbounds i32, i32* %Dst, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %cmp = icmp sgt i32 %0, 0 + br i1 %cmp, label %true, label %false + +true: + %arrayidx2 = getelementptr inbounds i32, i32* %Src, i64 %indvars.iv + store i32 %0, i32* %arrayidx2, align 4 + br label %latch + +false: + %a = add i64 %indvars.iv, 8 + %arrayidx3 = getelementptr inbounds i32, i32* %Src, i64 %a + store i32 %0, i32* %arrayidx3, align 4 + br label %latch + +latch: + %indvars.iv.next.9 = add nuw nsw i64 %indvars.iv, 1600 + %cmp.9 = icmp ult i64 %indvars.iv.next.9, 11200 + br i1 %cmp.9, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +} +