This is an archive of the discontinued LLVM Phabricator instance.

[LoopDataPrefetch] Move prefetch to dominating position of two accesses
AbandonedPublic

Authored by jonpa on Oct 1 2019, 9:34 AM.

Download Raw Diff

Details

Reviewers

uweigand
hfinkel
anemet
jfb

Summary

I found that the LBM benchmark benefits significantly if enabling (stores) prefetching on the hot loop with '-mllvm -min-prefetch-stride=128 -mllvm -loop-prefetch-writes' on SystemZ. I then observed that the same memory addresses are accessed on different paths in different iterations. Currently, the LoopDataPrefetch pass emits one prefetch and then skips adding another one for the next close/identical access without taking this into consideration. I therefore tried the idea of making sure that the one prefetch actually dominates both accesses by moving it if necessary. For what I can see now on SystemZ at least, this patch gives yet another 7-8% of improvement on LBM (matching gcc)...

Is this generally a good idea? The prefetch distance heuristic may get slightly thrown off by this in some bigger loops, I guess.

Diff Detail

Event Timeline

jonpa created this revision.Oct 1 2019, 9:34 AM

Herald added a subscriber: dexonsmith. · View Herald TranscriptOct 1 2019, 9:34 AM

ping

This change is included instead in https://reviews.llvm.org/D70228.

Revision Contents

Path

Size

lib/

Transforms/

Scalar/

LoopDataPrefetch.cpp

50 lines

test/

CodeGen/

SystemZ/

prefetch-03.ll

46 lines

Diff 222638

lib/Transforms/Scalar/LoopDataPrefetch.cpp

Show First 20 Lines • Show All 54 Lines • ▼ Show 20 Lines

STATISTIC(NumPrefetches, "Number of prefetches inserted");		STATISTIC(NumPrefetches, "Number of prefetches inserted");

namespace {		namespace {

/// Loop prefetch implementation class.		/// Loop prefetch implementation class.
class LoopDataPrefetch {		class LoopDataPrefetch {
public:		public:
LoopDataPrefetch(AssumptionCache AC, LoopInfo LI, ScalarEvolution *SE,		LoopDataPrefetch(AssumptionCache AC, DominatorTree DT, LoopInfo *LI,
const TargetTransformInfo *TTI,		ScalarEvolution SE, const TargetTransformInfo TTI,
OptimizationRemarkEmitter *ORE)		OptimizationRemarkEmitter *ORE)
: AC(AC), LI(LI), SE(SE), TTI(TTI), ORE(ORE) {}		: AC(AC), DT(DT), LI(LI), SE(SE), TTI(TTI), ORE(ORE) {}

bool run();		bool run();

private:		private:
bool runOnLoop(Loop *L);		bool runOnLoop(Loop *L);

/// Check if the stride of the accesses is large enough to		/// Check if the stride of the accesses is large enough to
/// warrant a prefetch.		/// warrant a prefetch.
Show All 19 Lines	private:

unsigned getMaxPrefetchIterationsAhead() {		unsigned getMaxPrefetchIterationsAhead() {
if (MaxPrefetchIterationsAhead.getNumOccurrences() > 0)		if (MaxPrefetchIterationsAhead.getNumOccurrences() > 0)
return MaxPrefetchIterationsAhead;		return MaxPrefetchIterationsAhead;
return TTI->getMaxPrefetchIterationsAhead();		return TTI->getMaxPrefetchIterationsAhead();
}		}

AssumptionCache *AC;		AssumptionCache *AC;
		DominatorTree *DT;
LoopInfo *LI;		LoopInfo *LI;
ScalarEvolution *SE;		ScalarEvolution *SE;
const TargetTransformInfo *TTI;		const TargetTransformInfo *TTI;
OptimizationRemarkEmitter *ORE;		OptimizationRemarkEmitter *ORE;
};		};

/// Legacy class for inserting loop data prefetches.		/// Legacy class for inserting loop data prefetches.
class LoopDataPrefetchLegacyPass : public FunctionPass {		class LoopDataPrefetchLegacyPass : public FunctionPass {
public:		public:
static char ID; // Pass ID, replacement for typeid		static char ID; // Pass ID, replacement for typeid
LoopDataPrefetchLegacyPass() : FunctionPass(ID) {		LoopDataPrefetchLegacyPass() : FunctionPass(ID) {
initializeLoopDataPrefetchLegacyPassPass(*PassRegistry::getPassRegistry());		initializeLoopDataPrefetchLegacyPassPass(*PassRegistry::getPassRegistry());
}		}

void getAnalysisUsage(AnalysisUsage &AU) const override {		void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<AssumptionCacheTracker>();		AU.addRequired<AssumptionCacheTracker>();
		AU.addRequired<DominatorTreeWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();		AU.addPreserved<DominatorTreeWrapperPass>();
AU.addRequired<LoopInfoWrapperPass>();		AU.addRequired<LoopInfoWrapperPass>();
AU.addPreserved<LoopInfoWrapperPass>();		AU.addPreserved<LoopInfoWrapperPass>();
AU.addRequired<OptimizationRemarkEmitterWrapperPass>();		AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
AU.addRequired<ScalarEvolutionWrapperPass>();		AU.addRequired<ScalarEvolutionWrapperPass>();
AU.addPreserved<ScalarEvolutionWrapperPass>();		AU.addPreserved<ScalarEvolutionWrapperPass>();
AU.addRequired<TargetTransformInfoWrapperPass>();		AU.addRequired<TargetTransformInfoWrapperPass>();
}		}
Show All 30 Lines	if (!ConstStride)
return false;		return false;

unsigned AbsStride = std::abs(ConstStride->getAPInt().getSExtValue());		unsigned AbsStride = std::abs(ConstStride->getAPInt().getSExtValue());
return TargetMinStride <= AbsStride;		return TargetMinStride <= AbsStride;
}		}

PreservedAnalyses LoopDataPrefetchPass::run(Function &F,		PreservedAnalyses LoopDataPrefetchPass::run(Function &F,
FunctionAnalysisManager &AM) {		FunctionAnalysisManager &AM) {
		DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
LoopInfo *LI = &AM.getResult<LoopAnalysis>(F);		LoopInfo *LI = &AM.getResult<LoopAnalysis>(F);
ScalarEvolution *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);		ScalarEvolution *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
AssumptionCache *AC = &AM.getResult<AssumptionAnalysis>(F);		AssumptionCache *AC = &AM.getResult<AssumptionAnalysis>(F);
OptimizationRemarkEmitter *ORE =		OptimizationRemarkEmitter *ORE =
&AM.getResult<OptimizationRemarkEmitterAnalysis>(F);		&AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
const TargetTransformInfo *TTI = &AM.getResult<TargetIRAnalysis>(F);		const TargetTransformInfo *TTI = &AM.getResult<TargetIRAnalysis>(F);

LoopDataPrefetch LDP(AC, LI, SE, TTI, ORE);		LoopDataPrefetch LDP(AC, DT, LI, SE, TTI, ORE);
bool Changed = LDP.run();		bool Changed = LDP.run();

if (Changed) {		if (Changed) {
PreservedAnalyses PA;		PreservedAnalyses PA;
PA.preserve<DominatorTreeAnalysis>();		PA.preserve<DominatorTreeAnalysis>();
PA.preserve<LoopAnalysis>();		PA.preserve<LoopAnalysis>();
return PA;		return PA;
}		}

return PreservedAnalyses::all();		return PreservedAnalyses::all();
}		}

bool LoopDataPrefetchLegacyPass::runOnFunction(Function &F) {		bool LoopDataPrefetchLegacyPass::runOnFunction(Function &F) {
if (skipFunction(F))		if (skipFunction(F))
return false;		return false;

		DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();		LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();		ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
AssumptionCache *AC =		AssumptionCache *AC =
&getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);		&getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
OptimizationRemarkEmitter *ORE =		OptimizationRemarkEmitter *ORE =
&getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();		&getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
const TargetTransformInfo *TTI =		const TargetTransformInfo *TTI =
&getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);		&getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);

LoopDataPrefetch LDP(AC, LI, SE, TTI, ORE);		LoopDataPrefetch LDP(AC, DT, LI, SE, TTI, ORE);
return LDP.run();		return LDP.run();
}		}

bool LoopDataPrefetch::run() {		bool LoopDataPrefetch::run() {
// If PrefetchDistance is not set, don't run the pass. This gives an		// If PrefetchDistance is not set, don't run the pass. This gives an
// opportunity for targets to run this pass for selected subtargets only		// opportunity for targets to run this pass for selected subtargets only
// (whose TTI sets PrefetchDistance).		// (whose TTI sets PrefetchDistance).
if (getPrefetchDistance() == 0)		if (getPrefetchDistance() == 0)
▲ Show 20 Lines • Show All 46 Lines • ▼ Show 20 Lines	bool LoopDataPrefetch::runOnLoop(Loop *L) {
unsigned LoopConstantTripCount = SE->getSmallConstantTripCount(L);		unsigned LoopConstantTripCount = SE->getSmallConstantTripCount(L);
if (LoopConstantTripCount && LoopConstantTripCount < ItersAhead)		if (LoopConstantTripCount && LoopConstantTripCount < ItersAhead)
return MadeChange;		return MadeChange;

LLVM_DEBUG(dbgs() << "Prefetching " << ItersAhead		LLVM_DEBUG(dbgs() << "Prefetching " << ItersAhead
<< " iterations ahead (loop size: " << LoopSize << ") in "		<< " iterations ahead (loop size: " << LoopSize << ") in "
<< L->getHeader()->getParent()->getName() << ": " << *L);		<< L->getHeader()->getParent()->getName() << ": " << *L);

SmallVector<std::pair<Instruction , const SCEVAddRecExpr >, 16> PrefLoads;		// Pairs of (Prefetch, SCEVAddRecExpr) for each emitted prefetch.
		SmallVector<std::pair<Instruction , const SCEVAddRecExpr >, 16> Prefetches;
for (const auto BB : L->blocks()) {		for (const auto BB : L->blocks()) {
for (auto &I : *BB) {		for (auto &I : *BB) {
Value *PtrValue;		Value *PtrValue;
Instruction *MemI;		Instruction *MemI;

if (LoadInst *LMemI = dyn_cast<LoadInst>(&I)) {		if (LoadInst *LMemI = dyn_cast<LoadInst>(&I)) {
MemI = LMemI;		MemI = LMemI;
PtrValue = LMemI->getPointerOperand();		PtrValue = LMemI->getPointerOperand();
Show All 15 Lines	for (auto &I : *BB) {
if (!LSCEVAddRec)		if (!LSCEVAddRec)
continue;		continue;

// Check if the stride of the accesses is large enough to warrant a		// Check if the stride of the accesses is large enough to warrant a
// prefetch.		// prefetch.
if (!isStrideLargeEnough(LSCEVAddRec))		if (!isStrideLargeEnough(LSCEVAddRec))
continue;		continue;

// We don't want to double prefetch individual cache lines. If this load		// We don't want to double prefetch individual cache lines. If this
// is known to be within one cache line of some other load that has		// access is known to be within one cache line of some other one that
// already been prefetched, then don't prefetch this one as well.		// has already been prefetched, then don't prefetch this one as well.
bool DupPref = false;		bool DupPref = false;
for (const auto &PrefLoad : PrefLoads) {		for (const auto &Pref : Prefetches) {
const SCEV *PtrDiff = SE->getMinusSCEV(LSCEVAddRec, PrefLoad.second);		const SCEV *PtrDiff = SE->getMinusSCEV(LSCEVAddRec, Pref.second);
if (const SCEVConstant *ConstPtrDiff =		if (const SCEVConstant *ConstPtrDiff =
dyn_cast<SCEVConstant>(PtrDiff)) {		dyn_cast<SCEVConstant>(PtrDiff)) {
int64_t PD = std::abs(ConstPtrDiff->getValue()->getSExtValue());		int64_t PD = std::abs(ConstPtrDiff->getValue()->getSExtValue());
if (PD < (int64_t) TTI->getCacheLineSize()) {		if (PD < (int64_t) TTI->getCacheLineSize()) {
		/// Make sure that the prefetch Pref already emitted is also
		/// executed each iteration MemI is reached, by moving the
		/// prefetch up in the CFG if needed.
		BasicBlock *PrefetchBB = Pref.first->getParent();
		if (PrefetchBB != BB) {
		BasicBlock *DomBB = DT->findNearestCommonDominator(PrefetchBB, BB);
		if (DomBB != PrefetchBB)
		Pref.first->moveBefore(DomBB->getTerminator());
		}
DupPref = true;		DupPref = true;
break;		break;
}		}
}		}
}		}
if (DupPref)		if (DupPref)
continue;		continue;

const SCEV *NextLSCEV = SE->getAddExpr(LSCEVAddRec, SE->getMulExpr(		const SCEV *NextLSCEV = SE->getAddExpr(LSCEVAddRec, SE->getMulExpr(
SE->getConstant(LSCEVAddRec->getType(), ItersAhead),		SE->getConstant(LSCEVAddRec->getType(), ItersAhead),
LSCEVAddRec->getStepRecurrence(*SE)));		LSCEVAddRec->getStepRecurrence(*SE)));
if (!isSafeToExpand(NextLSCEV, *SE))		if (!isSafeToExpand(NextLSCEV, *SE))
continue;		continue;

PrefLoads.push_back(std::make_pair(MemI, LSCEVAddRec));

Type *I8Ptr = Type::getInt8PtrTy(BB->getContext(), PtrAddrSpace);		Type *I8Ptr = Type::getInt8PtrTy(BB->getContext(), PtrAddrSpace);
SCEVExpander SCEVE(*SE, I.getModule()->getDataLayout(), "prefaddr");		SCEVExpander SCEVE(*SE, I.getModule()->getDataLayout(), "prefaddr");
Value *PrefPtrValue = SCEVE.expandCodeFor(NextLSCEV, I8Ptr, MemI);		Value *PrefPtrValue = SCEVE.expandCodeFor(NextLSCEV, I8Ptr, MemI);

IRBuilder<> Builder(MemI);		IRBuilder<> Builder(MemI);
Module *M = BB->getParent()->getParent();		Module *M = BB->getParent()->getParent();
Type *I32 = Type::getInt32Ty(BB->getContext());		Type *I32 = Type::getInt32Ty(BB->getContext());
Function *PrefetchFunc = Intrinsic::getDeclaration(		Function *PrefetchFunc = Intrinsic::getDeclaration(
M, Intrinsic::prefetch, PrefPtrValue->getType());		M, Intrinsic::prefetch, PrefPtrValue->getType());
		Instruction *Prefetch =
Builder.CreateCall(		Builder.CreateCall(
PrefetchFunc,		PrefetchFunc,
{PrefPtrValue,		{PrefPtrValue,
ConstantInt::get(I32, MemI->mayReadFromMemory() ? 0 : 1),		ConstantInt::get(I32, MemI->mayReadFromMemory() ? 0 : 1),
ConstantInt::get(I32, 3), ConstantInt::get(I32, 1)});		ConstantInt::get(I32, 3), ConstantInt::get(I32, 1)});
		Prefetches.push_back(std::make_pair(Prefetch, LSCEVAddRec));
++NumPrefetches;		++NumPrefetches;
LLVM_DEBUG(dbgs() << " Access: " << PtrValue << ", SCEV: " << LSCEV		LLVM_DEBUG(dbgs() << " Access: " << PtrValue << ", SCEV: " << LSCEV
<< "\n");		<< "\n");
ORE->emit([&]() {		ORE->emit([&]() {
return OptimizationRemark(DEBUG_TYPE, "Prefetched", MemI)		return OptimizationRemark(DEBUG_TYPE, "Prefetched", MemI)
<< "prefetched memory access";		<< "prefetched memory access";
});		});

MadeChange = true;		MadeChange = true;
}		}
}		}

return MadeChange;		return MadeChange;
}		}

test/CodeGen/SystemZ/prefetch-03.ll

This file was added.

				; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 -prefetch-distance=50 \
				; RUN: -loop-prefetch-writes -stop-after=loop-data-prefetch \| FileCheck %s
				;
				; Check that prefetches are emitted in a position that is executed each
				; iteration for each targeted memory instruction. The two stores in %true and
				; %false are within one cache line in memory, so they should get a single
				; prefetch in %for.body.
				;
				; CHECK-LABEL: for.body
				; CHECK: call void @llvm.prefetch.p0i8(i8* {{.*}}, i32 0
				; CHECK: call void @llvm.prefetch.p0i8(i8* {{.*}}, i32 1
				; CHECK-LABEL: true
				; CHECK-LABEL: false
				; CHECK-LABEL: latch

				define void @fun(i32* nocapture %Src, i32* nocapture readonly %Dst) {
				entry:
				br label %for.body

				for.body:
				%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next.9, %latch ]
				%arrayidx = getelementptr inbounds i32, i32* %Dst, i64 %indvars.iv
				%0 = load i32, i32* %arrayidx, align 4
				%cmp = icmp sgt i32 %0, 0
				br i1 %cmp, label %true, label %false

				true:
				%arrayidx2 = getelementptr inbounds i32, i32* %Src, i64 %indvars.iv
				store i32 %0, i32* %arrayidx2, align 4
				br label %latch

				false:
				%a = add i64 %indvars.iv, 8
				%arrayidx3 = getelementptr inbounds i32, i32* %Src, i64 %a
				store i32 %0, i32* %arrayidx3, align 4
				br label %latch

				latch:
				%indvars.iv.next.9 = add nuw nsw i64 %indvars.iv, 1600
				%cmp.9 = icmp ult i64 %indvars.iv.next.9, 11200
				br i1 %cmp.9, label %for.body, label %for.cond.cleanup

				for.cond.cleanup:
				ret void
				}