Skip to content

Commit 6d8beec

Browse files
committedMar 18, 2016
[LoopDataPrefetch/Aarch64] Allow selective prefetching of large-strided accesses
Summary: And use this TTI for Cyclone. As it was explained in the original RFC (http://thread.gmane.org/gmane.comp.compilers.llvm.devel/92758), the HW prefetcher work up to 2KB strides. I am also adding tests for this and the previous change (D17943): * Cyclone prefetching accesses with a large stride * Cyclone not prefetching accesses with a small stride * Generic Aarch64 subtarget not prefetching either Reviewers: hfinkel Subscribers: aemerson, rengolin, llvm-commits, mzolotukhin Differential Revision: http://reviews.llvm.org/D17945 llvm-svn: 263771
1 parent 53e758f commit 6d8beec

File tree

8 files changed

+109
-0
lines changed

8 files changed

+109
-0
lines changed
 

‎llvm/include/llvm/Analysis/TargetTransformInfo.h

+9
Original file line numberDiff line numberDiff line change
@@ -423,6 +423,11 @@ class TargetTransformInfo {
423423
/// This is currently measured in number of instructions.
424424
unsigned getPrefetchDistance() const;
425425

426+
/// \return Some HW prefetchers can handle accesses up to a certain constant
427+
/// stride. This is the minimum stride in bytes where it makes sense to start
428+
/// adding SW prefetches. The default is 1, i.e. prefetch with any stride.
429+
unsigned getMinPrefetchStride() const;
430+
426431
/// \return The maximum interleave factor that any transform should try to
427432
/// perform for this target. This number depends on the level of parallelism
428433
/// and the number of execution units in the CPU.
@@ -618,6 +623,7 @@ class TargetTransformInfo::Concept {
618623
virtual unsigned getRegisterBitWidth(bool Vector) = 0;
619624
virtual unsigned getCacheLineSize() = 0;
620625
virtual unsigned getPrefetchDistance() = 0;
626+
virtual unsigned getMinPrefetchStride() = 0;
621627
virtual unsigned getMaxInterleaveFactor(unsigned VF) = 0;
622628
virtual unsigned
623629
getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind Opd1Info,
@@ -788,6 +794,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
788794
return Impl.getCacheLineSize();
789795
}
790796
unsigned getPrefetchDistance() override { return Impl.getPrefetchDistance(); }
797+
unsigned getMinPrefetchStride() override {
798+
return Impl.getMinPrefetchStride();
799+
}
791800
unsigned getMaxInterleaveFactor(unsigned VF) override {
792801
return Impl.getMaxInterleaveFactor(VF);
793802
}

‎llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

+2
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,8 @@ class TargetTransformInfoImplBase {
268268

269269
unsigned getPrefetchDistance() { return 0; }
270270

271+
unsigned getMinPrefetchStride() { return 1; }
272+
271273
unsigned getMaxInterleaveFactor(unsigned VF) { return 1; }
272274

273275
unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,

‎llvm/lib/Analysis/TargetTransformInfo.cpp

+4
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,10 @@ unsigned TargetTransformInfo::getPrefetchDistance() const {
223223
return TTIImpl->getPrefetchDistance();
224224
}
225225

226+
unsigned TargetTransformInfo::getMinPrefetchStride() const {
227+
return TTIImpl->getMinPrefetchStride();
228+
}
229+
226230
unsigned TargetTransformInfo::getMaxInterleaveFactor(unsigned VF) const {
227231
return TTIImpl->getMaxInterleaveFactor(VF);
228232
}

‎llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

+12
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,12 @@ static cl::opt<unsigned> CyclonePrefetchDistance(
2525
cl::desc("Number of instructions to prefetch ahead for Cyclone"),
2626
cl::init(280), cl::Hidden);
2727

28+
// The HW prefetcher handles accesses with strides up to 2KB.
29+
static cl::opt<unsigned> CycloneMinPrefetchStride(
30+
"cyclone-min-prefetch-stride",
31+
cl::desc("Min stride to add prefetches for Cyclone"),
32+
cl::init(2048), cl::Hidden);
33+
2834
/// \brief Calculate the cost of materializing a 64-bit value. This helper
2935
/// method might only calculate a fraction of a larger immediate. Therefore it
3036
/// is valid to return a cost of ZERO.
@@ -590,3 +596,9 @@ unsigned AArch64TTIImpl::getPrefetchDistance() {
590596
return CyclonePrefetchDistance;
591597
return BaseT::getPrefetchDistance();
592598
}
599+
600+
unsigned AArch64TTIImpl::getMinPrefetchStride() {
601+
if (ST->isCyclone())
602+
return CycloneMinPrefetchStride;
603+
return BaseT::getMinPrefetchStride();
604+
}

‎llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

+2
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,8 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
131131
unsigned getCacheLineSize();
132132

133133
unsigned getPrefetchDistance();
134+
135+
unsigned getMinPrefetchStride();
134136
/// @}
135137
};
136138

‎llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp

+25
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,10 @@ namespace {
7373
bool runOnFunction(Function &F) override;
7474
bool runOnLoop(Loop *L);
7575

76+
/// \brief Check if the the stride of the accesses is large enough to
77+
/// warrant a prefetch.
78+
bool isStrideLargeEnough(const SCEVAddRecExpr *AR);
79+
7680
private:
7781
AssumptionCache *AC;
7882
LoopInfo *LI;
@@ -94,6 +98,22 @@ INITIALIZE_PASS_END(LoopDataPrefetch, "loop-data-prefetch",
9498

9599
FunctionPass *llvm::createLoopDataPrefetchPass() { return new LoopDataPrefetch(); }
96100

101+
bool LoopDataPrefetch::isStrideLargeEnough(const SCEVAddRecExpr *AR) {
102+
unsigned TargetMinStride = TTI->getMinPrefetchStride();
103+
// No need to check if any stride goes.
104+
if (TargetMinStride <= 1)
105+
return true;
106+
107+
const auto *ConstStride = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE));
108+
// If MinStride is set, don't prefetch unless we can ensure that stride is
109+
// larger.
110+
if (!ConstStride)
111+
return false;
112+
113+
unsigned AbsStride = std::abs(ConstStride->getAPInt().getSExtValue());
114+
return TargetMinStride <= AbsStride;
115+
}
116+
97117
bool LoopDataPrefetch::runOnFunction(Function &F) {
98118
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
99119
SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
@@ -184,6 +204,11 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) {
184204
if (!LSCEVAddRec)
185205
continue;
186206

207+
// Check if the the stride of the accesses is large enough to warrant a
208+
// prefetch.
209+
if (!isStrideLargeEnough(LSCEVAddRec))
210+
continue;
211+
187212
// We don't want to double prefetch individual cache lines. If this load
188213
// is known to be within one cache line of some other load that has
189214
// already been prefetched, then don't prefetch this one as well.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
; RUN: opt -mcpu=cyclone -mtriple=arm64-apple-ios -loop-data-prefetch -S < %s | FileCheck %s --check-prefix=LARGE_PREFETCH --check-prefix=ALL
2+
; RUN: opt -mcpu=generic -mtriple=arm64-apple-ios -loop-data-prefetch -S < %s | FileCheck %s --check-prefix=NO_LARGE_PREFETCH --check-prefix=ALL
3+
4+
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
5+
6+
; ALL-LABEL: @small_stride(
7+
define void @small_stride(double* nocapture %a, double* nocapture readonly %b) {
8+
entry:
9+
br label %for.body
10+
11+
; ALL: for.body:
12+
for.body: ; preds = %for.body, %entry
13+
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
14+
%arrayidx = getelementptr inbounds double, double* %b, i64 %indvars.iv
15+
; ALL-NOT: call void @llvm.prefetch
16+
%0 = load double, double* %arrayidx, align 8
17+
%add = fadd double %0, 1.000000e+00
18+
%arrayidx2 = getelementptr inbounds double, double* %a, i64 %indvars.iv
19+
store double %add, double* %arrayidx2, align 8
20+
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
21+
%exitcond = icmp eq i64 %indvars.iv.next, 1600
22+
br i1 %exitcond, label %for.end, label %for.body
23+
24+
; ALL: for.end:
25+
for.end: ; preds = %for.body
26+
ret void
27+
}
28+
29+
; ALL-LABEL: @large_stride(
30+
define void @large_stride(double* nocapture %a, double* nocapture readonly %b) {
31+
entry:
32+
br label %for.body
33+
34+
; ALL: for.body:
35+
for.body: ; preds = %for.body, %entry
36+
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
37+
%arrayidx = getelementptr inbounds double, double* %b, i64 %indvars.iv
38+
; LARGE_PREFETCH: call void @llvm.prefetch
39+
; NO_LARGE_PREFETCH-NOT: call void @llvm.prefetch
40+
%0 = load double, double* %arrayidx, align 8
41+
%add = fadd double %0, 1.000000e+00
42+
%arrayidx2 = getelementptr inbounds double, double* %a, i64 %indvars.iv
43+
store double %add, double* %arrayidx2, align 8
44+
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 300
45+
%exitcond = icmp eq i64 %indvars.iv.next, 160000
46+
br i1 %exitcond, label %for.end, label %for.body
47+
48+
; ALL: for.end:
49+
for.end: ; preds = %for.body
50+
ret void
51+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
config.suffixes = ['.ll']
2+
3+
if not 'AArch64' in config.root.targets:
4+
config.unsupported = True

0 commit comments

Comments
 (0)
Please sign in to comment.