Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -477,6 +477,9 @@ /// This is currently measured in number of instructions. unsigned getPrefetchDistance() const; + /// \return The number of cache lines ahead we should prefetch. + unsigned getPrefetchDegree() const; + /// \return Some HW prefetchers can handle accesses up to a certain constant /// stride. This is the minimum stride in bytes where it makes sense to start /// adding SW prefetches. The default is 1, i.e. prefetch with any stride. @@ -730,6 +733,7 @@ virtual unsigned getRegisterBitWidth(bool Vector) = 0; virtual unsigned getCacheLineSize() = 0; virtual unsigned getPrefetchDistance() = 0; + virtual unsigned getPrefetchDegree() = 0; virtual unsigned getMinPrefetchStride() = 0; virtual unsigned getMaxPrefetchIterationsAhead() = 0; virtual unsigned getMaxInterleaveFactor(unsigned VF) = 0; @@ -944,6 +948,7 @@ return Impl.getCacheLineSize(); } unsigned getPrefetchDistance() override { return Impl.getPrefetchDistance(); } + unsigned getPrefetchDegree() override { return Impl.getPrefetchDegree(); } unsigned getMinPrefetchStride() override { return Impl.getMinPrefetchStride(); } Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -295,6 +295,8 @@ unsigned getPrefetchDistance() { return 0; } + unsigned getPrefetchDegree() { return 0; } + unsigned getMinPrefetchStride() { return 1; } unsigned getMaxPrefetchIterationsAhead() { return UINT_MAX; } Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -262,6 +262,10 @@ return TTIImpl->getPrefetchDistance(); } +unsigned TargetTransformInfo::getPrefetchDegree() const { + return TTIImpl->getPrefetchDegree(); +} + unsigned TargetTransformInfo::getMinPrefetchStride() const { return TTIImpl->getMinPrefetchStride(); } Index: lib/Target/AArch64/AArch64Subtarget.h =================================================================== --- lib/Target/AArch64/AArch64Subtarget.h +++ lib/Target/AArch64/AArch64Subtarget.h @@ -87,6 +87,7 @@ uint8_t VectorInsertExtractBaseCost = 3; uint16_t CacheLineSize = 0; uint16_t PrefetchDistance = 0; + uint16_t PrefetchDegree = 0; uint16_t MinPrefetchStride = 1; unsigned MaxPrefetchIterationsAhead = UINT_MAX; unsigned PrefFunctionAlignment = 0; @@ -197,6 +198,7 @@ } unsigned getCacheLineSize() const { return CacheLineSize; } unsigned getPrefetchDistance() const { return PrefetchDistance; } + unsigned getPrefetchDegree() const { return PrefetchDegree; } unsigned getMinPrefetchStride() const { return MinPrefetchStride; } unsigned getMaxPrefetchIterationsAhead() const { return MaxPrefetchIterationsAhead; Index: lib/Target/AArch64/AArch64Subtarget.cpp =================================================================== --- lib/Target/AArch64/AArch64Subtarget.cpp +++ lib/Target/AArch64/AArch64Subtarget.cpp @@ -73,6 +73,7 @@ VectorInsertExtractBaseCost = 2; CacheLineSize = 128; PrefetchDistance = 740; + PrefetchDegree = 1; MinPrefetchStride = 1024; MaxPrefetchIterationsAhead = 11; break; Index: lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.h +++ lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -135,6 +135,8 @@ unsigned getPrefetchDistance(); + unsigned getPrefetchDegree(); + unsigned getMinPrefetchStride(); unsigned getMaxPrefetchIterationsAhead(); Index: lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -633,6 +633,10 @@ return ST->getPrefetchDistance(); } +unsigned AArch64TTIImpl::getPrefetchDegree() { + return ST->getPrefetchDegree(); +} + unsigned AArch64TTIImpl::getMinPrefetchStride() { return ST->getMinPrefetchStride(); } Index: lib/Transforms/Scalar/LoopDataPrefetch.cpp =================================================================== --- lib/Transforms/Scalar/LoopDataPrefetch.cpp +++ lib/Transforms/Scalar/LoopDataPrefetch.cpp @@ -52,6 +52,11 @@ cl::Hidden); static cl::opt + PrefetchDegree("prefetch-degree", + cl::desc("Number of cache lines to prefetch ahead"), + cl::Hidden); + +static cl::opt MinPrefetchStride("min-prefetch-stride", cl::desc("Min stride to add prefetches"), cl::Hidden); @@ -80,6 +85,9 @@ /// warrant a prefetch. bool isStrideLargeEnough(const SCEVAddRecExpr *AR); + /// \brief Look for irregular symbolic strides. + bool isIrregularSymbolicStride(const SCEVAddRecExpr *AR, Loop *L); + unsigned getMinPrefetchStride() { if (MinPrefetchStride.getNumOccurrences() > 0) return MinPrefetchStride; @@ -92,6 +100,12 @@ return TTI->getPrefetchDistance(); } + unsigned getPrefetchDegree() { + if (PrefetchDegree.getNumOccurrences() > 0) + return PrefetchDegree; + return TTI->getPrefetchDegree(); + } + unsigned getMaxPrefetchIterationsAhead() { if (MaxPrefetchIterationsAhead.getNumOccurrences() > 0) return MaxPrefetchIterationsAhead; @@ -161,6 +175,53 @@ return TargetMinStride <= AbsStride; } +bool LoopDataPrefetch::isIrregularSymbolicStride(const SCEVAddRecExpr *AR, + Loop *L) { + // If there is no outer loop we cannot ensure the stride is irregular. + const Loop *OuterLoop = L->getParentLoop(); + if (!OuterLoop) + return false; + + // If the base address is varying, then it is an irregular access. + const SCEV *Base = AR->getStart(); + if (SE->isLoopInvariant(Base, OuterLoop)) + return false; + + // Now check if it is a symbolic stride. + const SCEV *V = AR->getStepRecurrence(*SE); + if (!V) + return false; + + const auto *M = dyn_cast(V); + if (!M) + return false; + // Check if the step value is a non-unit constant. + // Be conservative and give up if step value is larger than the cache line. + V = M->getOperand(0); + if (V->getSCEVType() != scConstant) + return false; + + if (V->isOne() || V->isAllOnesValue()) + return false; + + const APInt &APStepVal = cast(V)->getAPInt(); + int64_t StepVal = APStepVal.getSExtValue(); + if (StepVal > TTI->getCacheLineSize()) + return false; + + V = M->getOperand(1); + + // Strip off casts. + while (const auto *C = dyn_cast(V)) + V = C->getOperand(); + + const auto *U = dyn_cast(V); + if (!U) + return false; + + return (L->isLoopInvariant(U->getValue())); +} + PreservedAnalyses LoopDataPrefetchPass::run(Function &F, FunctionAnalysisManager &AM) { LoopInfo *LI = &AM.getResult(F); @@ -245,21 +306,16 @@ LoopSize = 1; unsigned ItersAhead = getPrefetchDistance() / LoopSize; + unsigned BytesAhead = TTI->getCacheLineSize() * getPrefetchDegree(); if (!ItersAhead) ItersAhead = 1; - if (ItersAhead > getMaxPrefetchIterationsAhead()) - return MadeChange; - - DEBUG(dbgs() << "Prefetching " << ItersAhead - << " iterations ahead (loop size: " << LoopSize << ") in " - << L->getHeader()->getParent()->getName() << ": " << *L); - SmallVector, 16> PrefLoads; for (const auto BB : L->blocks()) { for (auto &I : *BB) { Value *PtrValue; Instruction *MemI; + bool PrefetchNextLine = false; if (LoadInst *LMemI = dyn_cast(&I)) { MemI = LMemI; @@ -270,7 +326,11 @@ PtrValue = SMemI->getPointerOperand(); } else continue; - unsigned PtrAddrSpace = PtrValue->getType()->getPointerAddressSpace(); + auto *PtrTy = dyn_cast(PtrValue->getType()); + if (!PtrTy || PtrTy->isAggregateType()) + continue; + + unsigned PtrAddrSpace = PtrTy->getPointerAddressSpace(); if (PtrAddrSpace) continue; @@ -278,13 +338,31 @@ continue; const SCEV *LSCEV = SE->getSCEV(PtrValue); - const SCEVAddRecExpr *LSCEVAddRec = dyn_cast(LSCEV); + const auto *LSCEVAddRec = dyn_cast(LSCEV); if (!LSCEVAddRec) continue; + if (getPrefetchDegree() > 0) + PrefetchNextLine = isIrregularSymbolicStride(LSCEVAddRec, L); + + if (!PrefetchNextLine && + ItersAhead > getMaxPrefetchIterationsAhead()) + continue; + + if (PrefetchNextLine) { + DEBUG(dbgs() << "Prefetching " << getPrefetchDegree() + << " cache line(s) ahead (irregular access: " << *PtrValue + << ") in " << L->getHeader()->getParent()->getName() + << ": " << *L); + } else { + DEBUG(dbgs() << "Prefetching " << ItersAhead + << " iterations ahead (loop size: " << LoopSize << ") in " + << L->getHeader()->getParent()->getName() << ": " << *L); + } + // Check if the the stride of the accesses is large enough to warrant a // prefetch. - if (!isStrideLargeEnough(LSCEVAddRec)) + if (!PrefetchNextLine && !isStrideLargeEnough(LSCEVAddRec)) continue; // We don't want to double prefetch individual cache lines. If this load @@ -305,9 +383,17 @@ if (DupPref) continue; - const SCEV *NextLSCEV = SE->getAddExpr(LSCEVAddRec, SE->getMulExpr( - SE->getConstant(LSCEVAddRec->getType(), ItersAhead), - LSCEVAddRec->getStepRecurrence(*SE))); + // If this is a irregular symbolic stride, prefetch next cache line. + const SCEV *NextLSCEV = + PrefetchNextLine + ? SE->getAddExpr(LSCEVAddRec, + SE->getConstant(LSCEVAddRec->getType(), + BytesAhead)) + : SE->getAddExpr( + LSCEVAddRec, + SE->getMulExpr( + SE->getConstant(LSCEVAddRec->getType(), ItersAhead), + LSCEVAddRec->getStepRecurrence(*SE))); if (!isSafeToExpand(NextLSCEV, *SE)) continue; Index: test/Transforms/LoopDataPrefetch/AArch64/kryo-large-stride.ll =================================================================== --- test/Transforms/LoopDataPrefetch/AArch64/kryo-large-stride.ll +++ test/Transforms/LoopDataPrefetch/AArch64/kryo-large-stride.ll @@ -1,7 +1,11 @@ ; RUN: opt -mcpu=kryo -mtriple=aarch64-gnu-linux -loop-data-prefetch -max-prefetch-iters-ahead=1000 -S < %s | FileCheck %s --check-prefix=LARGE_PREFETCH --check-prefix=ALL ; RUN: opt -mcpu=kryo -mtriple=aarch64-gnu-linux -loop-data-prefetch -S < %s | FileCheck %s --check-prefix=NO_LARGE_PREFETCH --check-prefix=ALL +; RUN: opt -mcpu=kryo -mtriple=aarch64-gnu-linux -loop-data-prefetch -S < %s | FileCheck %s --check-prefix=SYMBOLIC_PREFETCH --check-prefix=ALL +; RUN: opt -mcpu=kryo -mtriple=aarch64-gnu-linux -loop-data-prefetch -prefetch-degree=0 -S < %s | FileCheck %s --check-prefix=NO_SYMBOLIC_PREFETCH --check-prefix=ALL ; RUN: opt -mcpu=kryo -mtriple=aarch64-gnu-linux -passes=loop-data-prefetch -max-prefetch-iters-ahead=1000 -S < %s | FileCheck %s --check-prefix=LARGE_PREFETCH --check-prefix=ALL ; RUN: opt -mcpu=kryo -mtriple=aarch64-gnu-linux -passes=loop-data-prefetch -S < %s | FileCheck %s --check-prefix=NO_LARGE_PREFETCH --check-prefix=ALL +; RUN: opt -mcpu=kryo -mtriple=aarch64-gnu-linux -passes=loop-data-prefetch -S < %s | FileCheck %s --check-prefix=SYMBOLIC_PREFETCH --check-prefix=ALL +; RUN: opt -mcpu=kryo -mtriple=aarch64-gnu-linux -passes=loop-data-prefetch -prefetch-degree=0 -S < %s | FileCheck %s --check-prefix=NO_SYMBOLIC_PREFETCH --check-prefix=ALL target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128" @@ -51,3 +55,109 @@ for.end: ; preds = %for.body ret void } + +; No Prefetching in the presence of regular symbolic strides: +; +; for (unsigned i = 0; i < 100; i++) +; A[i + 1] = A[Stride * i] + B[i]; + +; ALL-LABEL: @regular_symbolic_stride( +define void @regular_symbolic_stride(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i64 %N, + i64 %stride) { +entry: + br label %for.body + +; ALL: for.body: +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %mul = mul i64 %indvars.iv, %stride +; SYMBOLIC_PREFETCH-NOT: call void @llvm.prefetch +; NO_SYMBOLIC_PREFETCH-NOT: call void @llvm.prefetch + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %mul + %load = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv + %load_1 = load i32, i32* %arrayidx2, align 4 + %add = add i32 %load_1, %load + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %arrayidx_next = getelementptr inbounds i32, i32* %A, i64 %indvars.iv.next + store i32 %add, i32* %arrayidx_next, align 4 + %exitcond = icmp eq i64 %indvars.iv.next, %N + br i1 %exitcond, label %for.end, label %for.body + +; ALL: for.end: +for.end: ; preds = %for.body + ret void +} + +; Prefetching in the presence of irregular symbolic strides: +; +; struct MyStruct { +; int field; +; char kk[60]; +;} *my_struct; +; +;int f(struct MyStruct *p, struct MyStruct *q, int N) { +; int total = 0; +; struct MyStruct *r = p; +; for (int i = 0; i < N/300; ++i) +; for (r = p + i; r < q; r += N) +; total += r->field; +; return total; +;} + +%struct.MyStruct = type { i32, [60 x i8] } + +@my_struct = local_unnamed_addr global %struct.MyStruct* null, align 8 + +; ALL-LABEL: @irregular_symbolic_stride( +define i32 @irregular_symbolic_stride(%struct.MyStruct* readonly %p, %struct.MyStruct* readnone %q, i32 %N) local_unnamed_addr { +entry: + %cmp23 = icmp sgt i32 %N, 299 + br i1 %cmp23, label %for.cond1.preheader.lr.ph, label %for.cond.cleanup + +for.cond1.preheader.lr.ph: ; preds = %entry + %div27 = udiv i32 %N, 300 + %idx.ext4 = sext i32 %N to i64 + %0 = zext i32 %div27 to i64 + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.cond1.preheader.lr.ph, %for.inc6 + %indvars.iv = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next, %for.inc6 ] + %total.024 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %total.1.lcssa, %for.inc6 ] + %add.ptr519 = getelementptr inbounds %struct.MyStruct, %struct.MyStruct* %p, i64 %indvars.iv + %cmp220 = icmp ult %struct.MyStruct* %add.ptr519, %q + br i1 %cmp220, label %for.body3.preheader, label %for.inc6 + +for.body3.preheader: ; preds = %for.cond1.preheader + br label %for.body3 + +for.cond.cleanup.loopexit: ; preds = %for.inc6 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + %total.0.lcssa = phi i32 [ 0, %entry ], [ %total.1.lcssa, %for.cond.cleanup.loopexit ] + ret i32 %total.0.lcssa + +; ALL: for.body3: +for.body3: ; preds = %for.body3.preheader, %for.body3 + %add.ptr522 = phi %struct.MyStruct* [ %add.ptr5, %for.body3 ], [ %add.ptr519, %for.body3.preheader ] + %total.121 = phi i32 [ %add, %for.body3 ], [ %total.024, %for.body3.preheader ] + %field = getelementptr inbounds %struct.MyStruct, %struct.MyStruct* %add.ptr522, i64 0, i32 0 +; SYMBOLIC_PREFETCH: call void @llvm.prefetch +; NO_SYMBOLIC_PREFETCH-NOT: call void @llvm.prefetch + %1 = load i32, i32* %field, align 4 + %add = add nsw i32 %1, %total.121 + %add.ptr5 = getelementptr inbounds %struct.MyStruct, %struct.MyStruct* %add.ptr522, i64 %idx.ext4 + %cmp2 = icmp ult %struct.MyStruct* %add.ptr5, %q + br i1 %cmp2, label %for.body3, label %for.inc6.loopexit + +; ALL:for.inc6.loopexit +for.inc6.loopexit: ; preds = %for.body3 + br label %for.inc6 + +for.inc6: ; preds = %for.inc6.loopexit, %for.cond1.preheader + %total.1.lcssa = phi i32 [ %total.024, %for.cond1.preheader ], [ %add, %for.inc6.loopexit ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %cmp = icmp slt i64 %indvars.iv.next, %0 + br i1 %cmp, label %for.cond1.preheader, label %for.cond.cleanup.loopexit +}