Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -475,6 +475,9 @@ /// This is currently measured in number of instructions. unsigned getPrefetchDistance() const; + /// \return The number of cache lines ahead we should prefetch. + unsigned getPrefetchDegree() const; + /// \return Some HW prefetchers can handle accesses up to a certain constant /// stride. This is the minimum stride in bytes where it makes sense to start /// adding SW prefetches. The default is 1, i.e. prefetch with any stride. @@ -696,6 +699,7 @@ virtual unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) = 0; virtual unsigned getCacheLineSize() = 0; virtual unsigned getPrefetchDistance() = 0; + virtual unsigned getPrefetchDegree() = 0; virtual unsigned getMinPrefetchStride() = 0; virtual unsigned getMaxPrefetchIterationsAhead() = 0; virtual unsigned getMaxInterleaveFactor(unsigned VF) = 0; @@ -896,6 +900,7 @@ return Impl.getCacheLineSize(); } unsigned getPrefetchDistance() override { return Impl.getPrefetchDistance(); } + unsigned getPrefetchDegree() override { return Impl.getPrefetchDegree(); } unsigned getMinPrefetchStride() override { return Impl.getMinPrefetchStride(); } Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -296,6 +296,8 @@ unsigned getPrefetchDistance() { return 0; } + unsigned getPrefetchDegree() { return 0; } + unsigned getMinPrefetchStride() { return 1; } unsigned getMaxPrefetchIterationsAhead() { return UINT_MAX; } Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -263,6 +263,10 @@ return TTIImpl->getPrefetchDistance(); } +unsigned TargetTransformInfo::getPrefetchDegree() const { + return TTIImpl->getPrefetchDegree(); +} + unsigned TargetTransformInfo::getMinPrefetchStride() const { return TTIImpl->getMinPrefetchStride(); } Index: lib/Target/AArch64/AArch64Subtarget.h =================================================================== --- lib/Target/AArch64/AArch64Subtarget.h +++ lib/Target/AArch64/AArch64Subtarget.h @@ -87,6 +87,7 @@ uint8_t VectorInsertExtractBaseCost = 3; uint16_t CacheLineSize = 0; uint16_t PrefetchDistance = 0; + uint16_t PrefetchDegree = 0; uint16_t MinPrefetchStride = 1; unsigned MaxPrefetchIterationsAhead = UINT_MAX; unsigned PrefFunctionAlignment = 0; @@ -198,6 +199,7 @@ } unsigned getCacheLineSize() const { return CacheLineSize; } unsigned getPrefetchDistance() const { return PrefetchDistance; } + unsigned getPrefetchDegree() const { return PrefetchDegree; } unsigned getMinPrefetchStride() const { return MinPrefetchStride; } unsigned getMaxPrefetchIterationsAhead() const { return MaxPrefetchIterationsAhead; Index: lib/Target/AArch64/AArch64Subtarget.cpp =================================================================== --- lib/Target/AArch64/AArch64Subtarget.cpp +++ lib/Target/AArch64/AArch64Subtarget.cpp @@ -71,6 +71,7 @@ VectorInsertExtractBaseCost = 2; CacheLineSize = 128; PrefetchDistance = 740; + PrefetchDegree = 1; MinPrefetchStride = 1024; MaxPrefetchIterationsAhead = 11; break; Index: lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.h +++ lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -135,6 +135,8 @@ unsigned getPrefetchDistance(); + unsigned getPrefetchDegree(); + unsigned getMinPrefetchStride(); unsigned getMaxPrefetchIterationsAhead(); Index: lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -633,6 +633,10 @@ return ST->getPrefetchDistance(); } +unsigned AArch64TTIImpl::getPrefetchDegree() { + return ST->getPrefetchDegree(); +} + unsigned AArch64TTIImpl::getMinPrefetchStride() { return ST->getMinPrefetchStride(); } Index: lib/Transforms/Scalar/LoopDataPrefetch.cpp =================================================================== --- lib/Transforms/Scalar/LoopDataPrefetch.cpp +++ lib/Transforms/Scalar/LoopDataPrefetch.cpp @@ -52,6 +52,11 @@ cl::Hidden); static cl::opt + PrefetchDegree("prefetch-degree", + cl::desc("Number of cache lines to prefetch ahead"), + cl::Hidden); + +static cl::opt MinPrefetchStride("min-prefetch-stride", cl::desc("Min stride to add prefetches"), cl::Hidden); @@ -78,7 +83,10 @@ /// \brief Check if the the stride of the accesses is large enough to /// warrant a prefetch. - bool isStrideLargeEnough(const SCEVAddRecExpr *AR); + bool isStrideLargeEnough(const SCEV *Step); + + /// \brief Look for symbolic strides "a[i*stride]". + bool isSymbolicStride(const SCEV *V, Loop *L); unsigned getMinPrefetchStride() { if (MinPrefetchStride.getNumOccurrences() > 0) @@ -92,6 +100,12 @@ return TTI->getPrefetchDistance(); } + unsigned getPrefetchDegree() { + if (PrefetchDegree.getNumOccurrences() > 0) + return PrefetchDegree; + return TTI->getPrefetchDegree(); + } + unsigned getMaxPrefetchIterationsAhead() { if (MaxPrefetchIterationsAhead.getNumOccurrences() > 0) return MaxPrefetchIterationsAhead; @@ -145,13 +159,13 @@ return new LoopDataPrefetchLegacyPass(); } -bool LoopDataPrefetch::isStrideLargeEnough(const SCEVAddRecExpr *AR) { +bool LoopDataPrefetch::isStrideLargeEnough(const SCEV *Step) { unsigned TargetMinStride = getMinPrefetchStride(); // No need to check if any stride goes. if (TargetMinStride <= 1) return true; - const auto *ConstStride = dyn_cast(AR->getStepRecurrence(*SE)); + const auto *ConstStride = dyn_cast(Step); // If MinStride is set, don't prefetch unless we can ensure that stride is // larger. if (!ConstStride) @@ -161,6 +175,31 @@ return TargetMinStride <= AbsStride; } +bool LoopDataPrefetch::isSymbolicStride(const SCEV *V, Loop *L) { + const auto *M = dyn_cast(V); + if (!M) + return false; + // Check if the step value is a non-unit constant. + V = M->getOperand(0); + if (V->getSCEVType() != scConstant) + return false; + + if (V->isOne() || V->isAllOnesValue()) + return false; + + V = M->getOperand(1); + + // Strip off casts. + while (const auto *C = dyn_cast(V)) + V = C->getOperand(); + + const auto *U = dyn_cast(V); + if (!U) + return false; + + return (L->isLoopInvariant(U->getValue())); +} + PreservedAnalyses LoopDataPrefetchPass::run(Function &F, FunctionAnalysisManager &AM) { LoopInfo *LI = &AM.getResult(F); @@ -245,21 +284,16 @@ LoopSize = 1; unsigned ItersAhead = getPrefetchDistance() / LoopSize; + unsigned BytesAhead = TTI->getCacheLineSize() * getPrefetchDegree(); if (!ItersAhead) ItersAhead = 1; - if (ItersAhead > getMaxPrefetchIterationsAhead()) - return MadeChange; - - DEBUG(dbgs() << "Prefetching " << ItersAhead - << " iterations ahead (loop size: " << LoopSize << ") in " - << L->getHeader()->getParent()->getName() << ": " << *L); - SmallVector, 16> PrefLoads; for (const auto BB : L->blocks()) { for (auto &I : *BB) { Value *PtrValue; Instruction *MemI; + bool PrefetchNextLine = false; if (LoadInst *LMemI = dyn_cast(&I)) { MemI = LMemI; @@ -270,7 +304,11 @@ PtrValue = SMemI->getPointerOperand(); } else continue; - unsigned PtrAddrSpace = PtrValue->getType()->getPointerAddressSpace(); + auto *PtrTy = dyn_cast(PtrValue->getType()); + if (!PtrTy || PtrTy->isAggregateType()) + continue; + + unsigned PtrAddrSpace = PtrTy->getPointerAddressSpace(); if (PtrAddrSpace) continue; @@ -278,13 +316,35 @@ continue; const SCEV *LSCEV = SE->getSCEV(PtrValue); - const SCEVAddRecExpr *LSCEVAddRec = dyn_cast(LSCEV); + const auto *LSCEVAddRec = dyn_cast(LSCEV); if (!LSCEVAddRec) continue; + const SCEV *LSCEVStep = LSCEVAddRec->getStepRecurrence(*SE); + if (!LSCEVStep) + continue; + + if (getPrefetchDegree() > 0) + PrefetchNextLine = isSymbolicStride(LSCEVStep, L); + + if (!PrefetchNextLine && + ItersAhead > getMaxPrefetchIterationsAhead()) + continue; + + if (PrefetchNextLine) { + DEBUG(dbgs() << "Prefetching " << getPrefetchDegree() + << " cache line(s) ahead (symbolic strided access: " + << *PtrValue << ") in " + << L->getHeader()->getParent()->getName() << ": " << *L); + } else { + DEBUG(dbgs() << "Prefetching " << ItersAhead + << " iterations ahead (loop size: " << LoopSize << ") in " + << L->getHeader()->getParent()->getName() << ": " << *L); + } + // Check if the the stride of the accesses is large enough to warrant a // prefetch. - if (!isStrideLargeEnough(LSCEVAddRec)) + if (!PrefetchNextLine && !isStrideLargeEnough(LSCEVStep)) continue; // We don't want to double prefetch individual cache lines. If this load @@ -305,9 +365,17 @@ if (DupPref) continue; - const SCEV *NextLSCEV = SE->getAddExpr(LSCEVAddRec, SE->getMulExpr( - SE->getConstant(LSCEVAddRec->getType(), ItersAhead), - LSCEVAddRec->getStepRecurrence(*SE))); + // If this is a symbolic stride, prefetch next cache line of this load. + const SCEV *NextLSCEV = + PrefetchNextLine + ? SE->getAddExpr(LSCEVAddRec, + SE->getConstant(LSCEVAddRec->getType(), + BytesAhead)) + : SE->getAddExpr( + LSCEVAddRec, + SE->getMulExpr( + SE->getConstant(LSCEVAddRec->getType(), ItersAhead), + LSCEVAddRec->getStepRecurrence(*SE))); if (!isSafeToExpand(NextLSCEV, *SE)) continue; Index: test/Transforms/LoopDataPrefetch/AArch64/kryo-large-stride.ll =================================================================== --- test/Transforms/LoopDataPrefetch/AArch64/kryo-large-stride.ll +++ test/Transforms/LoopDataPrefetch/AArch64/kryo-large-stride.ll @@ -1,7 +1,9 @@ ; RUN: opt -mcpu=kryo -mtriple=aarch64-gnu-linux -loop-data-prefetch -max-prefetch-iters-ahead=1000 -S < %s | FileCheck %s --check-prefix=LARGE_PREFETCH --check-prefix=ALL ; RUN: opt -mcpu=kryo -mtriple=aarch64-gnu-linux -loop-data-prefetch -S < %s | FileCheck %s --check-prefix=NO_LARGE_PREFETCH --check-prefix=ALL +; RUN: opt -mcpu=kryo -mtriple=aarch64-gnu-linux -loop-data-prefetch -S < %s | FileCheck %s --check-prefix=SYMBOLIC_PREFETCH --check-prefix=ALL ; RUN: opt -mcpu=kryo -mtriple=aarch64-gnu-linux -passes=loop-data-prefetch -max-prefetch-iters-ahead=1000 -S < %s | FileCheck %s --check-prefix=LARGE_PREFETCH --check-prefix=ALL ; RUN: opt -mcpu=kryo -mtriple=aarch64-gnu-linux -passes=loop-data-prefetch -S < %s | FileCheck %s --check-prefix=NO_LARGE_PREFETCH --check-prefix=ALL +; RUN: opt -mcpu=kryo -mtriple=aarch64-gnu-linux -passes=loop-data-prefetch -S < %s | FileCheck %s --check-prefix=SYMBOLIC_PREFETCH --check-prefix=ALL target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128" @@ -51,3 +53,35 @@ for.end: ; preds = %for.body ret void } + +; Prefetching in the presence of symbolic strides: +; +; for (unsigned i = 0; i < 100; i++) +; A[i + 1] = A[Stride * i] + B[i]; + +; ALL-LABEL: @symbolic_stride( +define void @symbolic_stride(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i64 %N, + i64 %stride) { +entry: + br label %for.body + +; ALL: for.body: +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %mul = mul i64 %indvars.iv, %stride +; SYMBOLIC_PREFETCH: call void @llvm.prefetch + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %mul + %load = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv + %load_1 = load i32, i32* %arrayidx2, align 4 + %add = add i32 %load_1, %load + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %arrayidx_next = getelementptr inbounds i32, i32* %A, i64 %indvars.iv.next + store i32 %add, i32* %arrayidx_next, align 4 + %exitcond = icmp eq i64 %indvars.iv.next, %N + br i1 %exitcond, label %for.end, label %for.body + +; ALL: for.end: +for.end: ; preds = %for.body + ret void +}