Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -428,6 +428,11 @@ /// adding SW prefetches. The default is 1, i.e. prefetch with any stride. unsigned getMinPrefetchStride() const; + /// \return The maximum number of iterations to prefetch ahead. If the + /// required number of iterations is more than this number, no prefetching is + /// performed. + unsigned getMaxPrefetchIterationsAhead() const; + /// \return The maximum interleave factor that any transform should try to /// perform for this target. This number depends on the level of parallelism /// and the number of execution units in the CPU. @@ -624,6 +629,7 @@ virtual unsigned getCacheLineSize() = 0; virtual unsigned getPrefetchDistance() = 0; virtual unsigned getMinPrefetchStride() = 0; + virtual unsigned getMaxPrefetchIterationsAhead() = 0; virtual unsigned getMaxInterleaveFactor(unsigned VF) = 0; virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind Opd1Info, @@ -797,6 +803,9 @@ unsigned getMinPrefetchStride() override { return Impl.getMinPrefetchStride(); } + unsigned getMaxPrefetchIterationsAhead() override { + return Impl.getMaxPrefetchIterationsAhead(); + } unsigned getMaxInterleaveFactor(unsigned VF) override { return Impl.getMaxInterleaveFactor(VF); } Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -270,6 +270,8 @@ unsigned getMinPrefetchStride() { return 1; } + unsigned getMaxPrefetchIterationsAhead() { return UINT_MAX; } + unsigned getMaxInterleaveFactor(unsigned VF) { return 1; } unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -227,6 +227,10 @@ return TTIImpl->getMinPrefetchStride(); } +unsigned TargetTransformInfo::getMaxPrefetchIterationsAhead() const { + return TTIImpl->getMaxPrefetchIterationsAhead(); +} + unsigned TargetTransformInfo::getMaxInterleaveFactor(unsigned VF) const { return TTIImpl->getMaxInterleaveFactor(VF); } Index: lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.h +++ lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -133,6 +133,8 @@ unsigned getPrefetchDistance(); unsigned getMinPrefetchStride(); + + unsigned getMaxPrefetchIterationsAhead(); /// @} }; Index: lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -31,6 +31,13 @@ cl::desc("Min stride to add prefetches for Cyclone"), cl::init(2048), cl::Hidden); +// Be conservative for now and don't prefetch ahead too much since the loop +// may terminate early. +static cl::opt CycloneMaxPrefetchIterationsAhead( + "cyclone-max-prefetch-iters-ahead", + cl::desc("Max number of iterations to prefetch ahead on Cyclone"), + cl::init(3), cl::Hidden); + /// \brief Calculate the cost of materializing a 64-bit value. This helper /// method might only calculate a fraction of a larger immediate. Therefore it /// is valid to return a cost of ZERO. @@ -602,3 +609,9 @@ return CycloneMinPrefetchStride; return BaseT::getMinPrefetchStride(); } + +unsigned AArch64TTIImpl::getMaxPrefetchIterationsAhead() { + if (ST->isCyclone()) + return CycloneMaxPrefetchIterationsAhead; + return BaseT::getMaxPrefetchIterationsAhead(); +} Index: lib/Transforms/Scalar/LoopDataPrefetch.cpp =================================================================== --- lib/Transforms/Scalar/LoopDataPrefetch.cpp +++ lib/Transforms/Scalar/LoopDataPrefetch.cpp @@ -171,6 +171,9 @@ if (!ItersAhead) ItersAhead = 1; + if (ItersAhead > TTI->getMaxPrefetchIterationsAhead()) + return MadeChange; + DEBUG(dbgs() << "Prefetching " << ItersAhead << " iterations ahead (loop size: " << LoopSize << ") in " << L->getHeader()->getParent()->getName() << ": " << *L); Index: test/Transforms/LoopDataPrefetch/AArch64/large-stride.ll =================================================================== --- test/Transforms/LoopDataPrefetch/AArch64/large-stride.ll +++ test/Transforms/LoopDataPrefetch/AArch64/large-stride.ll @@ -1,4 +1,5 @@ -; RUN: opt -mcpu=cyclone -mtriple=arm64-apple-ios -loop-data-prefetch -S < %s | FileCheck %s --check-prefix=LARGE_PREFETCH --check-prefix=ALL +; RUN: opt -mcpu=cyclone -mtriple=arm64-apple-ios -loop-data-prefetch -cyclone-max-prefetch-iters-ahead=100 -S < %s | FileCheck %s --check-prefix=LARGE_PREFETCH --check-prefix=ALL +; RUN: opt -mcpu=cyclone -mtriple=arm64-apple-ios -loop-data-prefetch -S < %s | FileCheck %s --check-prefix=NO_LARGE_PREFETCH --check-prefix=ALL ; RUN: opt -mcpu=generic -mtriple=arm64-apple-ios -loop-data-prefetch -S < %s | FileCheck %s --check-prefix=NO_LARGE_PREFETCH --check-prefix=ALL target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"