Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -423,6 +423,11 @@ /// This is currently measured in number of instructions. unsigned getPrefetchDistance() const; + /// \return Some HW prefetchers can handle accesses up to a certain constant + /// stride. This is the minimum stride in bytes where it makes sense to start + /// adding SW prefetches. The default is 1, i.e. prefetch with any stride. + unsigned getMinPrefetchStride() const; + /// \return The maximum interleave factor that any transform should try to /// perform for this target. This number depends on the level of parallelism /// and the number of execution units in the CPU. @@ -618,6 +623,7 @@ virtual unsigned getRegisterBitWidth(bool Vector) = 0; virtual unsigned getCacheLineSize() = 0; virtual unsigned getPrefetchDistance() = 0; + virtual unsigned getMinPrefetchStride() = 0; virtual unsigned getMaxInterleaveFactor(unsigned VF) = 0; virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind Opd1Info, @@ -788,6 +794,9 @@ return Impl.getCacheLineSize(); } unsigned getPrefetchDistance() override { return Impl.getPrefetchDistance(); } + unsigned getMinPrefetchStride() override { + return Impl.getMinPrefetchStride(); + } unsigned getMaxInterleaveFactor(unsigned VF) override { return Impl.getMaxInterleaveFactor(VF); } Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -268,6 +268,8 @@ unsigned getPrefetchDistance() { return 0; } + unsigned getMinPrefetchStride() { return 1; } + unsigned getMaxInterleaveFactor(unsigned VF) { return 1; } unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -223,6 +223,10 @@ return TTIImpl->getPrefetchDistance(); } +unsigned TargetTransformInfo::getMinPrefetchStride() const { + return TTIImpl->getMinPrefetchStride(); +} + unsigned TargetTransformInfo::getMaxInterleaveFactor(unsigned VF) const { return TTIImpl->getMaxInterleaveFactor(VF); } Index: lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.h +++ lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -131,6 +131,8 @@ unsigned getCacheLineSize(); unsigned getPrefetchDistance(); + + unsigned getMinPrefetchStride(); /// @} }; Index: lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -25,6 +25,12 @@ cl::desc("Number of instructions to prefetch ahead for Cyclone"), cl::init(280), cl::Hidden); +// The HW prefetcher handles accesses with strides up to 2KB. +static cl::opt CycloneMinPrefetchStride( + "cyclone-min-prefetch-stride", + cl::desc("Min stride to add prefetches for Cyclone"), + cl::init(2048), cl::Hidden); + /// \brief Calculate the cost of materializing a 64-bit value. This helper /// method might only calculate a fraction of a larger immediate. Therefore it /// is valid to return a cost of ZERO. @@ -590,3 +596,9 @@ return CyclonePrefetchDistance; return BaseT::getPrefetchDistance(); } + +unsigned AArch64TTIImpl::getMinPrefetchStride() { + if (ST->isCyclone()) + return CycloneMinPrefetchStride; + return BaseT::getMinPrefetchStride(); +} Index: lib/Transforms/Scalar/LoopDataPrefetch.cpp =================================================================== --- lib/Transforms/Scalar/LoopDataPrefetch.cpp +++ lib/Transforms/Scalar/LoopDataPrefetch.cpp @@ -73,6 +73,10 @@ bool runOnFunction(Function &F) override; bool runOnLoop(Loop *L); + /// \brief Check if the the stride of the accesses is large enough to + /// warrant a prefetch. + bool isStrideLargeEnough(const SCEVAddRecExpr *AR); + private: AssumptionCache *AC; LoopInfo *LI; @@ -94,6 +98,22 @@ FunctionPass *llvm::createLoopDataPrefetchPass() { return new LoopDataPrefetch(); } +bool LoopDataPrefetch::isStrideLargeEnough(const SCEVAddRecExpr *AR) { + unsigned TargetMinStride = TTI->getMinPrefetchStride(); + // No need to check if any stride goes. + if (TargetMinStride <= 1) + return true; + + const auto *ConstStride = dyn_cast(AR->getStepRecurrence(*SE)); + // If MinStride is set, don't prefetch unless we can ensure that stride is + // larger. + if (!ConstStride) + return false; + + unsigned AbsStride = std::abs(ConstStride->getAPInt().getSExtValue()); + return TargetMinStride <= AbsStride; +} + bool LoopDataPrefetch::runOnFunction(Function &F) { LI = &getAnalysis().getLoopInfo(); SE = &getAnalysis().getSE(); @@ -184,6 +204,11 @@ if (!LSCEVAddRec) continue; + // Check if the the stride of the accesses is large enough to warrant a + // prefetch. + if (!isStrideLargeEnough(LSCEVAddRec)) + continue; + // We don't want to double prefetch individual cache lines. If this load // is known to be within one cache line of some other load that has // already been prefetched, then don't prefetch this one as well. Index: test/Transforms/LoopDataPrefetch/AArch64/large-stride.ll =================================================================== --- /dev/null +++ test/Transforms/LoopDataPrefetch/AArch64/large-stride.ll @@ -0,0 +1,51 @@ +; RUN: opt -mcpu=cyclone -mtriple=arm64-apple-ios -loop-data-prefetch -S < %s | FileCheck %s --check-prefix=CYCLONE --check-prefix=BOTH +; RUN: opt -mcpu=generic -mtriple=arm64-apple-ios -loop-data-prefetch -S < %s | FileCheck %s --check-prefix=GENERIC --check-prefix=BOTH + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128" + +; BOTH-LABEL: @small_stride( +define void @small_stride(double* nocapture %a, double* nocapture readonly %b) { +entry: + br label %for.body + +; BOTH: for.body: +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds double, double* %b, i64 %indvars.iv +; BOTH-NOT: call void @llvm.prefetch + %0 = load double, double* %arrayidx, align 8 + %add = fadd double %0, 1.000000e+00 + %arrayidx2 = getelementptr inbounds double, double* %a, i64 %indvars.iv + store double %add, double* %arrayidx2, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1600 + br i1 %exitcond, label %for.end, label %for.body + +; BOTH: for.end: +for.end: ; preds = %for.body + ret void +} + +; BOTH-LABEL: @large_stride( +define void @large_stride(double* nocapture %a, double* nocapture readonly %b) { +entry: + br label %for.body + +; BOTH: for.body: +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds double, double* %b, i64 %indvars.iv +; CYCLONE: call void @llvm.prefetch +; GENERIC-NOT: call void @llvm.prefetch + %0 = load double, double* %arrayidx, align 8 + %add = fadd double %0, 1.000000e+00 + %arrayidx2 = getelementptr inbounds double, double* %a, i64 %indvars.iv + store double %add, double* %arrayidx2, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 300 + %exitcond = icmp eq i64 %indvars.iv.next, 160000 + br i1 %exitcond, label %for.end, label %for.body + +; BOTH: for.end: +for.end: ; preds = %for.body + ret void +} Index: test/Transforms/LoopDataPrefetch/AArch64/lit.local.cfg =================================================================== --- /dev/null +++ test/Transforms/LoopDataPrefetch/AArch64/lit.local.cfg @@ -0,0 +1,4 @@ +config.suffixes = ['.ll'] + +if not 'AArch64' in config.root.targets: + config.unsupported = True