Index: lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -20,6 +20,10 @@ #define DEBUG_TYPE "aarch64tti" +static cl::opt + DisableFalkorHWPFUnrollFix("disable-falkor-hwpf-unroll-fix", + cl::init(false), cl::Hidden); + /// \brief Calculate the cost of materializing a 64-bit value. This helper /// method might only calculate a fraction of a larger immediate. Therefore it /// is valid to return a cost of ZERO. @@ -644,6 +648,55 @@ // Disable partial & runtime unrolling on -Os. UP.PartialOptSizeThreshold = 0; + + // For Falkor, we want to avoid having too many strided loads in a loop since + // that can exhaust the HW prefetcher resources. We adjust the unroller + // MaxCount preference below to attempt to ensure unrolling doesn't create too + // many strided loads. + if (ST->getProcFamily() == AArch64Subtarget::Falkor && + !DisableFalkorHWPFUnrollFix) { + const int MaxStridedLoads = 7; + auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) { + int StridedLoads = 0; + // FIXME? We could make this more precise by looking at the CFG and + // e.g. not counting loads in each side of an if-then-else diamond. + for (const auto BB : L->blocks()) { + for (auto &I : *BB) { + LoadInst *LMemI = dyn_cast(&I); + if (!LMemI) + continue; + + Value *PtrValue = LMemI->getPointerOperand(); + if (L->isLoopInvariant(PtrValue)) + continue; + + const SCEV *LSCEV = SE.getSCEV(PtrValue); + const SCEVAddRecExpr *LSCEVAddRec = dyn_cast(LSCEV); + if (!LSCEVAddRec || !LSCEVAddRec->isAffine()) + continue; + + // FIXME? We could take pairing of unrolled load copies into account + // by looking at the AddRec, but we would probably have to limit this + // to loops with no stores or other memory optimization barriers. + ++StridedLoads; + // We've seen enough strided loads that seeing more won't make a + // difference. + if (StridedLoads > MaxStridedLoads / 2) + return StridedLoads; + } + } + return StridedLoads; + }; + + int StridedLoads = countStridedLoads(L, SE); + DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads << " strided loads\n"); + // Pick the largest power of 2 unroll count that won't result in too many + // strided loads. + if (StridedLoads) { + UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads); + DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to " << UP.MaxCount << '\n'); + } + } } Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Index: test/Transforms/LoopUnroll/AArch64/falkor-prefetch.ll =================================================================== --- /dev/null +++ test/Transforms/LoopUnroll/AArch64/falkor-prefetch.ll @@ -0,0 +1,169 @@ +; RUN: opt < %s -S -loop-unroll -mtriple aarch64 -mcpu=falkor | FileCheck %s +; RUN: opt < %s -S -loop-unroll -mtriple aarch64 -mcpu=falkor -disable-falkor-hwpf-unroll-fix | FileCheck %s --check-prefix=NOHWPF + +; Check that loop unroller doesn't exhaust HW prefetcher resources. + +; Partial unroll 2 times for this loop on falkor instead of 4. +; NOHWPF-LABEL: @unroll1( +; NOHWPF-LABEL: loop: +; NOHWPF-NEXT: phi +; NOHWPF-NEXT: getelementptr +; NOHWPF-NEXT: load +; NOHWPF-NEXT: getelementptr +; NOHWPF-NEXT: load +; NOHWPF-NEXT: add +; NOHWPF-NEXT: getelementptr +; NOHWPF-NEXT: load +; NOHWPF-NEXT: getelementptr +; NOHWPF-NEXT: load +; NOHWPF-NEXT: add +; NOHWPF-NEXT: getelementptr +; NOHWPF-NEXT: load +; NOHWPF-NEXT: getelementptr +; NOHWPF-NEXT: load +; NOHWPF-NEXT: add +; NOHWPF-NEXT: getelementptr +; NOHWPF-NEXT: load +; NOHWPF-NEXT: getelementptr +; NOHWPF-NEXT: load +; NOHWPF-NEXT: add +; NOHWPF-NEXT: icmp +; NOHWPF-NEXT: br +; NOHWPF-NEXT-LABEL: exit: +; +; CHECK-LABEL: @unroll1( +; CHECK-LABEL: loop: +; CHECK-NEXT: phi +; CHECK-NEXT: getelementptr +; CHECK-NEXT: load +; CHECK-NEXT: getelementptr +; CHECK-NEXT: load +; CHECK-NEXT: add +; CHECK-NEXT: getelementptr +; CHECK-NEXT: load +; CHECK-NEXT: getelementptr +; CHECK-NEXT: load +; CHECK-NEXT: add +; CHECK-NEXT: icmp +; CHECK-NEXT: br +; CHECK-NEXT-LABEL: exit: +define void @unroll1(i32* %p, i32* %p2) { +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %inc, %loop ] + + %gep = getelementptr inbounds i32, i32* %p, i32 %iv + %load = load volatile i32, i32* %gep + + %gep2 = getelementptr inbounds i32, i32* %p2, i32 %iv + %load2 = load volatile i32, i32* %gep2 + + %inc = add i32 %iv, 1 + %exitcnd = icmp uge i32 %inc, 1024 + br i1 %exitcnd, label %exit, label %loop + +exit: + ret void +} + +; Partial unroll 4 times for this loop on falkor instead of 8. +; NOHWPF-LABEL: @unroll2( +; NOHWPF-LABEL: loop2: +; NOHWPF-NEXT: phi +; NOHWPF-NEXT: phi +; NOHWPF-NEXT: getelementptr +; NOHWPF-NEXT: load +; NOHWPF-NEXT: add +; NOHWPF-NEXT: add +; NOHWPF-NEXT: getelementptr +; NOHWPF-NEXT: load +; NOHWPF-NEXT: add +; NOHWPF-NEXT: add +; NOHWPF-NEXT: getelementptr +; NOHWPF-NEXT: load +; NOHWPF-NEXT: add +; NOHWPF-NEXT: add +; NOHWPF-NEXT: getelementptr +; NOHWPF-NEXT: load +; NOHWPF-NEXT: add +; NOHWPF-NEXT: add +; NOHWPF-NEXT: getelementptr +; NOHWPF-NEXT: load +; NOHWPF-NEXT: add +; NOHWPF-NEXT: add +; NOHWPF-NEXT: getelementptr +; NOHWPF-NEXT: load +; NOHWPF-NEXT: add +; NOHWPF-NEXT: add +; NOHWPF-NEXT: getelementptr +; NOHWPF-NEXT: load +; NOHWPF-NEXT: add +; NOHWPF-NEXT: add +; NOHWPF-NEXT: getelementptr +; NOHWPF-NEXT: load +; NOHWPF-NEXT: add +; NOHWPF-NEXT: add +; NOHWPF-NEXT: icmp +; NOHWPF-NEXT: br +; NOHWPF-NEXT-LABEL: exit2: +; +; CHECK-LABEL: @unroll2( +; CHECK-LABEL: loop2: +; CHECK-NEXT: phi +; CHECK-NEXT: phi +; CHECK-NEXT: getelementptr +; CHECK-NEXT: load +; CHECK-NEXT: add +; CHECK-NEXT: add +; CHECK-NEXT: getelementptr +; CHECK-NEXT: load +; CHECK-NEXT: add +; CHECK-NEXT: add +; CHECK-NEXT: getelementptr +; CHECK-NEXT: load +; CHECK-NEXT: add +; CHECK-NEXT: add +; CHECK-NEXT: getelementptr +; CHECK-NEXT: load +; CHECK-NEXT: add +; CHECK-NEXT: add +; CHECK-NEXT: icmp +; CHECK-NEXT: br +; CHECK-NEXT-LABEL: exit2: + +define void @unroll2(i32* %p) { +entry: + br label %loop1 + +loop1: + %iv1 = phi i32 [ 0, %entry ], [ %inc1, %loop1.latch ] + %outer.sum = phi i32 [ 0, %entry ], [ %sum, %loop1.latch ] + br label %loop2.header + +loop2.header: + br label %loop2 + +loop2: + %iv2 = phi i32 [ 0, %loop2.header ], [ %inc2, %loop2 ] + %sum = phi i32 [ %outer.sum, %loop2.header ], [ %sum.inc, %loop2 ] + %gep = getelementptr inbounds i32, i32* %p, i32 %iv2 + %load = load i32, i32* %gep + %sum.inc = add i32 %sum, %load + %inc2 = add i32 %iv2, 1 + %exitcnd2 = icmp uge i32 %inc2, 1024 + br i1 %exitcnd2, label %exit2, label %loop2 + +exit2: + br label %loop1.latch + +loop1.latch: + %inc1 = add i32 %iv1, 1 + %exitcnd1 = icmp uge i32 %inc1, 1024 + br i1 %exitcnd2, label %exit, label %loop1 + +exit: + ret void +} +