diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -191,6 +191,10 @@ "lsr-term-fold", cl::Hidden, cl::init(false), cl::desc("Attempt to replace primary IV with other IV.")); +static cl::opt AllowDropSolutionIfLessProfitable( + "lsr-drop-sol", cl::Hidden, cl::init(false), + cl::desc("Attempt to drop solution if it is less profitable")); + STATISTIC(NumTermFold, "Number of terminating condition fold recognized and performed"); @@ -1975,6 +1979,10 @@ /// SmallDenseSet. SetVector, SmallSet> Factors; + /// The cost of the current SCEV, the best solution by LSR will be dropped if + /// the solution is not profitable. + Cost BaselineCost; + /// Interesting use types, to facilitate truncation reuse. SmallSetVector Types; @@ -3294,6 +3302,11 @@ BranchInst *ExitBranch = nullptr; bool SaveCmp = TTI.canSaveCmp(L, &ExitBranch, &SE, &LI, &DT, &AC, &TLI); + // For calculating baseline cost + SmallPtrSet Regs; + DenseSet VisitedRegs; + DenseSet VisitedLSRUse; + for (const IVStrideUse &U : IU) { Instruction *UserInst = U.getUser(); // Skip IV users that are part of profitable IV Chains. @@ -3387,6 +3400,14 @@ LF.Offset = Offset; LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L); + // Create SCEV as Formula for calculating baseline cost + if (!VisitedLSRUse.count(LUIdx) && !LF.isUseFullyOutsideLoop(L)) { + Formula F; + F.initialMatch(S, L, SE); + BaselineCost.RateFormula(F, Regs, VisitedRegs, LU); + VisitedLSRUse.insert(LUIdx); + } + if (!LU.WidestFixupType || SE.getTypeSizeInBits(LU.WidestFixupType) < SE.getTypeSizeInBits(LF.OperandValToReplace->getType())) @@ -5162,6 +5183,19 @@ }); assert(Solution.size() == Uses.size() && "Malformed solution!"); + + if (BaselineCost.isLess(SolutionCost)) { + LLVM_DEBUG(dbgs() << "The baseline solution requires "; + BaselineCost.print(dbgs()); dbgs() << "\n"); + if (!AllowDropSolutionIfLessProfitable) + LLVM_DEBUG(dbgs() << "Baseline is more profitable than chosen solution, " + "add option 'lsr-drop-sol' to drop LSR solution."); + else { + LLVM_DEBUG(dbgs() << "Baseline is more profitable than chosen " + "solution, dropping LSR solution.\n";); + Solution.clear(); + } + } } /// Helper for AdjustInsertPositionForExpand. Climb up the dominator tree far as @@ -5706,7 +5740,8 @@ MSSAU(MSSAU), AMK(PreferredAddresingMode.getNumOccurrences() > 0 ? PreferredAddresingMode : TTI.getPreferredAddressingMode(L, &SE)), - Rewriter(SE, L->getHeader()->getModule()->getDataLayout(), "lsr", false) { + Rewriter(SE, L->getHeader()->getModule()->getDataLayout(), "lsr", false), + BaselineCost(L, SE, TTI, AMK) { // If LoopSimplify form is not available, stay out of trouble. if (!L->isLoopSimplifyForm()) return; diff --git a/llvm/test/Transforms/LoopStrengthReduce/lsr-drop-solution.ll b/llvm/test/Transforms/LoopStrengthReduce/lsr-drop-solution.ll --- a/llvm/test/Transforms/LoopStrengthReduce/lsr-drop-solution.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/lsr-drop-solution.ll @@ -1,5 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -O3 -mattr=+v | FileCheck %s +; REQUIRES: asserts +; RUN: llc < %s -O3 -mattr=+v -lsr-drop-sol | FileCheck --check-prefix=CHECK %s +; RUN: llc < %s -O3 -mattr=+v -debug -lsr-drop-sol 2>&1 | FileCheck --check-prefix=DEBUG %s +; RUN: llc < %s -O3 -mattr=+v -debug 2>&1 | FileCheck --check-prefix=DEBUG2 %s target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128" target triple = "riscv64-unknown-linux-gnu" @@ -16,27 +19,31 @@ ; CHECK-NEXT: vse8.v v8, (a3) ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB0_2: # %if.then -; CHECK-NEXT: li a5, 0 -; CHECK-NEXT: add a3, a0, a2 -; CHECK-NEXT: sub a6, a3, a4 +; CHECK-NEXT: add a2, a0, a2 +; CHECK-NEXT: sub a5, a2, a4 +; CHECK-NEXT: mv a3, a0 ; CHECK-NEXT: .LBB0_3: # %do.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add a3, a1, a5 ; CHECK-NEXT: vsetvli zero, a4, e8, m8, ta, mu -; CHECK-NEXT: vle8.v v8, (a3) -; CHECK-NEXT: add a7, a0, a5 -; CHECK-NEXT: add a5, a5, a4 -; CHECK-NEXT: add a3, a0, a5 -; CHECK-NEXT: vse8.v v8, (a7) -; CHECK-NEXT: bltu a3, a6, .LBB0_3 +; CHECK-NEXT: vle8.v v8, (a1) +; CHECK-NEXT: vse8.v v8, (a3) +; CHECK-NEXT: add a3, a3, a4 +; CHECK-NEXT: add a1, a1, a4 +; CHECK-NEXT: bltu a3, a5, .LBB0_3 ; CHECK-NEXT: # %bb.4: # %do.end -; CHECK-NEXT: sub a2, a2, a5 +; CHECK-NEXT: sub a2, a2, a3 ; CHECK-NEXT: vsetvli a2, a2, e8, m8, ta, mu -; CHECK-NEXT: add a1, a1, a5 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, mu ; CHECK-NEXT: vle8.v v8, (a1) ; CHECK-NEXT: vse8.v v8, (a3) ; CHECK-NEXT: ret + +;DEBUG: The chosen solution requires 3 instructions 6 regs, with addrec cost 1, plus 2 base adds, plus 5 setup cost +;DEBUG: The baseline solution requires 2 instructions 4 regs, with addrec cost 2, plus 3 setup cost +;DEBUG: Baseline is more profitable than chosen solution, dropping LSR solution. + +;DEBUG2: Baseline is more profitable than chosen solution, add option 'lsr-drop-sol' to drop LSR solution. + entry: %0 = ptrtoint ptr %a0 to i64 %1 = tail call i64 @llvm.riscv.vsetvli.i64(i64 %a2, i64 0, i64 3)