Index: llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp =================================================================== --- llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -164,9 +164,9 @@ cl::init(std::numeric_limits::max()), cl::desc("LSR search space complexity limit")); -static cl::opt EnableRecursiveSetupCost( - "lsr-recursive-setupcost", cl::Hidden, cl::init(true), - cl::desc("Enable more thorough lsr setup cost calculation")); +static cl::opt SetupCostDepthLimit( + "lsr-setupcost-depth-limit", cl::Hidden, cl::init(7), + cl::desc("The limit on recursion depth for LSRs setup cost")); #ifndef NDEBUG // Stress test IV chain generation. @@ -1212,22 +1212,23 @@ bool HasBaseReg, int64_t Scale, Instruction *Fixup = nullptr); -static unsigned getSetupCost(const SCEV *Reg) { +static unsigned getSetupCost(const SCEV *Reg, unsigned Depth) { if (isa(Reg) || isa(Reg)) return 1; - if (!EnableRecursiveSetupCost) + if (Depth == 0) return 0; if (const auto *S = dyn_cast(Reg)) - return getSetupCost(S->getStart()); + return getSetupCost(S->getStart(), Depth - 1); if (auto S = dyn_cast(Reg)) - return getSetupCost(S->getOperand()); + return getSetupCost(S->getOperand(), Depth - 1); if (auto S = dyn_cast(Reg)) return std::accumulate(S->op_begin(), S->op_end(), 0, - [](unsigned i, const SCEV *Reg) { - return i + getSetupCost(Reg); + [&](unsigned i, const SCEV *Reg) { + return i + getSetupCost(Reg, Depth - 1); }); if (auto S = dyn_cast(Reg)) - return getSetupCost(S->getLHS()) + getSetupCost(S->getRHS()); + return getSetupCost(S->getLHS(), Depth - 1) + + getSetupCost(S->getRHS(), Depth - 1); return 0; } @@ -1293,7 +1294,9 @@ // Rough heuristic; favor registers which don't require extra setup // instructions in the preheader. - C.SetupCost += getSetupCost(Reg); + C.SetupCost += getSetupCost(Reg, SetupCostDepthLimit); + // Ensure we don't, even with the recusion limit, produce invalid costs. + C.SetupCost = std::min(C.SetupCost, 1 << 16); C.NumIVMuls += isa(Reg) && SE->hasComputableLoopEvolution(Reg, L); Index: llvm/test/CodeGen/Hexagon/swp-carried-1.ll =================================================================== --- llvm/test/CodeGen/Hexagon/swp-carried-1.ll +++ llvm/test/CodeGen/Hexagon/swp-carried-1.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -rdf-opt=0 -disable-hexagon-misched -hexagon-initial-cfg-cleanup=0 -lsr-recursive-setupcost=0 < %s | FileCheck %s +; RUN: llc -march=hexagon -rdf-opt=0 -disable-hexagon-misched -hexagon-initial-cfg-cleanup=0 -lsr-setupcost-depth-limit=1 < %s | FileCheck %s ; Test that we generate the correct code when a loop carried value ; is scheduled one stage earlier than it's use. The code in Index: llvm/test/Transforms/LoopStrengthReduce/gnarly-setupcost.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopStrengthReduce/gnarly-setupcost.ll @@ -0,0 +1,90 @@ +; RUN: opt < %s -loop-reduce -S | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1" + +; Check this completes sensibly +; CHECK-LABEL: func +; CHECK: load i32, i32* %gep + +define i32 @func(i32* %in) { +entry: + %load = load i32, i32* %in, align 4 + %a1 = add i32 %load, 1 + %m1 = mul i32 %a1, %load + %a2 = add i32 %m1, 1 + %m2 = mul i32 %a2, %m1 + %a3 = add i32 %m2, 1 + %m3 = mul i32 %a3, %m2 + %a4 = add i32 %m3, 1 + %m4 = mul i32 %a4, %m3 + %a5 = add i32 %m4, 1 + %m5 = mul i32 %a5, %m4 + %a6 = add i32 %m5, 1 + %m6 = mul i32 %a6, %m5 + %a7 = add i32 %m6, 1 + %m7 = mul i32 %a7, %m6 + %a8 = add i32 %m7, 1 + %m8 = mul i32 %a8, %m7 + %a9 = add i32 %m8, 1 + %m9 = mul i32 %a9, %m8 + %a10 = add i32 %m9, 1 + %m10 = mul i32 %a10, %m9 + %a11 = add i32 %m10, 1 + %m11 = mul i32 %a11, %m10 + %a12 = add i32 %m11, 1 + %m12 = mul i32 %a12, %m11 + %a13 = add i32 %m12, 1 + %m13 = mul i32 %a13, %m12 + %a14 = add i32 %m13, 1 + %m14 = mul i32 %a14, %m13 + %a15 = add i32 %m14, 1 + %m15 = mul i32 %a15, %m14 + %a16 = add i32 %m15, 1 + %m16 = mul i32 %a16, %m15 + %a17 = add i32 %m16, 1 + %m17 = mul i32 %a17, %m16 + %a18 = add i32 %m17, 1 + %m18 = mul i32 %a18, %m17 + %a19 = add i32 %m18, 1 + %m19 = mul i32 %a19, %m18 + %a20 = add i32 %m19, 1 + %m20 = mul i32 %a20, %m19 + %a21 = add i32 %m20, 1 + %m21 = mul i32 %a21, %m20 + %a22 = add i32 %m21, 1 + %m22 = mul i32 %a22, %m21 + %a23 = add i32 %m22, 1 + %m23 = mul i32 %a23, %m22 + %a24 = add i32 %m23, 1 + %m24 = mul i32 %a24, %m23 + %a25 = add i32 %m24, 1 + %m25 = mul i32 %a25, %m24 + %a26 = add i32 %m25, 1 + %m26 = mul i32 %a26, %m25 + %a27 = add i32 %m26, 1 + %m27 = mul i32 %a27, %m26 + %a28 = add i32 %m27, 1 + %m28 = mul i32 %a28, %m27 + %a29 = add i32 %m28, 1 + %m29 = mul i32 %a29, %m28 + %a30 = add i32 %m29, 1 + %m30 = mul i32 %a30, %m29 + %a31 = add i32 %m30, 1 + %m31 = mul i32 %a31, %m30 + br label %loop + +loop: + %lp = phi i32 [ %m31, %entry ], [ %linc, %loop ] + %0 = sext i32 %lp to i64 + %gep = getelementptr inbounds i32, i32* %in, i64 %0 + %loopload = load i32, i32* %gep, align 4 + store i32 0, i32* %gep, align 4 + %linc = add i32 %lp, 1 + %lcmp = icmp eq i32 %linc, 100 + br i1 %lcmp, label %exit, label %loop + +exit: + %ll = phi i32 [ %loopload, %loop ] + ret i32 %ll +} + Index: llvm/test/Transforms/LoopStrengthReduce/two-combinations-bug.ll =================================================================== --- llvm/test/Transforms/LoopStrengthReduce/two-combinations-bug.ll +++ llvm/test/Transforms/LoopStrengthReduce/two-combinations-bug.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -loop-reduce -lsr-recursive-setupcost=0 -S | FileCheck %s +; RUN: opt < %s -loop-reduce -lsr-setupcost-depth-limit=1 -S | FileCheck %s ; This test is adapted from the n-body test of the LLVM test-suite: A bug in ; r345114 caused LSR to generate incorrect code. The test verifies that the