diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp --- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp @@ -20,6 +20,7 @@ #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/LoopUnrollAnalyzer.h" #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/ScalarEvolution.h" @@ -251,7 +252,35 @@ L->dump()); return false; } - if (Metrics.NumInsts > MaxHeaderSize) + + // We track the simplification of each instruction in each iteration. We use + // this to recursively merge costs into the unrolled cost on-demand so that + // we don't count the cost of any dead code. This is essentially a map from + // to , but stored as a densely packed + // struct. + DenseSet InstCostMap; + // + // The estimated cost of the unrolled form of the loop. We try to estimate + // this by simplifying as much as we can while computing the estimate. + unsigned UnrolledCost = 0; + + // We also track the estimated dynamic (that is, actually executed) cost in + // the rolled form. This helps identify cases when the savings from + // unrolling aren't just exposing dead control flows, but actual reduced + // dynamic instructions due to the simplifications which we expect to occur + // after unrolling. + unsigned RolledDynamicCost = 0; + + DenseMap SimplifiedValues; + + UnrolledInstAnalyzer Analyzer(0, SimplifiedValues, EphValues, InstCostMap, + RolledDynamicCost, UnrolledCost, + MaxHeaderSize, *SE, *TTI, L); + if (!Analyzer.visitBB(*OrigHeader)) + return false; + + Analyzer.AddCostRecursively(*OrigHeader->getTerminator(), 0); + if (UnrolledCost > MaxHeaderSize + 1) return false; } diff --git a/llvm/test/Transforms/LoopRotate/oz-disable.ll b/llvm/test/Transforms/LoopRotate/oz-disable.ll --- a/llvm/test/Transforms/LoopRotate/oz-disable.ll +++ b/llvm/test/Transforms/LoopRotate/oz-disable.ll @@ -2,14 +2,18 @@ ; RUN: opt < %s -S -Os -debug -debug-only=loop-rotate 2>&1 | FileCheck %s -check-prefix=OS ; RUN: opt < %s -S -Oz -debug -debug-only=loop-rotate 2>&1 | FileCheck %s -check-prefix=OZ -; Loop should be rotated for -Os but not for -Oz. -; OS: rotating Loop at depth 1 -; OZ-NOT: rotating Loop at depth 1 - @e = global i32 10 declare void @use(i32) +; Loop should be rotated for -Os but not for -Oz, as not all instructions in +; the header are free. + +; OS: LoopRotation: rotating Loop at depth 1 containing: %loop
,%loop.fin +; OS-NEXT: LoopRotation: into Loop at depth 1 containing: %loop.fin
+ +; OZ-NOT: LoopRotation: rotating Loop at depth 1 containing: %loop
,%loop.fin +; OZ-NOT: LoopRotation: into Loop at depth 1 containing: %loop.fin
define void @test() { entry: %end = load i32, i32* @e @@ -28,3 +32,29 @@ exit: ret void } + +; Loop should be rotated for both -Os and -Oz, as all instructions in the header +; are free. + +; OS-NEXT: LoopRotation: rotating Loop at depth 1 containing: %loop2
,%loop2.fin +; OS-NEXT: LoopRotation: into Loop at depth 1 containing: %loop2.fin
+ +; OZ: LoopRotation: rotating Loop at d +; OZ-NEXT: LoopRotation: into Loop at depth 1 containing: %loop2.fin
+define void @test2() { +entry: + br label %loop2 + +loop2: + %n.phi = phi i32 [ %n, %loop2.fin ], [ 0, %entry ] + %cond = icmp eq i32 %n.phi, 15 + br i1 %cond, label %exit, label %loop2.fin + +loop2.fin: + %n = add i32 %n.phi, 1 + call void @use(i32 %n) + br label %loop2 + +exit: + ret void +}