Index: lib/Transforms/Utils/LoopUnrollPeel.cpp =================================================================== --- lib/Transforms/Utils/LoopUnrollPeel.cpp +++ lib/Transforms/Utils/LoopUnrollPeel.cpp @@ -46,6 +46,9 @@ "unroll-force-peel-count", cl::init(0), cl::Hidden, cl::desc("Force a peel count regardless of profiling information.")); +// Designates infinite invariant depth of a Phi node. +static const unsigned InfiniteDepth = UINT_MAX; + // Check whether we are capable of peeling this loop. static bool canPeel(Loop *L) { // Make sure the loop is in simplified form @@ -66,6 +69,56 @@ return true; } +// This function calculates the invariant depth of a loop Phi node. The +// pre-calculated values are memorized in Depths map. The entire calculation +// happens according to definition: +// Given %x = phi , ..., [%y, %back.edge]. +// If %y is a loop invariant, then Depth(%x) = 1. +// If %y is a Phi from the loop header, Depth(%x) = Depth(%y) + 1. +// Otherwise, Depth(%x) is infinite. +// TODO: Actually if %y is an expression that depends only on Phi %z and some +// loop invariants, we can estimate Depth(%x) = Depth(%y) + 1. The example +// looks like: +// %x = phi(0, %a), <-- becomes invariant starting from 3rd iteration. +// %y = phi(0, 5), +// %a = %y + 1. +static unsigned calculateInvDepth(PHINode *Phi, Loop *L, + SmallDenseMap &Depths) { + assert(Phi->getParent() == L->getHeader() && + "Requested invariant depth calculation for non-loop Phi?"); + // If we already know the answer, take it from the map. + auto I = Depths.find(Phi); + if (I != Depths.end()) + return I->second; + + BasicBlock *BackEdge = L->getLoopLatch(); + assert(BackEdge && "Loop is not in simplified form?"); + // Otherwise we need to analyze the input from the back edge. + Value *Input = Phi->getIncomingValueForBlock(BackEdge); + // Place infinity to map to avoid infinite recursion for cycled Phis. Such + // cycles can never stop on an invariant. + Depths[Phi] = InfiniteDepth; + unsigned Depth = InfiniteDepth; + + if (L->isLoopInvariant(Input)) + Depth = 1u; + else if (PHINode *IncPhi = dyn_cast(Input)) { + // Only consider Phis in header block. + if (IncPhi->getParent() != L->getHeader()) + return InfiniteDepth; + // Try to estimate the depth of input. If it is finite, our phi's + // depth is greater by 1 according to definition. + unsigned InputDepth = calculateInvDepth(IncPhi, L, Depths); + if (InputDepth != InfiniteDepth) + Depth = InputDepth + 1u; + } + + // If we found that this Phi lies in an invariant chain, update the map. + if (Depth != InfiniteDepth) + Depths[Phi] = Depth; + return Depth; +} + // Return the number of iterations we want to peel off. void llvm::computePeelCount(Loop *L, unsigned LoopSize, TargetTransformInfo::UnrollingPreferences &UP, @@ -78,30 +131,41 @@ if (!L->empty()) return; - // Try to find a Phi node that has the same loop invariant as an input from - // its only back edge. If there is such Phi, peeling 1 iteration from the - // loop is profitable, because starting from 2nd iteration we will have an - // invariant instead of this Phi. - if (LoopSize <= UP.Threshold) { - BasicBlock *BackEdge = L->getLoopLatch(); - assert(BackEdge && "Loop is not in simplified form?"); - BasicBlock *Header = L->getHeader(); - // Iterate over Phis to find one with invariant input on back edge. - bool FoundCandidate = false; - PHINode *Phi; - for (auto BI = Header->begin(); isa(&*BI); ++BI) { - Phi = cast(&*BI); - Value *Input = Phi->getIncomingValueForBlock(BackEdge); - if (L->isLoopInvariant(Input)) { - FoundCandidate = true; - break; - } + // Here we try to get rid of Phis which become invariants after 1, 2, ..., N + // iterations of the loop. In order to do this, we for every Phi in loop's + // header we define the Invariant Depth value which is calculated as follows: + // + // Given %x = phi , ..., [%y, %back.edge]. + // If %y is a loop invariant, then Depth(%x) = 1. + // If %y is a Phi from the loop header, Depth(%x) = Depth(%y) + 1. + // Otherwise, Depth(%x) is infinite. + // + // Notice that if we peel a loop, all Phis with Depth = 1 become invariants, + // and all other Phis decrease the depth by 1. + // Thus, peeling N first iterations allows us to turn all Phis with Depth <= N + // into invariants. Having fewer Phis is a good thing for the register + // allocator, and it also opens window for other optimizations. + // First, check that we can peel at least one iteration. + if (2 * LoopSize <= UP.Threshold) { + // Store pre-calculated values of Depth here. + SmallDenseMap Depths; + // Now go through all Phis to calculate their depth. Max (finite) depth is + // the desired number of iterations we want to peel. + unsigned DesiredPeelCount = 0; + for (auto BI = L->getHeader()->begin(); isa(&*BI); ++BI) { + PHINode *Phi = cast(&*BI); + unsigned Depth = calculateInvDepth(Phi, L, Depths); + if (Depth != InfiniteDepth) + DesiredPeelCount = std::max(DesiredPeelCount, Depth); } - if (FoundCandidate) { - DEBUG(dbgs() << "Peel one iteration to get rid of " << *Phi - << " because starting from 2nd iteration it is always" - << " an invariant\n"); - UP.PeelCount = 1; + if (DesiredPeelCount > 0) { + // Pay respect to limitations applied to peeled code size. + unsigned MaxPeelCount = UP.Threshold / LoopSize - 1; + DesiredPeelCount = std::min(DesiredPeelCount, MaxPeelCount); + assert(DesiredPeelCount > 0 && "Wrong loop size estimation?"); + DEBUG(dbgs() << "Peel " << DesiredPeelCount << " iteration(s) to get rid" + << " of some Phis forming an invariant chain.\n"); + UP.PeelCount = DesiredPeelCount; return; } } Index: test/Transforms/LoopUnroll/peel-loop-not-forced.ll =================================================================== --- test/Transforms/LoopUnroll/peel-loop-not-forced.ll +++ test/Transforms/LoopUnroll/peel-loop-not-forced.ll @@ -1,12 +1,14 @@ -; RUN: opt < %s -S -loop-unroll -unroll-threshold=4 | FileCheck %s +; RUN: opt < %s -S -loop-unroll -unroll-threshold=30 | FileCheck %s define i32 @invariant_backedge_1(i32 %a, i32 %b) { +; This loop should be peeled once because it has a Phi which becomes invariant +; starting from 2st iteration. ; CHECK-LABEL: @invariant_backedge_1 -; CHECK-NOT: %plus = phi -; CHECK: loop.peel: +; CHECK: loop.peel{{.*}}: ; CHECK: loop: ; CHECK: %i = phi ; CHECK: %sum = phi +; CHECK-NOT: %plus = phi entry: br label %loop @@ -25,10 +27,112 @@ ret i32 %sum } -; Peeling should fail due to method size. define i32 @invariant_backedge_2(i32 %a, i32 %b) { +; This loop should be peeled twice because it has a Phi which becomes invariant +; starting from 3rd iteration. ; CHECK-LABEL: @invariant_backedge_2 -; CHECK-NOT: loop.peel: +; CHECK: loop.peel{{.*}}: +; CHECK: loop.peel{{.*}}: +; CHECK: %i = phi +; CHECK: %sum = phi +; CHECK-NOT: %half.inv = phi +; CHECK-NOT: %plus = phi +entry: + br label %loop + +loop: + %i = phi i32 [ 0, %entry ], [ %inc, %loop ] + %sum = phi i32 [ 0, %entry ], [ %incsum, %loop ] + %half.inv = phi i32 [ %a, %entry ], [ %b, %loop ] + %plus = phi i32 [ %a, %entry ], [ %half.inv, %loop ] + + %incsum = add i32 %sum, %plus + %inc = add i32 %i, 1 + %cmp = icmp slt i32 %i, 1000 + + br i1 %cmp, label %loop, label %exit + +exit: + ret i32 %sum +} + +define i32 @invariant_backedge_3(i32 %a, i32 %b) { +; This loop should be peeled trice because it has a Phi which becomes invariant +; starting from 4th iteration. +; CHECK-LABEL: @invariant_backedge_3 +; CHECK: loop.peel{{.*}}: +; CHECK: loop.peel{{.*}}: +; CHECK: loop.peel{{.*}}: +; CHECK: %i = phi +; CHECK: %sum = phi +; CHECK-NOT: %half.inv = phi +; CHECK-NOT: %half.inv.2 = phi +; CHECK-NOT: %plus = phi +entry: + br label %loop + +loop: + %i = phi i32 [ 0, %entry ], [ %inc, %loop ] + %sum = phi i32 [ 0, %entry ], [ %incsum, %loop ] + %half.inv = phi i32 [ %a, %entry ], [ %b, %loop ] + %half.inv.2 = phi i32 [ %a, %entry ], [ %half.inv, %loop ] + %plus = phi i32 [ %a, %entry ], [ %half.inv.2, %loop ] + + %incsum = add i32 %sum, %plus + %inc = add i32 %i, 1 + %cmp = icmp slt i32 %i, 1000 + + br i1 %cmp, label %loop, label %exit + +exit: + ret i32 %sum +} + +define i32 @invariant_backedge_limited_by_size(i32 %a, i32 %b) { +; This loop should normally be peeled trice because it has a Phi which becomes +; invariant starting from 4th iteration, but the size of the loop only allows +; us to peel twice because we are restricted to 30 instructions in resulting +; code. Thus, %plus Phi node should stay in loop even despite its backedge +; input is an invariant. +; CHECK-LABEL: @invariant_backedge_limited_by_size +; CHECK: loop.peel{{.*}}: +; CHECK: loop.peel{{.*}}: +; CHECK: %i = phi +; CHECK: %sum = phi +; CHECK: %plus = phi i32 [ %a, {{.*}} ], [ %b, %loop ] +; CHECK-NOT: %half.inv = phi +; CHECK-NOT: %half.inv.2 = phi +entry: + br label %loop + +loop: + %i = phi i32 [ 0, %entry ], [ %inc, %loop ] + %sum = phi i32 [ 0, %entry ], [ %incsum, %loop ] + %half.inv = phi i32 [ %a, %entry ], [ %b, %loop ] + %half.inv.2 = phi i32 [ %a, %entry ], [ %half.inv, %loop ] + %plus = phi i32 [ %a, %entry ], [ %half.inv.2, %loop ] + + %incsum = add i32 %sum, %plus + %inc = add i32 %i, 1 + %cmp = icmp slt i32 %i, 1000 + + %incsum2 = add i32 %incsum, %plus + %incsum3 = add i32 %incsum, %plus + %incsum4 = add i32 %incsum, %plus + %incsum5 = add i32 %incsum, %plus + %incsum6 = add i32 %incsum, %plus + %incsum7 = add i32 %incsum, %plus + + br i1 %cmp, label %loop, label %exit + +exit: + ret i32 %sum +} + +; Peeling should fail due to method size. +define i32 @invariant_backedge_negative(i32 %a, i32 %b) { +; CHECK-LABEL: @invariant_backedge_negative +; CHECK-NOT: loop.peel{{.*}}: ; CHECK: loop: ; CHECK: %i = phi ; CHECK: %sum = phi @@ -43,6 +147,47 @@ %incsum = add i32 %sum, %plus %incsum2 = add i32 %incsum, %plus + %incsum3 = add i32 %incsum, %plus + %incsum4 = add i32 %incsum, %plus + %incsum5 = add i32 %incsum, %plus + %incsum6 = add i32 %incsum, %plus + %incsum7 = add i32 %incsum, %plus + %incsum8 = add i32 %incsum, %plus + %incsum9 = add i32 %incsum, %plus + %incsum10 = add i32 %incsum, %plus + %incsum11 = add i32 %incsum, %plus + %incsum12 = add i32 %incsum, %plus + %incsum13 = add i32 %incsum, %plus + %incsum14 = add i32 %incsum, %plus + %incsum15 = add i32 %incsum, %plus + %inc = add i32 %i, 1 + %cmp = icmp slt i32 %i, 1000 + + br i1 %cmp, label %loop, label %exit + +exit: + ret i32 %sum +} + +define i32 @cycled_phis(i32 %a, i32 %b) { +; Make sure that we do not crash working with cycled Phis and don't peel it. +; TODO: Actually this loop should be partially unrolled with factor 2. +; CHECK-LABEL: @cycled_phis +; CHECK-NOT: loop.peel{{.*}}: +; CHECK: loop: +; CHECK: %i = phi +; CHECK: %phi.a = phi +; CHECK: %phi.b = phi +; CHECK: %sum = phi +entry: + br label %loop + +loop: + %i = phi i32 [ 0, %entry ], [ %inc, %loop ] + %phi.a = phi i32 [ %a, %entry ], [ %phi.b, %loop ] + %phi.b = phi i32 [ %b, %entry ], [ %phi.a, %loop ] + %sum = phi i32 [ 0, %entry], [ %incsum, %loop ] + %incsum = add i32 %sum, %phi.a %inc = add i32 %i, 1 %cmp = icmp slt i32 %i, 1000