Index: llvm/trunk/include/llvm/Analysis/LoopUnrollAnalyzer.h =================================================================== --- llvm/trunk/include/llvm/Analysis/LoopUnrollAnalyzer.h +++ llvm/trunk/include/llvm/Analysis/LoopUnrollAnalyzer.h @@ -89,6 +89,7 @@ bool visitLoad(LoadInst &I); bool visitCastInst(CastInst &I); bool visitCmpInst(CmpInst &I); + bool visitPHINode(PHINode &PN); }; } #endif Index: llvm/trunk/lib/Analysis/LoopUnrollAnalyzer.cpp =================================================================== --- llvm/trunk/lib/Analysis/LoopUnrollAnalyzer.cpp +++ llvm/trunk/lib/Analysis/LoopUnrollAnalyzer.cpp @@ -189,3 +189,13 @@ return Base::visitCmpInst(I); } + +bool UnrolledInstAnalyzer::visitPHINode(PHINode &PN) { + // Run base visitor first. This way we can gather some useful for later + // analysis information. + if (Base::visitPHINode(PN)) + return true; + + // The loop induction PHI nodes are definitionally free. + return PN.getParent() == L->getHeader(); +} Index: llvm/trunk/lib/Transforms/Scalar/LoopUnrollPass.cpp =================================================================== --- llvm/trunk/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ llvm/trunk/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -185,6 +185,40 @@ } namespace { +/// A struct to densely store the state of an instruction after unrolling at +/// each iteration. +/// +/// This is designed to work like a tuple of for the +/// purposes of hashing and lookup, but to be able to associate two boolean +/// states with each key. +struct UnrolledInstState { + Instruction *I; + int Iteration : 30; + unsigned IsFree : 1; + unsigned IsCounted : 1; +}; + +/// Hashing and equality testing for a set of the instruction states. +struct UnrolledInstStateKeyInfo { + typedef DenseMapInfo PtrInfo; + typedef DenseMapInfo> PairInfo; + static inline UnrolledInstState getEmptyKey() { + return {PtrInfo::getEmptyKey(), 0, 0, 0}; + } + static inline UnrolledInstState getTombstoneKey() { + return {PtrInfo::getTombstoneKey(), 0, 0, 0}; + } + static inline unsigned getHashValue(const UnrolledInstState &S) { + return PairInfo::getHashValue({S.I, S.Iteration}); + } + static inline bool isEqual(const UnrolledInstState &LHS, + const UnrolledInstState &RHS) { + return PairInfo::isEqual({LHS.I, LHS.Iteration}, {RHS.I, RHS.Iteration}); + } +}; +} + +namespace { struct EstimatedUnrollCost { /// \brief The estimated cost after unrolling. int UnrolledCost; @@ -218,18 +252,25 @@ assert(UnrollMaxIterationsCountToAnalyze < (INT_MAX / 2) && "The unroll iterations max is too large!"); + // Only analyze inner loops. We can't properly estimate cost of nested loops + // and we won't visit inner loops again anyway. + if (!L->empty()) + return None; + // Don't simulate loops with a big or unknown tripcount if (!UnrollMaxIterationsCountToAnalyze || !TripCount || TripCount > UnrollMaxIterationsCountToAnalyze) return None; SmallSetVector BBWorklist; + SmallSetVector, 4> ExitWorklist; DenseMap SimplifiedValues; SmallVector, 4> SimplifiedInputValues; // The estimated cost of the unrolled form of the loop. We try to estimate // this by simplifying as much as we can while computing the estimate. int UnrolledCost = 0; + // We also track the estimated dynamic (that is, actually executed) cost in // the rolled form. This helps identify cases when the savings from unrolling // aren't just exposing dead control flows, but actual reduced dynamic @@ -237,6 +278,97 @@ // unrolling. int RolledDynamicCost = 0; + // We track the simplification of each instruction in each iteration. We use + // this to recursively merge costs into the unrolled cost on-demand so that + // we don't count the cost of any dead code. This is essentially a map from + // to , but stored as a densely packed struct. + DenseSet InstCostMap; + + // A small worklist used to accumulate cost of instructions from each + // observable and reached root in the loop. + SmallVector CostWorklist; + + // PHI-used worklist used between iterations while accumulating cost. + SmallVector PHIUsedList; + + // Helper function to accumulate cost for instructions in the loop. + auto AddCostRecursively = [&](Instruction &RootI, int Iteration) { + assert(Iteration >= 0 && "Cannot have a negative iteration!"); + assert(CostWorklist.empty() && "Must start with an empty cost list"); + assert(PHIUsedList.empty() && "Must start with an empty phi used list"); + CostWorklist.push_back(&RootI); + for (;; --Iteration) { + do { + Instruction *I = CostWorklist.pop_back_val(); + + // InstCostMap only uses I and Iteration as a key, the other two values + // don't matter here. + auto CostIter = InstCostMap.find({I, Iteration, 0, 0}); + if (CostIter == InstCostMap.end()) + // If an input to a PHI node comes from a dead path through the loop + // we may have no cost data for it here. What that actually means is + // that it is free. + continue; + auto &Cost = *CostIter; + if (Cost.IsCounted) + // Already counted this instruction. + continue; + + // Mark that we are counting the cost of this instruction now. + Cost.IsCounted = true; + + // If this is a PHI node in the loop header, just add it to the PHI set. + if (auto *PhiI = dyn_cast(I)) + if (PhiI->getParent() == L->getHeader()) { + assert(Cost.IsFree && "Loop PHIs shouldn't be evaluated as they " + "inherently simplify during unrolling."); + if (Iteration == 0) + continue; + + // Push the incoming value from the backedge into the PHI used list + // if it is an in-loop instruction. We'll use this to populate the + // cost worklist for the next iteration (as we count backwards). + if (auto *OpI = dyn_cast( + PhiI->getIncomingValueForBlock(L->getLoopLatch()))) + if (L->contains(OpI)) + PHIUsedList.push_back(OpI); + continue; + } + + // First accumulate the cost of this instruction. + if (!Cost.IsFree) { + UnrolledCost += TTI.getUserCost(I); + DEBUG(dbgs() << "Adding cost of instruction (iteration " << Iteration + << "): "); + DEBUG(I->dump()); + } + + // We must count the cost of every operand which is not free, + // recursively. If we reach a loop PHI node, simply add it to the set + // to be considered on the next iteration (backwards!). + for (Value *Op : I->operands()) { + // Check whether this operand is free due to being a constant or + // outside the loop. + auto *OpI = dyn_cast(Op); + if (!OpI || !L->contains(OpI)) + continue; + + // Otherwise accumulate its cost. + CostWorklist.push_back(OpI); + } + } while (!CostWorklist.empty()); + + if (PHIUsedList.empty()) + // We've exhausted the search. + break; + + assert(Iteration > 0 && + "Cannot track PHI-used values past the first iteration!"); + CostWorklist.append(PHIUsedList.begin(), PHIUsedList.end()); + PHIUsedList.clear(); + } + }; + // Ensure that we don't violate the loop structure invariants relied on by // this analysis. assert(L->isLoopSimplifyForm() && "Must put loop into normal form first."); @@ -291,22 +423,32 @@ // it. We don't change the actual IR, just count optimization // opportunities. for (Instruction &I : *BB) { - int InstCost = TTI.getUserCost(&I); + // Track this instruction's expected baseline cost when executing the + // rolled loop form. + RolledDynamicCost += TTI.getUserCost(&I); // Visit the instruction to analyze its loop cost after unrolling, - // and if the visitor returns false, include this instruction in the - // unrolled cost. - if (!Analyzer.visit(I)) - UnrolledCost += InstCost; - else { - DEBUG(dbgs() << " " << I - << " would be simplified if loop is unrolled.\n"); - (void)0; - } + // and if the visitor returns true, mark the instruction as free after + // unrolling and continue. + bool IsFree = Analyzer.visit(I); + bool Inserted = InstCostMap.insert({&I, (int)Iteration, IsFree, + /*IsCounted*/ false}) + .second; + (void)Inserted; + assert(Inserted && "Cannot have a state for an unvisited instruction!"); + + if (IsFree) + continue; - // Also track this instructions expected cost when executing the rolled - // loop form. - RolledDynamicCost += InstCost; + // If the instruction might have a side-effect recursively account for + // the cost of it and all the instructions leading up to it. + if (I.mayHaveSideEffects()) + AddCostRecursively(I, Iteration); + + // Can't properly model a cost of a call. + // FIXME: With a proper cost model we should be able to do it. + if(isa(&I)) + return None; // If unrolled body turns out to be too big, bail out. if (UnrolledCost > MaxUnrolledLoopSize) { @@ -335,6 +477,8 @@ cast(SimpleCond)->isZero() ? 1 : 0); if (L->contains(Succ)) BBWorklist.insert(Succ); + else + ExitWorklist.insert({BB, Succ}); continue; } } @@ -350,6 +494,8 @@ .getCaseSuccessor(); if (L->contains(Succ)) BBWorklist.insert(Succ); + else + ExitWorklist.insert({BB, Succ}); continue; } } @@ -358,6 +504,8 @@ for (BasicBlock *Succ : successors(BB)) if (L->contains(Succ)) BBWorklist.insert(Succ); + else + ExitWorklist.insert({BB, Succ}); } // If we found no optimization opportunities on the first iteration, we @@ -368,6 +516,23 @@ return None; } } + + while (!ExitWorklist.empty()) { + BasicBlock *ExitingBB, *ExitBB; + std::tie(ExitingBB, ExitBB) = ExitWorklist.pop_back_val(); + + for (Instruction &I : *ExitBB) { + auto *PN = dyn_cast(&I); + if (!PN) + break; + + Value *Op = PN->getIncomingValueForBlock(ExitingBB); + if (auto *OpI = dyn_cast(Op)) + if (L->contains(OpI)) + AddCostRecursively(*OpI, TripCount - 1); + } + } + DEBUG(dbgs() << "Analysis finished:\n" << "UnrolledCost: " << UnrolledCost << ", " << "RolledDynamicCost: " << RolledDynamicCost << "\n"); Index: llvm/trunk/test/Transforms/LoopUnroll/full-unroll-heuristics-2.ll =================================================================== --- llvm/trunk/test/Transforms/LoopUnroll/full-unroll-heuristics-2.ll +++ llvm/trunk/test/Transforms/LoopUnroll/full-unroll-heuristics-2.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=10 -unroll-percent-dynamic-cost-saved-threshold=50 -unroll-dynamic-cost-savings-discount=90 | FileCheck %s +; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=10 -unroll-percent-dynamic-cost-saved-threshold=70 -unroll-dynamic-cost-savings-discount=90 | FileCheck %s target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" @unknown_global = internal unnamed_addr global [9 x i32] [i32 0, i32 -1, i32 0, i32 -1, i32 5, i32 -1, i32 0, i32 -1, i32 0], align 16 Index: llvm/trunk/test/Transforms/LoopUnroll/full-unroll-heuristics-dce.ll =================================================================== --- llvm/trunk/test/Transforms/LoopUnroll/full-unroll-heuristics-dce.ll +++ llvm/trunk/test/Transforms/LoopUnroll/full-unroll-heuristics-dce.ll @@ -0,0 +1,38 @@ +; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=100 -unroll-dynamic-cost-savings-discount=1000 -unroll-threshold=10 -unroll-percent-dynamic-cost-saved-threshold=60 | FileCheck %s +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + +@known_constant = internal unnamed_addr constant [10 x i32] [i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0], align 16 + +; If a load becomes a constant after loop unrolling, we sometimes can simplify +; CFG. This test verifies that we handle such cases. +; After one operand in an instruction is constant-folded and the +; instruction is simplified, the other operand might become dead. +; In this test we have:: +; for i in 1..10: +; r += A[i] * B[i] +; A[i] is 0 almost at every iteration, so there is no need in loading B[i] at +; all. + + +; CHECK-LABEL: @unroll_dce +; CHECK-NOT: br i1 %exitcond, label %for.end, label %for.body +define i32 @unroll_dce(i32* noalias nocapture readonly %b) { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv.0 = phi i64 [ 0, %entry ], [ %iv.1, %for.body ] + %r.0 = phi i32 [ 0, %entry ], [ %r.1, %for.body ] + %arrayidx1 = getelementptr inbounds [10 x i32], [10 x i32]* @known_constant, i64 0, i64 %iv.0 + %x1 = load i32, i32* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv.0 + %x2 = load i32, i32* %arrayidx2, align 4 + %mul = mul i32 %x1, %x2 + %r.1 = add i32 %mul, %r.0 + %iv.1 = add nuw nsw i64 %iv.0, 1 + %exitcond = icmp eq i64 %iv.1, 10 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret i32 %r.1 +} Index: llvm/trunk/test/Transforms/LoopUnroll/full-unroll-heuristics-geps.ll =================================================================== --- llvm/trunk/test/Transforms/LoopUnroll/full-unroll-heuristics-geps.ll +++ llvm/trunk/test/Transforms/LoopUnroll/full-unroll-heuristics-geps.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=100 -unroll-dynamic-cost-savings-discount=1000 -unroll-threshold=10 -unroll-percent-dynamic-cost-saved-threshold=40 | FileCheck %s +; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=100 -unroll-dynamic-cost-savings-discount=1000 -unroll-threshold=10 -unroll-percent-dynamic-cost-saved-threshold=60 | FileCheck %s target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" ; When examining gep-instructions we shouldn't consider them simplified if the Index: llvm/trunk/unittests/Analysis/UnrollAnalyzer.cpp =================================================================== --- llvm/trunk/unittests/Analysis/UnrollAnalyzer.cpp +++ llvm/trunk/unittests/Analysis/UnrollAnalyzer.cpp @@ -134,6 +134,7 @@ " br label %outer.loop\n" "outer.loop:\n" " %iv.outer = phi i64 [ 0, %entry ], [ %iv.outer.next, %outer.loop.latch ]\n" + " %iv.outer.next = add nuw nsw i64 %iv.outer, 1\n" " br label %inner.loop\n" "inner.loop:\n" " %iv.inner = phi i64 [ 0, %outer.loop ], [ %iv.inner.next, %inner.loop ]\n" @@ -141,7 +142,6 @@ " %exitcond.inner = icmp eq i64 %iv.inner.next, 1000\n" " br i1 %exitcond.inner, label %outer.loop.latch, label %inner.loop\n" "outer.loop.latch:\n" - " %iv.outer.next = add nuw nsw i64 %iv.outer, 1\n" " %exitcond.outer = icmp eq i64 %iv.outer.next, 40\n" " br i1 %exitcond.outer, label %exit, label %outer.loop\n" "exit:\n" @@ -163,11 +163,15 @@ BasicBlock *InnerBody = &*FI++; BasicBlock::iterator BBI = Header->begin(); - Instruction *Y1 = &*BBI++; + BBI++; + Instruction *Y1 = &*BBI; BBI = InnerBody->begin(); - Instruction *Y2 = &*BBI++; + BBI++; + Instruction *Y2 = &*BBI; // Check that we can simplify IV of the outer loop, but can't simplify the IV // of the inner loop if we only know the iteration number of the outer loop. + // + // Y1 is %iv.outer.next, Y2 is %iv.inner.next auto I1 = SimplifiedValuesVector[0].find(Y1); EXPECT_TRUE(I1 != SimplifiedValuesVector[0].end()); auto I2 = SimplifiedValuesVector[0].find(Y2);