Index: lib/Transforms/Scalar/LoopRerollPass.cpp
===================================================================
--- lib/Transforms/Scalar/LoopRerollPass.cpp
+++ lib/Transforms/Scalar/LoopRerollPass.cpp
@@ -12,7 +12,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -119,6 +121,16 @@
 // br %cmp, header, exit
 
 namespace {
+  enum IterationLimits {
+    /// The maximum number of iterations that we'll try and reroll. This
+    /// has to be less than 25 in order to fit into a SmallBitVector.
+    IL_MaxRerollIterations = 16,
+    /// The bitvector index used by loop induction variables and other
+    /// instructions that belong to no one particular iteration.
+    IL_LoopIncIdx,
+    IL_End
+  };
+
   class LoopReroll : public LoopPass {
   public:
     static char ID; // Pass ID, replacement for typeid
@@ -334,6 +346,7 @@
       bool findScaleFromMul();
       bool collectAllRoots();
 
+      bool collectUsedInstructions(SmallInstructionSet &PossibleRedSet);
       void collectInLoopUserSet(const SmallInstructionVector &Roots,
                                 const SmallInstructionSet &Exclude,
                                 const SmallInstructionSet &Final,
@@ -366,12 +379,10 @@
       SmallInstructionVector Roots;
       // All increment instructions for IV.
       SmallInstructionVector LoopIncs;
-      // All instructions transitively used by any root.
-      DenseSet<Instruction *> AllRootUses;
-      // All instructions transitively used by the base.
-      DenseSet<Instruction *> BaseUseSet;
-      // All instructions transitively used by the increments.
-      DenseSet<Instruction *> LoopIncUseSet;
+      // Map of all instructions in the loop (in order) to the iterations
+      // they are used in (or specially, IL_LoopIncIdx for instructions
+      // used in the loop increment mechanism).
+      MapVector<Instruction*, SmallBitVector> Uses;
     };
 
     void collectPossibleIVs(Loop *L, SmallInstructionVector &PossibleIVs);
@@ -592,6 +603,13 @@
   if (!collectAllRoots())
     return false;
 
+  if (Roots.size() > IL_MaxRerollIterations) {
+    DEBUG(dbgs() << "LRR: Aborting - too many iterations found. "
+          << "#Found=" << Roots.size() << ", #Max=" << IL_MaxRerollIterations
+          << "\n");
+    return false;
+  }
+
   return true;
 }
 
@@ -715,9 +733,62 @@
   return true;
 }
 
-bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
-  BasicBlock *Header = L->getHeader();
+bool LoopReroll::DAGRootTracker::collectUsedInstructions(SmallInstructionSet &PossibleRedSet) {
+  // Populate the MapVector with all instructions in the block, in order first,
+  // so we can iterate over the contents later in perfect order.
+  for (auto &I : *L->getHeader())
+    Uses[&I].resize(IL_End);
+
+  SmallInstructionSet Exclude;
+  Exclude.insert(Roots.begin(), Roots.end());
+  Exclude.insert(LoopIncs.begin(), LoopIncs.end());
+
+  DenseSet<Instruction*> VBase;
+  collectInLoopUserSet(IV, Exclude, PossibleRedSet, VBase);
+  for (auto *I : VBase)
+    Uses[I].set(0);
+
+  unsigned Idx = 1;
+  for (auto *Root : Roots) {
+    DenseSet<Instruction*> V;
+    collectInLoopUserSet(Root, Exclude, PossibleRedSet, V);
+
+    // While we're here, check the use sets are the same size.
+    if (V.size() != VBase.size()) {
+      DEBUG(dbgs() << "LRR: Aborting - use sets are different sizes\n");
+      return false;
+    }
+
+    for (auto *I : V)
+      Uses[I].set(Idx);
+    ++Idx;
+  }
+
+  // Make sure the loop increments are also accounted for.
+  Exclude.clear();
+  Exclude.insert(Roots.begin(), Roots.end());
+
+  DenseSet<Instruction*> V;
+  collectInLoopUserSet(LoopIncs, Exclude, PossibleRedSet, V);
+  for (auto *I : V)
+    Uses[I].set(IL_LoopIncIdx);
+
+  if (IV != RealIV)
+    Uses[RealIV].set(IL_LoopIncIdx);
+
+  return true;
 
+}
+
+static MapVector<Instruction*, SmallBitVector>::iterator
+nextInstr(int Val, MapVector<Instruction*, SmallBitVector> &In,
+          MapVector<Instruction*, SmallBitVector>::iterator I) {
+  while (I != In.end() && I->second.test(Val) == 0)
+    ++I;
+  return I;
+}
+
+bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
   // We now need to check for equivalence of the use graph of each root with
   // that of the primary induction variable (excluding the roots). Our goal
   // here is not to solve the full graph isomorphism problem, but rather to
@@ -726,9 +797,6 @@
   // is the same (although we will not make an assumption about how the
   // different iterations are intermixed). Note that while the order must be
   // the same, the instructions may not be in the same basic block.
-  SmallInstructionSet Exclude;
-  Exclude.insert(Roots.begin(), Roots.end());
-  Exclude.insert(LoopIncs.begin(), LoopIncs.end());
 
   // An array of just the possible reductions for this scale factor. When we
   // collect the set of all users of some root instructions, these reduction
@@ -740,116 +808,120 @@
   SmallInstructionSet PossibleRedPHISet;
   Reductions.restrictToScale(Scale, PossibleRedSet,
                              PossibleRedPHISet, PossibleRedLastSet);
-                                          
 
-  collectInLoopUserSet(IV, Exclude, PossibleRedSet, BaseUseSet);
-
-  std::vector<DenseSet<Instruction *> > RootUseSets(Scale-1);
+  // Populate "Uses" with where each instruction is used.
+  if (!collectUsedInstructions(PossibleRedSet))
+    return false;
 
-  bool MatchFailed = false;
-  for (unsigned i = 0; i < Scale-1 && !MatchFailed; ++i) {
-    DenseSet<Instruction *> &RootUseSet = RootUseSets[i];
-    collectInLoopUserSet(Roots[i], SmallInstructionSet(),
-                         PossibleRedSet, RootUseSet);
+  // Make sure we mark the reduction PHIs as used in all iterations.
+  for (auto *I : PossibleRedPHISet)
+    Uses[I].set(IL_LoopIncIdx);
 
-    DEBUG(dbgs() << "LRR: base use set size: " << BaseUseSet.size() <<
-                    " vs. iteration increment " << (i+1) <<
-                    " use set size: " << RootUseSet.size() << "\n");
+  // Make sure all instructions in the loop are in one and only one
+  // set.
+  for (auto &KV : Uses)
+    if (KV.second.count() != 1) {
+      DEBUG(dbgs() << "LRR: Aborting - instruction is not used in 1 iteration: "
+            << *KV.first << " (#uses=" << KV.second.count() << ")\n");
+      return false;
+    }
 
-    if (BaseUseSet.size() != RootUseSet.size()) {
-      MatchFailed = true;
-      break;
+  DEBUG(
+    for (auto &KV : Uses) {
+      dbgs() << "LRR: " << KV.second.find_first() << "\t" << *KV.first << "\n";
     }
+    );
 
+  for (unsigned Iter = 1; Iter < Scale; ++Iter) {
     // In addition to regular aliasing information, we need to look for
     // instructions from later (future) iterations that have side effects
     // preventing us from reordering them past other instructions with side
     // effects.
     bool FutureSideEffects = false;
     AliasSetTracker AST(*AA);
-
     // The map between instructions in f(%iv.(i+1)) and f(%iv).
     DenseMap<Value *, Value *> BaseMap;
 
-    assert(L->getNumBlocks() == 1 && "Cannot handle multi-block loops");
-    for (BasicBlock::iterator J1 = Header->begin(), J2 = Header->begin(),
-         JE = Header->end(); J1 != JE && !MatchFailed; ++J1) {
-      if (cast<Instruction>(J1) == RealIV)
-        continue;
-      if (cast<Instruction>(J1) == IV)
-        continue;
-      if (!BaseUseSet.count(J1))
-        continue;
-      if (PossibleRedPHISet.count(J1)) // Skip reduction PHIs.
-        continue;
+    // Compare iteration Iter to the base.
+    auto BaseIt = nextInstr(0, Uses, Uses.begin());
+    auto RootIt = nextInstr(Iter, Uses, Uses.begin());
+    auto LastRootIt = Uses.begin();
 
-      while (J2 != JE && (!RootUseSet.count(J2) || Roots[i] ==  J2)) {
-        // As we iterate through the instructions, instructions that don't
-        // belong to previous iterations (or the base case), must belong to
-        // future iterations. We want to track the alias set of writes from
-        // previous iterations.
-        if (!isa<PHINode>(J2) && !BaseUseSet.count(J2) &&
-            !AllRootUses.count(J2)) {
-          if (J2->mayWriteToMemory())
-            AST.add(J2);
-
-          // Note: This is specifically guarded by a check on isa<PHINode>,
-          // which while a valid (somewhat arbitrary) micro-optimization, is
-          // needed because otherwise isSafeToSpeculativelyExecute returns
-          // false on PHI nodes.
-          if (!isSimpleLoadStore(J2) && !isSafeToSpeculativelyExecute(J2, DL))
-            FutureSideEffects = true;
-        }
+    while (BaseIt != Uses.end() && RootIt != Uses.end()) {
+      Instruction *BaseInst = BaseIt->first;
+      Instruction *RootInst = RootIt->first;
 
-        ++J2;
+      // Skip over the IV or root instructions; only match their users.
+      bool Continue = false;
+      if (BaseInst == RealIV || BaseInst == IV) {
+        BaseIt = nextInstr(0, Uses, ++BaseIt);
+        Continue = true;
+      }
+      if (std::find(Roots.begin(), Roots.end(), RootInst) != Roots.end()) {
+        LastRootIt = RootIt;
+        RootIt = nextInstr(Iter, Uses, ++RootIt);
+        Continue = true;
+      }
+      if (Continue) continue;
+
+      // All instructions between the last root and this root
+      // belong to some other iteration. If they belong to a 
+      // future iteration, then they're dangerous to alias with.
+      for (; LastRootIt != RootIt; ++LastRootIt) {
+        Instruction *I = LastRootIt->first;
+        if (LastRootIt->second.find_first() < (int)Iter)
+          continue;
+        if (I->mayWriteToMemory())
+          AST.add(I);
+        // Note: This is specifically guarded by a check on isa<PHINode>,
+        // which while a valid (somewhat arbitrary) micro-optimization, is
+        // needed because otherwise isSafeToSpeculativelyExecute returns
+        // false on PHI nodes.
+        if (!isa<PHINode>(I) && !isSimpleLoadStore(I) &&
+            !isSafeToSpeculativelyExecute(I, DL))
+          // Intervening instructions cause side effects.
+          FutureSideEffects = true;
       }
 
-      if (!J1->isSameOperationAs(J2)) {
-        DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 <<
-                        " vs. " << *J2 << "\n");
-        MatchFailed = true;
-        break;
+      if (!BaseInst->isSameOperationAs(RootInst)) {
+        DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
+              " vs. " << *RootInst << "\n");
+        return false;
       }
 
       // Make sure that this instruction, which is in the use set of this
       // root instruction, does not also belong to the base set or the set of
-      // some previous root instruction.
-      if (BaseUseSet.count(J2) || AllRootUses.count(J2)) {
-        DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 <<
-                        " vs. " << *J2 << " (prev. case overlap)\n");
-        MatchFailed = true;
-        break;
+      // some other root instruction.
+      if (RootIt->second.count() > 1) {
+        DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
+                        " vs. " << *RootInst << " (prev. case overlap)\n");
+        return false;
       }
 
       // Make sure that we don't alias with any instruction in the alias set
       // tracker. If we do, then we depend on a future iteration, and we
       // can't reroll.
-      if (J2->mayReadFromMemory()) {
-        for (AliasSetTracker::iterator K = AST.begin(), KE = AST.end();
-             K != KE && !MatchFailed; ++K) {
-          if (K->aliasesUnknownInst(J2, *AA)) {
-            DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 <<
-                            " vs. " << *J2 << " (depends on future store)\n");
-            MatchFailed = true;
-            break;
+      if (RootInst->mayReadFromMemory())
+        for (auto &K : AST)
+          if (K.aliasesUnknownInst(RootInst, *AA)) {
+            DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
+                            " vs. " << *RootInst << " (depends on future store)\n");
+            return false;
           }
-        }
-      }
 
       // If we've past an instruction from a future iteration that may have
       // side effects, and this instruction might also, then we can't reorder
       // them, and this matching fails. As an exception, we allow the alias
       // set tracker to handle regular (simple) load/store dependencies.
       if (FutureSideEffects &&
-            ((!isSimpleLoadStore(J1) &&
-              !isSafeToSpeculativelyExecute(J1, DL)) ||
-             (!isSimpleLoadStore(J2) &&
-              !isSafeToSpeculativelyExecute(J2, DL)))) {
-        DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 <<
-                        " vs. " << *J2 <<
+            ((!isSimpleLoadStore(BaseInst) &&
+              !isSafeToSpeculativelyExecute(BaseInst, DL)) ||
+             (!isSimpleLoadStore(RootInst) &&
+              !isSafeToSpeculativelyExecute(RootInst, DL)))) {
+        DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
+                        " vs. " << *RootInst <<
                         " (side effects prevent reordering)\n");
-        MatchFailed = true;
-        break;
+        return false;
       }
 
       // For instructions that are part of a reduction, if the operation is
@@ -862,41 +934,40 @@
       //   x += a[i]; x += b[i];
       //   x += a[i+1]; x += b[i+1];
       //   x += b[i+2]; x += a[i+2];
-      bool InReduction = Reductions.isPairInSame(J1, J2);
+      bool InReduction = Reductions.isPairInSame(BaseInst, RootInst);
 
-      if (!(InReduction && J1->isAssociative())) {
+      if (!(InReduction && BaseInst->isAssociative())) {
         bool Swapped = false, SomeOpMatched = false;
-        for (unsigned j = 0; j < J1->getNumOperands() && !MatchFailed; ++j) {
-          Value *Op2 = J2->getOperand(j);
+        for (unsigned j = 0; j < BaseInst->getNumOperands(); ++j) {
+          Value *Op2 = RootInst->getOperand(j);
 
           // If this is part of a reduction (and the operation is not
           // associatve), then we match all operands, but not those that are
           // part of the reduction.
           if (InReduction)
             if (Instruction *Op2I = dyn_cast<Instruction>(Op2))
-              if (Reductions.isPairInSame(J2, Op2I))
+              if (Reductions.isPairInSame(RootInst, Op2I))
                 continue;
 
           DenseMap<Value *, Value *>::iterator BMI = BaseMap.find(Op2);
           if (BMI != BaseMap.end())
             Op2 = BMI->second;
-          else if (Roots[i] == (Instruction*) Op2)
+          else if (Roots[Iter-1] == (Instruction*) Op2)
             Op2 = IV;
 
-          if (J1->getOperand(Swapped ? unsigned(!j) : j) != Op2) {
+          if (BaseInst->getOperand(Swapped ? unsigned(!j) : j) != Op2) {
             // If we've not already decided to swap the matched operands, and
             // we've not already matched our first operand (note that we could
             // have skipped matching the first operand because it is part of a
             // reduction above), and the instruction is commutative, then try
             // the swapped match.
-            if (!Swapped && J1->isCommutative() && !SomeOpMatched &&
-                J1->getOperand(!j) == Op2) {
+            if (!Swapped && BaseInst->isCommutative() && !SomeOpMatched &&
+                BaseInst->getOperand(!j) == Op2) {
               Swapped = true;
             } else {
-              DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 <<
-                              " vs. " << *J2 << " (operand " << j << ")\n");
-              MatchFailed = true;
-              break;
+              DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst
+                    << " vs. " << *RootInst << " (operand " << j << ")\n");
+              return false;
             }
           }
 
@@ -904,67 +975,29 @@
         }
       }
 
-      if ((!PossibleRedLastSet.count(J1) && hasUsesOutsideLoop(J1, L)) ||
-          (!PossibleRedLastSet.count(J2) && hasUsesOutsideLoop(J2, L))) {
-        DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 <<
-                        " vs. " << *J2 << " (uses outside loop)\n");
-        MatchFailed = true;
-        break;
+      if ((!PossibleRedLastSet.count(BaseInst) &&
+           hasUsesOutsideLoop(BaseInst, L)) ||
+          (!PossibleRedLastSet.count(RootInst) &&
+           hasUsesOutsideLoop(RootInst, L))) {
+        DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
+                        " vs. " << *RootInst << " (uses outside loop)\n");
+        return false;
       }
 
-      if (!MatchFailed)
-        BaseMap.insert(std::pair<Value *, Value *>(J2, J1));
-
-      AllRootUses.insert(J2);
-      Reductions.recordPair(J1, J2, i+1);
+      Reductions.recordPair(BaseInst, RootInst, Iter);
+      BaseMap.insert(std::make_pair(RootInst, BaseInst));
 
-      ++J2;
+      LastRootIt = RootIt;
+      BaseIt = nextInstr(0, Uses, ++BaseIt);
+      RootIt = nextInstr(Iter, Uses, ++RootIt);
     }
+    assert (BaseIt == Uses.end() && RootIt == Uses.end() &&
+            "Mismatched set sizes!");
   }
 
-  if (MatchFailed)
-    return false;
-
   DEBUG(dbgs() << "LRR: Matched all iteration increments for " <<
                   *RealIV << "\n");
 
-  collectInLoopUserSet(LoopIncs, SmallInstructionSet(),
-                       SmallInstructionSet(), LoopIncUseSet);
-  DEBUG(dbgs() << "LRR: Loop increment set size: " <<
-                  LoopIncUseSet.size() << "\n");
-
-  // Make sure that all instructions in the loop have been included in some
-  // use set.
-  for (BasicBlock::iterator J = Header->begin(), JE = Header->end();
-       J != JE; ++J) {
-    if (isa<DbgInfoIntrinsic>(J))
-      continue;
-    if (cast<Instruction>(J) == RealIV)
-      continue;
-    if (cast<Instruction>(J) == IV)
-      continue;
-    if (BaseUseSet.count(J) || AllRootUses.count(J) ||
-        (LoopIncUseSet.count(J) && (J->isTerminator() ||
-                                    isSafeToSpeculativelyExecute(J, DL))))
-      continue;
-
-    if (std::find(Roots.begin(), Roots.end(), J) != Roots.end())
-      continue;
-
-    if (Reductions.isSelectedPHI(J))
-      continue;
-
-    DEBUG(dbgs() << "LRR: aborting reroll based on " << *RealIV <<
-                    " unprocessed instruction found: " << *J << "\n");
-    MatchFailed = true;
-    break;
-  }
-
-  if (MatchFailed)
-    return false;
-
-  DEBUG(dbgs() << "LRR: all instructions processed from " <<
-                  *RealIV << "\n");
   return true;
 }
 
@@ -973,7 +1006,8 @@
   // Remove instructions associated with non-base iterations.
   for (BasicBlock::reverse_iterator J = Header->rbegin();
        J != Header->rend();) {
-    if (AllRootUses.count(&*J)) {
+    unsigned I = Uses[&*J].find_first();
+    if (I > 0 && I < IL_LoopIncIdx) {
       Instruction *D = &*J;
       DEBUG(dbgs() << "LRR: removing: " << *D << "\n");
       D->eraseFromParent();
@@ -997,12 +1031,14 @@
     SCEVExpander Expander(*SE, "reroll");
     Value *NewIV = Expander.expandCodeFor(H, IV->getType(), Header->begin());
 
-    for (DenseSet<Instruction *>::iterator J = BaseUseSet.begin(),
-         JE = BaseUseSet.end(); J != JE; ++J)
-      (*J)->replaceUsesOfWith(IV, NewIV);
+    for (auto &KV : Uses) {
+      if (KV.second.find_first() == 0)
+        KV.first->replaceUsesOfWith(IV, NewIV);
+    }
 
     if (BranchInst *BI = dyn_cast<BranchInst>(Header->getTerminator())) {
-      if (LoopIncUseSet.count(BI)) {
+      // FIXME: Why do we need this check?
+      if (Uses[BI].find_first() == IL_LoopIncIdx) {
         const SCEV *ICSCEV = RealIVSCEV->evaluateAtIteration(IterCount, *SE);
         if (Inc == 1)
           ICSCEV =