Index: include/llvm/Analysis/AccessAnalysis.h
===================================================================
--- /dev/null
+++ include/llvm/Analysis/AccessAnalysis.h
@@ -0,0 +1,387 @@
+//===-  -  -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+
+#ifndef LLVM_ANALYSIS_ACCESSANALYSIS_H
+#define LLVM_ANALYSIS_ACCESSANALYSIS_H
+
+#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+/// Optimization analysis message produced during vectorization. Messages inform
+/// the user why vectorization did not occur.
+class Report {
+  std::string Message;
+  raw_string_ostream Out;
+  Instruction *Instr;
+
+public:
+  Report(Instruction *I = nullptr) : Out(Message), Instr(I) {
+    Out << "loop not vectorized: ";
+  }
+
+  template <typename A> Report &operator<<(const A &Value) {
+    Out << Value;
+    return *this;
+  }
+
+  Instruction *getInstr() { return Instr; }
+
+  std::string &str() { return Out.str(); }
+  operator Twine() { return Out.str(); }
+};
+
+/// This struct holds information about the memory runtime legality
+/// check that a group of pointers do not overlap.
+struct RuntimePointerCheck {
+  RuntimePointerCheck() : Need(false) {}
+
+  /// Reset the state of the pointer runtime information.
+  void reset() {
+    Need = false;
+    Pointers.clear();
+    Starts.clear();
+    Ends.clear();
+    IsWritePtr.clear();
+    DependencySetId.clear();
+    AliasSetId.clear();
+    PartitionSetId.clear();
+  }
+
+  void print();
+
+  /// Insert a pointer and calculate the start and end SCEVs.
+  void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr,
+              unsigned DepSetId, unsigned ASId, ValueToValueMap &Strides);
+
+  /// This flag indicates if we need to add the runtime check.
+  bool Need;
+  /// Holds the pointers that we need to check.
+  SmallVector<TrackingVH<Value>, 2> Pointers;
+  /// Holds the pointer value at the beginning of the loop.
+  SmallVector<const SCEV*, 2> Starts;
+  /// Holds the pointer value at the end of the loop.
+  SmallVector<const SCEV*, 2> Ends;
+  /// Holds the information if this pointer is used for writing to memory.
+  SmallVector<bool, 2> IsWritePtr;
+  /// Holds the id of the set of pointers that could be dependent because of a
+  /// shared underlying object.
+  SmallVector<unsigned, 2> DependencySetId;
+  /// Holds the id of the disjoint alias set to which this pointer belongs.
+  SmallVector<unsigned, 2> AliasSetId;
+  // FIXME: this does not work if we have more than 64 partitions
+  SmallVector<uint64_t, 2> PartitionSetId;
+};
+
+/// \brief Analyses memory accesses in a loop.
+///
+/// Checks whether run time pointer checks are needed and builds sets for data
+/// dependence checking.
+class AccessAnalysis {
+public:
+  /// \brief Read or write access location.
+  typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
+  typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet;
+
+  /// \brief Set of potential dependent memory accesses.
+  typedef EquivalenceClasses<MemAccessInfo> DepCandidates;
+
+  AccessAnalysis(const DataLayout *Dl, AliasAnalysis *AA, DepCandidates &DA) :
+    DL(Dl), AST(*AA), DepCands(DA), IsRTCheckNeeded(false) {}
+
+  /// \brief Register a load  and whether it is only read from.
+  void addLoad(AliasAnalysis::Location &Loc, bool IsReadOnly) {
+    Value *Ptr = const_cast<Value*>(Loc.Ptr);
+    AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.AATags);
+    Accesses.insert(MemAccessInfo(Ptr, false));
+    if (IsReadOnly)
+      ReadOnlyPtr.insert(Ptr);
+  }
+
+  /// \brief Register a store.
+  void addStore(AliasAnalysis::Location &Loc) {
+    Value *Ptr = const_cast<Value*>(Loc.Ptr);
+    AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.AATags);
+    Accesses.insert(MemAccessInfo(Ptr, true));
+  }
+
+  /// \brief Check whether we can check the pointers at runtime for
+  /// non-intersection.
+  bool canCheckPtrAtRT(RuntimePointerCheck &RtCheck,
+                       unsigned &NumComparisons, ScalarEvolution *SE,
+                       Loop *TheLoop, ValueToValueMap &Strides,
+                       bool ShouldCheckStride = false);
+
+  /// \brief Goes over all memory accesses, checks whether a RT check is needed
+  /// and builds sets of dependent accesses.
+  void buildDependenceSets() {
+    processMemAccesses();
+  }
+
+  bool isRTCheckNeeded() { return IsRTCheckNeeded; }
+
+  bool isDependencyCheckNeeded() { return !CheckDeps.empty(); }
+  void resetDepChecks() { CheckDeps.clear(); }
+
+  MemAccessInfoSet &getDependenciesToCheck() { return CheckDeps; }
+
+private:
+  typedef SetVector<MemAccessInfo> PtrAccessSet;
+
+  /// \brief Go over all memory access and check whether runtime pointer checks
+  /// are needed /// and build sets of dependency check candidates.
+  void processMemAccesses();
+
+  /// Set of all accesses.
+  PtrAccessSet Accesses;
+
+  /// Set of accesses that need a further dependence check.
+  MemAccessInfoSet CheckDeps;
+
+  /// Set of pointers that are read only.
+  SmallPtrSet<Value*, 16> ReadOnlyPtr;
+
+  const DataLayout *DL;
+
+  /// An alias set tracker to partition the access set by underlying object and
+  //intrinsic property (such as TBAA metadata).
+  AliasSetTracker AST;
+
+  /// Sets of potentially dependent accesses - members of one set share an
+  /// underlying pointer. The set "CheckDeps" identfies which sets really need a
+  /// dependence check.
+  DepCandidates &DepCands;
+
+  bool IsRTCheckNeeded;
+};
+
+class RuntimeCheckEmitter {
+public:
+  /// \brief Add code that checks at runtime if the accessed arrays overlap.
+  ///
+  /// Returns a pair of instructions where the first element is the first
+  /// instruction generated in possibly a sequence of instructions and the
+  /// second value is the final comparator value or NULL if no check is needed.
+  std::pair<Instruction *, Instruction *>
+  addRuntimeCheck(Instruction *Loc, RuntimePointerCheck *PtrRtCheck,
+                  ScalarEvolution *SE, Loop *Loop);
+};
+
+struct OrderedInstruction {
+  Instruction *I;
+  unsigned Order;
+  OrderedInstruction(Instruction *I, unsigned Order) : I(I), Order(Order) {
+  }
+};
+bool operator<(const OrderedInstruction &LHS, const OrderedInstruction &RHS);
+
+/// \brief Checks memory dependences among accesses to the same underlying
+/// object to determine whether there vectorization is legal or not (and at
+/// which vectorization factor).
+///
+/// This class works under the assumption that we already checked that memory
+/// locations with different underlying pointers are "must-not alias".
+/// We use the ScalarEvolution framework to symbolically evalutate access
+/// functions pairs. Since we currently don't restructure the loop we can rely
+/// on the program order of memory accesses to determine their safety.
+/// At the moment we will only deem accesses as safe for:
+///  * A negative constant distance assuming program order.
+///
+///      Safe: tmp = a[i + 1];     OR     a[i + 1] = x;
+///            a[i] = tmp;                y = a[i];
+///
+///   The latter case is safe because later checks guarantuee that there can't
+///   be a cycle through a phi node (that is, we check that "x" and "y" is not
+///   the same variable: a header phi can only be an induction or a reduction, a
+///   reduction can't have a memory sink, an induction can't have a memory
+///   source). This is important and must not be violated (or we have to
+///   resort to checking for cycles through memory).
+///
+///  * A positive constant distance assuming program order that is bigger
+///    than the biggest memory access.
+///
+///     tmp = a[i]        OR              b[i] = x
+///     a[i+2] = tmp                      y = b[i+2];
+///
+///     Safe distance: 2 x sizeof(a[0]), and 2 x sizeof(b[0]), respectively.
+///
+///  * Zero distances and all accesses have the same size.
+///
+class MemoryDepChecker {
+public:
+  typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
+  typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet;
+  typedef std::vector<std::pair<OrderedInstruction, OrderedInstruction> >
+    DependenceVector;
+
+  MemoryDepChecker(ScalarEvolution *Se, const DataLayout *Dl, const Loop *L)
+      : SE(Se), DL(Dl), InnermostLoop(L), AccessIdx(0),
+        ShouldRetryWithRuntimeCheck(false) {}
+
+  /// \brief Register the location (instructions are given increasing numbers)
+  /// of a write access.
+  void addAccess(StoreInst *SI) {
+    Value *Ptr = SI->getPointerOperand();
+    Accesses[MemAccessInfo(Ptr, true)].push_back(AccessIdx);
+    InstMap.push_back(SI);
+    ++AccessIdx;
+  }
+
+  /// \brief Register the location (instructions are given increasing numbers)
+  /// of a write access.
+  void addAccess(LoadInst *LI) {
+    Value *Ptr = LI->getPointerOperand();
+    Accesses[MemAccessInfo(Ptr, false)].push_back(AccessIdx);
+    InstMap.push_back(LI);
+    ++AccessIdx;
+  }
+
+  /// \brief Check whether the dependencies between the accesses are safe.
+  ///
+  /// Only checks sets with elements in \p CheckDeps.
+  bool areDepsSafe(AccessAnalysis::DepCandidates &AccessSets,
+                   MemAccessInfoSet &CheckDeps, ValueToValueMap &Strides);
+
+  void findUnsafeDeps(AccessAnalysis::DepCandidates &AccessSets,
+                      MemAccessInfoSet &CheckDeps, ValueToValueMap &Strides,
+                      MemoryDepChecker::DependenceVector &UnsafeDeps);
+
+  /// \brief The maximum number of bytes of a vector register we can vectorize
+  /// the accesses safely with.
+  unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; }
+
+  /// \brief In same cases when the dependency check fails we can still
+  /// vectorize the loop with a dynamic array access check.
+  bool shouldRetryWithRuntimeCheck() { return ShouldRetryWithRuntimeCheck; }
+
+private:
+  ScalarEvolution *SE;
+  const DataLayout *DL;
+  const Loop *InnermostLoop;
+public:
+  /// \brief Maps access locations (ptr, read/write) to program order.
+  DenseMap<MemAccessInfo, std::vector<unsigned> > Accesses;
+private:
+public:
+  /// \brief Memory access instructions in program order.
+  SmallVector<Instruction *, 16> InstMap;
+private:
+
+  /// \brief The program order index to be used for the next instruction.
+  unsigned AccessIdx;
+
+  // We can access this many bytes in parallel safely.
+  unsigned MaxSafeDepDistBytes;
+
+  /// \brief If we see a non-constant dependence distance we can still try to
+  /// vectorize this loop with runtime checks.
+  bool ShouldRetryWithRuntimeCheck;
+
+  /// \brief Check whether there is a plausible dependence between the two
+  /// accesses.
+  ///
+  /// Access \p A must happen before \p B in program order. The two indices
+  /// identify the index into the program order map.
+  ///
+  /// This function checks  whether there is a plausible dependence (or the
+  /// absence of such can't be proved) between the two accesses. If there is a
+  /// plausible dependence but the dependence distance is bigger than one
+  /// element access it records this distance in \p MaxSafeDepDistBytes (if this
+  /// distance is smaller than any other distance encountered so far).
+  /// Otherwise, this function returns true signaling a possible dependence.
+  bool isDependent(const MemAccessInfo &A, unsigned AIdx,
+                   const MemAccessInfo &B, unsigned BIdx,
+                   ValueToValueMap &Strides);
+
+  /// \brief Check whether the data dependence could prevent store-load
+  /// forwarding.
+  bool couldPreventStoreLoadForward(unsigned Distance, unsigned TypeByteSize);
+};
+
+static inline Instruction *getFirstInst(Instruction *FirstInst, Value *V,
+                                 Instruction *Loc) {
+  if (FirstInst)
+    return FirstInst;
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    return I->getParent() == Loc->getParent() ? I : nullptr;
+  return nullptr;
+}
+
+static inline void addInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) {
+  if (L.empty())
+    return V.push_back(&L);
+
+  for (Loop *InnerL : L)
+    addInnerLoop(*InnerL, V);
+}
+
+class LoopAccessAnalysis : public RuntimeCheckEmitter {
+public:
+  unsigned NumLoads;
+  unsigned NumStores;
+  unsigned NumPredStores;
+
+  LoopAccessAnalysis(Loop *L, const DataLayout *DL,
+                     AliasAnalysis *AA, ScalarEvolution *SCEV) :
+    AA(AA),
+    SE(SCEV),
+    Accesses(DL, AA, DependentAccesses),
+    DepChecker(SE, DL, L),
+    TheLoop(L){}
+
+  bool analyzeLoop();
+  // FIXME
+  void emitAnalysis(Report &Message) {}
+
+  bool isUniform(Value *V) {
+    return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop));
+  }
+
+  SmallVector<Instruction *, 8>
+  getInstructionsForAccess(AccessAnalysis::MemAccessInfo &A) {
+    SmallVector<Instruction *, 8> Insts;
+    auto &Ids = DepChecker.Accesses[A];
+    std::transform(Ids.begin(), Ids.end(), std::back_inserter(Insts),
+                   [&](unsigned Id) {
+                     return this->DepChecker.InstMap[Id];
+                   });
+    return Insts;
+  }
+
+  std::pair<Instruction *, Instruction *> addRuntimeCheck(Instruction *Loc) {
+    return RuntimeCheckEmitter::addRuntimeCheck(Loc, &PtrRtCheck, SE, TheLoop);
+  }
+
+private:
+  AliasAnalysis *AA;
+  ScalarEvolution *SE;
+  AccessAnalysis Accesses;
+  AccessAnalysis::DepCandidates DependentAccesses;
+public:
+  MemoryDepChecker DepChecker;
+private:
+  Loop *TheLoop;
+public:
+  RuntimePointerCheck PtrRtCheck;
+private:
+  // FIXME
+  ValueToValueMap Strides;
+public:
+  MemoryDepChecker::DependenceVector UnsafeDeps;
+};
+
+} // End llvm namespace
+
+#endif
Index: include/llvm/Analysis/ControlDependence.h
===================================================================
--- /dev/null
+++ include/llvm/Analysis/ControlDependence.h
@@ -0,0 +1,66 @@
+//===-  -  -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+
+#ifndef LLVM_ANALYSIS_CONTROLDEPENDENCE_H
+#define LLVM_ANALYSIS_CONTROLDEPENDENCE_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include <algorithm>
+
+namespace llvm {
+  class ControlDependence {
+    // FIXME: do some caching here.
+    PostDominatorTree *PDT;
+    Loop *Region;
+
+  public:
+    ControlDependence(PostDominatorTree *PDT, Loop *L) : PDT(PDT), Region(L) {
+    }
+
+    Instruction *dependsOnBlock(BasicBlock *B) {
+      return nullptr;
+    }
+
+    static Instruction *findControlInstInBlock(BasicBlock *B) {
+      return nullptr;
+    }
+
+    SmallVector<BasicBlock *, 8> findControlBlocks(BasicBlock *B) {
+      SmallVector<BasicBlock *, 8> DirectCD;
+
+      // FIXME: find non-direct CDs as well.x
+      std::copy_if(pred_begin(B), pred_end(B),
+                   std::back_inserter(DirectCD),
+                   [this, B](BasicBlock *P) {
+                     return (this->Region->contains(P) &&
+                             !this->PDT->dominates(B, P));
+                   });
+
+      return DirectCD;
+    }
+
+    SmallVector<Instruction *, 8> findControlInsts(BasicBlock *B) {
+      SmallVector<Instruction *, 8> CI;
+
+      SmallVector<BasicBlock *, 8> CD = findControlBlocks(B);
+      for (BasicBlock *C : CD) {
+        auto *Br = cast<BranchInst>(C->getTerminator());
+        assert(Br->isConditional() &&
+               "Control block with unconditional branch");
+        CI.push_back(Br);
+      }
+      return CI;
+    }
+};
+}
+
+#endif
Index: include/llvm/InitializePasses.h
===================================================================
--- include/llvm/InitializePasses.h
+++ include/llvm/InitializePasses.h
@@ -164,6 +164,7 @@
 void initializeLoaderPassPass(PassRegistry&);
 void initializeLocalStackSlotPassPass(PassRegistry&);
 void initializeLoopDeletionPass(PassRegistry&);
+void initializeLoopDistributePass(PassRegistry&);
 void initializeLoopExtractorPass(PassRegistry&);
 void initializeLoopInfoPass(PassRegistry&);
 void initializeLoopInstSimplifyPass(PassRegistry&);
Index: include/llvm/Transforms/Scalar.h
===================================================================
--- include/llvm/Transforms/Scalar.h
+++ include/llvm/Transforms/Scalar.h
@@ -138,6 +138,8 @@
 //
 Pass *createLoopUnswitchPass(bool OptimizeForSize = false);
 
+Pass *createLoopDistributePass();
+
 //===----------------------------------------------------------------------===//
 //
 // LoopInstSimplify - This pass simplifies instructions in a loop's body.
Index: lib/Analysis/AccessAnalysis.cpp
===================================================================
--- /dev/null
+++ lib/Analysis/AccessAnalysis.cpp
@@ -0,0 +1,399 @@
+#include "llvm/Analysis/AccessAnalysis.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE  "access-analysis"
+
+using namespace llvm;
+
+std::pair<Instruction *, Instruction *>
+RuntimeCheckEmitter::addRuntimeCheck(Instruction *Loc,
+                                     RuntimePointerCheck *PtrRtCheck,
+                                     ScalarEvolution *SE,
+                                     Loop *OrigLoop) {
+  Instruction *tnullptr = nullptr;
+  if (!PtrRtCheck->Need)
+    return std::pair<Instruction *, Instruction *>(tnullptr, tnullptr);
+
+  unsigned NumPointers = PtrRtCheck->Pointers.size();
+  SmallVector<TrackingVH<Value> , 2> Starts;
+  SmallVector<TrackingVH<Value> , 2> Ends;
+
+  LLVMContext &Ctx = Loc->getContext();
+  SCEVExpander Exp(*SE, "induction");
+  Instruction *FirstInst = nullptr;
+
+  for (unsigned i = 0; i < NumPointers; ++i) {
+    Value *Ptr = PtrRtCheck->Pointers[i];
+    const SCEV *Sc = SE->getSCEV(Ptr);
+
+    if (SE->isLoopInvariant(Sc, OrigLoop)) {
+      DEBUG(dbgs() << "LV: Adding RT check for a loop invariant ptr:" <<
+            *Ptr <<"\n");
+      Starts.push_back(Ptr);
+      Ends.push_back(Ptr);
+    } else {
+      DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr << '\n');
+      unsigned AS = Ptr->getType()->getPointerAddressSpace();
+
+      // Use this type for pointer arithmetic.
+      Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS);
+
+      Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], PtrArithTy, Loc);
+      Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc);
+      Starts.push_back(Start);
+      Ends.push_back(End);
+    }
+  }
+
+  unsigned NumChecks = 0;
+  IRBuilder<> ChkBuilder(Loc);
+  // Our instructions might fold to a constant.
+  Value *MemoryRuntimeCheck = nullptr;
+  for (unsigned i = 0; i < NumPointers; ++i) {
+    for (unsigned j = i+1; j < NumPointers; ++j) {
+      // No need to check if two readonly pointers intersect.
+      if (!PtrRtCheck->IsWritePtr[i] && !PtrRtCheck->IsWritePtr[j])
+        continue;
+
+      // Only need to check pointers between two different dependency sets.
+      if (PtrRtCheck->DependencySetId[i] == PtrRtCheck->DependencySetId[j])
+       continue;
+      // Only need to check pointers in the same alias set.
+      if (PtrRtCheck->AliasSetId[i] != PtrRtCheck->AliasSetId[j])
+        continue;
+      // FIXME: for LV make these all unique.
+      if (!PtrRtCheck->PartitionSetId.empty() &&
+          PtrRtCheck->PartitionSetId[i] == PtrRtCheck->PartitionSetId[j])
+        continue;
+
+      ++NumChecks;
+      unsigned AS0 = Starts[i]->getType()->getPointerAddressSpace();
+      unsigned AS1 = Starts[j]->getType()->getPointerAddressSpace();
+
+      assert((AS0 == Ends[j]->getType()->getPointerAddressSpace()) &&
+             (AS1 == Ends[i]->getType()->getPointerAddressSpace()) &&
+             "Trying to bounds check pointers with different address spaces");
+
+      Type *PtrArithTy0 = Type::getInt8PtrTy(Ctx, AS0);
+      Type *PtrArithTy1 = Type::getInt8PtrTy(Ctx, AS1);
+
+      Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy0, "bc");
+      Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy1, "bc");
+      Value *End0 =   ChkBuilder.CreateBitCast(Ends[i],   PtrArithTy1, "bc");
+      Value *End1 =   ChkBuilder.CreateBitCast(Ends[j],   PtrArithTy0, "bc");
+
+      Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0");
+      FirstInst = getFirstInst(FirstInst, Cmp0, Loc);
+      Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1");
+      FirstInst = getFirstInst(FirstInst, Cmp1, Loc);
+      Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict");
+      FirstInst = getFirstInst(FirstInst, IsConflict, Loc);
+      if (MemoryRuntimeCheck) {
+        IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict,
+                                         "conflict.rdx");
+        FirstInst = getFirstInst(FirstInst, IsConflict, Loc);
+      }
+      MemoryRuntimeCheck = IsConflict;
+    }
+  }
+
+  dbgs() << "Number of checks: " << NumChecks << " for " << NumPointers
+         << " pointers\n";
+
+  // We have to do this trickery because the IRBuilder might fold the check to a
+  // constant expression in which case there is no Instruction anchored in a
+  // the block.
+  Instruction *Check = BinaryOperator::CreateAnd(MemoryRuntimeCheck,
+                                                 ConstantInt::getTrue(Ctx));
+  ChkBuilder.Insert(Check, "memcheck.conflict");
+  FirstInst = getFirstInst(FirstInst, Check, Loc);
+  return std::make_pair(FirstInst, Check);
+}
+
+bool LoopAccessAnalysis::analyzeLoop() {
+
+  typedef SmallVector<Value*, 16> ValueVector;
+  typedef SmallPtrSet<Value*, 16> ValueSet;
+
+  // Holds the Load and Store *instructions*.
+  ValueVector Loads;
+  ValueVector Stores;
+
+  // Holds all the different accesses in the loop.
+  unsigned NumReads = 0;
+  unsigned NumReadWrites = 0;
+
+  PtrRtCheck.Pointers.clear();
+  PtrRtCheck.Need = false;
+
+  const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
+
+  // For each block.
+  for (Loop::block_iterator bb = TheLoop->block_begin(),
+       be = TheLoop->block_end(); bb != be; ++bb) {
+
+    // Scan the BB and collect legal loads and stores.
+    for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
+         ++it) {
+
+      // If this is a load, save it. If this instruction can read from memory
+      // but is not a load, then we quit. Notice that we don't handle function
+      // calls that read or write.
+      if (it->mayReadFromMemory()) {
+#ifdef FIXME
+        // Many math library functions read the rounding mode. We will only
+        // vectorize a loop if it contains known function calls that don't set
+        // the flag. Therefore, it is safe to ignore this read from memory.
+        CallInst *Call = dyn_cast<CallInst>(it);
+        if (Call && getIntrinsicIDForCall(Call, TLI))
+          continue;
+#endif
+        LoadInst *Ld = dyn_cast<LoadInst>(it);
+        if (!Ld || (!Ld->isSimple() && !IsAnnotatedParallel)) {
+          emitAnalysis(Report(Ld)
+                       << "read with atomic ordering or volatile read");
+          DEBUG(dbgs() << "LV: Found a non-simple load.\n");
+          return false;
+        }
+        NumLoads++;
+        Loads.push_back(Ld);
+        DepChecker.addAccess(Ld);
+        continue;
+      }
+
+      // Save 'store' instructions. Abort if other instructions write to memory.
+      if (it->mayWriteToMemory()) {
+        StoreInst *St = dyn_cast<StoreInst>(it);
+        if (!St) {
+          emitAnalysis(Report(it) << "instruction cannot be vectorized");
+          return false;
+        }
+        if (!St->isSimple() && !IsAnnotatedParallel) {
+          emitAnalysis(Report(St)
+                       << "write with atomic ordering or volatile write");
+          DEBUG(dbgs() << "LV: Found a non-simple store.\n");
+          return false;
+        }
+        NumStores++;
+        Stores.push_back(St);
+        DepChecker.addAccess(St);
+      }
+    } // Next instr.
+  } // Next block.
+
+  // Now we have two lists that hold the loads and the stores.
+  // Next, we find the pointers that they use.
+
+  // Check if we see any stores. If there are no stores, then we don't
+  // care if the pointers are *restrict*.
+  if (!Stores.size()) {
+    DEBUG(dbgs() << "LV: Found a read-only loop!\n");
+    return true;
+  }
+
+  // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects
+  // multiple times on the same object. If the ptr is accessed twice, once
+  // for read and once for write, it will only appear once (on the write
+  // list). This is okay, since we are going to check for conflicts between
+  // writes and between reads and writes, but not between reads and reads.
+  ValueSet Seen;
+
+  ValueVector::iterator I, IE;
+  for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) {
+    StoreInst *ST = cast<StoreInst>(*I);
+    Value* Ptr = ST->getPointerOperand();
+
+    if (isUniform(Ptr)) {
+      emitAnalysis(
+          Report(ST)
+          << "write to a loop invariant address could not be vectorized");
+      DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n");
+      return false;
+    }
+
+    // If we did *not* see this pointer before, insert it to  the read-write
+    // list. At this phase it is only a 'write' list.
+    if (Seen.insert(Ptr)) {
+      ++NumReadWrites;
+
+      AliasAnalysis::Location Loc = AA->getLocation(ST);
+#ifdef FIXME
+      // The TBAA metadata could have a control dependency on the predication
+      // condition, so we cannot rely on it when determining whether or not we
+      // need runtime pointer checks.
+      if (blockNeedsPredication(ST->getParent()))
+        Loc.AATags.TBAA = nullptr;
+#endif
+
+      Accesses.addStore(Loc);
+    }
+  }
+
+  if (IsAnnotatedParallel) {
+    DEBUG(dbgs()
+          << "LV: A loop annotated parallel, ignore memory dependency "
+          << "checks.\n");
+    return true;
+  }
+
+  for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) {
+    LoadInst *LD = cast<LoadInst>(*I);
+    Value* Ptr = LD->getPointerOperand();
+    // If we did *not* see this pointer before, insert it to the
+    // read list. If we *did* see it before, then it is already in
+    // the read-write list. This allows us to vectorize expressions
+    // such as A[i] += x;  Because the address of A[i] is a read-write
+    // pointer. This only works if the index of A[i] is consecutive.
+    // If the address of i is unknown (for example A[B[i]]) then we may
+    // read a few words, modify, and write a few words, and some of the
+    // words may be written to the same address.
+    bool IsReadOnlyPtr = false;
+    if (Seen.insert(Ptr)
+#ifdef FIXME
+        || !isStridedPtr(SE, DL, Ptr, TheLoop, Strides)
+#endif
+      ) {
+      ++NumReads;
+      IsReadOnlyPtr = true;
+    }
+
+    AliasAnalysis::Location Loc = AA->getLocation(LD);
+#ifdef FIXME
+    // The TBAA metadata could have a control dependency on the predication
+    // condition, so we cannot rely on it when determining whether or not we
+    // need runtime pointer checks.
+    if (blockNeedsPredication(LD->getParent()))
+      Loc.AATags.TBAA = nullptr;
+#endif
+
+    Accesses.addLoad(Loc, IsReadOnlyPtr);
+  }
+
+  // If we write (or read-write) to a single destination and there are no
+  // other reads in this loop then is it safe to vectorize.
+  if (NumReadWrites == 1 && NumReads == 0) {
+    DEBUG(dbgs() << "LV: Found a write-only loop!\n");
+    return true;
+  }
+
+  // Build dependence sets and check whether we need a runtime pointer bounds
+  // check.
+  Accesses.buildDependenceSets();
+  bool NeedRTCheck = Accesses.isRTCheckNeeded();
+
+  // Find pointers with computable bounds. We are going to use this information
+  // to place a runtime bound check.
+  unsigned NumComparisons = 0;
+  bool CanDoRT = false;
+  if (NeedRTCheck)
+    CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE, TheLoop,
+                                       Strides);
+
+  DEBUG(dbgs() << "LV: We need to do " << NumComparisons <<
+        " pointer comparisons.\n");
+
+  // If we only have one set of dependences to check pointers among we don't
+  // need a runtime check.
+  if (NumComparisons == 0 && NeedRTCheck)
+    NeedRTCheck = false;
+
+#ifdef FIXME
+  // Check that we did not collect too many pointers or found an unsizeable
+  // pointer.
+  if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) {
+    PtrRtCheck.reset();
+    CanDoRT = false;
+  }
+#endif
+  if (CanDoRT) {
+    DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n");
+  }
+
+  if (NeedRTCheck && !CanDoRT) {
+    emitAnalysis(Report() << "cannot identify array bounds");
+    DEBUG(dbgs() << "LV: We can't vectorize because we can't find " <<
+          "the array bounds.\n");
+    PtrRtCheck.reset();
+    return false;
+  }
+
+  PtrRtCheck.Need = NeedRTCheck;
+
+  if (Accesses.isDependencyCheckNeeded()) {
+    DEBUG(dbgs() << "LV: Checking memory dependencies\n");
+    DepChecker.findUnsafeDeps(DependentAccesses,
+                              Accesses.getDependenciesToCheck(),
+                              Strides, UnsafeDeps);
+  }
+
+  DEBUG(dbgs() << "LV: We" << (NeedRTCheck ? "" : " don't") <<
+        " need a runtime memory check.\n");
+
+  return true;
+}
+
+void MemoryDepChecker::findUnsafeDeps(AccessAnalysis::DepCandidates &AccessSets,
+                                      MemAccessInfoSet &CheckDeps,
+                                      ValueToValueMap &Strides,
+                                      DependenceVector &UnsafeDeps) {
+  MaxSafeDepDistBytes = -1U;
+  while (!CheckDeps.empty()) {
+    MemAccessInfo CurAccess = *CheckDeps.begin();
+
+    // Get the relevant memory access set.
+    EquivalenceClasses<MemAccessInfo>::iterator I =
+      AccessSets.findValue(AccessSets.getLeaderValue(CurAccess));
+
+    // Check accesses within this set.
+    EquivalenceClasses<MemAccessInfo>::member_iterator AI, AE;
+    AI = AccessSets.member_begin(I), AE = AccessSets.member_end();
+
+    // Check every access pair.
+    while (AI != AE) {
+      CheckDeps.erase(*AI);
+      EquivalenceClasses<MemAccessInfo>::member_iterator OI = std::next(AI);
+      while (OI != AE) {
+        // Check every accessing instruction pair in program order.
+        for (std::vector<unsigned>::iterator I1 = Accesses[*AI].begin(),
+             I1E = Accesses[*AI].end(); I1 != I1E; ++I1)
+          for (std::vector<unsigned>::iterator I2 = Accesses[*OI].begin(),
+               I2E = Accesses[*OI].end(); I2 != I2E; ++I2) {
+            if (*I1 < *I2 && isDependent(*AI, *I1, *OI, *I2, Strides))
+              UnsafeDeps.push_back(
+                std::make_pair(OrderedInstruction(InstMap[*I1], *I1),
+                               OrderedInstruction(InstMap[*I2], *I2)));
+            if (*I2 < *I1 && isDependent(*OI, *I2, *AI, *I1, Strides))
+              UnsafeDeps.push_back(
+                std::make_pair(OrderedInstruction(InstMap[*I2], *I2),
+                               OrderedInstruction(InstMap[*I1], *I1)));
+          }
+        ++OI;
+      }
+      AI++;
+    }
+  }
+}
+
+bool
+MemoryDepChecker::areDepsSafe(AccessAnalysis::DepCandidates &AccessSets,
+                              MemAccessInfoSet &CheckDeps,
+                              ValueToValueMap &Strides) {
+  DependenceVector UnsafeDeps;
+  findUnsafeDeps(AccessSets, CheckDeps, Strides, UnsafeDeps);
+  return UnsafeDeps.size() == 0;
+}
+
+void RuntimePointerCheck::print() {
+    for (unsigned I = 0; I < Pointers.size(); ++I)
+      dbgs() << *Pointers[I]
+             << " Partition: " << PartitionSetId[I]
+             << " DependencySet: " << DependencySetId[I]
+             << " Write: " << IsWritePtr[I]
+             << "\n";
+}
+namespace llvm {
+bool operator<(const OrderedInstruction &LHS, const OrderedInstruction &RHS) {
+  return LHS.Order < RHS.Order;
+}
+}
Index: lib/Transforms/IPO/PassManagerBuilder.cpp
===================================================================
--- lib/Transforms/IPO/PassManagerBuilder.cpp
+++ lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -298,6 +298,10 @@
   if (ExtraVectorizerPasses)
     MPM.add(createLoopRotatePass());
 
+  // FIXME: ensure we have loop preheaders in ldist
+  MPM.add(createLoopSimplifyPass());
+  MPM.add(createLoopDistributePass());
+
   MPM.add(createLoopVectorizePass(DisableUnrollLoops, LoopVectorize));
   // FIXME: Because of #pragma vectorize enable, the passes below are always
   // inserted in the pipeline, even when the vectorizer doesn't run (ex. when
Index: lib/Transforms/Scalar/LoopDistribute.cpp
===================================================================
--- /dev/null
+++ lib/Transforms/Scalar/LoopDistribute.cpp
@@ -0,0 +1,532 @@
+#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/ADT/iterator_range.h"
+#include <set>
+#include <numeric>
+#include "llvm/Analysis/AccessAnalysis.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/ControlDependence.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Pass.h"
+
+#define LDIST_NAME "loop-distribute"
+#define DEBUG_TYPE LDIST_NAME
+
+using namespace llvm;
+
+namespace {
+
+void remapInstructions(SmallVectorImpl<BasicBlock *> &Blocks,
+                       ValueToValueMapTy &VMap) {
+    // Rewrite the code to refer to itself.
+    for (unsigned i = 0, e = Blocks.size(); i != e; ++i)
+      for (BasicBlock::iterator I = Blocks[i]->begin(),
+             E = Blocks[i]->end(); I != E; ++I)
+        RemapInstruction(I, VMap,
+                         RF_NoModuleLevelChanges|RF_IgnoreMissingEntries);
+}
+
+class InstrPartition {
+  typedef std::set<Instruction *> InstructionSet;
+
+  InstructionSet Set;
+  Loop *L;
+  ControlDependence *CD;
+
+public:
+  typedef SmallVector<BasicBlock *, 8> BlocksT;
+  BlocksT Blocks;
+  ValueToValueMapTy VMap;
+  bool hasCycle;
+
+  template <class IterTy>
+  InstrPartition(IterTy B, IterTy E, bool hasCycle, Loop *L,
+                 ControlDependence *CD) : L(L), CD(CD), hasCycle(hasCycle) {
+    for (IterTy I = B; I != E; ++I) {
+      Set.insert((*I).I);
+    }
+  }
+
+  InstructionSet::iterator begin() {
+    return Set.begin();
+  }
+
+  InstructionSet::iterator end() {
+    return Set.end();
+  }
+
+  void populateUsedSet() {
+    SmallVector<Instruction *, 8> Worklist(Set.begin(), Set.end());
+
+    while (!Worklist.empty()) {
+      Instruction *I = Worklist.pop_back_val();
+      BasicBlock *BB = I->getParent();
+
+      SmallVector<Instruction *, 8> CIs = CD->findControlInsts(BB);
+      // Insert in the set and any new elements onto the worklist.
+      for (Instruction *I : CIs)
+        if (Set.insert(I).second)
+          Worklist.push_back(I);
+
+      // Insert instructions from the loop that we depend on as well.
+      for (Value *V : I->operand_values()) {
+        auto *I = dyn_cast<Instruction>(V);
+        if (I && L->contains(I->getParent()) && Set.insert(I).second)
+          Worklist.push_back(I);
+      }
+    }
+  }
+
+  void print() {
+    for (auto *I : Set)
+      dbgs() << "  " << I->getParent()->getName() << ":" << *I << "\n";
+    if (hasCycle)
+      dbgs() << "  (cycle)\n";
+  }
+
+  BasicBlock *getLoopPreheader() {
+    return Blocks[0];
+  }
+
+  void remapInstructions() {
+    ::remapInstructions(Blocks, VMap);
+  }
+
+  iterator_range<BlocksT::iterator> getLoopBlocks() {
+    return make_range(Blocks.begin() + 1, Blocks.end() - 1);
+  }
+
+  void removeUnusedInsts() {
+    SmallVector<Instruction *, 8> Unused;
+    for (auto *BB: L->getBlocks())
+      for (auto &I: *BB)
+        if (!Set.count(&I)) {
+          Instruction *NewI = &I;
+          if (!VMap.empty())
+            NewI = cast<Instruction>(VMap[NewI]);
+          auto *Br = dyn_cast<BranchInst>(NewI);
+          if (Br) {
+            // If this is a conditional branch that no-one is control dependent
+            // on then we can go either way.  Turn it into an unconditional
+            // branch.  Keep unconditional branches.
+            if (Br->isConditional()) {
+              BranchInst::Create(Br->getSuccessor(0), Br);
+              Unused.push_back(NewI);
+            }
+          } else
+            Unused.push_back(NewI);
+        }
+    for (auto I = Unused.rbegin(), E = Unused.rend(); I != E; ++I)
+      (*I)->eraseFromParent();
+
+    dbgs() << "\nAfter removing unused Instrs:\n";
+    if (Blocks.empty())
+      for (auto *BB: L->getBlocks())
+        dbgs() << *BB;
+    else
+      for (auto *BB: Blocks)
+        dbgs() << *BB;
+  }
+};
+
+// When splitting A->B, SplitEdge produces A->B->B.split and returns the new
+// block B.split, I think that in some other cases it returns the middle block.
+// To keep this consistent when transforming A->B to A->M->B we return M and
+// update B if necessary.
+BasicBlock *LDistSplitEdge(BasicBlock *From, BasicBlock *&To, Pass *P) {
+  BasicBlock *NewBlock = SplitEdge(From, To, P);
+  BasicBlock *SinglePred = NewBlock->getSinglePredecessor();
+  if (SinglePred && SinglePred != From) {
+    To = NewBlock;
+    return SinglePred;
+  }
+  return NewBlock;
+}
+
+SmallVector<BasicBlock *, 8> splitLoopPreheadAndExit(Loop *L, Pass *P) {
+    BasicBlock *PH = L->getLoopPreheader();
+    BasicBlock *H = L->getHeader();
+    BasicBlock *NewPH = LDistSplitEdge(PH, H, P);
+    BasicBlock *ExitBlock = L->getExitBlock();
+    BasicBlock *NewExit = LDistSplitEdge(L->getExitingBlock(), ExitBlock, P);
+    SmallVector<BasicBlock *, 8> LoopBlocks;
+    LoopBlocks.push_back(NewPH);
+    LoopBlocks.append(L->block_begin(), L->block_end());
+    LoopBlocks.push_back(NewExit);
+    return LoopBlocks;
+}
+
+void cloneLoop(BasicBlock *Before,
+               SmallVectorImpl<BasicBlock *> &LoopBlocks,
+               SmallVectorImpl<BasicBlock *> &NewBlocks,
+               ValueToValueMapTy &VMap, const Twine &NameSuffix) {
+  Function *F = LoopBlocks[0]->getParent();
+  for (BasicBlock *BB : LoopBlocks) {
+    BasicBlock *NewBB = CloneBasicBlock(BB, VMap, NameSuffix, F);
+    NewBlocks.push_back(NewBB);
+    VMap[BB] = NewBB;
+// FIXME:
+// LPM->cloneBasicBlockSimpleAnalysis(LoopBlocks[i], NewBB, L);
+  }
+  // Move them physically from the end of the block list.
+  F->getBasicBlockList().splice(Before, F->getBasicBlockList(),
+                                NewBlocks[0], F->end());
+}
+
+//
+//
+//
+class InstrPartitionSet {
+
+  SmallVector<std::unique_ptr<InstrPartition>, 8> Vector;
+  EquivalenceClasses<OrderedInstruction> EC;
+  Loop *L;
+  PostDominatorTree *PDT;
+  ControlDependence CD;
+  DenseMap<Instruction *, unsigned> InstToPartitionId;
+  SmallSet<const OrderedInstruction *, 8> ClassWithCycle;
+
+public:
+  InstrPartitionSet(Loop *L, PostDominatorTree *PDT) :
+    L(L), PDT(PDT), CD(PDT, L) {
+  }
+
+  size_t size() {
+    return EC.getNumClasses();
+  }
+
+  void seedTogether(OrderedInstruction A, OrderedInstruction B) {
+    auto I = EC.unionSets(A, B);
+    ClassWithCycle.insert(&*I);
+  }
+
+  void mergeClasses(const OrderedInstruction &A, const OrderedInstruction &B) {
+    auto I = EC.unionSets(A, B);
+    assert(&A == &*I);
+    if (ClassWithCycle.count(&B))
+      ClassWithCycle.insert(&*I);
+  }
+
+  void seed(OrderedInstruction I) {
+    EC.insert(I);
+  }
+
+  bool isConditional(Instruction *I) {
+    return !PDT->dominates(I->getParent(), L->getHeader());
+  }
+
+  void mergeSingleStores() {
+    // FIXME: merge single stores.  We should be able to vectorize these in one
+    // loop.  (Multiple instructions in a class means that there is unsafe
+    // dependence among them, so we won't be able to vectorize that loop.)
+    for (auto I = EC.begin(), E = EC.end(); I != E; ++I) {
+      auto &P = *I;
+      if (P.isLeader() && P.getNext() == nullptr) {
+        auto Range = make_range(std::next(I), EC.end());
+        for (auto &R: Range)
+          if (R.isLeader() && R.getNext() == nullptr &&
+              R.getData().I->getParent() == P.getData().I->getParent())
+            // FIXME: Doesn't this invalidate the iterator?
+            mergeClasses(P.getData(), R.getData());
+      }
+    }
+  }
+
+  void mergeConditionalStores() {
+    // If all stores are conditional in a partition, we won't be able to
+    // vectorize it so merge it into previous partition.
+    const OrderedInstruction *PrevClass = nullptr;
+    for (auto I = EC.begin(), IE = EC.end(); I != IE; ++I)
+      if (I->isLeader()) {
+        if (PrevClass) {
+          auto J = EC.member_begin(I), JE = EC.member_end();
+          bool seenStore = false;
+          for (; J != JE; ++J) {
+            if (isa<StoreInst>((*J).I))
+              seenStore = true;
+            if (seenStore && !isConditional((*J).I))
+              break;
+          }
+          if (J == JE)
+            mergeClasses(I->getData(), *PrevClass);
+        }
+        PrevClass = &I->getData();
+      }
+  }
+
+  void merge() {
+    mergeSingleStores();
+    dbgs() << "\nMerged partitions (1):\n";
+    printEC();
+
+    mergeConditionalStores();
+    dbgs() << "\nMerged partitions(2):\n";
+    printEC();
+  }
+
+  void createPartitions() {
+    for (auto I = EC.begin(), E = EC.end(); I != E; ++I)
+      if (I->isLeader())
+        Vector.push_back(make_unique<InstrPartition>(EC.member_begin(I),
+                                                     EC.member_end(),
+                                                     ClassWithCycle.count(&I->getData()),
+                                                     L, &CD));
+  }
+
+  void populateUsedSet() {
+    for (auto &P : Vector)
+      P->populateUsedSet();
+
+    for (unsigned P = 0; P < Vector.size(); ++P)
+      for (Instruction *I : *Vector[P])
+        InstToPartitionId[I] = P;
+  }
+
+  void setSuccessor(BasicBlock *BB, BasicBlock *Succ) {
+    auto Branch = cast<BranchInst>(BB->getTerminator());
+    assert(Branch->isUnconditional());
+    Branch->setSuccessor(0, Succ);
+  }
+
+  void cloneLoops(Pass *P) {
+    BasicBlock *ExitBlock = L->getExitBlock();
+    BasicBlock *PH = L->getLoopPreheader();
+    SmallVector<BasicBlock *, 8> LoopBlocks = splitLoopPreheadAndExit(L, P);
+    BasicBlock *NewPH = LoopBlocks[0];
+
+    // In order to stich in the new loop on the PH -> NewPH edge:
+    //   * change the NewExit -> ExitBlock to NewExit -> NewPH
+    //   * change PH -> NewPH to PH -> last cloned NewPH
+    for (int I = Vector.size() - 2; I >= 0; --I) {
+      auto &Part = Vector[I];
+
+      cloneLoop(NewPH, LoopBlocks, Part->Blocks, Part->VMap,
+                Twine(".ldist") + Twine(I + 1));
+      Part->VMap[ExitBlock] = NewPH;
+      NewPH = Part->getLoopPreheader();
+      Part->remapInstructions();
+    }
+    setSuccessor(PH, NewPH);
+  }
+
+  void removeUnusedInsts() {
+    for (unsigned I = 0; I < Vector.size(); ++I) {
+      auto &Part = Vector[I];
+          Part->removeUnusedInsts();
+    }
+  }
+
+  void setPartitionPtrUse(LoopAccessAnalysis &LAA) {
+    RuntimePointerCheck &RtPtrCheck = LAA.PtrRtCheck;
+    unsigned N = RtPtrCheck.Pointers.size();
+    assert(RtPtrCheck.PartitionSetId.empty() && "PartitionSetId not empty");
+    RtPtrCheck.PartitionSetId.reserve(N);
+    for (unsigned I = 0; I < N; ++I) {
+      Value *Ptr = RtPtrCheck.Pointers[I];
+      AccessAnalysis::MemAccessInfo Access(Ptr, RtPtrCheck.IsWritePtr[I]);
+      auto Instructions = LAA.getInstructionsForAccess(Access);
+      uint64_t P =
+        std::accumulate(Instructions.begin(), Instructions.end(),
+                        (uint64_t) 0,
+                        [&](uint64_t res, Instruction *Inst) {
+                          return res | (1 << this->InstToPartitionId[Inst]);
+                        });
+      RtPtrCheck.PartitionSetId.push_back(P);
+    }
+  }
+
+  void printEC() {
+    int i = 0;
+    for (auto I = EC.begin(), E = EC.end(); I != E; ++I) {
+      if (!I->isLeader()) continue;
+      dbgs() << i++;
+      if (ClassWithCycle.count(&I->getData()))
+        dbgs() << " (cycle)";
+      dbgs() << ":\n";
+      for (auto MI = EC.member_begin(I); MI != EC.member_end(); ++MI)
+        dbgs() << "  " << (*MI).Order << " " << *(*MI).I << "\n";
+    }
+  }
+
+  void print() {
+    for (auto I = Vector.begin(); I != Vector.end(); ++I) {
+      dbgs() << I - Vector.begin() << ":\n";
+      (*I)->print();
+    }
+  }
+};
+
+class LDistRuntimeCheckEmitter {
+  LoopAccessAnalysis &LAA;
+  Loop *L;
+  BasicBlock *MemCheckBB;
+  SmallVector<BasicBlock *, 8> OrigLoopBlocks;
+
+public:
+  LDistRuntimeCheckEmitter(LoopAccessAnalysis &LAA, Loop *L) :
+    LAA(LAA), L(L), MemCheckBB(nullptr) {
+  }
+
+  bool needsRuntimeChecks() const {
+    // FIXME
+    return true;
+  }
+
+  void partitionPointers(InstrPartitionSet &Partitions) {
+    // Set up partition id in PtrRtChecks.  Ptr -> Access -> Intruction ->
+    // Partition.
+    Partitions.setPartitionPtrUse(LAA);
+    dbgs() << "\nPointers:\n";
+    LAA.PtrRtCheck.print();
+  }
+
+  void versionLoop(Pass *P) {
+    BasicBlock *OrigPH = L->getLoopPreheader();
+    SmallVector<BasicBlock *, 8> LoopBlocks = splitLoopPreheadAndExit(L, P);
+    ValueToValueMapTy VMap;
+    cloneLoop(*(LoopBlocks.end() - 1), LoopBlocks, OrigLoopBlocks, VMap,
+              ".ldist.orig");
+    remapInstructions(OrigLoopBlocks, VMap);
+
+    MemCheckBB = SplitBlock(OrigPH, OrigPH->getTerminator(), P);
+    MemCheckBB->setName(L->getHeader()->getName() + ".ldist.memcheck");
+
+  }
+
+  void insertChecks() {
+    Instruction *FirstCheckInst;
+    Instruction *MemRuntimeCheck;
+    Instruction *OrigTerm = MemCheckBB->getTerminator();
+    std::tie(FirstCheckInst, MemRuntimeCheck) = LAA.addRuntimeCheck(OrigTerm);
+    BranchInst::Create(OrigLoopBlocks[0], L->getLoopPreheader(),
+                       MemRuntimeCheck, OrigTerm);
+    OrigTerm->eraseFromParent();
+  }
+};
+
+struct LoopDistribute : public FunctionPass {
+  static char ID;
+  typedef EquivalenceClasses<Instruction *> PartitionsT;
+
+  LoopDistribute() : FunctionPass(ID) {
+    initializeLoopDistributePass(*PassRegistry::getPassRegistry());
+  }
+
+  ScalarEvolution *SE;
+  const DataLayout *DL;
+  LoopInfo *LI;
+  AliasAnalysis *AA;
+  PostDominatorTree *PDT;
+
+  virtual bool runOnFunction(Function &F) {
+    SE = &getAnalysis<ScalarEvolution>();
+    DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
+    DL = DLP ? &DLP->getDataLayout() : nullptr;
+    LI = &getAnalysis<LoopInfo>();
+    AA = &getAnalysis<AliasAnalysis>();
+    PDT = &getAnalysis<PostDominatorTree>();
+
+    // Build up a worklist of inner-loops to vectorize. This is necessary as
+    // the act of vectorizing or partially unrolling a loop creates new loops
+    // and can invalidate iterators across the loops.
+    SmallVector<Loop *, 8> Worklist;
+
+    for (Loop *L : *LI)
+      addInnerLoop(*L, Worklist);
+
+    // Now walk the identified inner loops.
+    bool Changed = false;
+    while (!Worklist.empty())
+      Changed |= processLoop(Worklist.pop_back_val());
+
+    // Process each loop nest in the function.
+    return Changed;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LoopInfo>();
+    AU.addRequired<ScalarEvolution>();
+    AU.addRequired<AliasAnalysis>();
+    AU.addRequired<PostDominatorTree>();
+  }
+
+  bool processLoop(Loop *L) {
+    assert(L->empty() && "Only process inner loops.");
+
+    DEBUG(dbgs() << "\nLV: In \""
+          << L->getHeader()->getParent()->getName() <<
+          "\" checking " << *L << "\n");
+
+    // ScalarEvolution needs to be able to find the exit count.
+    const SCEV *ExitCount = SE->getBackedgeTakenCount(L);
+    if (ExitCount == SE->getCouldNotCompute()) {
+      DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
+      return false;
+    }
+
+    LoopAccessAnalysis LAA(L, DL, AA, SE);
+    if (!LAA.analyzeLoop())
+      return false;
+
+    InstrPartitionSet Partitions(L, PDT);
+
+    // FIXME: This is not sufficient.  We need to put all mem accesses between
+    // these two into the same partition as well.
+    for (auto &d : LAA.UnsafeDeps) {
+      dbgs() << *d.first.I
+             << "\n->\n" << *d.second.I << "\n\n";
+
+      Partitions.seedTogether(d.first, d.second);
+    }
+
+    // FIXME: also add defs whose value is used outside the loop
+    for (auto B = LAA.DepChecker.InstMap.begin(),
+           E = LAA.DepChecker.InstMap.end(), I = B; I != E; ++I)
+      if (isa<StoreInst>(*I))
+        Partitions.seed(OrderedInstruction(*I, std::distance(B, I)));
+
+    dbgs() << "Seeded partitions:\n";
+    Partitions.printEC();
+
+    Partitions.merge();
+
+    if (Partitions.size() <= 1)
+      return false;
+
+    Partitions.createPartitions();
+    Partitions.populateUsedSet();
+    dbgs() << "\nPopulated partitions:\n";
+    Partitions.print();
+
+    LDistRuntimeCheckEmitter RtCheckEmitter(LAA, L);
+    if (RtCheckEmitter.needsRuntimeChecks()) {
+      RtCheckEmitter.partitionPointers(Partitions);
+      RtCheckEmitter.versionLoop(this);
+      RtCheckEmitter.insertChecks();
+    }
+
+    Partitions.cloneLoops(this);
+
+    Partitions.removeUnusedInsts();
+
+    return true;
+  }
+};
+}
+
+char LoopDistribute::ID = 0;
+static const char ldist_name[] = "Loop Distribition";
+
+INITIALIZE_PASS_BEGIN(LoopDistribute, LDIST_NAME, ldist_name, false, false)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTree)
+INITIALIZE_PASS_END(LoopDistribute, LDIST_NAME, ldist_name, false, false)
+
+namespace llvm {
+  Pass *createLoopDistributePass() {
+    return new LoopDistribute();
+  }
+}
Index: lib/Transforms/Scalar/Scalar.cpp
===================================================================
--- lib/Transforms/Scalar/Scalar.cpp
+++ lib/Transforms/Scalar/Scalar.cpp
@@ -44,6 +44,7 @@
   initializeJumpThreadingPass(Registry);
   initializeLICMPass(Registry);
   initializeLoopDeletionPass(Registry);
+  initializeLoopDistributePass(Registry);
   initializeLoopInstSimplifyPass(Registry);
   initializeLoopRotatePass(Registry);
   initializeLoopStrengthReducePass(Registry);
Index: lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorize.cpp
+++ lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -66,6 +66,7 @@
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/AccessAnalysis.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
@@ -218,29 +219,6 @@
 class LoopVectorizationCostModel;
 class LoopVectorizeHints;
 
-/// Optimization analysis message produced during vectorization. Messages inform
-/// the user why vectorization did not occur.
-class Report {
-  std::string Message;
-  raw_string_ostream Out;
-  Instruction *Instr;
-
-public:
-  Report(Instruction *I = nullptr) : Out(Message), Instr(I) {
-    Out << "loop not vectorized: ";
-  }
-
-  template <typename A> Report &operator<<(const A &Value) {
-    Out << Value;
-    return *this;
-  }
-
-  Instruction *getInstr() { return Instr; }
-
-  std::string &str() { return Out.str(); }
-  operator Twine() { return Out.str(); }
-};
-
 /// InnerLoopVectorizer vectorizes loops which contain only one basic
 /// block to a specified vectorization factor (VF).
 /// This class performs the widening of scalars into vectors, or multiple
@@ -255,7 +233,7 @@
 /// aspects. The InnerLoopVectorizer relies on the
 /// LoopVectorizationLegality class to provide information about the induction
 /// and reduction variables that were found to a given vectorization factor.
-class InnerLoopVectorizer {
+class InnerLoopVectorizer : public RuntimeCheckEmitter {
 public:
   InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
                       DominatorTree *DT, const DataLayout *DL,
@@ -293,13 +271,6 @@
   typedef DenseMap<std::pair<BasicBlock*, BasicBlock*>,
                    VectorParts> EdgeMaskCache;
 
-  /// \brief Add code that checks at runtime if the accessed arrays overlap.
-  ///
-  /// Returns a pair of instructions where the first element is the first
-  /// instruction generated in possibly a sequence of instructions and the
-  /// second value is the final comparator value or NULL if no check is needed.
-  std::pair<Instruction *, Instruction *> addRuntimeCheck(Instruction *Loc);
-
   /// \brief Add checks for strides that where assumed to be 1.
   ///
   /// Returns the last check instruction and the first check instruction in the
@@ -657,43 +628,6 @@
     MinMaxReductionKind MinMaxKind;
   };
 
-  /// This struct holds information about the memory runtime legality
-  /// check that a group of pointers do not overlap.
-  struct RuntimePointerCheck {
-    RuntimePointerCheck() : Need(false) {}
-
-    /// Reset the state of the pointer runtime information.
-    void reset() {
-      Need = false;
-      Pointers.clear();
-      Starts.clear();
-      Ends.clear();
-      IsWritePtr.clear();
-      DependencySetId.clear();
-      AliasSetId.clear();
-    }
-
-    /// Insert a pointer and calculate the start and end SCEVs.
-    void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr,
-                unsigned DepSetId, unsigned ASId, ValueToValueMap &Strides);
-
-    /// This flag indicates if we need to add the runtime check.
-    bool Need;
-    /// Holds the pointers that we need to check.
-    SmallVector<TrackingVH<Value>, 2> Pointers;
-    /// Holds the pointer value at the beginning of the loop.
-    SmallVector<const SCEV*, 2> Starts;
-    /// Holds the pointer value at the end of the loop.
-    SmallVector<const SCEV*, 2> Ends;
-    /// Holds the information if this pointer is used for writing to memory.
-    SmallVector<bool, 2> IsWritePtr;
-    /// Holds the id of the set of pointers that could be dependent because of a
-    /// shared underlying object.
-    SmallVector<unsigned, 2> DependencySetId;
-    /// Holds the id of the disjoint alias set to which this pointer belongs.
-    SmallVector<unsigned, 2> AliasSetId;
-  };
-
   /// A struct for saving information about induction variables.
   struct InductionInfo {
     InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {}
@@ -1220,14 +1154,6 @@
   }
 }
 
-static void addInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) {
-  if (L.empty())
-    return V.push_back(&L);
-
-  for (Loop *InnerL : L)
-    addInnerLoop(*InnerL, V);
-}
-
 /// The LoopVectorize Pass.
 struct LoopVectorize : public FunctionPass {
   /// Pass identification, replacement for typeid
@@ -1536,7 +1462,7 @@
   return SE->getSCEV(Ptr);
 }
 
-void LoopVectorizationLegality::RuntimePointerCheck::insert(
+void RuntimePointerCheck::insert(
     ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr, unsigned DepSetId,
     unsigned ASId, ValueToValueMap &Strides) {
   // Get the stride replaced scev.
@@ -1998,15 +1924,6 @@
   }
 }
 
-static Instruction *getFirstInst(Instruction *FirstInst, Value *V,
-                                 Instruction *Loc) {
-  if (FirstInst)
-    return FirstInst;
-  if (Instruction *I = dyn_cast<Instruction>(V))
-    return I->getParent() == Loc->getParent() ? I : nullptr;
-  return nullptr;
-}
-
 std::pair<Instruction *, Instruction *>
 InnerLoopVectorizer::addStrideCheck(Instruction *Loc) {
   Instruction *tnullptr = nullptr;
@@ -2044,102 +1961,6 @@
   return std::make_pair(FirstInst, TheCheck);
 }
 
-std::pair<Instruction *, Instruction *>
-InnerLoopVectorizer::addRuntimeCheck(Instruction *Loc) {
-  LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck =
-  Legal->getRuntimePointerCheck();
-
-  Instruction *tnullptr = nullptr;
-  if (!PtrRtCheck->Need)
-    return std::pair<Instruction *, Instruction *>(tnullptr, tnullptr);
-
-  unsigned NumPointers = PtrRtCheck->Pointers.size();
-  SmallVector<TrackingVH<Value> , 2> Starts;
-  SmallVector<TrackingVH<Value> , 2> Ends;
-
-  LLVMContext &Ctx = Loc->getContext();
-  SCEVExpander Exp(*SE, "induction");
-  Instruction *FirstInst = nullptr;
-
-  for (unsigned i = 0; i < NumPointers; ++i) {
-    Value *Ptr = PtrRtCheck->Pointers[i];
-    const SCEV *Sc = SE->getSCEV(Ptr);
-
-    if (SE->isLoopInvariant(Sc, OrigLoop)) {
-      DEBUG(dbgs() << "LV: Adding RT check for a loop invariant ptr:" <<
-            *Ptr <<"\n");
-      Starts.push_back(Ptr);
-      Ends.push_back(Ptr);
-    } else {
-      DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr << '\n');
-      unsigned AS = Ptr->getType()->getPointerAddressSpace();
-
-      // Use this type for pointer arithmetic.
-      Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS);
-
-      Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], PtrArithTy, Loc);
-      Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc);
-      Starts.push_back(Start);
-      Ends.push_back(End);
-    }
-  }
-
-  IRBuilder<> ChkBuilder(Loc);
-  // Our instructions might fold to a constant.
-  Value *MemoryRuntimeCheck = nullptr;
-  for (unsigned i = 0; i < NumPointers; ++i) {
-    for (unsigned j = i+1; j < NumPointers; ++j) {
-      // No need to check if two readonly pointers intersect.
-      if (!PtrRtCheck->IsWritePtr[i] && !PtrRtCheck->IsWritePtr[j])
-        continue;
-
-      // Only need to check pointers between two different dependency sets.
-      if (PtrRtCheck->DependencySetId[i] == PtrRtCheck->DependencySetId[j])
-       continue;
-      // Only need to check pointers in the same alias set.
-      if (PtrRtCheck->AliasSetId[i] != PtrRtCheck->AliasSetId[j])
-        continue;
-
-      unsigned AS0 = Starts[i]->getType()->getPointerAddressSpace();
-      unsigned AS1 = Starts[j]->getType()->getPointerAddressSpace();
-
-      assert((AS0 == Ends[j]->getType()->getPointerAddressSpace()) &&
-             (AS1 == Ends[i]->getType()->getPointerAddressSpace()) &&
-             "Trying to bounds check pointers with different address spaces");
-
-      Type *PtrArithTy0 = Type::getInt8PtrTy(Ctx, AS0);
-      Type *PtrArithTy1 = Type::getInt8PtrTy(Ctx, AS1);
-
-      Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy0, "bc");
-      Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy1, "bc");
-      Value *End0 =   ChkBuilder.CreateBitCast(Ends[i],   PtrArithTy1, "bc");
-      Value *End1 =   ChkBuilder.CreateBitCast(Ends[j],   PtrArithTy0, "bc");
-
-      Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0");
-      FirstInst = getFirstInst(FirstInst, Cmp0, Loc);
-      Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1");
-      FirstInst = getFirstInst(FirstInst, Cmp1, Loc);
-      Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict");
-      FirstInst = getFirstInst(FirstInst, IsConflict, Loc);
-      if (MemoryRuntimeCheck) {
-        IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict,
-                                         "conflict.rdx");
-        FirstInst = getFirstInst(FirstInst, IsConflict, Loc);
-      }
-      MemoryRuntimeCheck = IsConflict;
-    }
-  }
-
-  // We have to do this trickery because the IRBuilder might fold the check to a
-  // constant expression in which case there is no Instruction anchored in a
-  // the block.
-  Instruction *Check = BinaryOperator::CreateAnd(MemoryRuntimeCheck,
-                                                 ConstantInt::getTrue(Ctx));
-  ChkBuilder.Insert(Check, "memcheck.conflict");
-  FirstInst = getFirstInst(FirstInst, Check, Loc);
-  return std::make_pair(FirstInst, Check);
-}
-
 void InnerLoopVectorizer::createEmptyLoop() {
   /*
    In this function we generate a new loop. The new loop will contain
@@ -2364,7 +2185,8 @@
   // faster.
   Instruction *MemRuntimeCheck;
   std::tie(FirstCheckInst, MemRuntimeCheck) =
-      addRuntimeCheck(LastBypassBlock->getTerminator());
+      addRuntimeCheck(LastBypassBlock->getTerminator(),
+                      Legal->getRuntimePointerCheck(), SE, OrigLoop);
   if (MemRuntimeCheck) {
     // Create a new block containing the memory check.
     BasicBlock *CheckBlock =
@@ -3989,91 +3811,6 @@
   }
 }
 
-namespace {
-/// \brief Analyses memory accesses in a loop.
-///
-/// Checks whether run time pointer checks are needed and builds sets for data
-/// dependence checking.
-class AccessAnalysis {
-public:
-  /// \brief Read or write access location.
-  typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
-  typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet;
-
-  /// \brief Set of potential dependent memory accesses.
-  typedef EquivalenceClasses<MemAccessInfo> DepCandidates;
-
-  AccessAnalysis(const DataLayout *Dl, AliasAnalysis *AA, DepCandidates &DA) :
-    DL(Dl), AST(*AA), DepCands(DA), IsRTCheckNeeded(false) {}
-
-  /// \brief Register a load  and whether it is only read from.
-  void addLoad(AliasAnalysis::Location &Loc, bool IsReadOnly) {
-    Value *Ptr = const_cast<Value*>(Loc.Ptr);
-    AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.AATags);
-    Accesses.insert(MemAccessInfo(Ptr, false));
-    if (IsReadOnly)
-      ReadOnlyPtr.insert(Ptr);
-  }
-
-  /// \brief Register a store.
-  void addStore(AliasAnalysis::Location &Loc) {
-    Value *Ptr = const_cast<Value*>(Loc.Ptr);
-    AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.AATags);
-    Accesses.insert(MemAccessInfo(Ptr, true));
-  }
-
-  /// \brief Check whether we can check the pointers at runtime for
-  /// non-intersection.
-  bool canCheckPtrAtRT(LoopVectorizationLegality::RuntimePointerCheck &RtCheck,
-                       unsigned &NumComparisons, ScalarEvolution *SE,
-                       Loop *TheLoop, ValueToValueMap &Strides,
-                       bool ShouldCheckStride = false);
-
-  /// \brief Goes over all memory accesses, checks whether a RT check is needed
-  /// and builds sets of dependent accesses.
-  void buildDependenceSets() {
-    processMemAccesses();
-  }
-
-  bool isRTCheckNeeded() { return IsRTCheckNeeded; }
-
-  bool isDependencyCheckNeeded() { return !CheckDeps.empty(); }
-  void resetDepChecks() { CheckDeps.clear(); }
-
-  MemAccessInfoSet &getDependenciesToCheck() { return CheckDeps; }
-
-private:
-  typedef SetVector<MemAccessInfo> PtrAccessSet;
-
-  /// \brief Go over all memory access and check whether runtime pointer checks
-  /// are needed /// and build sets of dependency check candidates.
-  void processMemAccesses();
-
-  /// Set of all accesses.
-  PtrAccessSet Accesses;
-
-  /// Set of accesses that need a further dependence check.
-  MemAccessInfoSet CheckDeps;
-
-  /// Set of pointers that are read only.
-  SmallPtrSet<Value*, 16> ReadOnlyPtr;
-
-  const DataLayout *DL;
-
-  /// An alias set tracker to partition the access set by underlying object and
-  //intrinsic property (such as TBAA metadata).
-  AliasSetTracker AST;
-
-  /// Sets of potentially dependent accesses - members of one set share an
-  /// underlying pointer. The set "CheckDeps" identfies which sets really need a
-  /// dependence check.
-  DepCandidates &DepCands;
-
-  bool IsRTCheckNeeded;
-};
-
-} // end anonymous namespace
-
 /// \brief Check whether a pointer can participate in a runtime bounds check.
 static bool hasComputableBounds(ScalarEvolution *SE, ValueToValueMap &Strides,
                                 Value *Ptr) {
@@ -4091,7 +3828,7 @@
                         const Loop *Lp, ValueToValueMap &StridesMap);
 
 bool AccessAnalysis::canCheckPtrAtRT(
-    LoopVectorizationLegality::RuntimePointerCheck &RtCheck,
+    RuntimePointerCheck &RtCheck,
     unsigned &NumComparisons, ScalarEvolution *SE, Loop *TheLoop,
     ValueToValueMap &StridesMap, bool ShouldCheckStride) {
   // Find pointers with computable bounds. We are going to use this information
@@ -4261,6 +3998,7 @@
         // there is no other write to the ptr - this is an optimization to
         // catch "a[i] = a[i] + " without having to do a dependence check).
         if ((IsWrite || IsReadOnlyPtr) && SetHasWrite) {
+          DEBUG(dbgs() << "LV: CheckDeps this ptr " << *Access.getPointer() << "\n");
           CheckDeps.insert(Access);
           IsRTCheckNeeded = true;
         }
@@ -4279,6 +4017,7 @@
           if (Prev != ObjToLastAccess.end())
             DepCands.unionSets(Access, Prev->second);
 
+          DEBUG(dbgs() << "LV: UnderlyingPtr " << *UnderlyingObj << "\n");
           ObjToLastAccess[UnderlyingObj] = Access;
         }
       }
@@ -4286,124 +4025,6 @@
   }
 }
 
-namespace {
-/// \brief Checks memory dependences among accesses to the same underlying
-/// object to determine whether there vectorization is legal or not (and at
-/// which vectorization factor).
-///
-/// This class works under the assumption that we already checked that memory
-/// locations with different underlying pointers are "must-not alias".
-/// We use the ScalarEvolution framework to symbolically evalutate access
-/// functions pairs. Since we currently don't restructure the loop we can rely
-/// on the program order of memory accesses to determine their safety.
-/// At the moment we will only deem accesses as safe for:
-///  * A negative constant distance assuming program order.
-///
-///      Safe: tmp = a[i + 1];     OR     a[i + 1] = x;
-///            a[i] = tmp;                y = a[i];
-///
-///   The latter case is safe because later checks guarantuee that there can't
-///   be a cycle through a phi node (that is, we check that "x" and "y" is not
-///   the same variable: a header phi can only be an induction or a reduction, a
-///   reduction can't have a memory sink, an induction can't have a memory
-///   source). This is important and must not be violated (or we have to
-///   resort to checking for cycles through memory).
-///
-///  * A positive constant distance assuming program order that is bigger
-///    than the biggest memory access.
-///
-///     tmp = a[i]        OR              b[i] = x
-///     a[i+2] = tmp                      y = b[i+2];
-///
-///     Safe distance: 2 x sizeof(a[0]), and 2 x sizeof(b[0]), respectively.
-///
-///  * Zero distances and all accesses have the same size.
-///
-class MemoryDepChecker {
-public:
-  typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
-  typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet;
-
-  MemoryDepChecker(ScalarEvolution *Se, const DataLayout *Dl, const Loop *L)
-      : SE(Se), DL(Dl), InnermostLoop(L), AccessIdx(0),
-        ShouldRetryWithRuntimeCheck(false) {}
-
-  /// \brief Register the location (instructions are given increasing numbers)
-  /// of a write access.
-  void addAccess(StoreInst *SI) {
-    Value *Ptr = SI->getPointerOperand();
-    Accesses[MemAccessInfo(Ptr, true)].push_back(AccessIdx);
-    InstMap.push_back(SI);
-    ++AccessIdx;
-  }
-
-  /// \brief Register the location (instructions are given increasing numbers)
-  /// of a write access.
-  void addAccess(LoadInst *LI) {
-    Value *Ptr = LI->getPointerOperand();
-    Accesses[MemAccessInfo(Ptr, false)].push_back(AccessIdx);
-    InstMap.push_back(LI);
-    ++AccessIdx;
-  }
-
-  /// \brief Check whether the dependencies between the accesses are safe.
-  ///
-  /// Only checks sets with elements in \p CheckDeps.
-  bool areDepsSafe(AccessAnalysis::DepCandidates &AccessSets,
-                   MemAccessInfoSet &CheckDeps, ValueToValueMap &Strides);
-
-  /// \brief The maximum number of bytes of a vector register we can vectorize
-  /// the accesses safely with.
-  unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; }
-
-  /// \brief In same cases when the dependency check fails we can still
-  /// vectorize the loop with a dynamic array access check.
-  bool shouldRetryWithRuntimeCheck() { return ShouldRetryWithRuntimeCheck; }
-
-private:
-  ScalarEvolution *SE;
-  const DataLayout *DL;
-  const Loop *InnermostLoop;
-
-  /// \brief Maps access locations (ptr, read/write) to program order.
-  DenseMap<MemAccessInfo, std::vector<unsigned> > Accesses;
-
-  /// \brief Memory access instructions in program order.
-  SmallVector<Instruction *, 16> InstMap;
-
-  /// \brief The program order index to be used for the next instruction.
-  unsigned AccessIdx;
-
-  // We can access this many bytes in parallel safely.
-  unsigned MaxSafeDepDistBytes;
-
-  /// \brief If we see a non-constant dependence distance we can still try to
-  /// vectorize this loop with runtime checks.
-  bool ShouldRetryWithRuntimeCheck;
-
-  /// \brief Check whether there is a plausible dependence between the two
-  /// accesses.
-  ///
-  /// Access \p A must happen before \p B in program order. The two indices
-  /// identify the index into the program order map.
-  ///
-  /// This function checks  whether there is a plausible dependence (or the
-  /// absence of such can't be proved) between the two accesses. If there is a
-  /// plausible dependence but the dependence distance is bigger than one
-  /// element access it records this distance in \p MaxSafeDepDistBytes (if this
-  /// distance is smaller than any other distance encountered so far).
-  /// Otherwise, this function returns true signaling a possible dependence.
-  bool isDependent(const MemAccessInfo &A, unsigned AIdx,
-                   const MemAccessInfo &B, unsigned BIdx,
-                   ValueToValueMap &Strides);
-
-  /// \brief Check whether the data dependence could prevent store-load
-  /// forwarding.
-  bool couldPreventStoreLoadForward(unsigned Distance, unsigned TypeByteSize);
-};
-
-} // end anonymous namespace
-
 static bool isInBoundsGep(Value *Ptr) {
   if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr))
     return GEP->isInBounds();
@@ -4493,6 +4114,8 @@
 
 bool MemoryDepChecker::couldPreventStoreLoadForward(unsigned Distance,
                                                     unsigned TypeByteSize) {
+  // FIXME
+  return false;
   // If loads occur at a distance that is not a multiple of a feasible vector
   // factor store-load forwarding does not take place.
   // Positive dependences might cause troubles because vectorizing them might
@@ -4656,45 +4279,6 @@
   return false;
 }
 
-bool MemoryDepChecker::areDepsSafe(AccessAnalysis::DepCandidates &AccessSets,
-                                   MemAccessInfoSet &CheckDeps,
-                                   ValueToValueMap &Strides) {
-
-  MaxSafeDepDistBytes = -1U;
-  while (!CheckDeps.empty()) {
-    MemAccessInfo CurAccess = *CheckDeps.begin();
-
-    // Get the relevant memory access set.
-    EquivalenceClasses<MemAccessInfo>::iterator I =
-      AccessSets.findValue(AccessSets.getLeaderValue(CurAccess));
-
-    // Check accesses within this set.
-    EquivalenceClasses<MemAccessInfo>::member_iterator AI, AE;
-    AI = AccessSets.member_begin(I), AE = AccessSets.member_end();
-
-    // Check every access pair.
-    while (AI != AE) {
-      CheckDeps.erase(*AI);
-      EquivalenceClasses<MemAccessInfo>::member_iterator OI = std::next(AI);
-      while (OI != AE) {
-        // Check every accessing instruction pair in program order.
-        for (std::vector<unsigned>::iterator I1 = Accesses[*AI].begin(),
-             I1E = Accesses[*AI].end(); I1 != I1E; ++I1)
-          for (std::vector<unsigned>::iterator I2 = Accesses[*OI].begin(),
-               I2E = Accesses[*OI].end(); I2 != I2E; ++I2) {
-            if (*I1 < *I2 && isDependent(*AI, *I1, *OI, *I2, Strides))
-              return false;
-            if (*I2 < *I1 && isDependent(*OI, *I2, *AI, *I1, Strides))
-              return false;
-          }
-        ++OI;
-      }
-      AI++;
-    }
-  }
-  return true;
-}
-
 bool LoopVectorizationLegality::canVectorizeMemory() {
 
   typedef SmallVector<Value*, 16> ValueVector;