Index: include/llvm/InitializePasses.h
===================================================================
--- include/llvm/InitializePasses.h
+++ include/llvm/InitializePasses.h
@@ -304,6 +304,7 @@
 void initializeFloat2IntPass(PassRegistry&);
 void initializeLoopDistributePass(PassRegistry&);
 void initializeSjLjEHPreparePass(PassRegistry&);
+void initializeLoopEditorTestPass(PassRegistry&);
 }
 
 #endif
Index: include/llvm/Transforms/Utils/LoopEditor.h
===================================================================
--- /dev/null
+++ include/llvm/Transforms/Utils/LoopEditor.h
@@ -0,0 +1,413 @@
+//===-- LoopEditor.h - High-level loop transformations --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The LoopEditor provides a toolkit for performing high-level transforms on
+// loops.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_UTILS_LOOPEDITOR_H
+#define LLVM_TRANSFORMS_SCALAR_UTILS_LOOPEDITOR_H
+
+#include <llvm/ADT/ArrayRef.h>
+#include <llvm/ADT/DenseMap.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/ADT/Twine.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/Transforms/Utils/ValueMapper.h>
+#include <functional>
+#include <map>
+
+namespace llvm {
+
+class BasicBlock;
+class DataLayout;
+class DominatorTree;
+class Loop;
+class LoopInfo;
+class PHINode;
+class SCEV;
+class ScalarEvolution;
+class Value;
+
+/// \brief This class provides a toolkit to create high-level operations on
+/// loops.
+///
+/// The functionality consists of a set of core functions which are designed
+/// to be composable, and a set of helper routines that stitch together
+/// the core functions to provide higher-level constructions such as loop
+/// versioning, or loop peeling.
+///
+/// Many functions can take a Delegate object, which allows the user to be
+/// notified when different events happen (such as for example "an instruction
+/// has been interleaved"). This allows the core functionality to remain
+/// simple and not need too much configuration, while allowing the user
+/// the flexibility to hook into the internals. Some delegate methods offer
+/// the user the chance to influence the edit operation.
+///
+/// All of the core functions are designed to keep LoopInfo, ScalarEvolution
+/// and DominatorTree up to date.
+class LoopEditor {
+public:
+  /// A recurrence is any PHI node in the header block of a loop.
+  class Recurrence {
+  protected:
+    enum RType {
+      tySimple, tyReduction, tyInduction
+    };
+    /// Default constructor purely for operator[] on maps.
+    Recurrence() {}
+    Recurrence(PHINode *BO, Value *StartV, PHINode *Val) :
+      BasedOn(BO), StartValue(StartV), Val(Val), Ty(tySimple) {
+    }
+    /// If a loop is cloned, its clone will have this member of each of its
+    /// recurrences and inductions set to the original recurrence or induction.
+    ///
+    /// This allows LoopEditor to reason about sets of PHIs and how to merge
+    /// them together when stitching the outputs of loops to the input of other
+    /// loops (such as in LoopEditor::addIncoming).
+    PHINode *BasedOn;
+    /// For an induction or a reduction, the "start value" of the PHI (the value
+    /// incoming from the preheader).
+    Value *StartValue;
+    /// The PHInode itself.
+    PHINode *Val;
+    /// The type of the recurrence.
+    RType Ty;
+    // This class is deliberately opaque to users, but allow LoopEditor full
+    // access.
+    friend class LoopEditor;
+  public:
+    /// Return the start value of this recurrence
+    Value *getStartValue() const { return StartValue; }
+    /// Return the PHI in the loop header that identifies this recurrence.
+    PHINode *getPHI() const { return Val; }
+    bool operator < (const Recurrence &R) const {
+      return Val < R.Val;
+    }
+  };
+
+  /// A reduction is a type of recurrence that has one user outside the loop,
+  /// and is inductive: R[i] = f(i) OP R[i-1], where OP is an associative
+  /// binary operator or min()/max().
+  class Reduction : public Recurrence {
+    Reduction(Value *StartV, PHINode *Val) : 
+      Recurrence(nullptr, StartV, Val) {
+      Ty = tyReduction;
+    }
+    /// Only available for operator[] on maps.
+    Reduction() { assert(0 && "Unreachable!"); }
+  public:
+    /// Emit code to reduce Op1 and Op2 to one value.
+    Value *createOp(IRBuilder<> &IRB, Value *Op1, Value *Op2) const;
+    /// Methods for support type inquiry through isa, cast, and dyn_cast:
+    static inline bool classof(const Recurrence *R) {
+      return R->Ty == tyReduction;
+    }
+    // Open up to LoopEditor.
+    friend class LoopEditor;
+  };
+  /// A semi-opaque descriptor for an induction. An induction is a recurrence
+  /// with a constant inductive step: I[i] = C + I[i-1].
+  class Induction : private Recurrence {
+    Induction(Value *StartV, PHINode *Val) :
+      Recurrence(nullptr, StartV, Val) {
+      Ty = tyInduction;
+    }
+    /// Only available for operator[] on maps.
+    Induction() { assert(0 && "Unreachable!"); }
+  public:
+    /// Methods for support type inquiry through isa, cast, and dyn_cast:
+    static inline bool classof(const Recurrence *R) {
+      return R->Ty == tyInduction;
+    }
+
+    // Open up to LoopEditor.
+    friend class LoopEditor;
+  };
+
+public:
+  /// Delegates should be subclassed by users. They can optionally chain
+  /// to other delegates for composability.
+  ///
+  /// Delegates provide many callback functions, but only a subset (if any)
+  /// will be invoked for any one core function call.
+  ///
+  /// It is forbidden to modify the content of the loop when servicing one
+  /// of the "notify*" callbacks.
+  struct Delegate {
+    Delegate *Next;
+
+    Delegate() : Next(nullptr) {}
+    Delegate(Delegate *Next) : Next(Next) {}
+    virtual void anchor(); // Provide a home for the vtable.
+
+    /// A call to interleave() is about to happen, for the given iteration.
+    /// Iteration starts at 1 (the zero'th iteration would correspond to
+    /// the original loop content).
+    ///
+    /// Called by: widenAndInterleave()
+    virtual void notifyInterleaveIterationStarting(unsigned Iteration) {
+      if (Next) Next->notifyInterleaveIterationStarting(Iteration);
+    }
+    /// \c OldInst has been cloned to \c NewInst, during a call to
+    /// \c interleave()
+    ///
+    /// Called by: interleave()
+    virtual void notifyInstructionInterleaved(Instruction *OldInst,
+                                              Instruction *NewInst) {
+      if (Next) Next->notifyInstructionInterleaved(OldInst, NewInst);
+    }
+    /// \c OldInst has been peeled out of the loop as \c NewInst.
+    /// The peel iteration was \c Iteration, which starts from zero.
+    ///
+    /// Called by: peelBefore(),peelAfter()
+    virtual void notifyInstructionPeeled(Instruction *OldInst,
+                                         Instruction *NewInst,
+                                         unsigned Iteration) {
+      if (Next) Next->notifyInstructionPeeled(OldInst, NewInst, Iteration);
+    }
+    /// During a clone() operation, \c V needs to be cloned. VM contains the
+    /// values that have already been cloned (apart from PHI operands which
+    /// will be remapped later).
+    /// \return The value that V should have in the cloned loop.
+    ///
+    /// Called by: clone()
+    virtual Value *hookCloneValue(Value *V, ValueToValueMapTy &VM,
+                                  IRBuilder<> &IRB) {
+      if (Next) return Next->hookCloneValue(V, VM, IRB);
+      return nullptr;
+    }
+    /// When interleaving a loop, each interleaved iteration will require an
+    /// offset to be applied to induction variables. This hook defines how
+    /// that offset is computed. For example, for an UF=4 interleave:
+    ///   iteration 0 uses IndVar+0
+    ///   iteration 1 uses IndVar+S
+    ///   iteration 2 uses IndVar+S*2
+    ///   iteration 2 uses IndVar+S*3
+    ///
+    /// Where S is the return value of this hook.
+    ///
+    /// Called by: widenAndInterleave()
+    virtual unsigned hookInterleaveInductionStep() {
+      if (Next) return Next->hookInterleaveInductionStep();
+      return 1;
+    }
+    /// The reduction \c R, made up of \c Values, needs to be reduced to one
+    /// value at the exit of the loop.
+    ///
+    /// Called by: widenAndInterleave()
+    virtual Value *hookReduceReductions(Reduction *R, ArrayRef<Value*> Values,
+                                        IRBuilder<> &IRB) {
+      if (Next) return Next->hookReduceReductions(R, Values, IRB);
+      return nullptr;
+    }
+  };
+
+  /// Different transforms require the loop to be analyzable to a different
+  /// degree. For example, loop cloning doesn't need to know anything about
+  /// the loop, whereas loop interleaving needs to know that all recurrences
+  /// are either inductions or reductions.
+  ///
+  /// The AnalysisLevel describes the extent to which the loop was analyzable;
+  /// some functions may have minimum analysis requirements.
+  enum AnalysisLevel {
+    /// The loop was not able to be analyzed at all.
+    AL_None,
+    /// The loop trip count was able to be determined.
+    AL_TripCount,
+    /// The loop contains a single backedge and is in LCSSA form.
+    AL_LCSSA,
+    /// All recurrences were categorized (into inductions and reductions),
+    /// as well as the trip count being identified.
+    AL_AllRecurrences
+  };
+
+  /// Contructs a new loop editor, editing L.
+  LoopEditor(Loop *L, ScalarEvolution *SE, const DataLayout *DL, LoopInfo *LI,
+             DominatorTree *DT);
+
+  ///
+  /// Queries
+  ///
+
+  /// Returns the level to which the loop was able to be analyzed.
+  AnalysisLevel getAnalysisLevel() const;
+
+  /// Returns true if I is contained within blocks the loop editor owns.
+  /// This is equivalent to Loop::contains(), but it also checks the optional
+  /// bypass and dedicated exit blocks that are outside of Loop's knowledge.
+  bool contains(Instruction *I);
+
+  /// Allow accesses to the underlying Loop using ->.
+  Loop *operator -> () const { return L; }
+
+  /// Return the recurrence referred to by this PHI, if any. A recurrence is
+  /// referred to by its PHI in the loop header block. Returns nullptr on
+  /// failure.
+  Recurrence *getRecurrenceByPHI(PHINode *V);
+
+  /// Retrieves all known recurrences.
+  ArrayRef<Recurrence*> getAllRecurrences();
+
+  /// Returns the value of the given recurrence on exiting the loop.
+  Value *getOutgoingRecurrence(const Recurrence &R);
+
+  /// Retrieves a Recurrence valid for this loop, if one is found to be based
+  /// on or derived from the given recurrence. This will happen if this loop has
+  /// been cloned from another loop previously.
+  Recurrence *getMatchingRecurrence(const Recurrence &R);
+
+  /// Retrieves the executed trip count at the end of the loop. This may be
+  /// zero if the loop was bypassed by a predicate, or less than the normal
+  /// trip count if the loop was widened.
+  ///
+  /// Requires AnalysisLevel AL_TripCount or above.
+  Value *getExecutedTripCount();
+
+  /// Retrieves the trip count of the loop as a SCEV. Note that this involves
+  /// adding one to the backedge count and as such may overflow when expanded.
+  ///
+  /// Requires AnalysisLevel AL_TripCount or above.
+  const SCEV *getTripCount();
+
+  ///
+  /// Mutations
+  ///
+  
+  /// Produce a new version of this loop. The new loop is returned in
+  /// LoopEditor.
+  ///
+  /// The new loop is inserted into the CFG on IncomingEdge (IncomingEdge is
+  /// updated), so if IncomingEdge is A -> B, the result of this function will
+  /// be A -> LOOP -> B.
+  LoopEditor clone(Use &IncomingEdge, const Twine &NameSuffix = "");
+
+  /// A "widened" loop iterates over the same range but with a wider step.
+  /// For example the AddRec {5,1}<%L> widened by a factor 4 becomes {5,4}<%L>.
+  ///
+  /// This is a requirement for loop interleaving and loop vectorization, but
+  /// importantly this does not actually do any unrolling or vectorization.
+  /// Instead, all induction variables have their steps altered and the trip
+  /// count is modified.
+  ///
+  /// Reductions are not altered. Requires AnalysisLevel AL_AllRecurrences.
+  void widen(unsigned Factor);
+
+  /// Set the loop trip count. If MayOverflow is set, then an overflow check
+  /// will be emitted to ensure that TripCount is not zero.
+  ///
+  /// The overflow check may be omitted if the LoopEditor can prove that
+  /// TripCount is less than the current loop trip count, and there is currently
+  /// no overflow check.
+  ///
+  /// Requires AnalysisLevel AL_TripCount.
+  void setTripCount(const SCEV *TripCount, bool MayOverflow=true);
+  
+  /// Adds a new reduction variable starting at StartValue. The reduction does
+  /// not do anything until it is connected with connectReduction().
+  ///
+  /// Requires AnalysisLevel AL_LCSSA.
+  Reduction *addReduction(Value *StartValue);
+
+  /// Connects the backedge of a reduction added with addReduction().
+  ///
+  /// Requires AnalysisLevel AL_LCSSA.
+  void connectReduction(const Reduction *ID, Value *V);
+
+  /// Creates a new copy of all instructions in the loop body, except control flow
+  /// instructions. Does not modify the control flow in the loop at all.
+  ///
+  /// The inserted instructions are interleaved. Requires AnalysisLevel
+  /// AL_AllRecurrences.
+  ///
+  /// Invokes the following functions on the delegate:
+  ///   hookInductionVariableUse(Induction &Induction, IRBuilder<> IRB)
+  ///     Called when a use of an induction variable is encountered. Expected
+  ///     to return the value to use as an induction variable.
+  ///
+  ///   hookReductionVariableUse(Reduction &Reduction, IRBuilder<> IRB)
+  ///     Called when a use of a reduction variable is encountered. Expected
+  ///     to return the value to use as a reduction.
+  ///
+  ///   notifyReductionVariableDefined(Reduction &Reduction, Value *Def)
+  ///     Called when a feed of a reduction variable is seen. The arguments are
+  ///     the newly created Value and the Reduction it fed in the original loop.
+  void interleave(Delegate *D);
+
+  /// Returns a dedicated exiting block, creating one if it does not exist. A
+  /// dedicated exiting block is one where the only incoming arcs are from
+  /// the loop latch or optionally the bypass block (The bypass block is a 
+  /// check to see if the loop would actually be executed at least once, as
+  /// the trip count check is on the backedge).
+  BasicBlock *getOrCreateDedicatedExitingBlock();
+
+  /// Changes the block that is executed after this loop.
+  void updateExitBlock(BasicBlock *BB);
+
+  /// Add an incoming edge from the loop L, with the given mappings between our
+  /// and L's recurrences
+  ///
+  /// The Recurrences can either reference this loop or L's.
+  void addIncoming(LoopEditor &L,
+                   const std::map<Recurrence*, Value*> &Recurrences);
+
+  /// Add an incoming edge from the basic block BB.
+  void addIncoming(BasicBlock *BB);
+
+  /// Remove an incoming edge from BB. The edge must already exist.
+  void removeIncoming(BasicBlock *BB);
+
+  ///
+  /// Macro-mutations - These are all helper functions implemented in terms of
+  ///                   the mutations above.
+  /// 
+
+  /// Widens the loop by \c Factor and performs interleaved loop unrolling.
+  ///
+  /// The reductions in the original loop get duplicated \c Factor times,
+  /// then reduced at the end of the loop into one value and returned in Ret.
+  void widenAndInterleaveLoop(unsigned Factor,
+                              std::map<Reduction*, Value*> &Ret,
+                              Delegate *D=nullptr);
+
+  /// Versions, widens, and interleaves the loop by \c Factor.
+  ///
+  /// The original loop is cloned and a widened loop created, which branches
+  /// to the original scalar loop as a tail.
+  ///
+  /// PredBB is created as a block to determine whether to branch to the widened
+  /// loop or the scalar one (independent of any trip count and overflow checks).
+  /// It is initially set up with just one branch:
+  ///   br i1 true, label %versioned-loop, label %scalar-loop
+  LoopEditor versionWidenAndInterleaveLoop(unsigned Factor, BasicBlock *&PredBB,
+                                           Delegate *D=nullptr);
+
+  /// Peels \c Iterations iterations out of the loop and places them before the
+  /// loop. The trip count of the loop is decreased to compensate.
+  void peelBefore(unsigned Iterations, Delegate *D=nullptr);
+
+  /// Peels \c Iterations iterations out of the loop and places them after the
+  /// loop. The trip count of the loop is decreased to compensate.
+  void peelAfter(unsigned Iterations, Delegate *D=nullptr);
+
+private:
+  //
+  // Immutable/analysis state
+  //
+  Loop *L;
+  ScalarEvolution *SE;
+  const DataLayout *DL;
+  LoopInfo *LI;
+  DominatorTree *DT;
+  bool Analyzed;
+};
+} // end namespace llvm
+
+#endif
Index: lib/Target/ARM/Thumb2ITBlockPass.cpp
===================================================================
--- lib/Target/ARM/Thumb2ITBlockPass.cpp
+++ lib/Target/ARM/Thumb2ITBlockPass.cpp
@@ -183,6 +183,7 @@
     DebugLoc dl = MI->getDebugLoc();
     unsigned PredReg = 0;
     ARMCC::CondCodes CC = getITInstrPredicate(MI, PredReg);
+
     if (CC == ARMCC::AL) {
       ++MBBI;
       continue;
Index: lib/Transforms/Scalar/Scalar.cpp
===================================================================
--- lib/Transforms/Scalar/Scalar.cpp
+++ lib/Transforms/Scalar/Scalar.cpp
@@ -81,6 +81,7 @@
   initializePlaceSafepointsPass(Registry);
   initializeFloat2IntPass(Registry);
   initializeLoopDistributePass(Registry);
+  initializeLoopEditorTestPass(Registry);
 }
 
 void LLVMInitializeScalarOpts(LLVMPassRegistryRef R) {
Index: lib/Transforms/Utils/CMakeLists.txt
===================================================================
--- lib/Transforms/Utils/CMakeLists.txt
+++ lib/Transforms/Utils/CMakeLists.txt
@@ -18,6 +18,7 @@
   IntegerDivision.cpp
   LCSSA.cpp
   Local.cpp
+  LoopEditor.cpp
   LoopSimplify.cpp
   LoopUnroll.cpp
   LoopUnrollRuntime.cpp
Index: lib/Transforms/Utils/LoopEditor.cpp
===================================================================
--- /dev/null
+++ lib/Transforms/Utils/LoopEditor.cpp
@@ -0,0 +1,244 @@
+//===-- LoopEditor.cpp - High-level loop transformations ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The LoopEditor provides a toolkit for performing high-level transforms on
+// loops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/LoopEditor.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/PrettyStackTrace.h"
+
+#define DEBUG_TYPE "loopeditor"
+using namespace llvm;
+
+void LoopEditor::Delegate::anchor() {}
+
+//
+// Helper functions.
+//
+Value *LoopEditor::Reduction::createOp(IRBuilder<> &IRB, Value *Op1, Value *Op2) const {
+  assert(0 && "Implement!");
+}
+
+//
+// Public API
+//
+
+LoopEditor::LoopEditor(Loop *L, ScalarEvolution *SE, const DataLayout *DL,
+                       LoopInfo *LI, DominatorTree *DT)
+  : L(L), SE(SE), DL(DL), LI(LI), DT(DT) {
+}
+
+LoopEditor::AnalysisLevel LoopEditor::getAnalysisLevel() const {
+  assert(0 && "Not implemented!");
+  return AL_None;
+}
+
+bool LoopEditor::contains(Instruction *I) {
+  assert(0 && "Not implemented!");
+  return false;
+}
+
+LoopEditor::Recurrence *LoopEditor::getRecurrenceByPHI(PHINode *V) {
+  assert(0 && "Not implemented!");
+  return nullptr;
+}
+
+ArrayRef<LoopEditor::Recurrence*> LoopEditor::getAllRecurrences() {
+  assert(0 && "Not implemented!");
+  return ArrayRef<LoopEditor::Recurrence*>();
+}
+
+Value *LoopEditor::getOutgoingRecurrence(const Recurrence &R) {
+  assert(0 && "Not implemented!");
+  return nullptr;
+}
+
+LoopEditor::Recurrence *LoopEditor::getMatchingRecurrence(const Recurrence &R) {
+  assert(0 && "Not implemented!");
+  return nullptr;
+}
+
+Value *LoopEditor::getExecutedTripCount() {
+  assert(0 && "Not implemented!");
+  return nullptr;
+}
+
+const SCEV *LoopEditor::getTripCount() {
+  assert(0 && "Not implemented!");
+  return nullptr;
+}
+
+void LoopEditor::setTripCount(const SCEV *TC, bool MayOverflow) {
+  assert(0 && "Not implemented!");
+}
+
+LoopEditor LoopEditor::clone(Use &IncomingEdge,
+                             const Twine &NameSuffix) {
+  assert(0 && "Not implemented!");
+  return *this;
+}
+
+void LoopEditor::widen(unsigned Factor) {
+  assert(0 && "Not implemented!");
+}
+
+LoopEditor::Reduction *LoopEditor::addReduction(Value *StartValue) {
+  assert(0 && "Not implemented!");
+  return nullptr;
+}
+
+void LoopEditor::connectReduction(const Reduction *ID, Value *V) {
+  assert(0 && "Not implemented!");
+}
+
+void LoopEditor::interleave(Delegate *D) {
+  assert(0 && "Not implemented!");
+}
+
+BasicBlock *LoopEditor::getOrCreateDedicatedExitingBlock() {
+  assert(0 && "Not implemented!");
+  return nullptr;
+}
+
+void LoopEditor::updateExitBlock(BasicBlock *BB) {
+  assert(0 && "Not implemented!");
+}
+
+void LoopEditor::addIncoming(
+  LoopEditor &LE,
+  const std::map<Recurrence*, Value*> &Recurrences) {
+  assert(0 && "Not implemented!");
+}
+
+void LoopEditor::addIncoming(BasicBlock *BB) {
+  assert(0 && "Not implemented!");
+}
+
+void LoopEditor::removeIncoming(BasicBlock *BB) {
+  assert(0 && "Not implemented!");
+}
+
+//
+// Macro-mutators
+//
+
+void LoopEditor::widenAndInterleaveLoop(unsigned Factor,
+                                         std::map<Reduction*, Value*> &Ret,
+                                         Delegate *D) {
+  assert(0 && "Not implemented!");
+}
+
+LoopEditor LoopEditor::versionWidenAndInterleaveLoop(unsigned Factor,
+                                                     BasicBlock *&PredBB,
+                                                     Delegate *D) {
+  assert(0 && "Not implemented!");
+}
+
+void LoopEditor::peelBefore(unsigned Iterations, Delegate *D) {
+  assert(0 && "Not implemented!");
+}
+
+void LoopEditor::peelAfter(unsigned Iterations, Delegate *D) {
+  assert(0 && "Not implemented!");
+}
+
+//
+// Dummy test pass
+//
+enum TestCommand {
+  TC_Clone
+};
+cl::opt<TestCommand> TestCommand(
+  cl::desc("Loop editor test command"), cl::Hidden, cl::values(
+    clEnumValN(TC_Clone, "clone", "Test the ::clone() function"),
+    clEnumValEnd));
+cl::opt<std::string> TestLoop(cl::desc("Loop editor loop to operate on"),
+                              cl::Hidden, cl::init("TestLoop"));
+cl::opt<std::string> TestEdge(cl::desc("Loop editor edge to clone on"),
+                              cl::Hidden, cl::init("TestEdge"));
+
+struct LoopEditorTest : FunctionPass {
+  static char ID;
+  LoopEditorTest() : FunctionPass(ID) {
+    initializeLoopEditorTestPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnFunction(Function &F) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<ScalarEvolution>();
+  }
+};
+char LoopEditorTest::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopEditorTest, "test-loop-editor",
+                      "Dummy test pass to test LoopEditor",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_END(LoopEditorTest, "test-loop-editor",
+                    "Dummy test pass to test LoopEditor",
+                    false, false)
+
+bool LoopEditorTest::runOnFunction(Function &F) {
+  LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  ScalarEvolution &SE = getAnalysis<ScalarEvolution>();
+
+  auto I = std::find_if(F.begin(), F.end(), [](const BasicBlock &BB) {
+      return BB.getName() == TestLoop;
+    });
+  if (I == F.end())
+    report_fatal_error("Could not find basicblock %" + TestLoop +
+                       " in function @" + F.getName(), false);
+
+  Loop *L = LI.getLoopFor(I);
+  if (!L) 
+    report_fatal_error("Could not find loop for BB %" + TestLoop + 
+                       " in function @" + F.getName(), false);
+
+  LoopEditor LE(L, &SE, &F.getParent()->getDataLayout(), &LI, &DT);
+  switch (TestCommand) {
+  case TC_Clone: {
+    auto I = std::find_if(F.begin(), F.end(), [](const BasicBlock &BB) {
+                            return BB.getName() == TestEdge;
+                          });
+    if (I == F.end())
+      report_fatal_error("Could not find test edge %" + TestEdge, false);
+    if (I->size() != 1)
+      report_fatal_error("Test edge %" + TestEdge + " does not contain solely"
+                         " an unconditional branch!", false);
+    BranchInst *BI = dyn_cast<BranchInst>(I->getTerminator());
+    if (!BI || BI->isConditional())
+      report_fatal_error("Test edge %" + TestEdge + " was not an unconditional"
+                         " branch!", false);
+    Use &Edge = *BI->op_begin();
+    LE.clone(Edge, ".cloned");
+  }
+  }
+
+  return true;
+}
Index: lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorize.cpp
+++ lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -98,6 +98,7 @@
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/LoopEditor.h"
 #include <algorithm>
 #include <map>
 #include <tuple>
@@ -289,7 +290,7 @@
 
   virtual ~InnerLoopVectorizer() {}
 
-protected:
+public:
   /// A small list of PHINodes.
   typedef SmallVector<PHINode*, 4> PhiVector;
   /// When we unroll loops we have multiple vector values for each scalar.
@@ -329,6 +330,7 @@
 
   /// A helper function to vectorize a single BB within the innermost loop.
   void vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV);
+  void vectorizeSingleInstruction(Instruction *I, PhiVector *PV);
 
   /// Vectorize a single PHINode in a block. This method handles the induction
   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
@@ -433,7 +435,8 @@
   /// vector elements.
   unsigned VF;
 
-protected:
+  friend class VectorizerDelegate;
+public:
   /// The vectorization unroll factor to use. Each scalar is vectorized to this
   /// many different vector instructions.
   unsigned UF;
@@ -2456,404 +2459,7 @@
 }
 
 void InnerLoopVectorizer::createEmptyLoop() {
-  /*
-   In this function we generate a new loop. The new loop will contain
-   the vectorized instructions while the old loop will continue to run the
-   scalar remainder.
-
-       [ ] <-- Back-edge taken count overflow check.
-    /   |
-   /    v
-  |    [ ] <-- vector loop bypass (may consist of multiple blocks).
-  |  /  |
-  | /   v
-  ||   [ ]     <-- vector pre header.
-  ||    |
-  ||    v
-  ||   [  ] \
-  ||   [  ]_|   <-- vector loop.
-  ||    |
-  | \   v
-  |   >[ ]   <--- middle-block.
-  |  /  |
-  | /   v
-  -|- >[ ]     <--- new preheader.
-   |    |
-   |    v
-   |   [ ] \
-   |   [ ]_|   <-- old scalar loop to handle remainder.
-    \   |
-     \  v
-      >[ ]     <-- exit block.
-   ...
-   */
-
-  BasicBlock *OldBasicBlock = OrigLoop->getHeader();
-  BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
-  BasicBlock *ExitBlock = OrigLoop->getExitBlock();
-  assert(VectorPH && "Invalid loop structure");
-  assert(ExitBlock && "Must have an exit block");
-
-  // Some loops have a single integer induction variable, while other loops
-  // don't. One example is c++ iterators that often have multiple pointer
-  // induction variables. In the code below we also support a case where we
-  // don't have a single induction variable.
-  OldInduction = Legal->getInduction();
-  Type *IdxTy = Legal->getWidestInductionType();
-
-  // Find the loop boundaries.
-  const SCEV *ExitCount = SE->getBackedgeTakenCount(OrigLoop);
-  assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
-
-  // The exit count might have the type of i64 while the phi is i32. This can
-  // happen if we have an induction variable that is sign extended before the
-  // compare. The only way that we get a backedge taken count is that the
-  // induction variable was signed and as such will not overflow. In such a case
-  // truncation is legal.
-  if (ExitCount->getType()->getPrimitiveSizeInBits() >
-      IdxTy->getPrimitiveSizeInBits())
-    ExitCount = SE->getTruncateOrNoop(ExitCount, IdxTy);
-
-  const SCEV *BackedgeTakeCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy);
-  // Get the total trip count from the count by adding 1.
-  ExitCount = SE->getAddExpr(BackedgeTakeCount,
-                             SE->getConstant(BackedgeTakeCount->getType(), 1));
-
-  const DataLayout &DL = OldBasicBlock->getModule()->getDataLayout();
-
-  // Expand the trip count and place the new instructions in the preheader.
-  // Notice that the pre-header does not change, only the loop body.
-  SCEVExpander Exp(*SE, DL, "induction");
-
-  // We need to test whether the backedge-taken count is uint##_max. Adding one
-  // to it will cause overflow and an incorrect loop trip count in the vector
-  // body. In case of overflow we want to directly jump to the scalar remainder
-  // loop.
-  Value *BackedgeCount =
-      Exp.expandCodeFor(BackedgeTakeCount, BackedgeTakeCount->getType(),
-                        VectorPH->getTerminator());
-  if (BackedgeCount->getType()->isPointerTy())
-    BackedgeCount = CastInst::CreatePointerCast(BackedgeCount, IdxTy,
-                                                "backedge.ptrcnt.to.int",
-                                                VectorPH->getTerminator());
-  Instruction *CheckBCOverflow =
-      CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, BackedgeCount,
-                      Constant::getAllOnesValue(BackedgeCount->getType()),
-                      "backedge.overflow", VectorPH->getTerminator());
-
-  // The loop index does not have to start at Zero. Find the original start
-  // value from the induction PHI node. If we don't have an induction variable
-  // then we know that it starts at zero.
-  Builder.SetInsertPoint(VectorPH->getTerminator());
-  Value *StartIdx = ExtendedIdx =
-      OldInduction
-          ? Builder.CreateZExt(OldInduction->getIncomingValueForBlock(VectorPH),
-                               IdxTy)
-          : ConstantInt::get(IdxTy, 0);
-
-  // Count holds the overall loop count (N).
-  Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
-                                   VectorPH->getTerminator());
-
-  LoopBypassBlocks.push_back(VectorPH);
-
-  // Split the single block loop into the two loop structure described above.
-  BasicBlock *VecBody =
-      VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
-  BasicBlock *MiddleBlock =
-  VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
-  BasicBlock *ScalarPH =
-  MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
-
-  // Create and register the new vector loop.
-  Loop* Lp = new Loop();
-  Loop *ParentLoop = OrigLoop->getParentLoop();
-
-  // Insert the new loop into the loop nest and register the new basic blocks
-  // before calling any utilities such as SCEV that require valid LoopInfo.
-  if (ParentLoop) {
-    ParentLoop->addChildLoop(Lp);
-    ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
-    ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
-  } else {
-    LI->addTopLevelLoop(Lp);
-  }
-  Lp->addBasicBlockToLoop(VecBody, *LI);
-
-  // Use this IR builder to create the loop instructions (Phi, Br, Cmp)
-  // inside the loop.
-  Builder.SetInsertPoint(VecBody->getFirstNonPHI());
-
-  // Generate the induction variable.
-  setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction));
-  Induction = Builder.CreatePHI(IdxTy, 2, "index");
-  // The loop step is equal to the vectorization factor (num of SIMD elements)
-  // times the unroll factor (num of SIMD instructions).
-  Constant *Step = ConstantInt::get(IdxTy, VF * UF);
-
-  // Generate code to check that the loop's trip count that we computed by
-  // adding one to the backedge-taken count will not overflow.
-  BasicBlock *NewVectorPH =
-      VectorPH->splitBasicBlock(VectorPH->getTerminator(), "overflow.checked");
-  if (ParentLoop)
-    ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI);
-  ReplaceInstWithInst(
-      VectorPH->getTerminator(),
-      BranchInst::Create(ScalarPH, NewVectorPH, CheckBCOverflow));
-  VectorPH = NewVectorPH;
-
-  // This is the IR builder that we use to add all of the logic for bypassing
-  // the new vector loop.
-  IRBuilder<> BypassBuilder(VectorPH->getTerminator());
-  setDebugLocFromInst(BypassBuilder,
-                      getDebugLocFromInstOrOperands(OldInduction));
-
-  // We may need to extend the index in case there is a type mismatch.
-  // We know that the count starts at zero and does not overflow.
-  if (Count->getType() != IdxTy) {
-    // The exit count can be of pointer type. Convert it to the correct
-    // integer type.
-    if (ExitCount->getType()->isPointerTy())
-      Count = BypassBuilder.CreatePointerCast(Count, IdxTy, "ptrcnt.to.int");
-    else
-      Count = BypassBuilder.CreateZExtOrTrunc(Count, IdxTy, "cnt.cast");
-  }
-
-  // Add the start index to the loop count to get the new end index.
-  Value *IdxEnd = BypassBuilder.CreateAdd(Count, StartIdx, "end.idx");
-
-  // Now we need to generate the expression for N - (N % VF), which is
-  // the part that the vectorized body will execute.
-  Value *R = BypassBuilder.CreateURem(Count, Step, "n.mod.vf");
-  Value *CountRoundDown = BypassBuilder.CreateSub(Count, R, "n.vec");
-  Value *IdxEndRoundDown = BypassBuilder.CreateAdd(CountRoundDown, StartIdx,
-                                                     "end.idx.rnd.down");
-
-  // Now, compare the new count to zero. If it is zero skip the vector loop and
-  // jump to the scalar loop.
-  Value *Cmp =
-      BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx, "cmp.zero");
-  NewVectorPH =
-      VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.ph");
-  if (ParentLoop)
-    ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI);
-  LoopBypassBlocks.push_back(VectorPH);
-  ReplaceInstWithInst(VectorPH->getTerminator(),
-                      BranchInst::Create(MiddleBlock, NewVectorPH, Cmp));
-  VectorPH = NewVectorPH;
-
-  // Generate the code to check that the strides we assumed to be one are really
-  // one. We want the new basic block to start at the first instruction in a
-  // sequence of instructions that form a check.
-  Instruction *StrideCheck;
-  Instruction *FirstCheckInst;
-  std::tie(FirstCheckInst, StrideCheck) =
-      addStrideCheck(VectorPH->getTerminator());
-  if (StrideCheck) {
-    AddedSafetyChecks = true;
-    // Create a new block containing the stride check.
-    VectorPH->setName("vector.stridecheck");
-    NewVectorPH =
-        VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.ph");
-    if (ParentLoop)
-      ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI);
-    LoopBypassBlocks.push_back(VectorPH);
-
-    // Replace the branch into the memory check block with a conditional branch
-    // for the "few elements case".
-    ReplaceInstWithInst(
-        VectorPH->getTerminator(),
-        BranchInst::Create(MiddleBlock, NewVectorPH, StrideCheck));
-
-    VectorPH = NewVectorPH;
-  }
-
-  // Generate the code that checks in runtime if arrays overlap. We put the
-  // checks into a separate block to make the more common case of few elements
-  // faster.
-  Instruction *MemRuntimeCheck;
-  std::tie(FirstCheckInst, MemRuntimeCheck) =
-      Legal->getLAI()->addRuntimeCheck(VectorPH->getTerminator());
-  if (MemRuntimeCheck) {
-    AddedSafetyChecks = true;
-    // Create a new block containing the memory check.
-    VectorPH->setName("vector.memcheck");
-    NewVectorPH =
-        VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.ph");
-    if (ParentLoop)
-      ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI);
-    LoopBypassBlocks.push_back(VectorPH);
-
-    // Replace the branch into the memory check block with a conditional branch
-    // for the "few elements case".
-    ReplaceInstWithInst(
-        VectorPH->getTerminator(),
-        BranchInst::Create(MiddleBlock, NewVectorPH, MemRuntimeCheck));
-
-    VectorPH = NewVectorPH;
-  }
-
-  // We are going to resume the execution of the scalar loop.
-  // Go over all of the induction variables that we found and fix the
-  // PHIs that are left in the scalar version of the loop.
-  // The starting values of PHI nodes depend on the counter of the last
-  // iteration in the vectorized loop.
-  // If we come from a bypass edge then we need to start from the original
-  // start value.
-
-  // This variable saves the new starting index for the scalar loop.
-  PHINode *ResumeIndex = nullptr;
-  LoopVectorizationLegality::InductionList::iterator I, E;
-  LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
-  // Set builder to point to last bypass block.
-  BypassBuilder.SetInsertPoint(LoopBypassBlocks.back()->getTerminator());
-  for (I = List->begin(), E = List->end(); I != E; ++I) {
-    PHINode *OrigPhi = I->first;
-    LoopVectorizationLegality::InductionInfo II = I->second;
-
-    Type *ResumeValTy = (OrigPhi == OldInduction) ? IdxTy : OrigPhi->getType();
-    PHINode *ResumeVal = PHINode::Create(ResumeValTy, 2, "resume.val",
-                                         MiddleBlock->getTerminator());
-    // We might have extended the type of the induction variable but we need a
-    // truncated version for the scalar loop.
-    PHINode *TruncResumeVal = (OrigPhi == OldInduction) ?
-      PHINode::Create(OrigPhi->getType(), 2, "trunc.resume.val",
-                      MiddleBlock->getTerminator()) : nullptr;
-
-    // Create phi nodes to merge from the  backedge-taken check block.
-    PHINode *BCResumeVal = PHINode::Create(ResumeValTy, 3, "bc.resume.val",
-                                           ScalarPH->getTerminator());
-    BCResumeVal->addIncoming(ResumeVal, MiddleBlock);
-
-    PHINode *BCTruncResumeVal = nullptr;
-    if (OrigPhi == OldInduction) {
-      BCTruncResumeVal =
-          PHINode::Create(OrigPhi->getType(), 2, "bc.trunc.resume.val",
-                          ScalarPH->getTerminator());
-      BCTruncResumeVal->addIncoming(TruncResumeVal, MiddleBlock);
-    }
-
-    Value *EndValue = nullptr;
-    switch (II.IK) {
-    case LoopVectorizationLegality::IK_NoInduction:
-      llvm_unreachable("Unknown induction");
-    case LoopVectorizationLegality::IK_IntInduction: {
-      // Handle the integer induction counter.
-      assert(OrigPhi->getType()->isIntegerTy() && "Invalid type");
-
-      // We have the canonical induction variable.
-      if (OrigPhi == OldInduction) {
-        // Create a truncated version of the resume value for the scalar loop,
-        // we might have promoted the type to a larger width.
-        EndValue =
-          BypassBuilder.CreateTrunc(IdxEndRoundDown, OrigPhi->getType());
-        // The new PHI merges the original incoming value, in case of a bypass,
-        // or the value at the end of the vectorized loop.
-        for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
-          TruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
-        TruncResumeVal->addIncoming(EndValue, VecBody);
-
-        BCTruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]);
-
-        // We know what the end value is.
-        EndValue = IdxEndRoundDown;
-        // We also know which PHI node holds it.
-        ResumeIndex = ResumeVal;
-        break;
-      }
-
-      // Not the canonical induction variable - add the vector loop count to the
-      // start value.
-      Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown,
-                                                   II.StartValue->getType(),
-                                                   "cast.crd");
-      EndValue = II.transform(BypassBuilder, CRD);
-      EndValue->setName("ind.end");
-      break;
-    }
-    case LoopVectorizationLegality::IK_PtrInduction: {
-      Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown,
-                                                   II.StepValue->getType(),
-                                                   "cast.crd");
-      EndValue = II.transform(BypassBuilder, CRD);
-      EndValue->setName("ptr.ind.end");
-      break;
-    }
-    }// end of case
-
-    // The new PHI merges the original incoming value, in case of a bypass,
-    // or the value at the end of the vectorized loop.
-    for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) {
-      if (OrigPhi == OldInduction)
-        ResumeVal->addIncoming(StartIdx, LoopBypassBlocks[I]);
-      else
-        ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
-    }
-    ResumeVal->addIncoming(EndValue, VecBody);
-
-    // Fix the scalar body counter (PHI node).
-    unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
-
-    // The old induction's phi node in the scalar body needs the truncated
-    // value.
-    if (OrigPhi == OldInduction) {
-      BCResumeVal->addIncoming(StartIdx, LoopBypassBlocks[0]);
-      OrigPhi->setIncomingValue(BlockIdx, BCTruncResumeVal);
-    } else {
-      BCResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]);
-      OrigPhi->setIncomingValue(BlockIdx, BCResumeVal);
-    }
-  }
-
-  // If we are generating a new induction variable then we also need to
-  // generate the code that calculates the exit value. This value is not
-  // simply the end of the counter because we may skip the vectorized body
-  // in case of a runtime check.
-  if (!OldInduction){
-    assert(!ResumeIndex && "Unexpected resume value found");
-    ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val",
-                                  MiddleBlock->getTerminator());
-    for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
-      ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]);
-    ResumeIndex->addIncoming(IdxEndRoundDown, VecBody);
-  }
-
-  // Make sure that we found the index where scalar loop needs to continue.
-  assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() &&
-         "Invalid resume Index");
-
-  // Add a check in the middle block to see if we have completed
-  // all of the iterations in the first vector loop.
-  // If (N - N%VF) == N, then we *don't* need to run the remainder.
-  Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd,
-                                ResumeIndex, "cmp.n",
-                                MiddleBlock->getTerminator());
-  ReplaceInstWithInst(MiddleBlock->getTerminator(),
-                      BranchInst::Create(ExitBlock, ScalarPH, CmpN));
-
-  // Create i+1 and fill the PHINode.
-  Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next");
-  Induction->addIncoming(StartIdx, VectorPH);
-  Induction->addIncoming(NextIdx, VecBody);
-  // Create the compare.
-  Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown);
-  Builder.CreateCondBr(ICmp, MiddleBlock, VecBody);
-
-  // Now we have two terminators. Remove the old one from the block.
-  VecBody->getTerminator()->eraseFromParent();
-
-  // Get ready to start creating new instructions into the vectorized body.
-  Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
-
-  // Save the state.
-  LoopVectorPreHeader = VectorPH;
-  LoopScalarPreHeader = ScalarPH;
-  LoopMiddleBlock = MiddleBlock;
-  LoopExitBlock = ExitBlock;
-  LoopVectorBody.push_back(VecBody);
-  LoopScalarBody = OldBasicBlock;
-
-  LoopVectorizeHints Hints(Lp, true);
+  LoopVectorizeHints Hints(OrigLoop, true);
   Hints.setAlreadyVectorized();
 }
 
@@ -3021,239 +2627,99 @@
   return TTI.getIntrinsicInstrCost(ID, RetTy, Tys);
 }
 
-void InnerLoopVectorizer::vectorizeLoop() {
-  //===------------------------------------------------===//
-  //
-  // Notice: any optimization or new instruction that go
-  // into the code below should be also be implemented in
-  // the cost-model.
-  //
-  //===------------------------------------------------===//
-  Constant *Zero = Builder.getInt32(0);
+class VectorizerDelegate : public LoopEditor::Delegate {
+  InnerLoopVectorizer &LV;
+public:
+  VectorizerDelegate(InnerLoopVectorizer &LV) : LV(LV) {
+  };
 
-  // In order to support reduction variables we need to be able to vectorize
-  // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two
-  // stages. First, we create a new vector PHI node with no incoming edges.
-  // We use this value when we vectorize all of the instructions that use the
-  // PHI. Next, after all of the instructions in the block are complete we
-  // add the new incoming edges to the PHI. At this point all of the
-  // instructions in the basic block are vectorized, so we can use them to
-  // construct the PHI.
-  PhiVector RdxPHIsToFix;
-
-  // Scan the loop in a topological order to ensure that defs are vectorized
-  // before users.
-  LoopBlocksDFS DFS(OrigLoop);
-  DFS.perform(LI);
+  Value *hookCloneValue(Value *V, ValueToValueMapTy &VM,
+                        IRBuilder<> &IRB) override {
+    LV.Builder.SetInsertPoint(IRB.GetInsertPoint());
+    unsigned UF = LV.UF;
+    LV.UF = 1;
+    InnerLoopVectorizer::PhiVector Dummy;
+    LV.vectorizeSingleInstruction(cast<Instruction>(V), &Dummy);
 
-  // Vectorize all of the blocks in the original loop.
-  for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
-       be = DFS.endRPO(); bb != be; ++bb)
-    vectorizeBlockInLoop(*bb, &RdxPHIsToFix);
-
-  // At this point every instruction in the original loop is widened to
-  // a vector form. We are almost done. Now, we need to fix the PHI nodes
-  // that we vectorized. The PHI nodes are currently empty because we did
-  // not want to introduce cycles. Notice that the remaining PHI nodes
-  // that we need to fix are reduction variables.
-
-  // Create the 'reduced' values for each of the induction vars.
-  // The reduced values are the vector values that we scalarize and combine
-  // after the loop is finished.
-  for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end();
-       it != e; ++it) {
-    PHINode *RdxPhi = *it;
-    assert(RdxPhi && "Unable to recover vectorized PHI");
-
-    // Find the reduction variable descriptor.
-    assert(Legal->getReductionVars()->count(RdxPhi) &&
-           "Unable to find the reduction variable");
-    RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[RdxPhi];
-
-    RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
-    TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
-    Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
-    RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
-        RdxDesc.getMinMaxRecurrenceKind();
-    setDebugLocFromInst(Builder, ReductionStartValue);
-
-    // We need to generate a reduction vector from the incoming scalar.
-    // To do so, we need to generate the 'identity' vector and override
-    // one of the elements with the incoming scalar reduction. We need
-    // to do it in the vector-loop preheader.
-    Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator());
-
-    // This is the vector-clone of the value that leaves the loop.
-    VectorParts &VectorExit = getVectorValue(LoopExitInst);
-    Type *VecTy = VectorExit[0]->getType();
-
-    // Find the reduction identity variable. Zero for addition, or, xor,
-    // one for multiplication, -1 for And.
-    Value *Identity;
-    Value *VectorStart;
-    if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
-        RK == RecurrenceDescriptor::RK_FloatMinMax) {
-      // MinMax reduction have the start value as their identify.
-      if (VF == 1) {
-        VectorStart = Identity = ReductionStartValue;
-      } else {
-        VectorStart = Identity =
-            Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
-      }
-    } else {
-      // Handle other reduction kinds:
-      Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
-          RK, VecTy->getScalarType());
-      if (VF == 1) {
-        Identity = Iden;
-        // This vector is the Identity vector where the first element is the
-        // incoming scalar reduction.
-        VectorStart = ReductionStartValue;
-      } else {
-        Identity = ConstantVector::getSplat(VF, Iden);
+    LV.UF = UF;
+    return LV.WidenMap.get(V)[0];
+  }
 
-        // This vector is the Identity vector where the first element is the
-        // incoming scalar reduction.
-        VectorStart =
-            Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
-      }
-    }
+  unsigned hookInterleaveInductionStep() override {
+    return LV.VF;
+  }
 
-    // Fix the vector-loop phi.
+  Value *hookReduceReductions(LoopEditor::Reduction *R,
+                              ArrayRef<Value*> Values,
+                              IRBuilder<> &IRB) override {
+    LV.Builder.SetInsertPoint(IRB.GetInsertPoint());
 
-    // Reductions do not have to start at zero. They can start with
-    // any loop invariant values.
-    VectorParts &VecRdxPhi = WidenMap.get(RdxPhi);
-    BasicBlock *Latch = OrigLoop->getLoopLatch();
-    Value *LoopVal = RdxPhi->getIncomingValueForBlock(Latch);
-    VectorParts &Val = getVectorValue(LoopVal);
-    for (unsigned part = 0; part < UF; ++part) {
-      // Make sure to add the reduction stat value only to the
-      // first unroll part.
-      Value *StartVal = (part == 0) ? VectorStart : Identity;
-      cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal,
-                                                  LoopVectorPreHeader);
-      cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part],
-                                                  LoopVectorBody.back());
+    Value *ReducedPartRdx = nullptr;
+    for (auto *V : Values) {
+      if (!ReducedPartRdx)
+        ReducedPartRdx = V;
+      else
+        ReducedPartRdx = R->createOp(IRB, ReducedPartRdx, V);
     }
 
-    // Before each round, move the insertion point right between
-    // the PHIs and the values we are going to write.
-    // This allows us to write both PHINodes and the extractelement
-    // instructions.
-    Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt());
-
-    VectorParts RdxParts;
-    setDebugLocFromInst(Builder, LoopExitInst);
-    for (unsigned part = 0; part < UF; ++part) {
-      // This PHINode contains the vectorized reduction variable, or
-      // the initial value vector, if we bypass the vector loop.
-      VectorParts &RdxExitVal = getVectorValue(LoopExitInst);
-      PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
-      Value *StartVal = (part == 0) ? VectorStart : Identity;
-      for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
-        NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]);
-      NewPhi->addIncoming(RdxExitVal[part],
-                          LoopVectorBody.back());
-      RdxParts.push_back(NewPhi);
+    if (LV.VF == 1)
+      return ReducedPartRdx;
+
+    // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
+    // and vector ops, reducing the set of values being computed by half each
+    // round.
+    assert(isPowerOf2_32(LV.VF) &&
+           "Reduction emission only supported for pow2 vectors!");
+    Value *TmpVec = ReducedPartRdx;
+    SmallVector<Constant*, 32> ShuffleMask(LV.VF, nullptr);
+    for (unsigned i = LV.VF; i != 1; i >>= 1) {
+      // Move the upper half of the vector to the lower half.
+      for (unsigned j = 0; j != i/2; ++j)
+        ShuffleMask[j] = LV.Builder.getInt32(i/2 + j);
+
+      // Fill the rest of the mask with undef.
+      std::fill(&ShuffleMask[i/2], ShuffleMask.end(),
+                UndefValue::get(LV.Builder.getInt32Ty()));
+
+      Value *Shuf =
+        IRB.CreateShuffleVector(TmpVec,
+                                UndefValue::get(TmpVec->getType()),
+                                ConstantVector::get(ShuffleMask),
+                                "rdx.shuf");
+
+      TmpVec = R->createOp(IRB, TmpVec, Shuf);
     }
 
-    // Reduce all of the unrolled parts into a single vector.
-    Value *ReducedPartRdx = RdxParts[0];
-    unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
-    setDebugLocFromInst(Builder, ReducedPartRdx);
-    for (unsigned part = 1; part < UF; ++part) {
-      if (Op != Instruction::ICmp && Op != Instruction::FCmp)
-        // Floating point operations had to be 'fast' to enable the reduction.
-        ReducedPartRdx = addFastMathFlag(
-            Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxParts[part],
-                                ReducedPartRdx, "bin.rdx"));
-      else
-        ReducedPartRdx = RecurrenceDescriptor::createMinMaxOp(
-            Builder, MinMaxKind, ReducedPartRdx, RdxParts[part]);
-    }
+    // The result is in the first element of the vector.
+    return IRB.CreateExtractElement(TmpVec, IRB.getInt32(0));
+  }
+};
 
-    if (VF > 1) {
-      // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
-      // and vector ops, reducing the set of values being computed by half each
-      // round.
-      assert(isPowerOf2_32(VF) &&
-             "Reduction emission only supported for pow2 vectors!");
-      Value *TmpVec = ReducedPartRdx;
-      SmallVector<Constant*, 32> ShuffleMask(VF, nullptr);
-      for (unsigned i = VF; i != 1; i >>= 1) {
-        // Move the upper half of the vector to the lower half.
-        for (unsigned j = 0; j != i/2; ++j)
-          ShuffleMask[j] = Builder.getInt32(i/2 + j);
-
-        // Fill the rest of the mask with undef.
-        std::fill(&ShuffleMask[i/2], ShuffleMask.end(),
-                  UndefValue::get(Builder.getInt32Ty()));
-
-        Value *Shuf =
-        Builder.CreateShuffleVector(TmpVec,
-                                    UndefValue::get(TmpVec->getType()),
-                                    ConstantVector::get(ShuffleMask),
-                                    "rdx.shuf");
-
-        if (Op != Instruction::ICmp && Op != Instruction::FCmp)
-          // Floating point operations had to be 'fast' to enable the reduction.
-          TmpVec = addFastMathFlag(Builder.CreateBinOp(
-              (Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx"));
-        else
-          TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind,
-                                                        TmpVec, Shuf);
-      }
+void InnerLoopVectorizer::vectorizeLoop() {
+  //===------------------------------------------------===//
+  //
+  // Notice: any optimization or new instruction that go
+  // into the code below should be also be implemented in
+  // the cost-model.
+  //
+  //===------------------------------------------------===//
 
-      // The result is in the first element of the vector.
-      ReducedPartRdx = Builder.CreateExtractElement(TmpVec,
-                                                    Builder.getInt32(0));
-    }
+  BasicBlock *PredBB;
+  const DataLayout &DL =
+    OrigLoop->getHeader()->getParent()->getParent()->getDataLayout();
+  LoopEditor ScalarLE(OrigLoop, SE, &DL, LI, DT);
+  VectorizerDelegate D(*this);
+  ScalarLE.versionWidenAndInterleaveLoop(UF, PredBB, &D);
+
+  Instruction *MemRuntimeCheck, *FirstCheckInst;
+  std::tie(FirstCheckInst, MemRuntimeCheck) =
+    Legal->getLAI()->addRuntimeCheck(PredBB->getTerminator());
+  if (MemRuntimeCheck) {
+    AddedSafetyChecks = true;
+    // Create a new block containing the memory check.
+    PredBB->setName("vector.memcheck");
+  }
 
-    // Create a phi node that merges control-flow from the backedge-taken check
-    // block and the middle block.
-    PHINode *BCBlockPhi = PHINode::Create(RdxPhi->getType(), 2, "bc.merge.rdx",
-                                          LoopScalarPreHeader->getTerminator());
-    BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[0]);
-    BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
-
-    // Now, we need to fix the users of the reduction variable
-    // inside and outside of the scalar remainder loop.
-    // We know that the loop is in LCSSA form. We need to update the
-    // PHI nodes in the exit blocks.
-    for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
-         LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
-      PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
-      if (!LCSSAPhi) break;
-
-      // All PHINodes need to have a single entry edge, or two if
-      // we already fixed them.
-      assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
-
-      // We found our reduction value exit-PHI. Update it with the
-      // incoming bypass edge.
-      if (LCSSAPhi->getIncomingValue(0) == LoopExitInst) {
-        // Add an edge coming from the bypass.
-        LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
-        break;
-      }
-    }// end of the LCSSA phi scan.
-
-    // Fix the scalar loop reduction variable with the incoming reduction sum
-    // from the vector body and from the backedge value.
-    int IncomingEdgeBlockIdx =
-    (RdxPhi)->getBasicBlockIndex(OrigLoop->getLoopLatch());
-    assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
-    // Pick the other block.
-    int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
-    (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
-    (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
-  }// end of for each redux variable.
-
-  fixLCSSAPHIs();
-
-  // Remove redundant induction instructions.
-  cse(LoopVectorBody);
 }
 
 void InnerLoopVectorizer::fixLCSSAPHIs() {
@@ -3458,16 +2924,21 @@
 void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
   // For each instruction in the old loop.
   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+    vectorizeSingleInstruction(it, PV);
+  }
+}
+
+void InnerLoopVectorizer::vectorizeSingleInstruction(Instruction *it, PhiVector *PV) {
     VectorParts &Entry = WidenMap.get(it);
     switch (it->getOpcode()) {
     case Instruction::Br:
       // Nothing to do for PHIs and BR, since we already took care of the
       // loop control flow instructions.
-      continue;
+      return;
     case Instruction::PHI: {
       // Vectorize PHINodes.
       widenPHIInstruction(it, Entry, UF, VF, PV);
-      continue;
+      return;
     }// End of PHI.
 
     case Instruction::Add:
@@ -3611,7 +3082,7 @@
         break;
       setDebugLocFromInst(Builder, it);
 
-      Module *M = BB->getParent()->getParent();
+      Module *M = it->getParent()->getParent()->getParent();
       CallInst *CI = cast<CallInst>(it);
 
       StringRef FnName = CI->getCalledFunction()->getName();
@@ -3686,38 +3157,9 @@
       scalarizeInstruction(it);
       break;
     }// end of switch.
-  }// end of for_each instr.
 }
 
 void InnerLoopVectorizer::updateAnalysis() {
-  // Forget the original basic block.
-  SE->forgetLoop(OrigLoop);
-
-  // Update the dominator tree information.
-  assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
-         "Entry does not dominate exit.");
-
-  for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
-    DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]);
-  DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back());
-
-  // Due to if predication of stores we might create a sequence of "if(pred)
-  // a[i] = ...;  " blocks.
-  for (unsigned i = 0, e = LoopVectorBody.size(); i != e; ++i) {
-    if (i == 0)
-      DT->addNewBlock(LoopVectorBody[0], LoopVectorPreHeader);
-    else if (isPredicatedBlock(i)) {
-      DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-1]);
-    } else {
-      DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-2]);
-    }
-  }
-
-  DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks[1]);
-  DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
-  DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
-  DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
-
   DEBUG(DT->verifyDomTree());
 }