Index: include/llvm/Analysis/VectorUtils.h
===================================================================
--- include/llvm/Analysis/VectorUtils.h
+++ include/llvm/Analysis/VectorUtils.h
@@ -271,7 +271,7 @@
 
   /// Get the index for the given member. Unlike the key in the member
   /// map, the index starts from 0.
-  unsigned getIndex(InstTy *Instr) const {
+  unsigned getIndex(const InstTy *Instr) const {
     for (auto I : Members) {
       if (I.second == Instr)
         return I.first - SmallestKey;
@@ -349,7 +349,8 @@
   /// Get the interleave group that \p Instr belongs to.
   ///
   /// \returns nullptr if doesn't have such group.
-  InterleaveGroup<Instruction> *getInterleaveGroup(Instruction *Instr) const {
+  InterleaveGroup<Instruction> *
+  getInterleaveGroup(const Instruction *Instr) const {
     if (InterleaveGroupMap.count(Instr))
       return InterleaveGroupMap.find(Instr)->second;
     return nullptr;
Index: lib/Transforms/Vectorize/CMakeLists.txt
===================================================================
--- lib/Transforms/Vectorize/CMakeLists.txt
+++ lib/Transforms/Vectorize/CMakeLists.txt
@@ -7,6 +7,7 @@
   VPlan.cpp
   VPlanHCFGBuilder.cpp
   VPlanHCFGTransforms.cpp
+  VPlanSLP.cpp
   VPlanVerifier.cpp
 
   ADDITIONAL_HEADER_DIRS
Index: lib/Transforms/Vectorize/VPlan.h
===================================================================
--- lib/Transforms/Vectorize/VPlan.h
+++ lib/Transforms/Vectorize/VPlan.h
@@ -586,7 +586,11 @@
 
 public:
   /// VPlan opcodes, extending LLVM IR with idiomatics instructions.
-  enum { Not = Instruction::OtherOpsEnd + 1 };
+  enum {
+    Not = Instruction::OtherOpsEnd + 1,
+    CombinedLoad,
+    CombinedStore,
+  };
 
 private:
   typedef unsigned char OpcodeTy;
@@ -609,12 +613,18 @@
     return V->getVPValueID() == VPValue::VPInstructionSC;
   }
 
+  VPInstruction *clone() const {
+    SmallVector<VPValue *, 2> Operands(operands());
+    return new VPInstruction(Opcode, Operands);
+  }
+
   /// Method to support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const VPRecipeBase *R) {
     return R->getVPRecipeID() == VPRecipeBase::VPInstructionSC;
   }
 
   unsigned getOpcode() const { return Opcode; }
+  void setOpcode(unsigned Op) { Opcode = Op; }
 
   /// Generate the instruction.
   /// TODO: We currently execute only per-part unless a specific instance is
@@ -626,6 +636,12 @@
 
   /// Print the VPInstruction.
   void print(raw_ostream &O) const;
+
+  Instruction *getUnderlyingInstr() {
+    return cast_or_null<Instruction>(getUnderlyingValue());
+  }
+
+  void setUnderlyingInstr(Instruction *I) { setUnderlyingValue(I); }
 };
 
 /// VPWidenRecipe is a recipe for producing a copy of vector type for each
@@ -1363,6 +1379,76 @@
   }
 };
 
+/// Class that maps (parts) of an existing VPlan to trees of combined
+/// VPInstructions.
+class VPSlp {
+private:
+  /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
+  /// sorted SmallVectors of const SCEV*.
+  struct BundleDenseMapInfo {
+    static SmallVector<VPValue *, 4> getEmptyKey() {
+      return {reinterpret_cast<VPValue *>(-1)};
+    }
+
+    static SmallVector<VPValue *, 4> getTombstoneKey() {
+      return {reinterpret_cast<VPValue *>(-2)};
+    }
+
+    static unsigned getHashValue(const SmallVector<VPValue *, 4> &V) {
+      return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
+    }
+
+    static bool isEqual(const SmallVector<VPValue *, 4> &LHS,
+                        const SmallVector<VPValue *, 4> &RHS) {
+      return LHS == RHS;
+    }
+  };
+
+  /// Mapping of values in the original VPlan to a combined VPInstruction.
+  DenseMap<SmallVector<VPValue *, 4>, VPInstruction *, BundleDenseMapInfo>
+      BundleToCombined;
+
+  VPInterleavedAccessInfo &IAI;
+
+  /// Indicates whether we managed to combine all visited instructions or not.
+  bool CompletelySLP = true;
+
+  /// Width of the widest combined bundle in bits.
+  unsigned WidestBundleBits = 0;
+
+  using MultiNodeOpTy =
+      typename std::pair<VPInstruction *, SmallVector<VPValue *, 4>>;
+
+  // Input operand bundles for the current multi node. Each multi node operand
+  // bundle contains values not matching the multi node's opcode. They will
+  // be reordered in reorderMultiNodeOps, once we completed building a
+  // multi node.
+  SmallVector<MultiNodeOpTy, 4> MultiNodeOps;
+
+  bool MultiNodeActive = false;
+
+  bool areVectorizable(SmallVectorImpl<VPValue *> &Operands) const;
+
+  VPInstruction *addCombined(SmallVector<VPValue *, 4> &Operands,
+                             VPInstruction *New);
+
+  VPInstruction *markFailed();
+
+  SmallVector<MultiNodeOpTy, 4> reorderMultiNodeOps();
+
+public:
+  VPSlp(VPInterleavedAccessInfo &IAI) : IAI(IAI) {}
+
+  /// Tries to build an SLP tree rooted at \p Operands and returns a
+  /// VPInstruction combining \p Operands, if they can be combined.
+  VPInstruction *buildGraph(SmallVector<VPValue *, 4> &Operands);
+
+  /// Return the width of the widest combined bundle in bits.
+  unsigned getWidestBundleBits() const { return WidestBundleBits; }
+
+  /// Return true if all visited instruction can be combined.
+  bool isCompletelySLP() const { return CompletelySLP; }
+};
 } // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
Index: lib/Transforms/Vectorize/VPlan.cpp
===================================================================
--- lib/Transforms/Vectorize/VPlan.cpp
+++ lib/Transforms/Vectorize/VPlan.cpp
@@ -273,6 +273,12 @@
   case VPInstruction::Not:
     O << "not";
     break;
+  case VPInstruction::CombinedLoad:
+    O << "combined load";
+    break;
+  case VPInstruction::CombinedStore:
+    O << "combined store";
+    break;
   default:
     O << Instruction::getOpcodeName(getOpcode());
   }
@@ -577,6 +583,13 @@
   O << "\\l\"";
 }
 
+void VPValue::replaceAllUsesWith(VPValue *New) {
+  for (VPUser *User : users())
+    for (unsigned I = 0, E = User->getNumOperands(); I < E; ++I)
+      if (User->getOperand(I) == this)
+        User->setOperand(I, New);
+}
+
 VPInterleavedAccessInfo::VPInterleavedAccessInfo(VPlan &Plan,
                                                  InterleavedAccessInfo &IAI) {
   DenseMap<InterleaveGroup<Instruction> *, InterleaveGroup<VPInstruction> *>
@@ -589,7 +602,7 @@
     for (auto I = VPBB->begin(), E = VPBB->end(); I != E; I++) {
       assert(isa<VPInstruction>(&*I) && "Can only handle VPInstructions");
       VPInstruction *VPInst = cast<VPInstruction>(&*I);
-      Instruction *Inst = cast<Instruction>(VPInst->getUnderlyingValue());
+      const Instruction *Inst = cast<Instruction>(VPInst->getUnderlyingValue());
       auto *IG = IAI.getInterleaveGroup(Inst);
       if (!IG)
         continue;
Index: lib/Transforms/Vectorize/VPlanSLP.cpp
===================================================================
--- /dev/null
+++ lib/Transforms/Vectorize/VPlanSLP.cpp
@@ -0,0 +1,435 @@
+//===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// This file implements SLP analysis based on VPlan. The analysis is based on
+/// the ideas described in
+///
+///   Look-ahead SLP: auto-vectorization in the presence of commutative
+///   operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
+///   Luís F. W. Góes
+///
+//===----------------------------------------------------------------------===//
+
+#include "VPlan.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <cassert>
+#include <iterator>
+#include <string>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "vplan-slp"
+
+static const Instruction *getUnderlyingInstr(VPValue *V) {
+  return cast<VPInstruction>(V)->getUnderlyingInstr();
+}
+
+VPInstruction *VPSlp::markFailed() {
+  // FIXME: Currently this is used to signal we hit instructions we cannot
+  //        trivially SLP'ize.
+  CompletelySLP = false;
+  return new VPInstruction(0, {});
+}
+
+static unsigned getNumUniqueUsers(VPValue *V) {
+  SmallPtrSet<VPUser *, 4> Users(V->user_begin(), V->user_end());
+  return Users.size();
+}
+
+VPInstruction *VPSlp::addCombined(SmallVector<VPValue *, 4> &Operands,
+                                  VPInstruction *New) {
+  if (all_of(Operands, [](VPValue *V) {
+        return cast<VPInstruction>(V)->getUnderlyingInstr();
+      })) {
+    unsigned BundleSize = 0;
+    for (VPValue *V : Operands) {
+      Type *T = cast<VPInstruction>(V)->getUnderlyingInstr()->getType();
+      assert(!T->isVectorTy() && "Only scalar types supported for now");
+      BundleSize += T->getScalarSizeInBits();
+    }
+    WidestBundleBits = std::max(WidestBundleBits, BundleSize);
+  }
+
+  assert(BundleToCombined.find(Operands) == BundleToCombined.end() &&
+         "Already created a combined instruction for the operand bundle");
+  BundleToCombined[Operands] = New;
+  return New;
+}
+
+bool VPSlp::areVectorizable(SmallVectorImpl<VPValue *> &Operands) const {
+  // Currently we only support VPInstructions.
+  if (!all_of(Operands, [](VPValue *Op) {
+        return Op && isa<VPInstruction>(Op) &&
+               cast<VPInstruction>(Op)->getUnderlyingInstr();
+      })) {
+    dbgs() << "VPSLP: not all operands are VPInstructions\n";
+    return false;
+  }
+
+  // Check if opcodes and type width agree for all instructions in the bundle.
+  // FIXME: Differing widths/opcodes can be handled by inserting additional
+  //        instructions.
+  // FIXME: Deal with non-primitive types.
+  const Instruction *OriginalInstr = getUnderlyingInstr(Operands[0]);
+  unsigned Opcode = OriginalInstr->getOpcode();
+  unsigned Width = OriginalInstr->getType()->getPrimitiveSizeInBits();
+  if (!all_of(Operands, [Opcode, Width](VPValue *Op) {
+        const Instruction *I = getUnderlyingInstr(Op);
+        return I->getOpcode() == Opcode &&
+               I->getType()->getPrimitiveSizeInBits() == Width;
+      })) {
+    dbgs() << "VPSLP: Opcodes do not agree \n";
+    return false;
+  }
+
+  if (!all_of(Operands,
+              [](VPValue *Op) { return getNumUniqueUsers(Op) <= 1; })) {
+    dbgs() << "VPSLP: Some operands have multiple users.\n";
+    return false;
+  }
+
+  // For loads, check that there are no stores between them.
+  // FIXME: we only have to forbid stores that could interfere with any of the
+  // loads in the bundle
+  // FIXME: we do not have OrderedVPBBs, so for now we just have to scan a BB
+  //        until we find the first load in the bundle and then check for
+  //        stores until we find the last load.
+  if (Opcode == Instruction::Load) {
+    VPBasicBlock *Parent = cast<VPInstruction>(Operands[0])->getParent();
+    if (any_of(Operands, [Parent](VPValue *V) {
+          return cast<VPInstruction>(V)->getParent() != Parent;
+        })) {
+      LLVM_DEBUG(dbgs() << "VPSLP: not all in same BB\n");
+      return false;
+    }
+    unsigned LoadsSeen = 0;
+    for (auto &I : *Parent) {
+      auto *VPI = cast<VPInstruction>(&I);
+      if (VPI->getOpcode() == Instruction::Load &&
+          std::find(Operands.begin(), Operands.end(), VPI) != Operands.end())
+        LoadsSeen++;
+
+      if (LoadsSeen == Operands.size())
+        break;
+
+      if (LoadsSeen > 0 && VPI->getOpcode() == Instruction::Store) {
+        LLVM_DEBUG(dbgs() << "VPSLP: Store between load\n");
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+static SmallVector<VPValue *, 4> getOperands(SmallVectorImpl<VPValue *> &Values,
+                                             unsigned OperandIndex) {
+  SmallVector<VPValue *, 4> Operands;
+  for (VPValue *V : Values) {
+    auto *U = cast<VPUser>(V);
+    Operands.push_back(U->getOperand(OperandIndex));
+  }
+  return Operands;
+}
+
+static bool areCommutative(SmallVectorImpl<VPValue *> &Values) {
+  return Instruction::isCommutative(
+      cast<VPInstruction>(Values[0])->getOpcode());
+}
+
+static SmallVector<SmallVector<VPValue *, 4>, 4>
+getOperands(SmallVectorImpl<VPValue *> &Values) {
+  SmallVector<SmallVector<VPValue *, 4>, 4> Result;
+  auto *VPI = cast<VPInstruction>(Values[0]);
+
+  switch (VPI->getOpcode()) {
+  case Instruction::Load:
+    llvm_unreachable("Loads terminate a tree, no need to get operands");
+  case Instruction::Store:
+    Result.push_back(getOperands(Values, 0));
+    break;
+  default: {
+    for (unsigned I = 0, NumOps = VPI->getNumOperands(); I < NumOps; ++I)
+      Result.push_back(getOperands(Values, I));
+    break;
+  }
+  }
+
+  return Result;
+}
+
+/// Returns the opcode of Values or ~0 if they do not all agree.
+static Optional<unsigned> getOpcode(SmallVectorImpl<VPValue *> &Values) {
+  unsigned Opcode = cast<VPInstruction>(Values[0])->getOpcode();
+  if (any_of(Values, [Opcode](VPValue *V) {
+        return cast<VPInstruction>(V)->getOpcode() != Opcode;
+      }))
+    return {};
+  return {Opcode};
+}
+
+enum class OpMode { Failed, Load, Opcode };
+
+/// Returns true if A and B access sequential memory if they are loads or
+/// stores or if they have identical opcodes otherwise.
+static bool areConsecutiveOrMatch(VPInstruction *A, VPInstruction *B,
+                                  VPInterleavedAccessInfo &IAI) {
+  if (A->getOpcode() != B->getOpcode())
+    return false;
+
+  if (A->getOpcode() != Instruction::Load &&
+      A->getOpcode() != Instruction::Store &&
+      B->getOpcode() != Instruction::Load &&
+      B->getOpcode() != Instruction::Store)
+    return true;
+  auto *GA = IAI.getInterleaveGroup(A);
+  auto *GB = IAI.getInterleaveGroup(B);
+
+  return GA && GB && GA == GB && GA->getIndex(A) + 1 == GB->getIndex(B);
+}
+
+/// Implements getLAScore from Listing 7 in the paper.
+/// Traverses and compares operands of V1 and V2 to MaxLevel.
+static unsigned getLAScore(VPValue *V1, VPValue *V2, unsigned MaxLevel,
+                           VPInterleavedAccessInfo &IAI) {
+  if (!isa<VPInstruction>(V1) || !isa<VPInstruction>(V2))
+    return 0;
+
+  if (MaxLevel == 0)
+    return (unsigned)areConsecutiveOrMatch(cast<VPInstruction>(V1),
+                                           cast<VPInstruction>(V2), IAI);
+
+  unsigned Score = 0;
+  for (unsigned i = 0, eV1 = cast<VPUser>(V1)->getNumOperands(); i < eV1; i++)
+    for (unsigned j = 0, eV2 = cast<VPUser>(V2)->getNumOperands(); j < eV2; j++)
+      Score += getLAScore(cast<VPUser>(V1)->getOperand(i),
+                          cast<VPUser>(V2)->getOperand(j), MaxLevel - 1, IAI);
+  return Score;
+}
+
+static std::pair<OpMode, VPValue *>
+getBest(OpMode Mode, VPValue *Last, SmallVectorImpl<VPValue *> &Candidates,
+        VPInterleavedAccessInfo &IAI) {
+  LLVM_DEBUG(dbgs() << "      getBest\n");
+  VPValue *Best = Candidates[0];
+  SmallVector<VPValue *, 4> BestCandidates;
+
+  LLVM_DEBUG(dbgs() << "        Candidates  for "
+                    << *cast<VPInstruction>(Last)->getUnderlyingInstr() << " ");
+  for (auto *Candidate : Candidates) {
+    auto *LastI = cast<VPInstruction>(Last);
+    auto *CandidateI = cast<VPInstruction>(Candidate);
+    if (areConsecutiveOrMatch(LastI, CandidateI, IAI)) {
+      LLVM_DEBUG(dbgs() << *cast<VPInstruction>(Candidate)->getUnderlyingInstr()
+                        << " ");
+      BestCandidates.push_back(Candidate);
+    }
+  }
+  LLVM_DEBUG(dbgs() << "\n");
+
+  if (BestCandidates.empty())
+    return {OpMode::Failed, nullptr};
+
+  if (BestCandidates.size() == 1)
+    return {Mode, BestCandidates[0]};
+
+  if (Mode == OpMode::Opcode) {
+    unsigned BestScore = 0;
+    for (unsigned Depth = 1; Depth < 5; Depth++) {
+      unsigned PrevScore = ~0u;
+      bool AllSame = true;
+
+      // FIXME: Avoid visiting the same operands multiple times.
+      for (auto *Candidate : BestCandidates) {
+        unsigned Score = getLAScore(Last, Candidate, Depth, IAI);
+        if (PrevScore == ~0u)
+          PrevScore = Score;
+        if (PrevScore != Score)
+          AllSame = false;
+        PrevScore = Score;
+
+        if (Score > BestScore) {
+          BestScore = Score;
+          Best = Candidate;
+        }
+      }
+      if (!AllSame)
+        break;
+    }
+  }
+  LLVM_DEBUG(dbgs() << "Found best "
+                    << *cast<VPInstruction>(Best)->getUnderlyingInstr()
+                    << "\n");
+  std::remove(Candidates.begin(), Candidates.end(), Best);
+
+  return {Mode, Best};
+}
+
+SmallVector<VPSlp::MultiNodeOpTy, 4> VPSlp::reorderMultiNodeOps() {
+  SmallVector<MultiNodeOpTy, 4> FinalOrder;
+
+  SmallVector<OpMode, 4> Mode;
+  LLVM_DEBUG(dbgs() << "Reordering multinode\n");
+
+  for (auto &Operands : MultiNodeOps) {
+    FinalOrder.push_back({Operands.first, {Operands.second[0]}});
+    if (cast<VPInstruction>(Operands.second[0])->getOpcode() ==
+        Instruction::Load)
+      Mode.push_back(OpMode::Load);
+    else
+      Mode.push_back(OpMode::Opcode);
+  }
+
+  for (unsigned Lane = 1, E = MultiNodeOps[0].second.size(); Lane < E; ++Lane) {
+    LLVM_DEBUG(dbgs() << "  Finding best value for lane " << Lane << "\n");
+    SmallVector<VPValue *, 4> Candidates;
+    LLVM_DEBUG(dbgs() << "  Candidates  ");
+    for (auto Ops : MultiNodeOps) {
+      LLVM_DEBUG(
+          dbgs() << *cast<VPInstruction>(Ops.second[Lane])->getUnderlyingInstr()
+                 << " ");
+      Candidates.push_back(Ops.second[Lane]);
+    }
+    LLVM_DEBUG(dbgs() << "\n");
+
+    for (unsigned Op = 0, E = MultiNodeOps.size(); Op < E; ++Op) {
+      LLVM_DEBUG(dbgs() << "  Checking " << Op << "\n");
+      if (Mode[Op] == OpMode::Failed)
+        continue;
+
+      VPValue *Last = FinalOrder[Op].second[Lane - 1];
+      std::pair<OpMode, VPValue *> Res =
+          getBest(Mode[Op], Last, Candidates, IAI);
+      if (Res.second)
+        FinalOrder[Op].second.push_back(Res.second);
+      else
+        // TODO: handle this case
+        FinalOrder[Op].second.push_back(markFailed());
+    }
+  }
+
+  return FinalOrder;
+}
+
+static void dumpBundle(ArrayRef<VPValue *> Values) {
+  LLVM_DEBUG(dbgs() << " Ops: ");
+  for (auto Op : Values)
+    if (Op && getUnderlyingInstr(Op))
+      LLVM_DEBUG(dbgs() << *getUnderlyingInstr(Op) << " | ");
+    else
+      LLVM_DEBUG(dbgs() << " nullptr | ");
+  LLVM_DEBUG(dbgs() << "\n");
+}
+
+VPInstruction *VPSlp::buildGraph(SmallVector<VPValue *, 4> &Values) {
+  assert(!Values.empty() && "Need some operands!");
+  // If we already visited this instruction bundle, re-use the existing node
+  auto I = BundleToCombined.find(Values);
+  if (I != BundleToCombined.end())
+    return I->second;
+
+  // Dump inputs
+  LLVM_DEBUG(dbgs() << "buildGraph");
+  dumpBundle(Values);
+  if (!areVectorizable(Values))
+    return markFailed();
+
+  assert(getOpcode(Values) && "Opcodes for all values must match");
+  unsigned ValuesOpcode = getOpcode(Values).getValue();
+
+  SmallVector<VPValue *, 4> CombinedOperands;
+  if (areCommutative(Values)) {
+    bool MultiNodeRoot = !MultiNodeActive;
+    MultiNodeActive = true;
+    for (auto &Operands : getOperands(Values)) {
+      LLVM_DEBUG(dbgs() << "  Visiting Commutative");
+      dumpBundle(Operands);
+      auto OperandsOpcode = getOpcode(Operands);
+      if (OperandsOpcode && OperandsOpcode == getOpcode(Values)) {
+        LLVM_DEBUG(dbgs() << "    Same opcode, continue building\n");
+        CombinedOperands.push_back(buildGraph(Operands));
+      } else {
+        LLVM_DEBUG(dbgs() << "    Adding multinode Ops\n");
+        // Create dummy VPInstruction, which will we replace later by the
+        // re-ordered operand.
+        VPInstruction *Op = new VPInstruction(0, {});
+        CombinedOperands.push_back(Op);
+        MultiNodeOps.emplace_back(Op, Operands);
+      }
+    }
+
+    if (MultiNodeRoot) {
+      LLVM_DEBUG(dbgs() << "Reorder \n");
+      MultiNodeActive = false;
+
+      auto FinalOrder = reorderMultiNodeOps();
+
+      MultiNodeOps.clear();
+      for (auto &Ops : FinalOrder) {
+        VPInstruction *NewOp = buildGraph(Ops.second);
+        Ops.first->replaceAllUsesWith(NewOp);
+        for (unsigned i = 0; i < CombinedOperands.size(); i++)
+          if (CombinedOperands[i] == Ops.first)
+            CombinedOperands[i] = NewOp;
+        delete Ops.first;
+        Ops.first = NewOp;
+      }
+      LLVM_DEBUG(dbgs() << "Found final order\n");
+    }
+  } else {
+    LLVM_DEBUG(dbgs() << "  NonCommuntative\n");
+    if (ValuesOpcode == Instruction::Load)
+      for (VPValue *V : Values)
+        CombinedOperands.push_back(cast<VPInstruction>(V)->getOperand(0));
+    else
+      for (auto &Operands : getOperands(Values))
+        CombinedOperands.push_back(buildGraph(Operands));
+    if (ValuesOpcode == Instruction::Store)
+      CombinedOperands.push_back(cast<VPInstruction>(Values[0])->getOperand(1));
+  }
+
+  unsigned Opcode;
+  switch (ValuesOpcode) {
+  case Instruction::Load:
+    Opcode = VPInstruction::CombinedLoad;
+    break;
+  case Instruction::Store:
+    Opcode = VPInstruction::CombinedStore;
+    break;
+  default:
+    Opcode = ValuesOpcode;
+    break;
+  }
+
+  assert(CombinedOperands.size() > 0 && "Need more some operands");
+  auto *VPI = new VPInstruction(Opcode, CombinedOperands);
+  VPI->setUnderlyingInstr(cast<VPInstruction>(Values[0])->getUnderlyingInstr());
+
+  LLVM_DEBUG(dbgs() << "Create VPInstruction "; VPI->print(dbgs());
+             cast<VPInstruction>(Values[0])->print(dbgs()); dbgs() << "\n");
+  return addCombined(Values, VPI);
+}
Index: lib/Transforms/Vectorize/VPlanValue.h
===================================================================
--- lib/Transforms/Vectorize/VPlanValue.h
+++ lib/Transforms/Vectorize/VPlanValue.h
@@ -104,6 +104,8 @@
   const_user_range users() const {
     return const_user_range(user_begin(), user_end());
   }
+
+  void replaceAllUsesWith(VPValue *New);
 };
 
 typedef DenseMap<Value *, VPValue *> Value2VPValueTy;
@@ -149,6 +151,8 @@
     return Operands[N];
   }
 
+  void setOperand(unsigned i, VPValue *New) { Operands[i] = New; }
+
   typedef SmallVectorImpl<VPValue *>::iterator operand_iterator;
   typedef SmallVectorImpl<VPValue *>::const_iterator const_operand_iterator;
   typedef iterator_range<operand_iterator> operand_range;
Index: unittests/Transforms/Vectorize/CMakeLists.txt
===================================================================
--- unittests/Transforms/Vectorize/CMakeLists.txt
+++ unittests/Transforms/Vectorize/CMakeLists.txt
@@ -8,4 +8,5 @@
 add_llvm_unittest(VectorizeTests
   VPlanTest.cpp
   VPlanHCFGTest.cpp
+  VPlanSlpTest.cpp
   )
Index: unittests/Transforms/Vectorize/VPlanSlpTest.cpp
===================================================================
--- /dev/null
+++ unittests/Transforms/Vectorize/VPlanSlpTest.cpp
@@ -0,0 +1,655 @@
+//===- llvm/unittest/Transforms/Vectorize/VPlanSlpTest.cpp ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "../lib/Transforms/Vectorize/VPlan.h"
+#include "../lib/Transforms/Vectorize/VPlanHCFGBuilder.h"
+#include "../lib/Transforms/Vectorize/VPlanHCFGTransforms.h"
+#include "VPlanTestBase.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "gtest/gtest.h"
+
+namespace llvm {
+namespace {
+
+class VPlanSlpTest : public VPlanTestBase {
+protected:
+  TargetLibraryInfoImpl TLII;
+  TargetLibraryInfo TLI;
+  DataLayout DL;
+
+  std::unique_ptr<AssumptionCache> AC;
+  std::unique_ptr<ScalarEvolution> SE;
+  std::unique_ptr<AAResults> AARes;
+  std::unique_ptr<BasicAAResult> BasicAA;
+  std::unique_ptr<LoopAccessInfo> LAI;
+  std::unique_ptr<PredicatedScalarEvolution> PSE;
+  std::unique_ptr<InterleavedAccessInfo> IAI;
+
+  VPlanSlpTest()
+      : TLII(), TLI(TLII),
+        DL("e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-"
+           "f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:"
+           "16:32:64-S128") {}
+
+  VPInterleavedAccessInfo getInterleavedAccessInfo(Function &F, Loop *L,
+                                                   VPlan &Plan) {
+    AC.reset(new AssumptionCache(F));
+    SE.reset(new ScalarEvolution(F, TLI, *AC, *DT, *LI));
+    BasicAA.reset(new BasicAAResult(DL, F, TLI, *AC, &*DT, &*LI));
+    AARes.reset(new AAResults(TLI));
+    AARes->addAAResult(*BasicAA);
+    PSE.reset(new PredicatedScalarEvolution(*SE, *L));
+    LAI.reset(new LoopAccessInfo(L, &*SE, &TLI, &*AARes, &*DT, &*LI));
+    IAI.reset(new InterleavedAccessInfo(*PSE, L, &*DT, &*LI, &*LAI));
+    IAI->analyzeInterleaving();
+    return {Plan, *IAI};
+  }
+};
+
+TEST_F(VPlanSlpTest, testSlpSimple_3) {
+  const char *ModuleString =
+      "%struct.Test = type { i32, i32 }\n"
+      "%struct.Test3 = type { i32, i32, i32 }\n"
+      "%struct.Test4xi8 = type { i8, i8, i8 }\n"
+      "define void @add_x2(%struct.Test* nocapture readonly %A, %struct.Test* "
+      "nocapture readonly %B, %struct.Test* nocapture %C)  {\n"
+      "entry:\n"
+      "  br label %for.body\n"
+      "for.body:                                         ; preds = %for.body, "
+      "%entry\n"
+      "  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n"
+      "  %A0 = getelementptr %struct.Test, %struct.Test* %A, i64 "
+      "                      %indvars.iv, i32 0\n"
+      "  %vA0 = load i32, i32* %A0, align 4\n"
+      "  %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "                      %indvars.iv, i32 0\n"
+      "  %vB0 = load i32, i32* %B0, align 4\n"
+      "  %add0 = add nsw i32 %vA0, %vB0\n"
+      "  %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "                      %indvars.iv, i32 1\n"
+      "  %vA1 = load i32, i32* %A1, align 4\n"
+      "  %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "                      %indvars.iv, i32 1\n"
+      "  %vB1 = load i32, i32* %B1, align 4\n"
+      "  %add1 = add nsw i32 %vA1, %vB1\n"
+      "  %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "                      %indvars.iv, i32 0\n"
+      "  store i32 %add0, i32* %C0, align 4\n"
+      "  %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "                      %indvars.iv, i32 1\n"
+      "  store i32 %add1, i32* %C1, align 4\n"
+      "  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n"
+      "  %exitcond = icmp eq i64 %indvars.iv.next, 1024\n"
+      "  br i1 %exitcond, label %for.cond.cleanup, label %for.body\n"
+      "for.cond.cleanup:                                 ; preds = %for.body\n"
+      "  ret void\n"
+      "}\n";
+
+  Module &M = parseModule(ModuleString);
+
+  Function *F = M.getFunction("add_x2");
+  BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor();
+  auto Plan = buildHCFG(LoopHeader);
+
+  VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock();
+  EXPECT_NE(nullptr, Entry->getSingleSuccessor());
+  VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock();
+
+  VPInstruction *Store1 = cast<VPInstruction>(&*std::next(Body->begin(), 12));
+  VPInstruction *Store2 = cast<VPInstruction>(&*std::next(Body->begin(), 14));
+
+  auto VPIAI = getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan);
+  VPSlp Slp(VPIAI);
+
+  SmallVector<VPValue *, 4> StoreRoot = {Store1, Store2};
+
+  VPInstruction *CombinedStore = Slp.buildGraph(StoreRoot);
+  EXPECT_EQ(64u, Slp.getWidestBundleBits());
+  EXPECT_EQ(VPInstruction::CombinedStore, CombinedStore->getOpcode());
+
+  auto *CombinedAdd = cast<VPInstruction>(CombinedStore->getOperand(0));
+  EXPECT_EQ(Instruction::Add, CombinedAdd->getOpcode());
+
+  auto *CombinedLoadA = cast<VPInstruction>(CombinedAdd->getOperand(0));
+  auto *CombinedLoadB = cast<VPInstruction>(CombinedAdd->getOperand(1));
+  EXPECT_EQ(VPInstruction::CombinedLoad, CombinedLoadA->getOpcode());
+  EXPECT_EQ(VPInstruction::CombinedLoad, CombinedLoadB->getOpcode());
+
+  VPInstruction *GetA = cast<VPInstruction>(&*std::next(Body->begin(), 1));
+  VPInstruction *GetB = cast<VPInstruction>(&*std::next(Body->begin(), 3));
+  EXPECT_EQ(GetA, CombinedLoadA->getOperand(0));
+  EXPECT_EQ(GetB, CombinedLoadB->getOperand(0));
+}
+
+TEST_F(VPlanSlpTest, testSlpSimple_2) {
+  const char *ModuleString =
+      "%struct.Test = type { i32, i32 }\n"
+      "%struct.Test3 = type { i32, i32, i32 }\n"
+      "%struct.Test4xi8 = type { i8, i8, i8 }\n"
+      "define void @add_x2(%struct.Test* nocapture readonly %A, %struct.Test* "
+      "nocapture readonly %B, %struct.Test* nocapture %C)  {\n"
+      "entry:\n"
+      "  br label %for.body\n"
+      "for.body:                                         ; preds = %for.body, "
+      "%entry\n"
+      "  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n"
+      "  %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vA0 = load i32, i32* %A0, align 4\n"
+      "  %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vB0 = load i32, i32* %B0, align 4\n"
+      "  %add0 = add nsw i32 %vA0, %vB0\n"
+      "  %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vA1 = load i32, i32* %A1, align 4\n"
+      "  %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vB1 = load i32, i32* %B1, align 4\n"
+      "  %add1 = add nsw i32 %vA1, %vB1\n"
+      "  %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 0\n"
+      "  store i32 %add0, i32* %C0, align 4\n"
+      "  %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 1\n"
+      "  store i32 %add1, i32* %C1, align 4\n"
+      "  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n"
+      "  %exitcond = icmp eq i64 %indvars.iv.next, 1024\n"
+      "  br i1 %exitcond, label %for.cond.cleanup, label %for.body\n"
+      "for.cond.cleanup:                                 ; preds = %for.body\n"
+      "  ret void\n"
+      "}\n";
+
+  Module &M = parseModule(ModuleString);
+
+  Function *F = M.getFunction("add_x2");
+  BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor();
+  auto Plan = buildHCFG(LoopHeader);
+  auto VPIAI = getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan);
+  VPSlp Slp(VPIAI);
+
+  VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock();
+  EXPECT_NE(nullptr, Entry->getSingleSuccessor());
+  VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock();
+
+  VPInstruction *Store1 = cast<VPInstruction>(&*std::next(Body->begin(), 12));
+  VPInstruction *Store2 = cast<VPInstruction>(&*std::next(Body->begin(), 14));
+  SmallVector<VPValue *, 4> StoreRoot = {Store1, Store2};
+  VPInstruction *CombinedStore = Slp.buildGraph(StoreRoot);
+  EXPECT_EQ(64u, Slp.getWidestBundleBits());
+  EXPECT_EQ(VPInstruction::CombinedStore, CombinedStore->getOpcode());
+
+  auto *CombinedAdd = cast<VPInstruction>(CombinedStore->getOperand(0));
+  EXPECT_EQ(Instruction::Add, CombinedAdd->getOpcode());
+
+  auto *CombinedLoadA = cast<VPInstruction>(CombinedAdd->getOperand(0));
+  auto *CombinedLoadB = cast<VPInstruction>(CombinedAdd->getOperand(1));
+  EXPECT_EQ(VPInstruction::CombinedLoad, CombinedLoadA->getOpcode());
+  EXPECT_EQ(VPInstruction::CombinedLoad, CombinedLoadB->getOpcode());
+}
+
+TEST_F(VPlanSlpTest, testSlpReuse_1) {
+  const char *ModuleString =
+      "%struct.Test = type { i32, i32 }\n"
+      "define void @add_x2(%struct.Test* nocapture readonly %A, %struct.Test* "
+      "nocapture readonly %B, %struct.Test* nocapture %C)  {\n"
+      "entry:\n"
+      "  br label %for.body\n"
+      "for.body:                                         ; preds = %for.body, "
+      "%entry\n"
+      "  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n"
+      "  %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vA0 = load i32, i32* %A0, align 4\n"
+      "  %add0 = add nsw i32 %vA0, %vA0\n"
+      "  %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vA1 = load i32, i32* %A1, align 4\n"
+      "  %add1 = add nsw i32 %vA1, %vA1\n"
+      "  %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 0\n"
+      "  store i32 %add0, i32* %C0, align 4\n"
+      "  %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 1\n"
+      "  store i32 %add1, i32* %C1, align 4\n"
+      "  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n"
+      "  %exitcond = icmp eq i64 %indvars.iv.next, 1024\n"
+      "  br i1 %exitcond, label %for.cond.cleanup, label %for.body\n"
+      "for.cond.cleanup:                                 ; preds = %for.body\n"
+      "  ret void\n"
+      "}\n";
+
+  Module &M = parseModule(ModuleString);
+
+  Function *F = M.getFunction("add_x2");
+  BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor();
+  auto Plan = buildHCFG(LoopHeader);
+  auto VPIAI = getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan);
+  VPSlp Slp(VPIAI);
+
+  VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock();
+  EXPECT_NE(nullptr, Entry->getSingleSuccessor());
+  VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock();
+
+  VPInstruction *Store1 = cast<VPInstruction>(&*std::next(Body->begin(), 8));
+  VPInstruction *Store2 = cast<VPInstruction>(&*std::next(Body->begin(), 10));
+
+  SmallVector<VPValue *, 4> StoreRoot = {Store1, Store2};
+  VPInstruction *CombinedStore = Slp.buildGraph(StoreRoot);
+  EXPECT_EQ(64u, Slp.getWidestBundleBits());
+  EXPECT_EQ(VPInstruction::CombinedStore, CombinedStore->getOpcode());
+
+  auto *CombinedAdd = cast<VPInstruction>(CombinedStore->getOperand(0));
+  EXPECT_EQ(Instruction::Add, CombinedAdd->getOpcode());
+
+  auto *CombinedLoadA = cast<VPInstruction>(CombinedAdd->getOperand(0));
+  EXPECT_EQ(CombinedLoadA, CombinedAdd->getOperand(1));
+  EXPECT_EQ(VPInstruction::CombinedLoad, CombinedLoadA->getOpcode());
+}
+
+TEST_F(VPlanSlpTest, testSlpReuse_2) {
+  const char *ModuleString =
+      "%struct.Test = type { i32, i32 }\n"
+      "define i32 @add_x2(%struct.Test* nocapture readonly %A, %struct.Test* "
+      "nocapture readonly %B, %struct.Test* nocapture %C)  {\n"
+      "entry:\n"
+      "  br label %for.body\n"
+      "for.body:                                         ; preds = %for.body, "
+      "%entry\n"
+      "  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n"
+      "  %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vA0 = load i32, i32* %A0, align 4\n"
+      "  %add0 = add nsw i32 %vA0, %vA0\n"
+      "  %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 0\n"
+      "  store i32 %add0, i32* %C0, align 4\n"
+      "  %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vA1 = load i32, i32* %A1, align 4\n"
+      "  %add1 = add nsw i32 %vA1, %vA1\n"
+      "  %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 1\n"
+      "  store i32 %add1, i32* %C1, align 4\n"
+      "  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n"
+      "  %exitcond = icmp eq i64 %indvars.iv.next, 1024\n"
+      "  br i1 %exitcond, label %for.cond.cleanup, label %for.body\n"
+      "for.cond.cleanup:                                 ; preds = %for.body\n"
+      "  ret i32 %vA1\n"
+      "}\n";
+
+  Module &M = parseModule(ModuleString);
+
+  Function *F = M.getFunction("add_x2");
+  BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor();
+  auto Plan = buildHCFG(LoopHeader);
+  auto VPIAI = getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan);
+  VPSlp Slp(VPIAI);
+
+  VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock();
+  EXPECT_NE(nullptr, Entry->getSingleSuccessor());
+  VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock();
+
+  VPInstruction *Store1 = cast<VPInstruction>(&*std::next(Body->begin(), 5));
+  VPInstruction *Store2 = cast<VPInstruction>(&*std::next(Body->begin(), 10));
+
+  SmallVector<VPValue *, 4> StoreRoot = {Store1, Store2};
+  Slp.buildGraph(StoreRoot);
+  EXPECT_FALSE(Slp.isCompletelySLP());
+}
+
+static void checkReorderExample(VPInstruction *Store1, VPInstruction *Store2,
+                                VPBasicBlock *Body,
+                                VPInterleavedAccessInfo &&IAI) {
+  VPSlp Slp(IAI);
+  SmallVector<VPValue *, 4> StoreRoot = {Store1, Store2};
+  VPInstruction *CombinedStore = Slp.buildGraph(StoreRoot);
+
+  EXPECT_TRUE(Slp.isCompletelySLP());
+  EXPECT_EQ(CombinedStore->getOpcode(), VPInstruction::CombinedStore);
+
+  VPInstruction *CombinedAdd =
+      cast<VPInstruction>(CombinedStore->getOperand(0));
+  EXPECT_EQ(CombinedAdd->getOpcode(), Instruction::Add);
+
+  VPInstruction *CombinedMulAB =
+      cast<VPInstruction>(CombinedAdd->getOperand(0));
+  VPInstruction *CombinedMulCD =
+      cast<VPInstruction>(CombinedAdd->getOperand(1));
+  EXPECT_EQ(CombinedMulAB->getOpcode(), Instruction::Mul);
+
+  VPInstruction *CombinedLoadA =
+      cast<VPInstruction>(CombinedMulAB->getOperand(0));
+  EXPECT_EQ(VPInstruction::CombinedLoad, CombinedLoadA->getOpcode());
+  VPInstruction *LoadvA0 = cast<VPInstruction>(&*std::next(Body->begin(), 2));
+  VPInstruction *LoadvA1 = cast<VPInstruction>(&*std::next(Body->begin(), 12));
+  EXPECT_EQ(LoadvA0->getOperand(0), CombinedLoadA->getOperand(0));
+  EXPECT_EQ(LoadvA1->getOperand(0), CombinedLoadA->getOperand(1));
+
+  VPInstruction *CombinedLoadB =
+      cast<VPInstruction>(CombinedMulAB->getOperand(1));
+  EXPECT_EQ(VPInstruction::CombinedLoad, CombinedLoadB->getOpcode());
+  VPInstruction *LoadvB0 = cast<VPInstruction>(&*std::next(Body->begin(), 4));
+  VPInstruction *LoadvB1 = cast<VPInstruction>(&*std::next(Body->begin(), 14));
+  EXPECT_EQ(LoadvB0->getOperand(0), CombinedLoadB->getOperand(0));
+  EXPECT_EQ(LoadvB1->getOperand(0), CombinedLoadB->getOperand(1));
+
+  EXPECT_EQ(CombinedMulCD->getOpcode(), Instruction::Mul);
+
+  VPInstruction *CombinedLoadC =
+      cast<VPInstruction>(CombinedMulCD->getOperand(0));
+  EXPECT_EQ(VPInstruction::CombinedLoad, CombinedLoadC->getOpcode());
+  VPInstruction *LoadvC0 = cast<VPInstruction>(&*std::next(Body->begin(), 7));
+  VPInstruction *LoadvC1 = cast<VPInstruction>(&*std::next(Body->begin(), 17));
+  EXPECT_EQ(LoadvC0->getOperand(0), CombinedLoadC->getOperand(0));
+  EXPECT_EQ(LoadvC1->getOperand(0), CombinedLoadC->getOperand(1));
+
+  VPInstruction *CombinedLoadD =
+      cast<VPInstruction>(CombinedMulCD->getOperand(1));
+  EXPECT_EQ(VPInstruction::CombinedLoad, CombinedLoadD->getOpcode());
+  VPInstruction *LoadvD0 = cast<VPInstruction>(&*std::next(Body->begin(), 9));
+  VPInstruction *LoadvD1 = cast<VPInstruction>(&*std::next(Body->begin(), 19));
+  EXPECT_EQ(LoadvD0->getOperand(0), CombinedLoadD->getOperand(0));
+  EXPECT_EQ(LoadvD1->getOperand(0), CombinedLoadD->getOperand(1));
+}
+
+TEST_F(VPlanSlpTest, testSlpReorder_1) {
+  LLVMContext Ctx;
+  const char *ModuleString =
+      "%struct.Test = type { i32, i32 }\n"
+      "define void @add_x3(%struct.Test* %A, %struct.Test* %B, %struct.Test* "
+      "%C,  %struct.Test* %D,  %struct.Test* %E)  {\n"
+      "entry:\n"
+      "  br label %for.body\n"
+      "for.body:                                         ; preds = %for.body, "
+      "%entry\n"
+      "  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n"
+      "  %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vA0 = load i32, i32* %A0, align 4\n"
+      "  %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vB0 = load i32, i32* %B0, align 4\n"
+      "  %mul11 = mul nsw i32 %vA0, %vB0\n"
+      "  %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vC0 = load i32, i32* %C0, align 4\n"
+      "  %D0 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vD0 = load i32, i32* %D0, align 4\n"
+      "  %mul12 = mul nsw i32 %vC0, %vD0\n"
+      "  %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vA1 = load i32, i32* %A1, align 4\n"
+      "  %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vB1 = load i32, i32* %B1, align 4\n"
+      "  %mul21 = mul nsw i32 %vA1, %vB1\n"
+      "  %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vC1 = load i32, i32* %C1, align 4\n"
+      "  %D1 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vD1 = load i32, i32* %D1, align 4\n"
+      "  %mul22 = mul nsw i32 %vC1, %vD1\n"
+      "  %add1 = add nsw i32 %mul11, %mul12\n"
+      "  %add2 = add nsw i32 %mul22, %mul21\n"
+      "  %E0 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 "
+      "%indvars.iv, i32 0\n"
+      "  store i32 %add1, i32* %E0, align 4\n"
+      "  %E1 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 "
+      "%indvars.iv, i32 1\n"
+      "  store i32 %add2, i32* %E1, align 4\n"
+      "  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n"
+      "  %exitcond = icmp eq i64 %indvars.iv.next, 1024\n"
+      "  br i1 %exitcond, label %for.cond.cleanup, label %for.body\n"
+      "for.cond.cleanup:                                 ; preds = %for.body\n"
+      "  ret void\n"
+      "}\n";
+
+  Module &M = parseModule(ModuleString);
+
+  Function *F = M.getFunction("add_x3");
+  BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor();
+  auto Plan = buildHCFG(LoopHeader);
+
+  VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock();
+  EXPECT_NE(nullptr, Entry->getSingleSuccessor());
+  VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock();
+
+  VPInstruction *Store1 = cast<VPInstruction>(&*std::next(Body->begin(), 24));
+  VPInstruction *Store2 = cast<VPInstruction>(&*std::next(Body->begin(), 26));
+
+  checkReorderExample(
+      Store1, Store2, Body,
+      getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan));
+}
+
+TEST_F(VPlanSlpTest, testSlpReorder_2) {
+  LLVMContext Ctx;
+  const char *ModuleString =
+      "%struct.Test = type { i32, i32 }\n"
+      "define void @add_x3(%struct.Test* %A, %struct.Test* %B, %struct.Test* "
+      "%C,  %struct.Test* %D,  %struct.Test* %E)  {\n"
+      "entry:\n"
+      "  br label %for.body\n"
+      "for.body:                                         ; preds = %for.body, "
+      "%entry\n"
+      "  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n"
+      "  %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vA0 = load i32, i32* %A0, align 4\n"
+      "  %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vB0 = load i32, i32* %B0, align 4\n"
+      "  %mul11 = mul nsw i32 %vA0, %vB0\n"
+      "  %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vC0 = load i32, i32* %C0, align 4\n"
+      "  %D0 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vD0 = load i32, i32* %D0, align 4\n"
+      "  %mul12 = mul nsw i32 %vC0, %vD0\n"
+      "  %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vA1 = load i32, i32* %A1, align 4\n"
+      "  %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vB1 = load i32, i32* %B1, align 4\n"
+      "  %mul21 = mul nsw i32 %vB1, %vA1\n"
+      "  %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vC1 = load i32, i32* %C1, align 4\n"
+      "  %D1 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vD1 = load i32, i32* %D1, align 4\n"
+      "  %mul22 = mul nsw i32 %vD1, %vC1\n"
+      "  %add1 = add nsw i32 %mul11, %mul12\n"
+      "  %add2 = add nsw i32 %mul22, %mul21\n"
+      "  %E0 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 "
+      "%indvars.iv, i32 0\n"
+      "  store i32 %add1, i32* %E0, align 4\n"
+      "  %E1 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 "
+      "%indvars.iv, i32 1\n"
+      "  store i32 %add2, i32* %E1, align 4\n"
+      "  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n"
+      "  %exitcond = icmp eq i64 %indvars.iv.next, 1024\n"
+      "  br i1 %exitcond, label %for.cond.cleanup, label %for.body\n"
+      "for.cond.cleanup:                                 ; preds = %for.body\n"
+      "  ret void\n"
+      "}\n";
+
+  Module &M = parseModule(ModuleString);
+
+  Function *F = M.getFunction("add_x3");
+  BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor();
+  auto Plan = buildHCFG(LoopHeader);
+
+  VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock();
+  EXPECT_NE(nullptr, Entry->getSingleSuccessor());
+  VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock();
+
+  VPInstruction *Store1 = cast<VPInstruction>(&*std::next(Body->begin(), 24));
+  VPInstruction *Store2 = cast<VPInstruction>(&*std::next(Body->begin(), 26));
+
+  checkReorderExample(
+      Store1, Store2, Body,
+      getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan));
+}
+
+TEST_F(VPlanSlpTest, testSlpReorder_3) {
+  LLVMContext Ctx;
+  const char *ModuleString =
+      "%struct.Test = type { i32, i32 }\n"
+      "define void @add_x3(%struct.Test* %A, %struct.Test* %B, %struct.Test* "
+      "%C,  %struct.Test* %D,  %struct.Test* %E)  {\n"
+      "entry:\n"
+      "  br label %for.body\n"
+      "for.body:                                         ; preds = %for.body, "
+      "%entry\n"
+      "  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n"
+      "  %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vA1 = load i32, i32* %A1, align 4\n"
+      "  %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vB0 = load i32, i32* %B0, align 4\n"
+      "  %mul11 = mul nsw i32 %vA1, %vB0\n"
+      "  %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vC0 = load i32, i32* %C0, align 4\n"
+      "  %D0 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vD0 = load i32, i32* %D0, align 4\n"
+      "  %mul12 = mul nsw i32 %vC0, %vD0\n"
+      "  %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vA0 = load i32, i32* %A0, align 4\n"
+      "  %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vB1 = load i32, i32* %B1, align 4\n"
+      "  %mul21 = mul nsw i32 %vB1, %vA0\n"
+      "  %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vC1 = load i32, i32* %C1, align 4\n"
+      "  %D1 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vD1 = load i32, i32* %D1, align 4\n"
+      "  %mul22 = mul nsw i32 %vD1, %vC1\n"
+      "  %add1 = add nsw i32 %mul11, %mul12\n"
+      "  %add2 = add nsw i32 %mul22, %mul21\n"
+      "  %E0 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 "
+      "%indvars.iv, i32 0\n"
+      "  store i32 %add1, i32* %E0, align 4\n"
+      "  %E1 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 "
+      "%indvars.iv, i32 1\n"
+      "  store i32 %add2, i32* %E1, align 4\n"
+      "  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n"
+      "  %exitcond = icmp eq i64 %indvars.iv.next, 1024\n"
+      "  br i1 %exitcond, label %for.cond.cleanup, label %for.body\n"
+      "for.cond.cleanup:                                 ; preds = %for.body\n"
+      "  ret void\n"
+      "}\n";
+
+  Module &M = parseModule(ModuleString);
+
+  Function *F = M.getFunction("add_x3");
+  BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor();
+  auto Plan = buildHCFG(LoopHeader);
+
+  VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock();
+  EXPECT_NE(nullptr, Entry->getSingleSuccessor());
+  VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock();
+
+  VPInstruction *Store1 = cast<VPInstruction>(&*std::next(Body->begin(), 24));
+  VPInstruction *Store2 = cast<VPInstruction>(&*std::next(Body->begin(), 26));
+
+  SmallVector<VPValue *, 4> StoreRoot = {Store1, Store2};
+  auto VPIAI = getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan);
+  VPSlp Slp(VPIAI);
+  VPInstruction *CombinedStore = Slp.buildGraph(StoreRoot);
+
+  // FIXME Need to select better first value for lane0.
+  EXPECT_FALSE(Slp.isCompletelySLP());
+}
+
+TEST_F(VPlanSlpTest, testSlpReorder_4) {
+  LLVMContext Ctx;
+  const char *ModuleString =
+      "%struct.Test = type { i32, i32 }\n"
+      "define void @add_x3(%struct.Test* %A, %struct.Test* %B, %struct.Test* "
+      "%C,  %struct.Test* %D,  %struct.Test* %E)  {\n"
+      "entry:\n"
+      "  br label %for.body\n"
+      "for.body:                                         ; preds = %for.body, "
+      "%entry\n"
+      "  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n"
+      "  %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vA0 = load i32, i32* %A0, align 4\n"
+      "  %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vB0 = load i32, i32* %B0, align 4\n"
+      "  %mul11 = mul nsw i32 %vA0, %vB0\n"
+      "  %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vC0 = load i32, i32* %C0, align 4\n"
+      "  %D0 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vD0 = load i32, i32* %D0, align 4\n"
+      "  %mul12 = mul nsw i32 %vC0, %vD0\n"
+      "  %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vA1 = load i32, i32* %A1, align 4\n"
+      "  %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vB1 = load i32, i32* %B1, align 4\n"
+      "  %mul21 = mul nsw i32 %vA1, %vB1\n"
+      "  %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vC1 = load i32, i32* %C1, align 4\n"
+      "  %D1 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vD1 = load i32, i32* %D1, align 4\n"
+      "  %mul22 = mul nsw i32 %vC1, %vD1\n"
+      "  %add1 = add nsw i32 %mul11, %mul12\n"
+      "  %add2 = add nsw i32 %mul22, %mul21\n"
+      "  %E0 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 "
+      "%indvars.iv, i32 0\n"
+      "  store i32 %add1, i32* %E0, align 4\n"
+      "  %E1 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 "
+      "%indvars.iv, i32 1\n"
+      "  store i32 %add2, i32* %E1, align 4\n"
+      "  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n"
+      "  %exitcond = icmp eq i64 %indvars.iv.next, 1024\n"
+      "  br i1 %exitcond, label %for.cond.cleanup, label %for.body\n"
+      "for.cond.cleanup:                                 ; preds = %for.body\n"
+      "  ret void\n"
+      "}\n";
+
+  Module &M = parseModule(ModuleString);
+
+  Function *F = M.getFunction("add_x3");
+  BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor();
+  auto Plan = buildHCFG(LoopHeader);
+
+  VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock();
+  EXPECT_NE(nullptr, Entry->getSingleSuccessor());
+  VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock();
+
+  VPInstruction *Store1 = cast<VPInstruction>(&*std::next(Body->begin(), 24));
+  VPInstruction *Store2 = cast<VPInstruction>(&*std::next(Body->begin(), 26));
+
+  checkReorderExample(
+      Store1, Store2, Body,
+      getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan));
+}
+
+} // namespace
+} // namespace llvm