Index: lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -172,40 +172,6 @@
          !Ty->isPPC_FP128Ty();
 }
 
-/// \returns true if all of the instructions in \p VL are in the same block or
-/// false otherwise.
-static bool allSameBlock(ArrayRef<Value *> VL) {
-  Instruction *I0 = dyn_cast<Instruction>(VL[0]);
-  if (!I0)
-    return false;
-  BasicBlock *BB = I0->getParent();
-  for (int i = 1, e = VL.size(); i < e; i++) {
-    Instruction *I = dyn_cast<Instruction>(VL[i]);
-    if (!I)
-      return false;
-
-    if (BB != I->getParent())
-      return false;
-  }
-  return true;
-}
-
-/// \returns True if all of the values in \p VL are constants.
-static bool allConstant(ArrayRef<Value *> VL) {
-  for (Value *i : VL)
-    if (!isa<Constant>(i))
-      return false;
-  return true;
-}
-
-/// \returns True if all of the values in \p VL are identical.
-static bool isSplat(ArrayRef<Value *> VL) {
-  for (unsigned i = 1, e = VL.size(); i < e; ++i)
-    if (VL[i] != VL[0])
-      return false;
-  return true;
-}
-
 /// Checks if the vector of instructions can be represented as a shuffle, like:
 /// %x0 = extractelement <4 x i8> %x, i32 0
 /// %x3 = extractelement <4 x i8> %x, i32 3
@@ -318,51 +284,156 @@
     return AltOp ? AltOp->getOpcode() : 0;
   }
 
-  /// Some of the instructions in the list have alternate opcodes.
-  bool isAltShuffle() const { return getOpcode() != getAltOpcode(); }
-
-  bool isOpcodeOrAlt(Instruction *I) const {
-    unsigned CheckedOpcode = I->getOpcode();
-    return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
-  }
-
   InstructionsState() = delete;
   InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
       : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
 };
 
-} // end anonymous namespace
+class InstructionOrPseudo {
+private:
+  /// Parent of the operation.
+  Value *Parent;
+  /// The operation instance.
+  Value *Op;
+  /// Instruction represantation of the operation.
+  Instruction *OpInst;
+  /// Pseudo operation after replacement.
+  Instruction *PseudoOp = nullptr;
 
-/// Chooses the correct key for scheduling data. If \p Op has the same (or
-/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
-/// OpValue.
-static Value *isOneOf(const InstructionsState &S, Value *Op) {
-  auto *I = dyn_cast<Instruction>(Op);
-  if (I && S.isOpcodeOrAlt(I))
+public:
+  /// Main operation in a group of operations.
+  Instruction *MainOp = nullptr;
+  /// Alternative operation in a group of operations.
+  Instruction *AltOp = nullptr;
+  /// Main opcode of a group of operations.
+  unsigned MainOpcode = 0;
+  /// Alternative opcode of a group of operations.
+  unsigned AltOpcode = 0;
+
+  InstructionOrPseudo(Value *Parent, Value *Op) : Parent(Parent), Op(Op) {
+    OpInst = dyn_cast<Instruction>(Op);
+  }
+
+  /// Set properties of this operation after analysis of a group of operations.
+  void setInstructionsState(const InstructionsState &S) {
+    MainOp = S.MainOp;
+    AltOp = S.AltOp;
+    if (MainOp)
+      MainOpcode = MainOp->getOpcode();
+    if (AltOp)
+      AltOpcode = AltOp->getOpcode();
+  }
+
+  /// Replace pseudo opration with real one.
+  void replaceWith(Instruction *NewOp) {
+    PseudoOp = OpInst;
+    Op = NewOp;
+    OpInst = NewOp;
+  }
+
+  bool isPseudoEqual(Value *V) {
+    if (!PseudoOp)
+      return false;
+    if (PseudoOp == V)
+      return true;
+    return false;
+  }
+
+  bool isPseudo() {
+    if (PseudoOp)
+      return true;
+    return false;
+  }
+
+  bool isEqual(InstructionOrPseudo *IOP) {
+    return ((Op == IOP->getOp()) || isPseudoEqual(IOP->getOp()) ||
+            IOP->isPseudoEqual(Op));
+  }
+
+  /// Returns parent of this operation
+  Value* getParent() {
+    return Parent;
+  }
+
+  /// Returns a Value represntation of this operation.
+  Value* getOp() {
     return Op;
-  return S.OpValue;
+  }
+
+  /// Returns an Instruction represntation of this operation.
+  Instruction* getOpInst() {
+    return OpInst;
+  }
+
+  bool isAltShuffle() const {
+    return (MainOpcode != 0 && AltOpcode != 0 && MainOpcode != AltOpcode);
+  }
+
+  bool isOpcodeOrAlt() const {
+    unsigned CheckedOpcode = OpInst->getOpcode();
+    return MainOpcode == CheckedOpcode || AltOpcode == CheckedOpcode;
+  }
+
+  std::pair<Value *, unsigned> getKey() const {
+    assert(Parent && "Incorrect parent!");
+    return std::make_pair(Parent, MainOpcode);
+  }
+};
+} // end anonymous namespace
+
+/// \returns True if all of the values in \p IL are constants.
+static bool allConstant(ArrayRef<InstructionOrPseudo *> IL) {
+  return llvm::all_of(
+      make_range(IL.begin(), IL.end()),
+      [](InstructionOrPseudo *I) -> bool { return isa<Constant>(I->getOp()); });
 }
 
-/// \returns analysis of the Instructions in \p VL described in
+/// \returns True if all of the values in \p IL are identical.
+static bool isSplat(ArrayRef<InstructionOrPseudo *> IL) {
+  return llvm::all_of(make_range(IL.begin(), IL.end()),
+                      [&](InstructionOrPseudo *I) -> bool {
+                        return I->getOp() == IL[0]->getOp();
+                      });
+}
+
+/// \returns true if all of the instructions in \p IL are in the same block or
+/// false otherwise.
+static bool allSameBlock(ArrayRef<InstructionOrPseudo *> IL) {
+  if (!IL[0]->getOpInst())
+    return false;
+  BasicBlock *BB = IL[0]->getOpInst()->getParent();
+  for (int i = 1, e = IL.size(); i < e; i++) {
+    if (!IL[i]->getOpInst())
+      return false;
+
+    if (BB != IL[i]->getOpInst()->getParent())
+      return false;
+  }
+  return true;
+}
+
+/// \returns analysis of the Instructions in \p IL described in
 /// InstructionsState, the Opcode that we suppose the whole list
 /// could be vectorized even if its structure is diverse.
-static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
+static InstructionsState getSameOpcode(ArrayRef<InstructionOrPseudo *> IL,
                                        unsigned BaseIndex = 0) {
   // Make sure these are all Instructions.
-  if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
-    return InstructionsState(VL[BaseIndex], nullptr, nullptr);
+  if (llvm::any_of(IL, [](InstructionOrPseudo *IOP) { return !IOP->getOpInst(); }))
+    return InstructionsState(IL[BaseIndex]->getOp(), nullptr, nullptr);
 
-  bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
-  bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
-  unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
+  Instruction* BaseOp = IL[BaseIndex]->getOpInst();
+  bool IsCastOp = isa<CastInst>(BaseOp);
+  bool IsBinOp = isa<BinaryOperator>(BaseOp);
+  unsigned Opcode = BaseOp->getOpcode();
   unsigned AltOpcode = Opcode;
   unsigned AltIndex = BaseIndex;
 
   // Check for one alternate opcode from another BinaryOperator.
   // TODO - generalize to support all operators (types, calls etc.).
-  for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
-    unsigned InstOpcode = cast<Instruction>(VL[Cnt])->getOpcode();
-    if (IsBinOp && isa<BinaryOperator>(VL[Cnt])) {
+  for (int Cnt = 0, E = IL.size(); Cnt < E; Cnt++) {
+    auto *I = IL[Cnt]->getOpInst();
+    unsigned InstOpcode = I->getOpcode();
+    if (IsBinOp && isa<BinaryOperator>(I)) {
       if (InstOpcode == Opcode || InstOpcode == AltOpcode)
         continue;
       if (Opcode == AltOpcode) {
@@ -370,9 +441,9 @@
         AltIndex = Cnt;
         continue;
       }
-    } else if (IsCastOp && isa<CastInst>(VL[Cnt])) {
-      Type *Ty0 = cast<Instruction>(VL[BaseIndex])->getOperand(0)->getType();
-      Type *Ty1 = cast<Instruction>(VL[Cnt])->getOperand(0)->getType();
+    } else if (IsCastOp && isa<CastInst>(I)) {
+      Type *Ty0 = BaseOp->getOperand(0)->getType();
+      Type *Ty1 = I->getOperand(0)->getType();
       if (Ty0 == Ty1) {
         if (InstOpcode == Opcode || InstOpcode == AltOpcode)
           continue;
@@ -384,22 +455,19 @@
       }
     } else if (InstOpcode == Opcode || InstOpcode == AltOpcode)
       continue;
-    return InstructionsState(VL[BaseIndex], nullptr, nullptr);
+    return InstructionsState(BaseOp, nullptr, nullptr);
   }
 
-  return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
-                           cast<Instruction>(VL[AltIndex]));
+  return InstructionsState(BaseOp, BaseOp, IL[AltIndex]->getOpInst());
 }
 
 /// \returns true if all of the values in \p VL have the same type or false
 /// otherwise.
 static bool allSameType(ArrayRef<Value *> VL) {
   Type *Ty = VL[0]->getType();
-  for (int i = 1, e = VL.size(); i < e; i++)
-    if (VL[i]->getType() != Ty)
-      return false;
-
-  return true;
+  return llvm::all_of(
+      make_range(VL.begin(), VL.end()),
+      [&](Value *V) -> bool { return V->getType() == Ty; });
 }
 
 /// \returns True if Extract{Value,Element} instruction extracts element Idx.
@@ -478,6 +546,7 @@
   using InstrList = SmallVector<Instruction *, 16>;
   using ValueSet = SmallPtrSet<Value *, 16>;
   using StoreList = SmallVector<StoreInst *, 8>;
+  using IOPList = SmallVector<InstructionOrPseudo *, 8>;
   using ExtraValueToDebugLocsMap =
       MapVector<Value *, SmallVector<Instruction *, 2>>;
 
@@ -537,6 +606,7 @@
 
   /// Clear the internal data structures that are created by 'buildTree'.
   void deleteTree() {
+    InstructionOrPseudoArray.clear();
     VectorizableTree.clear();
     ScalarToTreeEntry.clear();
     MustGather.clear();
@@ -612,21 +682,21 @@
   int getEntryCost(TreeEntry *E);
 
   /// This is the recursive part of buildTree.
-  void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, int);
+  void buildTree_rec(Value *Parent, ArrayRef<Value *> Roots, unsigned Depth, int);
 
-  /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
+  /// \returns true if the ExtractElement/ExtractValue instructions in \p IL can
   /// be vectorized to use the original vector (or aggregate "bitcast" to a
   /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
   /// returns false, setting \p CurrentOrder to either an empty vector or a
   /// non-identity permutation that allows to reuse extract instructions.
-  bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
+  bool canReuseExtract(ArrayRef<InstructionOrPseudo *> IL,
                        SmallVectorImpl<unsigned> &CurrentOrder) const;
 
   /// Vectorize a single entry in the tree.
   Value *vectorizeTree(TreeEntry *E);
 
   /// Vectorize a single entry in the tree, starting in \p VL.
-  Value *vectorizeTree(ArrayRef<Value *> VL);
+  Value *vectorizeTree(Value *Parent, ArrayRef<Value *> VL);
 
   /// \returns the scalarization cost for this type. Scalarization in this
   /// context means the creation of vectors from a group of scalars.
@@ -635,47 +705,55 @@
   /// \returns the scalarization cost for this list of values. Assuming that
   /// this subtree gets vectorized, we may need to extract the values from the
   /// roots. This method calculates the cost of extracting the values.
-  int getGatherCost(ArrayRef<Value *> VL);
+  int getGatherCost(ArrayRef<InstructionOrPseudo *> IL);
 
   /// Set the Builder insert point to one after the last instruction in
   /// the bundle
-  void setInsertPointAfterBundle(ArrayRef<Value *> VL,
-                                 const InstructionsState &S);
+  void setInsertPointAfterBundle(ArrayRef<InstructionOrPseudo *> IL);
 
-  /// \returns a vector from a collection of scalars in \p VL.
-  Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
+  /// \returns a vector from a collection of scalars in \p IL.
+  Value *Gather(ArrayRef<InstructionOrPseudo *> IL, VectorType *Ty);
 
   /// \returns whether the VectorizableTree is fully vectorizable and will
   /// be beneficial even the tree height is tiny.
   bool isFullyVectorizableTinyTree();
 
+  Instruction *propagateMetadata(Instruction *I,
+                                 ArrayRef<InstructionOrPseudo *> IL);
+
+  void propagateIRFlags(Value *I,
+                        ArrayRef<InstructionOrPseudo *> IL,
+                        Value *OpValue);
+
   /// \reorder commutative operands in alt shuffle if they result in
   ///  vectorized code.
-  void reorderAltShuffleOperands(const InstructionsState &S,
-                                 ArrayRef<Value *> VL,
+  void reorderAltShuffleOperands(ArrayRef<InstructionOrPseudo *> IL,
                                  SmallVectorImpl<Value *> &Left,
                                  SmallVectorImpl<Value *> &Right);
 
   /// \reorder commutative operands to get better probability of
   /// generating vectorized code.
-  void reorderInputsAccordingToOpcode(unsigned Opcode, ArrayRef<Value *> VL,
+  void reorderInputsAccordingToOpcode(ArrayRef<InstructionOrPseudo *> IL,
                                       SmallVectorImpl<Value *> &Left,
                                       SmallVectorImpl<Value *> &Right);
   struct TreeEntry {
     TreeEntry(std::vector<TreeEntry> &Container) : Container(Container) {}
 
-    /// \returns true if the scalars in VL are equal to this entry.
-    bool isSame(ArrayRef<Value *> VL) const {
-      if (VL.size() == Scalars.size())
-        return std::equal(VL.begin(), VL.end(), Scalars.begin());
-      return VL.size() == ReuseShuffleIndices.size() &&
+    /// \returns true if the scalars in IL are equal to this entry.
+    bool isSame(ArrayRef<InstructionOrPseudo *> IL) const {
+      if (IL.size() == Scalars.size())
+        return std::equal(IL.begin(), IL.end(), Scalars.begin(),
+               [this](InstructionOrPseudo *I1, InstructionOrPseudo *I2)
+                     { return I1->isEqual(I2); });
+      return IL.size() == ReuseShuffleIndices.size() &&
              std::equal(
-                 VL.begin(), VL.end(), ReuseShuffleIndices.begin(),
-                 [this](Value *V, unsigned Idx) { return V == Scalars[Idx]; });
+                 IL.begin(), IL.end(), ReuseShuffleIndices.begin(),
+                 [this](InstructionOrPseudo *I1, unsigned Idx)
+                 { return I1->isEqual(Scalars[Idx]); });
     }
 
     /// A vector of scalars.
-    ValueList Scalars;
+    IOPList Scalars;
 
     /// The Scalars are vectorized into this value. It is initialized to Null.
     Value *VectorizedValue = nullptr;
@@ -703,24 +781,31 @@
   };
 
   /// Create a new VectorizableTree entry.
-  void newTreeEntry(ArrayRef<Value *> VL, bool Vectorized, int &UserTreeIdx,
+  void newTreeEntry(ArrayRef<InstructionOrPseudo *> IL, bool Vectorized,
+                    int &UserTreeIdx,
                     ArrayRef<unsigned> ReuseShuffleIndices = None,
                     ArrayRef<unsigned> ReorderIndices = None) {
     VectorizableTree.emplace_back(VectorizableTree);
     int idx = VectorizableTree.size() - 1;
     TreeEntry *Last = &VectorizableTree[idx];
-    Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
+    Last->Scalars.insert(Last->Scalars.begin(), IL.begin(), IL.end());
     Last->NeedToGather = !Vectorized;
     Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
                                      ReuseShuffleIndices.end());
     Last->ReorderIndices = ReorderIndices;
     if (Vectorized) {
-      for (int i = 0, e = VL.size(); i != e; ++i) {
-        assert(!getTreeEntry(VL[i]) && "Scalar already in tree!");
-        ScalarToTreeEntry[VL[i]] = idx;
+      for (InstructionOrPseudo *IOP : IL) {
+        auto *I = IOP->getOpInst();
+        assert(
+            (ScalarToTreeEntry.find(I) == ScalarToTreeEntry.end() ||
+             ScalarToTreeEntry[I].find(IOP->getKey()) ==
+                 ScalarToTreeEntry[I].end()) &&
+            "Scalar already in tree!");
+        ScalarToTreeEntry[I][IOP->getKey()] = idx;
       }
     } else {
-      MustGather.insert(VL.begin(), VL.end());
+      for (InstructionOrPseudo *IOP : IL)
+        MustGather.insert(IOP->getOp());
     }
 
     if (UserTreeIdx >= 0)
@@ -733,14 +818,33 @@
   std::vector<TreeEntry> VectorizableTree;
 
   TreeEntry *getTreeEntry(Value *V) {
-    auto I = ScalarToTreeEntry.find(V);
-    if (I != ScalarToTreeEntry.end())
-      return &VectorizableTree[I->second];
+    auto *I = dyn_cast<Instruction>(V);
+    if (!I)
+      return nullptr;
+    auto It = ScalarToTreeEntry.find(I);
+    if (It != ScalarToTreeEntry.end()) {
+      auto &STT = It->second;
+      for (auto STTI : STT)
+        for (auto *IOP : VectorizableTree[STTI.second].Scalars)
+          if (IOP->getOpInst() == I)
+            return &VectorizableTree[STTI.second];
+    }
     return nullptr;
   }
 
+  std::vector<std::unique_ptr<InstructionOrPseudo>> InstructionOrPseudoArray;
+
+  /// Allocates InstructionOrPseudo.
+  InstructionOrPseudo *allocInstructionOrPseudo(Value* Parent, Value *V) {
+    InstructionOrPseudoArray.emplace_back(
+                       llvm::make_unique<InstructionOrPseudo>(Parent, V));
+    int idx = InstructionOrPseudoArray.size() - 1;
+    return (InstructionOrPseudoArray[idx]).get();
+  }
+
   /// Maps a specific scalar to its tree entry.
-  SmallDenseMap<Value*, int> ScalarToTreeEntry;
+  SmallDenseMap<Instruction *, SmallDenseMap<std::pair<Value *, unsigned>, int>>
+      ScalarToTreeEntry;
 
   /// A list of scalars that we found that we need to keep as scalars.
   ValueSet MustGather;
@@ -832,15 +936,34 @@
 
     ScheduleData() = default;
 
-    void init(int BlockSchedulingRegionID, Value *OpVal) {
+    void init(int BlockSchedulingRegionID) {
       FirstInBundle = this;
       NextInBundle = nullptr;
       NextLoadStore = nullptr;
       IsScheduled = false;
+      Parent = nullptr;
+      Opcode = 0;
+      IsPseudo = false;
+      SchedulingRegionID = BlockSchedulingRegionID;
+      UnscheduledDepsInBundle = UnscheduledDeps;
+      clearDependencies();
+    }
+
+    // Initialize ScheduleData for pseudo instruction
+    // with ScheduleData for an original instruction OpOrigin.
+    void init(int BlockSchedulingRegionID, ScheduleData *OpOrigin,
+              InstructionOrPseudo *IOP) {
+      FirstInBundle = this;
+      NextInBundle = nullptr;
+      IsScheduled = false;
+      Parent = IOP->getParent();
+      Opcode = IOP->MainOpcode;
+      IsPseudo = true;
+      Inst = OpOrigin->Inst;
+      NextLoadStore = OpOrigin->NextLoadStore;
       SchedulingRegionID = BlockSchedulingRegionID;
       UnscheduledDepsInBundle = UnscheduledDeps;
       clearDependencies();
-      OpValue = OpVal;
     }
 
     /// Returns true if the dependency information has been calculated.
@@ -945,8 +1068,14 @@
     /// dry-run).
     bool IsScheduled = false;
 
-    /// Opcode of the current instruction in the schedule data.
-    Value *OpValue = nullptr;
+    /// Opcode that represents instructions to be vectorized.
+    unsigned Opcode = 0;
+
+    /// Parent of this operation.
+    Value *Parent = nullptr;
+
+    /// True if this is a pseudo operation.
+    bool IsPseudo = false;
   };
 
 #ifndef NDEBUG
@@ -984,22 +1113,17 @@
       ++SchedulingRegionID;
     }
 
-    ScheduleData *getScheduleData(Value *V) {
-      ScheduleData *SD = ScheduleDataMap[V];
-      if (SD && SD->SchedulingRegionID == SchedulingRegionID)
+    ScheduleData *getScheduleData(InstructionOrPseudo *IOP) {
+      std::pair<Value *, unsigned> Key = IOP->getKey();
+      auto *I = IOP->getOpInst();
+      ScheduleData *SD = ScheduleDataMap[I][std::make_pair(nullptr, 0)];
+      if (SD && SD->Parent == Key.first && SD->Opcode == Key.second &&
+          SD->SchedulingRegionID == SchedulingRegionID)
+        return SD;
+      SD = ScheduleDataMap[I][Key];
+      if (SD && SD->Parent == Key.first && SD->Opcode == Key.second &&
+          SD->SchedulingRegionID == SchedulingRegionID)
         return SD;
-      return nullptr;
-    }
-
-    ScheduleData *getScheduleData(Value *V, Value *Key) {
-      if (V == Key)
-        return getScheduleData(V);
-      auto I = ExtraScheduleDataMap.find(V);
-      if (I != ExtraScheduleDataMap.end()) {
-        ScheduleData *SD = I->second[Key];
-        if (SD && SD->SchedulingRegionID == SchedulingRegionID)
-          return SD;
-      }
       return nullptr;
     }
 
@@ -1015,18 +1139,18 @@
       LLVM_DEBUG(dbgs() << "SLP:   schedule " << *SD << "\n");
 
       ScheduleData *BundleMember = SD;
+      unsigned Opcode = BundleMember->Opcode;
+      Value *Parent = BundleMember->Parent;
       while (BundleMember) {
-        if (BundleMember->Inst != BundleMember->OpValue) {
-          BundleMember = BundleMember->NextInBundle;
-          continue;
-        }
+        assert(BundleMember->Opcode == Opcode &&
+               BundleMember->Parent == Parent && "Corrupt bundle member");
         // Handle the def-use chain dependencies.
         for (Use &U : BundleMember->Inst->operands()) {
           auto *I = dyn_cast<Instruction>(U.get());
           if (!I)
             continue;
           doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
-            if (OpDef && OpDef->hasValidDependencies() &&
+            if (OpDef->hasValidDependencies() &&
                 OpDef->incrementUnscheduledDeps(-1) == 0) {
               // There are no more unscheduled dependencies after
               // decrementing, so we can put the dependent instruction
@@ -1057,15 +1181,15 @@
       }
     }
 
-    void doForAllOpcodes(Value *V,
+    void doForAllOpcodes(Instruction *I,
                          function_ref<void(ScheduleData *SD)> Action) {
-      if (ScheduleData *SD = getScheduleData(V))
-        Action(SD);
-      auto I = ExtraScheduleDataMap.find(V);
-      if (I != ExtraScheduleDataMap.end())
-        for (auto &P : I->second)
-          if (P.second->SchedulingRegionID == SchedulingRegionID)
-            Action(P.second);
+      if (ScheduleDataMap.find(I) == ScheduleDataMap.end())
+        return;
+      for (auto It : ScheduleDataMap[I]) {
+        auto *SD = It.second;
+        if (SD && isInSchedulingRegion(SD))
+          Action(SD);
+      }
     }
 
     /// Put all instructions into the ReadyList which are ready for scheduling.
@@ -1085,18 +1209,17 @@
     /// Checks if a bundle of instructions can be scheduled, i.e. has no
     /// cyclic dependencies. This is only a dry-run, no instructions are
     /// actually moved at this stage.
-    bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
-                           const InstructionsState &S);
+    bool tryScheduleBundle(ArrayRef<InstructionOrPseudo *> IL, BoUpSLP *SLP);
 
     /// Un-bundles a group of instructions.
-    void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
+    void cancelScheduling(InstructionOrPseudo *IOP);
 
     /// Allocates schedule data chunk.
     ScheduleData *allocateScheduleDataChunks();
 
     /// Extends the scheduling region so that V is inside the region.
     /// \returns true if the region size is within the limit.
-    bool extendSchedulingRegion(Value *V, const InstructionsState &S);
+    bool extendSchedulingRegion(InstructionOrPseudo *IOP);
 
     /// Initialize the ScheduleData structures for new instructions in the
     /// scheduling region.
@@ -1126,12 +1249,11 @@
 
     /// Attaches ScheduleData to Instruction.
     /// Note that the mapping survives during all vectorization iterations, i.e.
-    /// ScheduleData structures are recycled.
-    DenseMap<Value *, ScheduleData *> ScheduleDataMap;
-
-    /// Attaches ScheduleData to Instruction with the leading key.
-    DenseMap<Value *, SmallDenseMap<Value *, ScheduleData *>>
-        ExtraScheduleDataMap;
+    /// ScheduleData structures are recycled. All real instructions are placed
+    /// in [Instruction *][nullptr][0].
+    DenseMap<Instruction *,
+             SmallDenseMap<std::pair<Value *, unsigned>, ScheduleData *>>
+        ScheduleDataMap;
 
     struct ReadyList : SmallVector<ScheduleData *, 8> {
       void insert(ScheduleData *SD) { push_back(SD); }
@@ -1170,6 +1292,9 @@
   /// Attaches the BlockScheduling structures to basic blocks.
   MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
 
+  /// Remove or subsititude any pseudo instructions in the tree.
+  void eliminatePseudos(BlockScheduling *BS);
+
   /// Performs the "real" scheduling. Done before vectorization is actually
   /// performed in a basic block.
   void scheduleBlock(BlockScheduling *BS);
@@ -1293,14 +1418,16 @@
     std::string Str;
     raw_string_ostream OS(Str);
     if (isSplat(Entry->Scalars)) {
-      OS << "<splat> " << *Entry->Scalars[0];
+      OS << "<splat> " << *Entry->Scalars[0]->getOp();
       return Str;
     }
-    for (auto V : Entry->Scalars) {
+    for (auto *IOP : Entry->Scalars) {
+      Value *V = IOP->getOp();
       OS << *V;
-      if (std::any_of(
-              R->ExternalUses.begin(), R->ExternalUses.end(),
-              [&](const BoUpSLP::ExternalUser &EU) { return EU.Scalar == V; }))
+      if (std::any_of(R->ExternalUses.begin(), R->ExternalUses.end(),
+                      [&](const BoUpSLP::ExternalUser &EU) {
+                        return EU.Scalar == V;
+                      }))
         OS << " <extract>";
       OS << "\n";
     }
@@ -1330,7 +1457,7 @@
   UserIgnoreList = UserIgnoreLst;
   if (!allSameType(Roots))
     return;
-  buildTree_rec(Roots, 0, -1);
+  buildTree_rec(Roots[0], Roots, 0, -1);
 
   // Collect the values that we need to extract from the tree.
   for (TreeEntry &EIdx : VectorizableTree) {
@@ -1342,7 +1469,7 @@
 
     // For each lane:
     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
-      Value *Scalar = Entry->Scalars[Lane];
+      auto *Scalar = Entry->Scalars[Lane];
       int FoundLane = Lane;
       if (!Entry->ReuseShuffleIndices.empty()) {
         FoundLane =
@@ -1351,13 +1478,14 @@
       }
 
       // Check if the scalar is externally used as an extra arg.
-      auto ExtI = ExternallyUsedValues.find(Scalar);
+      Value *V = Scalar->getOp();
+      auto ExtI = ExternallyUsedValues.find(V);
       if (ExtI != ExternallyUsedValues.end()) {
         LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
-                          << Lane << " from " << *Scalar << ".\n");
-        ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
+                          << Lane << " from " << *V << ".\n");
+        ExternalUses.emplace_back(V, nullptr, FoundLane);
       }
-      for (User *U : Scalar->users()) {
+      for (User *U : V->users()) {
         LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
 
         Instruction *UserInst = dyn_cast<Instruction>(U);
@@ -1365,13 +1493,13 @@
           continue;
 
         // Skip in-tree scalars that become vectors
-        if (TreeEntry *UseEntry = getTreeEntry(U)) {
-          Value *UseScalar = UseEntry->Scalars[0];
+        if (TreeEntry *UseEntry = getTreeEntry(UserInst)) {
+          Value *UseScalar = UseEntry->Scalars[0]->getOp();
           // Some in-tree scalars will remain as scalar in vectorized
           // instructions. If that is the case, the one in Lane 0 will
           // be used.
           if (UseScalar != U ||
-              !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
+              !InTreeUserNeedToExtract(V, UserInst, TLI)) {
             LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
                               << ".\n");
             assert(!UseEntry->NeedToGather && "Bad state");
@@ -1384,42 +1512,48 @@
           continue;
 
         LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane "
-                          << Lane << " from " << *Scalar << ".\n");
-        ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane));
+                          << Lane << " from " << *V << ".\n");
+        ExternalUses.push_back(ExternalUser(V, U, FoundLane));
       }
     }
   }
 }
 
-void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
+void BoUpSLP::buildTree_rec(Value *Parent, ArrayRef<Value *> VL, unsigned Depth,
                             int UserTreeIdx) {
-  assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
+  SmallVector<InstructionOrPseudo *, 4> IL;
+  for (Value *V: VL)
+    IL.push_back(allocInstructionOrPseudo(Parent, V));
+  InstructionsState S = getSameOpcode(IL);
+  for (InstructionOrPseudo *IOP : IL)
+    IOP->setInstructionsState(S);
+
+  assert((allConstant(IL) || allSameType(VL)) && "Invalid types!");
 
-  InstructionsState S = getSameOpcode(VL);
   if (Depth == RecursionMaxDepth) {
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
-    newTreeEntry(VL, false, UserTreeIdx);
+    newTreeEntry(IL, false, UserTreeIdx);
     return;
   }
 
   // Don't handle vectors.
   if (S.OpValue->getType()->isVectorTy()) {
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
-    newTreeEntry(VL, false, UserTreeIdx);
+    newTreeEntry(IL, false, UserTreeIdx);
     return;
   }
 
   if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
     if (SI->getValueOperand()->getType()->isVectorTy()) {
       LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
-      newTreeEntry(VL, false, UserTreeIdx);
+      newTreeEntry(IL, false, UserTreeIdx);
       return;
     }
 
   // If all of the operands are identical or constant we have a simple solution.
-  if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode()) {
+  if (allConstant(IL) || isSplat(IL) || !allSameBlock(IL) || !S.getOpcode()) {
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
-    newTreeEntry(VL, false, UserTreeIdx);
+    newTreeEntry(IL, false, UserTreeIdx);
     return;
   }
 
@@ -1427,40 +1561,39 @@
   // the same block.
 
   // Don't vectorize ephemeral values.
-  for (unsigned i = 0, e = VL.size(); i != e; ++i) {
-    if (EphValues.count(VL[i])) {
-      LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i]
+  for (auto *IOP : IL) {
+    if (EphValues.count(IOP->getOp())) {
+      LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *IOP->getOp()
                         << ") is ephemeral.\n");
-      newTreeEntry(VL, false, UserTreeIdx);
+      newTreeEntry(IL, false, UserTreeIdx);
       return;
     }
   }
 
   // Check if this is a duplicate of another entry.
-  if (TreeEntry *E = getTreeEntry(S.OpValue)) {
-    LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
-    if (!E->isSame(VL)) {
-      LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
-      newTreeEntry(VL, false, UserTreeIdx);
-      return;
-    }
+  TreeEntry *E = getTreeEntry(IL[0]->MainOp);
+  if (E && E->isSame(IL)) {
+    LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: "
+                      << *IL[0]->MainOp << ".\n");
     // Record the reuse of the tree node.  FIXME, currently this is only used to
     // properly draw the graph rather than for the actual vectorization.
     E->UserTreeIndices.push_back(UserTreeIdx);
-    LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
+    LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *IL[0]->MainOp
                       << ".\n");
     return;
   }
 
   // Check that none of the instructions in the bundle are already in the tree.
-  for (unsigned i = 0, e = VL.size(); i != e; ++i) {
-    auto *I = dyn_cast<Instruction>(VL[i]);
-    if (!I)
-      continue;
-    if (getTreeEntry(I)) {
-      LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i]
+  for (auto *IOP : IL) {
+    auto *I = IOP->getOpInst();
+    // Avoid any non-binary duplicate operation.
+    if ((getTreeEntry(I) && (!isa<BinaryOperator>(I) || IL[0]->MainOp == I)) ||
+        (ScalarToTreeEntry.find(I) != ScalarToTreeEntry.end() &&
+         ScalarToTreeEntry[I].find(IOP->getKey()) !=
+         ScalarToTreeEntry[I].end())) {
+      LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *I
                         << ") is already in tree.\n");
-      newTreeEntry(VL, false, UserTreeIdx);
+      newTreeEntry(IL, false, UserTreeIdx);
       return;
     }
   }
@@ -1468,47 +1601,52 @@
   // If any of the scalars is marked as a value that needs to stay scalar, then
   // we need to gather the scalars.
   // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
-  for (unsigned i = 0, e = VL.size(); i != e; ++i) {
-    if (MustGather.count(VL[i]) || is_contained(UserIgnoreList, VL[i])) {
+  for (auto *IOP : IL) {
+    if (MustGather.count(IOP->getOp()) ||
+        is_contained(UserIgnoreList, IOP->getOp())) {
       LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
-      newTreeEntry(VL, false, UserTreeIdx);
+      newTreeEntry(IL, false, UserTreeIdx);
       return;
     }
   }
 
   // Check that all of the users of the scalars that we want to vectorize are
   // schedulable.
-  auto *VL0 = cast<Instruction>(S.OpValue);
+  auto *VL0 = IL[0]->MainOp;
   BasicBlock *BB = VL0->getParent();
 
   if (!DT->isReachableFromEntry(BB)) {
     // Don't go into unreachable blocks. They may contain instructions with
     // dependency cycles which confuse the final scheduling.
     LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
-    newTreeEntry(VL, false, UserTreeIdx);
+    newTreeEntry(IL, false, UserTreeIdx);
     return;
   }
 
   // Check that every instruction appears once in this bundle.
   SmallVector<unsigned, 4> ReuseShuffleIndicies;
-  SmallVector<Value *, 4> UniqueValues;
+  SmallVector<InstructionOrPseudo *, 4> UniqueValues;
   DenseMap<Value *, unsigned> UniquePositions;
-  for (Value *V : VL) {
+  for (auto *I : IL) {
+    Value *V = I->getOp();
     auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
     ReuseShuffleIndicies.emplace_back(Res.first->second);
-    if (Res.second)
-      UniqueValues.emplace_back(V);
+    if (Res.second) {
+      InstructionOrPseudo *IOP = allocInstructionOrPseudo(Parent, V);
+      IOP->setInstructionsState(S);
+      UniqueValues.emplace_back(IOP);
+    }
   }
-  if (UniqueValues.size() == VL.size()) {
+  if (UniqueValues.size() == IL.size()) {
     ReuseShuffleIndicies.clear();
   } else {
     LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
     if (UniqueValues.size() <= 1 || !llvm::isPowerOf2_32(UniqueValues.size())) {
       LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
-      newTreeEntry(VL, false, UserTreeIdx);
+      newTreeEntry(IL, false, UserTreeIdx);
       return;
     }
-    VL = UniqueValues;
+    IL = UniqueValues;
   }
 
   auto &BSRef = BlocksSchedules[BB];
@@ -1517,59 +1655,59 @@
 
   BlockScheduling &BS = *BSRef.get();
 
-  if (!BS.tryScheduleBundle(VL, this, S)) {
+  if (!BS.tryScheduleBundle(IL, this)) {
     LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
-    assert((!BS.getScheduleData(VL0) ||
-            !BS.getScheduleData(VL0)->isPartOfBundle()) &&
+    assert((!BS.getScheduleData(IL[0]) ||
+            !BS.getScheduleData(IL[0])->isPartOfBundle()) &&
            "tryScheduleBundle should cancelScheduling on failure");
-    newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+    newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies);
     return;
   }
   LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
 
-  unsigned ShuffleOrOp = S.isAltShuffle() ?
-                (unsigned) Instruction::ShuffleVector : S.getOpcode();
+  unsigned ShuffleOrOp = IL[0]->isAltShuffle() ?
+                (unsigned) Instruction::ShuffleVector : IL[0]->MainOpcode;
   switch (ShuffleOrOp) {
     case Instruction::PHI: {
       PHINode *PH = dyn_cast<PHINode>(VL0);
 
       // Check for terminator values (e.g. invoke).
-      for (unsigned j = 0; j < VL.size(); ++j)
+      for (unsigned j = 0; j < IL.size(); ++j)
         for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
           Instruction *Term = dyn_cast<Instruction>(
-              cast<PHINode>(VL[j])->getIncomingValueForBlock(
+              cast<PHINode>(IL[j]->getOp())->getIncomingValueForBlock(
                   PH->getIncomingBlock(i)));
           if (Term && Term->isTerminator()) {
             LLVM_DEBUG(dbgs()
                        << "SLP: Need to swizzle PHINodes (terminator use).\n");
-            BS.cancelScheduling(VL, VL0);
-            newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+            BS.cancelScheduling(IL[0]);
+            newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies);
             return;
           }
         }
 
-      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      newTreeEntry(IL, true, UserTreeIdx, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
 
       for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
-        for (Value *j : VL)
-          Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock(
-              PH->getIncomingBlock(i)));
-
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+        for (auto *j : IL)
+          Operands.push_back(cast<PHINode>(
+                                 j->getOp())->getIncomingValueForBlock(
+                                 PH->getIncomingBlock(i)));
+        buildTree_rec(VL0, Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
     case Instruction::ExtractValue:
     case Instruction::ExtractElement: {
       OrdersType CurrentOrder;
-      bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
+      bool Reuse = canReuseExtract(IL, CurrentOrder);
       if (Reuse) {
         LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
         ++NumOpsWantToKeepOriginalOrder;
-        newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
+        newTreeEntry(IL, /*Vectorized=*/true, UserTreeIdx,
                      ReuseShuffleIndicies);
         return;
       }
@@ -1586,13 +1724,13 @@
         auto StoredCurrentOrderAndNum =
             NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
         ++StoredCurrentOrderAndNum->getSecond();
-        newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies,
+        newTreeEntry(IL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies,
                      StoredCurrentOrderAndNum->getFirst());
         return;
       }
       LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
-      newTreeEntry(VL, /*Vectorized=*/false, UserTreeIdx, ReuseShuffleIndicies);
-      BS.cancelScheduling(VL, VL0);
+      newTreeEntry(IL, /*Vectorized=*/false, UserTreeIdx, ReuseShuffleIndicies);
+      BS.cancelScheduling(IL[0]);
       return;
     }
     case Instruction::Load: {
@@ -1606,21 +1744,21 @@
 
       if (DL->getTypeSizeInBits(ScalarTy) !=
           DL->getTypeAllocSizeInBits(ScalarTy)) {
-        BS.cancelScheduling(VL, VL0);
-        newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+        BS.cancelScheduling(IL[0]);
+        newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies);
         LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
         return;
       }
 
       // Make sure all loads in the bundle are simple - we can't vectorize
       // atomic or volatile loads.
-      SmallVector<Value *, 4> PointerOps(VL.size());
+      SmallVector<Value *, 4> PointerOps(IL.size());
       auto POIter = PointerOps.begin();
-      for (Value *V : VL) {
-        auto *L = cast<LoadInst>(V);
+      for (auto *I : IL) {
+        auto *L = cast<LoadInst>(I->getOpInst());
         if (!L->isSimple()) {
-          BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          BS.cancelScheduling(IL[0]);
+          newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies);
           LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
           return;
         }
@@ -1646,18 +1784,18 @@
             dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0));
         uint64_t Size = DL->getTypeAllocSize(ScalarTy);
         // Check that the sorted loads are consecutive.
-        if (Diff && Diff->getAPInt().getZExtValue() == (VL.size() - 1) * Size) {
+        if (Diff && Diff->getAPInt().getZExtValue() == (IL.size() - 1) * Size) {
           if (CurrentOrder.empty()) {
             // Original loads are consecutive and does not require reordering.
             ++NumOpsWantToKeepOriginalOrder;
-            newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
+            newTreeEntry(IL, /*Vectorized=*/true, UserTreeIdx,
                          ReuseShuffleIndicies);
             LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
           } else {
             // Need to reorder.
             auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
             ++I->getSecond();
-            newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
+            newTreeEntry(IL, /*Vectorized=*/true, UserTreeIdx,
                          ReuseShuffleIndicies, I->getFirst());
             LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
           }
@@ -1666,8 +1804,8 @@
       }
 
       LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
-      BS.cancelScheduling(VL, VL0);
-      newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+      BS.cancelScheduling(IL[0]);
+      newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies);
       return;
     }
     case Instruction::ZExt:
@@ -1683,26 +1821,26 @@
     case Instruction::FPTrunc:
     case Instruction::BitCast: {
       Type *SrcTy = VL0->getOperand(0)->getType();
-      for (unsigned i = 0; i < VL.size(); ++i) {
-        Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
+      for (auto *IOP : IL) {
+        Type *Ty = IOP->getOpInst()->getOperand(0)->getType();
         if (Ty != SrcTy || !isValidElementType(Ty)) {
-          BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          BS.cancelScheduling(IL[0]);
+          newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies);
           LLVM_DEBUG(dbgs()
                      << "SLP: Gathering casts with different src types.\n");
           return;
         }
       }
-      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      newTreeEntry(IL, true, UserTreeIdx, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
 
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
-        for (Value *j : VL)
-          Operands.push_back(cast<Instruction>(j)->getOperand(i));
+        for (auto *I : IL)
+          Operands.push_back(I->getOpInst()->getOperand(i));
 
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+        buildTree_rec(VL0, Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
@@ -1711,28 +1849,28 @@
       // Check that all of the compares have the same predicate.
       CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
       Type *ComparedTy = VL0->getOperand(0)->getType();
-      for (unsigned i = 1, e = VL.size(); i < e; ++i) {
-        CmpInst *Cmp = cast<CmpInst>(VL[i]);
+      for (auto *IOP : IL) {
+        CmpInst *Cmp = cast<CmpInst>(IOP->getOpInst());
         if (Cmp->getPredicate() != P0 ||
             Cmp->getOperand(0)->getType() != ComparedTy) {
-          BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          BS.cancelScheduling(IL[0]);
+          newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies);
           LLVM_DEBUG(dbgs()
                      << "SLP: Gathering cmp with different predicate.\n");
           return;
         }
       }
 
-      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      newTreeEntry(IL, true, UserTreeIdx, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
 
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
-        for (Value *j : VL)
-          Operands.push_back(cast<Instruction>(j)->getOperand(i));
+        for (auto *I : IL)
+          Operands.push_back(I->getOpInst()->getOperand(i));
 
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+        buildTree_rec(VL0, Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
@@ -1755,36 +1893,36 @@
     case Instruction::And:
     case Instruction::Or:
     case Instruction::Xor:
-      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      newTreeEntry(IL, true, UserTreeIdx, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
 
       // Sort operands of the instructions so that each side is more likely to
       // have the same opcode.
       if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
         ValueList Left, Right;
-        reorderInputsAccordingToOpcode(S.getOpcode(), VL, Left, Right);
-        buildTree_rec(Left, Depth + 1, UserTreeIdx);
-        buildTree_rec(Right, Depth + 1, UserTreeIdx);
+        reorderInputsAccordingToOpcode(IL, Left, Right);
+        buildTree_rec(VL0, Left, Depth + 1, UserTreeIdx);
+        buildTree_rec(VL0, Right, Depth + 1, UserTreeIdx);
         return;
       }
 
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
-        for (Value *j : VL)
-          Operands.push_back(cast<Instruction>(j)->getOperand(i));
+        for (auto *Inst: IL)
+          Operands.push_back(Inst->getOpInst()->getOperand(i));
 
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+        buildTree_rec(VL0, Operands, Depth + 1, UserTreeIdx);
       }
       return;
 
     case Instruction::GetElementPtr: {
       // We don't combine GEPs with complicated (nested) indexing.
-      for (unsigned j = 0; j < VL.size(); ++j) {
-        if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
+      for (auto *IOP : IL) {
+        if (IOP->getOpInst()->getNumOperands() != 2) {
           LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
-          BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          BS.cancelScheduling(IL[0]);
+          newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies);
           return;
         }
       }
@@ -1792,59 +1930,60 @@
       // We can't combine several GEPs into one vector if they operate on
       // different types.
       Type *Ty0 = VL0->getOperand(0)->getType();
-      for (unsigned j = 0; j < VL.size(); ++j) {
-        Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
+      for (auto *IOP : IL) {
+        Type *CurTy = IOP->getOpInst()->getOperand(0)->getType();
         if (Ty0 != CurTy) {
           LLVM_DEBUG(dbgs()
                      << "SLP: not-vectorizable GEP (different types).\n");
-          BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          BS.cancelScheduling(IL[0]);
+          newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies);
           return;
         }
       }
 
       // We don't combine GEPs with non-constant indexes.
-      for (unsigned j = 0; j < VL.size(); ++j) {
-        auto Op = cast<Instruction>(VL[j])->getOperand(1);
+      for (auto *IOP : IL) {
+        auto Op = IOP->getOpInst()->getOperand(1);
         if (!isa<ConstantInt>(Op)) {
           LLVM_DEBUG(dbgs()
                      << "SLP: not-vectorizable GEP (non-constant indexes).\n");
-          BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          BS.cancelScheduling(IL[0]);
+          newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies);
           return;
         }
       }
 
-      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      newTreeEntry(IL, true, UserTreeIdx, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
       for (unsigned i = 0, e = 2; i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
-        for (Value *j : VL)
-          Operands.push_back(cast<Instruction>(j)->getOperand(i));
+        for (auto *j : IL)
+          Operands.push_back(j->getOpInst()->getOperand(i));
 
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+        buildTree_rec(VL0, Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
     case Instruction::Store: {
       // Check if the stores are consecutive or of we need to swizzle them.
-      for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
-        if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
-          BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+      for (unsigned i = 0, e = IL.size() - 1; i < e; ++i)
+        if (!isConsecutiveAccess(IL[i]->getOp(), IL[i + 1]->getOp(),
+                                 *DL, *SE)) {
+          BS.cancelScheduling(IL[0]);
+          newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies);
           LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
           return;
         }
 
-      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      newTreeEntry(IL, true, UserTreeIdx, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
 
       ValueList Operands;
-      for (Value *j : VL)
-        Operands.push_back(cast<Instruction>(j)->getOperand(0));
+      for (auto *j : IL)
+        Operands.push_back(j->getOpInst()->getOperand(0));
 
-      buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+      buildTree_rec(VL0, Operands, Depth + 1, UserTreeIdx);
       return;
     }
     case Instruction::Call: {
@@ -1854,8 +1993,8 @@
       // represented by an intrinsic call
       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
       if (!isTriviallyVectorizable(ID)) {
-        BS.cancelScheduling(VL, VL0);
-        newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+        BS.cancelScheduling(IL[0]);
+        newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies);
         LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
         return;
       }
@@ -1863,15 +2002,15 @@
       Value *A1I = nullptr;
       if (hasVectorInstrinsicScalarOpd(ID, 1))
         A1I = CI->getArgOperand(1);
-      for (unsigned i = 1, e = VL.size(); i != e; ++i) {
-        CallInst *CI2 = dyn_cast<CallInst>(VL[i]);
+      for (unsigned i = 1, e = IL.size(); i != e; ++i) {
+        CallInst *CI2 = dyn_cast<CallInst>(IL[i]->getOpInst());
         if (!CI2 || CI2->getCalledFunction() != Int ||
             getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
             !CI->hasIdenticalOperandBundleSchema(*CI2)) {
-          BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
-          LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
-                            << "\n");
+          BS.cancelScheduling(IL[0]);
+          newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies);
+          LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" <<
+                            *IL[i]->getOp() << "\n");
           return;
         }
         // ctlz,cttz and powi are special intrinsics whose second argument
@@ -1879,8 +2018,8 @@
         if (hasVectorInstrinsicScalarOpd(ID, 1)) {
           Value *A1J = CI2->getArgOperand(1);
           if (A1I != A1J) {
-            BS.cancelScheduling(VL, VL0);
-            newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+            BS.cancelScheduling(IL[0]);
+            newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies);
             LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
                               << " argument " << A1I << "!=" << A1J << "\n");
             return;
@@ -1891,60 +2030,60 @@
             !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
                         CI->op_begin() + CI->getBundleOperandsEndIndex(),
                         CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
-          BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          BS.cancelScheduling(IL[0]);
+          newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies);
           LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:"
-                            << *CI << "!=" << *VL[i] << '\n');
+                            << *CI << "!=" << *IL[i]->getOp() << '\n');
           return;
         }
       }
 
-      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      newTreeEntry(IL, true, UserTreeIdx, ReuseShuffleIndicies);
       for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
-        for (Value *j : VL) {
-          CallInst *CI2 = dyn_cast<CallInst>(j);
+        for (auto *VecOp : IL) {
+          CallInst *CI2 = dyn_cast<CallInst>(VecOp->getOpInst());
           Operands.push_back(CI2->getArgOperand(i));
         }
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+        buildTree_rec(VL0, Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
     case Instruction::ShuffleVector:
       // If this is not an alternate sequence of opcode like add-sub
       // then do not vectorize this instruction.
-      if (!S.isAltShuffle()) {
-        BS.cancelScheduling(VL, VL0);
-        newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+      if (!IL[0]->isAltShuffle()) {
+        BS.cancelScheduling(IL[0]);
+        newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies);
         LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
         return;
       }
-      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      newTreeEntry(IL, true, UserTreeIdx, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
 
       // Reorder operands if reordering would enable vectorization.
       if (isa<BinaryOperator>(VL0)) {
         ValueList Left, Right;
-        reorderAltShuffleOperands(S, VL, Left, Right);
-        buildTree_rec(Left, Depth + 1, UserTreeIdx);
-        buildTree_rec(Right, Depth + 1, UserTreeIdx);
+        reorderAltShuffleOperands(IL, Left, Right);
+        buildTree_rec(VL0, Left, Depth + 1, UserTreeIdx);
+        buildTree_rec(VL0, Right, Depth + 1, UserTreeIdx);
         return;
       }
 
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
-        for (Value *j : VL)
-          Operands.push_back(cast<Instruction>(j)->getOperand(i));
+        for (auto *VecOp : IL)
+          Operands.push_back(VecOp->getOpInst()->getOperand(i));
 
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+        buildTree_rec(VL0, Operands, Depth + 1, UserTreeIdx);
       }
       return;
 
     default:
-      BS.cancelScheduling(VL, VL0);
-      newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+      BS.cancelScheduling(IL[0]);
+      newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
       return;
   }
@@ -1975,12 +2114,12 @@
   return N;
 }
 
-bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
+bool BoUpSLP::canReuseExtract(ArrayRef<InstructionOrPseudo *> IL,
                               SmallVectorImpl<unsigned> &CurrentOrder) const {
-  Instruction *E0 = cast<Instruction>(OpValue);
+  Instruction *E0 = IL[0]->MainOp;
   assert(E0->getOpcode() == Instruction::ExtractElement ||
          E0->getOpcode() == Instruction::ExtractValue);
-  assert(E0->getOpcode() == getSameOpcode(VL).getOpcode() && "Invalid opcode");
+  assert(E0->getOpcode() == getSameOpcode(IL).getOpcode() && "Invalid opcode");
   // Check if all of the extracts come from the same vector and from the
   // correct offset.
   Value *Vec = E0->getOperand(0);
@@ -1996,18 +2135,18 @@
       return false;
     // Check if load can be rewritten as load of vector.
     LoadInst *LI = dyn_cast<LoadInst>(Vec);
-    if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
+    if (!LI || !LI->isSimple() || !LI->hasNUses(IL.size()))
       return false;
   } else {
     NElts = Vec->getType()->getVectorNumElements();
   }
 
-  if (NElts != VL.size())
+  if (NElts != IL.size())
     return false;
 
   // Check that all of the indices extract from the correct offset.
   bool ShouldKeepOrder = true;
-  unsigned E = VL.size();
+  unsigned E = IL.size();
   // Assign to all items the initial value E + 1 so we can check if the extract
   // instruction index was used already.
   // Also, later we can check that all the indices are used and we have a
@@ -2016,7 +2155,7 @@
   CurrentOrder.assign(E, E + 1);
   unsigned I = 0;
   for (; I < E; ++I) {
-    auto *Inst = cast<Instruction>(VL[I]);
+    auto *Inst = IL[I]->getOpInst();
     if (Inst->getOperand(0) != Vec)
       break;
     Optional<unsigned> Idx = getExtractIndex(Inst);
@@ -2027,6 +2166,7 @@
       if (ExtIdx >= E || CurrentOrder[ExtIdx] != E + 1)
         break;
       ShouldKeepOrder = false;
+      ShouldKeepOrder = false;
       CurrentOrder[ExtIdx] = I;
     } else {
       if (CurrentOrder[I] != E + 1)
@@ -2045,25 +2185,26 @@
 bool BoUpSLP::areAllUsersVectorized(Instruction *I) const {
   return I->hasOneUse() ||
          std::all_of(I->user_begin(), I->user_end(), [this](User *U) {
-           return ScalarToTreeEntry.count(U) > 0;
+           return ScalarToTreeEntry.count(dyn_cast<Instruction>(U)) > 0;
          });
 }
 
 int BoUpSLP::getEntryCost(TreeEntry *E) {
-  ArrayRef<Value*> VL = E->Scalars;
+  ArrayRef<InstructionOrPseudo *> IL = E->Scalars;
+  Value *V = E->Scalars[0]->getOp();
 
-  Type *ScalarTy = VL[0]->getType();
-  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+  Type *ScalarTy = V->getType();
+  if (StoreInst *SI = dyn_cast<StoreInst>(V))
     ScalarTy = SI->getValueOperand()->getType();
-  else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0]))
+  else if (CmpInst *CI = dyn_cast<CmpInst>(V))
     ScalarTy = CI->getOperand(0)->getType();
-  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
+  VectorType *VecTy = VectorType::get(ScalarTy, IL.size());
 
   // If we have computed a smaller type for the expression, update VecTy so
   // that the costs will be accurate.
-  if (MinBWs.count(VL[0]))
+  if (MinBWs.count(V))
     VecTy = VectorType::get(
-        IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
+        IntegerType::get(F->getContext(), MinBWs[V].first), IL.size());
 
   unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size();
   bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
@@ -2073,26 +2214,29 @@
         TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
   }
   if (E->NeedToGather) {
-    if (allConstant(VL))
+    if (allConstant(IL))
       return 0;
-    if (isSplat(VL)) {
+    if (isSplat(IL)) {
       return ReuseShuffleCost +
              TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
     }
-    if (getSameOpcode(VL).getOpcode() == Instruction::ExtractElement &&
-        allSameType(VL) && allSameBlock(VL)) {
+    if (IL[0]->MainOpcode == Instruction::ExtractElement && allSameBlock(IL)) {
+      ValueList VL;
+      for (auto *IOP : IL)
+        VL.push_back(IOP->getOp());
       Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isShuffle(VL);
       if (ShuffleKind.hasValue()) {
         int Cost = TTI->getShuffleCost(ShuffleKind.getValue(), VecTy);
-        for (auto *V : VL) {
+        for (auto *IOP : IL) {
           // If all users of instruction are going to be vectorized and this
           // instruction itself is not going to be vectorized, consider this
           // instruction as dead and remove its cost from the final cost of the
           // vectorized tree.
-          if (areAllUsersVectorized(cast<Instruction>(V)) &&
-              !ScalarToTreeEntry.count(V)) {
+          auto *I = IOP->getOpInst();
+          if (areAllUsersVectorized(I) &&
+              !ScalarToTreeEntry.count(I)) {
             auto *IO = cast<ConstantInt>(
-                cast<ExtractElementInst>(V)->getIndexOperand());
+                cast<ExtractElementInst>(I)->getIndexOperand());
             Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
                                             IO->getZExtValue());
           }
@@ -2100,13 +2244,12 @@
         return ReuseShuffleCost + Cost;
       }
     }
-    return ReuseShuffleCost + getGatherCost(VL);
+    return ReuseShuffleCost + getGatherCost(IL);
   }
-  InstructionsState S = getSameOpcode(VL);
-  assert(S.getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
-  Instruction *VL0 = cast<Instruction>(S.OpValue);
-  unsigned ShuffleOrOp = S.isAltShuffle() ?
-               (unsigned) Instruction::ShuffleVector : S.getOpcode();
+  auto *VL0 = E->Scalars[0]->MainOp;
+  assert(IL[0]->MainOpcode != 0 && allSameBlock(IL) && "Invalid IL");
+  unsigned ShuffleOrOp = IL[0]->isAltShuffle() ?
+             (unsigned) Instruction::ShuffleVector : IL[0]->MainOpcode;
   switch (ShuffleOrOp) {
     case Instruction::PHI:
       return 0;
@@ -2116,9 +2259,10 @@
       if (NeedToShuffleReuses) {
         unsigned Idx = 0;
         for (unsigned I : E->ReuseShuffleIndices) {
+          auto *Inst = IL[I]->getOpInst();
           if (ShuffleOrOp == Instruction::ExtractElement) {
             auto *IO = cast<ConstantInt>(
-                cast<ExtractElementInst>(VL[I])->getIndexOperand());
+                cast<ExtractElementInst>(Inst)->getIndexOperand());
             Idx = IO->getZExtValue();
             ReuseShuffleCost -= TTI->getVectorInstrCost(
                 Instruction::ExtractElement, VecTy, Idx);
@@ -2129,10 +2273,10 @@
           }
         }
         Idx = ReuseShuffleNumbers;
-        for (Value *V : VL) {
+        for (auto *IOP : IL) {
           if (ShuffleOrOp == Instruction::ExtractElement) {
             auto *IO = cast<ConstantInt>(
-                cast<ExtractElementInst>(V)->getIndexOperand());
+                cast<ExtractElementInst>(IOP->getOpInst())->getIndexOperand());
             Idx = IO->getZExtValue();
           } else {
             --Idx;
@@ -2148,11 +2292,11 @@
           DeadCost += TTI->getShuffleCost(
               TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
         }
-        for (unsigned i = 0, e = VL.size(); i < e; ++i) {
-          Instruction *E = cast<Instruction>(VL[i]);
+        for (unsigned i = 0, e = IL.size(); i < e; ++i) {
           // If all users are going to be vectorized, instruction can be
           // considered as dead.
           // The same, if have only one user, it will be vectorized for sure.
+          auto *E = IL[i]->getOpInst();
           if (areAllUsersVectorized(E)) {
             // Take credit for instruction that will become dead.
             if (E->hasOneUse()) {
@@ -2166,7 +2310,8 @@
                     Ext->getOpcode(), Ext->getType(), VecTy, i);
                 // Add back the cost of s|zext which is subtracted separately.
                 DeadCost += TTI->getCastInstrCost(
-                    Ext->getOpcode(), Ext->getType(), E->getType(), Ext);
+                    Ext->getOpcode(), Ext->getType(),
+                    E->getType(), Ext);
                 continue;
               }
             }
@@ -2176,7 +2321,7 @@
         }
         return DeadCost;
       }
-      return ReuseShuffleCost + getGatherCost(VL);
+      return ReuseShuffleCost + getGatherCost(IL);
 
     case Instruction::ZExt:
     case Instruction::SExt:
@@ -2192,20 +2337,21 @@
     case Instruction::BitCast: {
       Type *SrcTy = VL0->getOperand(0)->getType();
       int ScalarEltCost =
-          TTI->getCastInstrCost(S.getOpcode(), ScalarTy, SrcTy, VL0);
+          TTI->getCastInstrCost(IL[0]->MainOpcode, ScalarTy, SrcTy, VL0);
       if (NeedToShuffleReuses) {
-        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+        ReuseShuffleCost -= (ReuseShuffleNumbers - IL.size()) * ScalarEltCost;
       }
 
       // Calculate the cost of this instruction.
-      int ScalarCost = VL.size() * ScalarEltCost;
+      int ScalarCost = IL.size() * ScalarEltCost;
 
-      VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
+      VectorType *SrcVecTy = VectorType::get(SrcTy, IL.size());
       int VecCost = 0;
       // Check if the values are candidates to demote.
       if (!MinBWs.count(VL0) || VecTy != SrcVecTy) {
         VecCost = ReuseShuffleCost +
-                  TTI->getCastInstrCost(S.getOpcode(), VecTy, SrcVecTy, VL0);
+                  TTI->getCastInstrCost(IL[0]->MainOpcode, VecTy,
+                                        SrcVecTy, VL0);
       }
       return VecCost - ScalarCost;
     }
@@ -2213,14 +2359,17 @@
     case Instruction::ICmp:
     case Instruction::Select: {
       // Calculate the cost of this instruction.
-      int ScalarEltCost = TTI->getCmpSelInstrCost(S.getOpcode(), ScalarTy,
-                                                  Builder.getInt1Ty(), VL0);
+      int ScalarEltCost = TTI->getCmpSelInstrCost(IL[0]->MainOpcode,
+                                                  ScalarTy,
+                                                  Builder.getInt1Ty(),
+                                                  VL0);
       if (NeedToShuffleReuses) {
-        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+        ReuseShuffleCost -= (ReuseShuffleNumbers - IL.size()) * ScalarEltCost;
       }
-      VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
+      VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), IL.size());
       int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
-      int VecCost = TTI->getCmpSelInstrCost(S.getOpcode(), VecTy, MaskTy, VL0);
+      int VecCost = TTI->getCmpSelInstrCost(IL[0]->MainOpcode, VecTy,
+                                            MaskTy, VL0);
       return ReuseShuffleCost + VecCost - ScalarCost;
     }
     case Instruction::Add:
@@ -2257,9 +2406,10 @@
       // If instead not all operands are constants, then set the operand kind
       // to OK_AnyValue. If all operands are constants but not the same,
       // then set the operand kind to OK_NonUniformConstantValue.
-      ConstantInt *CInt0 = nullptr;
-      for (unsigned i = 0, e = VL.size(); i < e; ++i) {
-        const Instruction *I = cast<Instruction>(VL[i]);
+      ConstantInt *CInt0 = dyn_cast<ConstantInt>(
+                                IL[0]->getOpInst()->getOperand(1));
+      for (auto *IOP : IL) {
+        auto *I = IOP->getOpInst();
         ConstantInt *CInt = dyn_cast<ConstantInt>(I->getOperand(1));
         if (!CInt) {
           Op2VK = TargetTransformInfo::OK_AnyValue;
@@ -2269,23 +2419,20 @@
         if (Op2VP == TargetTransformInfo::OP_PowerOf2 &&
             !CInt->getValue().isPowerOf2())
           Op2VP = TargetTransformInfo::OP_None;
-        if (i == 0) {
-          CInt0 = CInt;
-          continue;
-        }
         if (CInt0 != CInt)
           Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
       }
 
       SmallVector<const Value *, 4> Operands(VL0->operand_values());
       int ScalarEltCost = TTI->getArithmeticInstrCost(
-          S.getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands);
+          IL[0]->MainOpcode, ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands);
       if (NeedToShuffleReuses) {
-        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+        ReuseShuffleCost -= (ReuseShuffleNumbers - IL.size()) * ScalarEltCost;
       }
       int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
-      int VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy, Op1VK,
-                                                Op2VK, Op1VP, Op2VP, Operands);
+      int VecCost = TTI->getArithmeticInstrCost(IL[0]->MainOpcode, VecTy,
+                                                Op1VK, Op2VK, Op1VP, Op2VP,
+                                                Operands);
       return ReuseShuffleCost + VecCost - ScalarCost;
     }
     case Instruction::GetElementPtr: {
@@ -2297,7 +2444,7 @@
       int ScalarEltCost =
           TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
       if (NeedToShuffleReuses) {
-        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+        ReuseShuffleCost -= (ReuseShuffleNumbers - IL.size()) * ScalarEltCost;
       }
       int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
       int VecCost =
@@ -2310,7 +2457,7 @@
       int ScalarEltCost =
           TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0);
       if (NeedToShuffleReuses) {
-        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+        ReuseShuffleCost -= (ReuseShuffleNumbers - IL.size()) * ScalarEltCost;
       }
       int ScalarLdCost = VecTy->getNumElements() * ScalarEltCost;
       int VecLdCost =
@@ -2328,7 +2475,7 @@
       int ScalarEltCost =
           TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0);
       if (NeedToShuffleReuses) {
-        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+        ReuseShuffleCost -= (ReuseShuffleNumbers - IL.size()) * ScalarEltCost;
       }
       int ScalarStCost = VecTy->getNumElements() * ScalarEltCost;
       int VecStCost =
@@ -2351,7 +2498,7 @@
       int ScalarEltCost =
           TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF);
       if (NeedToShuffleReuses) {
-        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+        ReuseShuffleCost -= (ReuseShuffleNumbers - IL.size()) * ScalarEltCost;
       }
       int ScalarCallCost = VecTy->getNumElements() * ScalarEltCost;
 
@@ -2366,44 +2513,42 @@
       return ReuseShuffleCost + VecCallCost - ScalarCallCost;
     }
     case Instruction::ShuffleVector: {
-      assert(S.isAltShuffle() &&
-             ((Instruction::isBinaryOp(S.getOpcode()) &&
-               Instruction::isBinaryOp(S.getAltOpcode())) ||
-              (Instruction::isCast(S.getOpcode()) &&
-               Instruction::isCast(S.getAltOpcode()))) &&
+      assert(IL[0]->isAltShuffle() &&
+             ((Instruction::isBinaryOp(IL[0]->MainOpcode) &&
+             Instruction::isBinaryOp(IL[0]->AltOpcode)) ||
+             (Instruction::isCast(IL[0]->MainOpcode) &&
+             Instruction::isCast(IL[0]->AltOpcode))) &&
              "Invalid Shuffle Vector Operand");
       int ScalarCost = 0;
       if (NeedToShuffleReuses) {
         for (unsigned Idx : E->ReuseShuffleIndices) {
-          Instruction *I = cast<Instruction>(VL[Idx]);
           ReuseShuffleCost -= TTI->getInstructionCost(
-              I, TargetTransformInfo::TCK_RecipThroughput);
+              IL[Idx]->getOpInst(), TargetTransformInfo::TCK_RecipThroughput);
         }
-        for (Value *V : VL) {
-          Instruction *I = cast<Instruction>(V);
+        for (auto *IOP : IL) {
           ReuseShuffleCost += TTI->getInstructionCost(
-              I, TargetTransformInfo::TCK_RecipThroughput);
+              IOP->getOpInst(), TargetTransformInfo::TCK_RecipThroughput);
         }
       }
-      for (Value *i : VL) {
-        Instruction *I = cast<Instruction>(i);
-        assert(S.isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
+      for (auto *IOP : IL) {
+        assert(IOP->isOpcodeOrAlt() && "Unexpected main/alternate opcode");
         ScalarCost += TTI->getInstructionCost(
-            I, TargetTransformInfo::TCK_RecipThroughput);
+            IOP->getOpInst(), TargetTransformInfo::TCK_RecipThroughput);
       }
       // VecCost is equal to sum of the cost of creating 2 vectors
       // and the cost of creating shuffle.
       int VecCost = 0;
-      if (Instruction::isBinaryOp(S.getOpcode())) {
-        VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy);
-        VecCost += TTI->getArithmeticInstrCost(S.getAltOpcode(), VecTy);
+      if (Instruction::isBinaryOp(IL[0]->MainOpcode)) {
+        VecCost = TTI->getArithmeticInstrCost(IL[0]->MainOpcode, VecTy);
+        VecCost += TTI->getArithmeticInstrCost(IL[0]->AltOpcode, VecTy);
       } else {
-        Type *Src0SclTy = S.MainOp->getOperand(0)->getType();
-        Type *Src1SclTy = S.AltOp->getOperand(0)->getType();
-        VectorType *Src0Ty = VectorType::get(Src0SclTy, VL.size());
-        VectorType *Src1Ty = VectorType::get(Src1SclTy, VL.size());
-        VecCost = TTI->getCastInstrCost(S.getOpcode(), VecTy, Src0Ty);
-        VecCost += TTI->getCastInstrCost(S.getAltOpcode(), VecTy, Src1Ty);
+        Type *Src0SclTy = IL[0]->MainOp->getOperand(0)->getType();
+        Type *Src1SclTy = IL[0]->AltOp->getOperand(0)->getType();
+        VectorType *Src0Ty = VectorType::get(Src0SclTy, IL.size());
+        VectorType *Src1Ty = VectorType::get(Src1SclTy, IL.size());
+        VecCost = TTI->getCastInstrCost(IL[0]->MainOpcode, VecTy, Src0Ty);
+        VecCost += TTI->getCastInstrCost(IL[0]->AltOpcode, VecTy,
+                                         Src1Ty);
       }
       VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0);
       return ReuseShuffleCost + VecCost - ScalarCost;
@@ -2437,6 +2582,23 @@
   return true;
 }
 
+Instruction *BoUpSLP::propagateMetadata(Instruction *I,
+                                        ArrayRef<InstructionOrPseudo *> IL) {
+  ValueList VL;
+  for (auto *IOP: IL)
+    VL.push_back(IOP->getOp());
+  return llvm::propagateMetadata(I, VL);
+}
+
+void BoUpSLP::propagateIRFlags(Value *I,
+                               ArrayRef<InstructionOrPseudo *> IL,
+                               Value *OpValue) {
+  ValueList VL;
+  for (auto *IOP: IL)
+    VL.push_back(IOP->getOp());
+  llvm::propagateIRFlags(I, VL, OpValue);
+}
+
 bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() {
   // We can vectorize the tree if its size is greater than or equal to the
   // minimum size specified by the MinTreeSize command line option.
@@ -2469,7 +2631,7 @@
   Instruction *PrevInst = nullptr;
 
   for (const auto &N : VectorizableTree) {
-    Instruction *Inst = dyn_cast<Instruction>(N.Scalars[0]);
+    Instruction *Inst = N.Scalars[0]->getOpInst();
     if (!Inst)
       continue;
 
@@ -2481,8 +2643,9 @@
     // Update LiveValues.
     LiveValues.erase(PrevInst);
     for (auto &J : PrevInst->operands()) {
-      if (isa<Instruction>(&*J) && getTreeEntry(&*J))
-        LiveValues.insert(cast<Instruction>(&*J));
+      auto *I = dyn_cast<Instruction>(&*J);
+      if (I && getTreeEntry(I))
+        LiveValues.insert(I);
     }
 
     LLVM_DEBUG({
@@ -2553,8 +2716,8 @@
 
     int C = getEntryCost(&TE);
     LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
-                      << " for bundle that starts with " << *TE.Scalars[0]
-                      << ".\n");
+                      << " for bundle that starts with "
+                      << *TE.Scalars[0]->getOp() << ".\n");
     Cost += C;
   }
 
@@ -2575,7 +2738,7 @@
     // extend the extracted value back to the original type. Here, we account
     // for the extract and the added cost of the sign extend if needed.
     auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
-    auto *ScalarRoot = VectorizableTree[0].Scalars[0];
+    auto *ScalarRoot = VectorizableTree[0].Scalars[0]->getOp();
     if (MinBWs.count(ScalarRoot)) {
       auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
       auto Extend =
@@ -2618,21 +2781,22 @@
   return Cost;
 }
 
-int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
-  // Find the type of the operands in VL.
-  Type *ScalarTy = VL[0]->getType();
-  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+int BoUpSLP::getGatherCost(ArrayRef<InstructionOrPseudo *> IL) {
+  // Find the type of the operands in IL.
+  Value *V = IL[0]->getOp();
+  Type *ScalarTy = V->getType();
+  if (StoreInst *SI = dyn_cast<StoreInst>(V))
     ScalarTy = SI->getValueOperand()->getType();
-  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
+  VectorType *VecTy = VectorType::get(ScalarTy, IL.size());
   // Find the cost of inserting/extracting values from the vector.
   // Check if the same elements are inserted several times and count them as
   // shuffle candidates.
   DenseSet<unsigned> ShuffledElements;
   DenseSet<Value *> UniqueElements;
   // Iterate in reverse order to consider insert elements with the high cost.
-  for (unsigned I = VL.size(); I > 0; --I) {
+  for (unsigned I = IL.size(); I > 0; --I) {
     unsigned Idx = I - 1;
-    if (!UniqueElements.insert(VL[Idx]).second)
+    if (!UniqueElements.insert(IL[Idx]->getOp()).second)
       ShuffledElements.insert(Idx);
   }
   return getGatherCost(VecTy, ShuffledElements);
@@ -2647,25 +2811,24 @@
 // load a[3] + load b[3]
 // Reordering the second load b[1]  load a[1] would allow us to vectorize this
 // code.
-void BoUpSLP::reorderAltShuffleOperands(const InstructionsState &S,
-                                        ArrayRef<Value *> VL,
+void BoUpSLP::reorderAltShuffleOperands(ArrayRef<InstructionOrPseudo *> IL,
                                         SmallVectorImpl<Value *> &Left,
                                         SmallVectorImpl<Value *> &Right) {
   // Push left and right operands of binary operation into Left and Right
-  for (Value *V : VL) {
-    auto *I = cast<Instruction>(V);
-    assert(S.isOpcodeOrAlt(I) && "Incorrect instruction in vector");
+  for (auto *Inst : IL) {
+    auto *I = Inst->getOpInst();
+    assert(Inst->isOpcodeOrAlt() && "Incorrect instruction in vector");
     Left.push_back(I->getOperand(0));
     Right.push_back(I->getOperand(1));
   }
 
   // Reorder if we have a commutative operation and consecutive access
   // are on either side of the alternate instructions.
-  for (unsigned j = 0; j < VL.size() - 1; ++j) {
+  for (unsigned j = 0; j < IL.size() - 1; ++j) {
     if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
       if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
-        Instruction *VL1 = cast<Instruction>(VL[j]);
-        Instruction *VL2 = cast<Instruction>(VL[j + 1]);
+        Instruction *VL1 = IL[j]->getOpInst();
+        Instruction *VL2 = IL[j + 1]->getOpInst();
         if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) {
           std::swap(Left[j], Right[j]);
           continue;
@@ -2679,8 +2842,8 @@
     }
     if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
       if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
-        Instruction *VL1 = cast<Instruction>(VL[j]);
-        Instruction *VL2 = cast<Instruction>(VL[j + 1]);
+        Instruction *VL1 = IL[j]->getOpInst();
+        Instruction *VL2 = IL[j + 1]->getOpInst();
         if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) {
           std::swap(Left[j], Right[j]);
           continue;
@@ -2762,14 +2925,13 @@
   return false;
 }
 
-void BoUpSLP::reorderInputsAccordingToOpcode(unsigned Opcode,
-                                             ArrayRef<Value *> VL,
+void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<InstructionOrPseudo *> IL,
                                              SmallVectorImpl<Value *> &Left,
                                              SmallVectorImpl<Value *> &Right) {
-  if (!VL.empty()) {
+  if (!IL.empty()) {
     // Peel the first iteration out of the loop since there's nothing
     // interesting to do anyway and it simplifies the checks in the loop.
-    auto *I = cast<Instruction>(VL[0]);
+    auto *I = IL[0]->getOpInst();
     Value *VLeft = I->getOperand(0);
     Value *VRight = I->getOperand(1);
     if (!isa<Instruction>(VRight) && isa<Instruction>(VLeft))
@@ -2778,6 +2940,7 @@
     Left.push_back(VLeft);
     Right.push_back(VRight);
   }
+  unsigned Opcode = IL[0]->MainOpcode;
 
   // Keep track if we have instructions with all the same opcode on one side.
   bool AllSameOpcodeLeft = isa<Instruction>(Left[0]);
@@ -2786,8 +2949,8 @@
   bool SplatLeft = true;
   bool SplatRight = true;
 
-  for (unsigned i = 1, e = VL.size(); i != e; ++i) {
-    Instruction *I = cast<Instruction>(VL[i]);
+  for (unsigned i = 1, e = IL.size(); i != e; ++i) {
+    Instruction *I = IL[i]->getOpInst();
     assert(((I->getOpcode() == Opcode && I->isCommutative()) ||
             (I->getOpcode() != Opcode && Instruction::isCommutative(Opcode))) &&
            "Can only process commutative instruction");
@@ -2834,7 +2997,7 @@
   // add a[1],c[2]  load b[1]
   // b[2]           load b[2]
   // add a[3],c[3]  load b[3]
-  for (unsigned j = 0, e = VL.size() - 1; j < e; ++j) {
+  for (unsigned j = 0, e = IL.size() - 1; j < e; ++j) {
     if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
       if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
         if (isConsecutiveAccess(L, L1, *DL, *SE)) {
@@ -2855,31 +3018,27 @@
   }
 }
 
-void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL,
-                                        const InstructionsState &S) {
+void BoUpSLP::setInsertPointAfterBundle(ArrayRef<InstructionOrPseudo *> IL) {
   // Get the basic block this bundle is in. All instructions in the bundle
   // should be in this block.
-  auto *Front = cast<Instruction>(S.OpValue);
+  auto *Front = IL[0]->MainOp;
   auto *BB = Front->getParent();
-  assert(llvm::all_of(make_range(VL.begin(), VL.end()), [=](Value *V) -> bool {
-    auto *I = cast<Instruction>(V);
-    return !S.isOpcodeOrAlt(I) || I->getParent() == BB;
-  }));
+  assert(llvm::all_of(
+      make_range(IL.begin(), IL.end()), [=](InstructionOrPseudo *I) -> bool {
+        return !I->isOpcodeOrAlt() || I->getOpInst()->getParent() == BB;
+      }));
 
   // The last instruction in the bundle in program order.
   Instruction *LastInst = nullptr;
 
-  // Find the last instruction. The common case should be that BB has been
-  // scheduled, and the last instruction is VL.back(). So we start with
-  // VL.back() and iterate over schedule data until we reach the end of the
-  // bundle. The end of the bundle is marked by null ScheduleData.
+  // Find the last instruction. If the bundle is not scheduled then
+  // the first in the bundle is the last one in BB, because we discover
+  // bundles in backward walk.
   if (BlocksSchedules.count(BB)) {
-    auto *Bundle =
-        BlocksSchedules[BB]->getScheduleData(isOneOf(S, VL.back()));
+    BlockScheduling *BS = BlocksSchedules[BB].get();
+    auto *Bundle = BS->getScheduleData(IL[0]);
     if (Bundle && Bundle->isPartOfBundle())
-      for (; Bundle; Bundle = Bundle->NextInBundle)
-        if (Bundle->OpValue == Bundle->Inst)
-          LastInst = Bundle->Inst;
+      LastInst = Bundle->FirstInBundle->Inst;
   }
 
   // LastInst can still be null at this point if there's either not an entry
@@ -2901,9 +3060,11 @@
   // we both exit early from buildTree_rec and that the bundle be out-of-order
   // (causing us to iterate all the way to the end of the block).
   if (!LastInst) {
-    SmallPtrSet<Value *, 16> Bundle(VL.begin(), VL.end());
+    SmallPtrSet<Value *, 16> Bundle;
+    for (auto *IOP : IL)
+      Bundle.insert(IOP->getOp());
     for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) {
-      if (Bundle.erase(&I) && S.isOpcodeOrAlt(&I))
+      if (Bundle.erase(&I))
         LastInst = &I;
       if (Bundle.empty())
         break;
@@ -2916,22 +3077,23 @@
   Builder.SetCurrentDebugLocation(Front->getDebugLoc());
 }
 
-Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
+Value *BoUpSLP::Gather(ArrayRef<InstructionOrPseudo *> IL, VectorType *Ty) {
   Value *Vec = UndefValue::get(Ty);
   // Generate the 'InsertElement' instruction.
   for (unsigned i = 0; i < Ty->getNumElements(); ++i) {
-    Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
+    auto *Op = IL[i]->getOp();
+    Vec = Builder.CreateInsertElement(Vec, Op, Builder.getInt32(i));
     if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) {
       GatherSeq.insert(Insrt);
       CSEBlocks.insert(Insrt->getParent());
 
       // Add to our 'need-to-extract' list.
-      if (TreeEntry *E = getTreeEntry(VL[i])) {
+      if (TreeEntry *E = getTreeEntry(Op)) {
         // Find which lane we need to extract.
         int FoundLane = -1;
         for (unsigned Lane = 0, LE = E->Scalars.size(); Lane != LE; ++Lane) {
           // Is this the lane of the scalar that we are looking for ?
-          if (E->Scalars[Lane] == VL[i]) {
+          if (E->Scalars[Lane]->isEqual(IL[i])) {
             FoundLane = Lane;
             break;
           }
@@ -2942,7 +3104,7 @@
               std::distance(E->ReuseShuffleIndices.begin(),
                             llvm::find(E->ReuseShuffleIndices, FoundLane));
         }
-        ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));
+        ExternalUses.push_back(ExternalUser(Op, Insrt, FoundLane));
       }
     }
   }
@@ -2950,29 +3112,33 @@
   return Vec;
 }
 
-Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
-  InstructionsState S = getSameOpcode(VL);
-  if (S.getOpcode()) {
-    if (TreeEntry *E = getTreeEntry(S.OpValue)) {
-      if (E->isSame(VL)) {
-        Value *V = vectorizeTree(E);
-        if (VL.size() == E->Scalars.size() && !E->ReuseShuffleIndices.empty()) {
-          // We need to get the vectorized value but without shuffle.
-          if (auto *SV = dyn_cast<ShuffleVectorInst>(V)) {
-            V = SV->getOperand(0);
-          } else {
-            // Reshuffle to get only unique values.
-            SmallVector<unsigned, 4> UniqueIdxs;
-            SmallSet<unsigned, 4> UsedIdxs;
-            for(unsigned Idx : E->ReuseShuffleIndices)
-              if (UsedIdxs.insert(Idx).second)
+Value *BoUpSLP::vectorizeTree(Value *Parent, ArrayRef<Value *> VL) {
+  IOPList IL;
+  for (Value *V: VL)
+    IL.push_back(allocInstructionOrPseudo(Parent, V));
+  InstructionsState S = getSameOpcode(IL);
+  for (InstructionOrPseudo *IOP: IL)
+    IOP->setInstructionsState(S);
+  if (IL[0]->MainOp) {
+    TreeEntry *E = getTreeEntry(IL[0]->MainOp);
+    if (E && E->isSame(IL)) {
+      Value *V = vectorizeTree(E);
+      if (VL.size() == E->Scalars.size() && !E->ReuseShuffleIndices.empty()) {
+        // We need to get the vectorized value but without shuffle.
+        if (auto *SV = dyn_cast<ShuffleVectorInst>(V)) {
+          V = SV->getOperand(0);
+        } else {
+          // Reshuffle to get only unique values.
+          SmallVector<unsigned, 4> UniqueIdxs;
+          SmallSet<unsigned, 4> UsedIdxs;
+          for (unsigned Idx : E->ReuseShuffleIndices)
+            if (UsedIdxs.insert(Idx).second)
                 UniqueIdxs.emplace_back(Idx);
-            V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()),
-                                            UniqueIdxs);
-          }
+          V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()),
+                                          UniqueIdxs);
         }
-        return V;
       }
+      return V;
     }
   }
 
@@ -2982,26 +3148,27 @@
 
   // Check that every instruction appears once in this bundle.
   SmallVector<unsigned, 4> ReuseShuffleIndicies;
-  SmallVector<Value *, 4> UniqueValues;
-  if (VL.size() > 2) {
+  IOPList UniqueValues;
+  if (IL.size() > 2) {
     DenseMap<Value *, unsigned> UniquePositions;
-    for (Value *V : VL) {
+    for (auto *I : IL) {
+      Value *V = I->getOp();
       auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
       ReuseShuffleIndicies.emplace_back(Res.first->second);
       if (Res.second || isa<Constant>(V))
-        UniqueValues.emplace_back(V);
+        UniqueValues.emplace_back(I);
     }
     // Do not shuffle single element or if number of unique values is not power
     // of 2.
-    if (UniqueValues.size() == VL.size() || UniqueValues.size() <= 1 ||
+    if (UniqueValues.size() == IL.size() || UniqueValues.size() <= 1 ||
         !llvm::isPowerOf2_32(UniqueValues.size()))
       ReuseShuffleIndicies.clear();
     else
-      VL = UniqueValues;
+      IL = UniqueValues;
   }
-  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
+  VectorType *VecTy = VectorType::get(ScalarTy, IL.size());
 
-  Value *V = Gather(VL, VecTy);
+  Value *V = Gather(IL, VecTy);
   if (!ReuseShuffleIndicies.empty()) {
     V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
                                     ReuseShuffleIndicies, "shuffle");
@@ -3026,12 +3193,12 @@
   IRBuilder<>::InsertPointGuard Guard(Builder);
 
   if (E->VectorizedValue) {
-    LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
+    LLVM_DEBUG(dbgs() << "SLP: Diamond merged for "
+                      << *E->Scalars[0]->MainOp << ".\n");
     return E->VectorizedValue;
   }
 
-  InstructionsState S = getSameOpcode(E->Scalars);
-  Instruction *VL0 = cast<Instruction>(S.OpValue);
+  auto *VL0 = E->Scalars[0]->MainOp;
   Type *ScalarTy = VL0->getType();
   if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
     ScalarTy = SI->getValueOperand()->getType();
@@ -3040,7 +3207,7 @@
   bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
 
   if (E->NeedToGather) {
-    setInsertPointAfterBundle(E->Scalars, S);
+    setInsertPointAfterBundle(E->Scalars);
     auto *V = Gather(E->Scalars, VecTy);
     if (NeedToShuffleReuses) {
       V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
@@ -3054,8 +3221,8 @@
     return V;
   }
 
-  unsigned ShuffleOrOp = S.isAltShuffle() ?
-           (unsigned) Instruction::ShuffleVector : S.getOpcode();
+  unsigned ShuffleOrOp = E->Scalars[0]->isAltShuffle() ?
+           (unsigned) Instruction::ShuffleVector : E->Scalars[0]->MainOpcode;
   switch (ShuffleOrOp) {
     case Instruction::PHI: {
       PHINode *PH = dyn_cast<PHINode>(VL0);
@@ -3083,12 +3250,13 @@
         }
 
         // Prepare the operand vector.
-        for (Value *V : E->Scalars)
-          Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(IBB));
+        for (auto *IOP : E->Scalars)
+          Operands.push_back(
+              cast<PHINode>(IOP->getOp())->getIncomingValueForBlock(IBB));
 
         Builder.SetInsertPoint(IBB->getTerminator());
         Builder.SetCurrentDebugLocation(PH->getDebugLoc());
-        Value *Vec = vectorizeTree(Operands);
+        Value *Vec = vectorizeTree(VL0, Operands);
         NewPhi->addIncoming(Vec, IBB);
       }
 
@@ -3117,7 +3285,7 @@
         E->VectorizedValue = V;
         return V;
       }
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E->Scalars);
       auto *V = Gather(E->Scalars, VecTy);
       if (NeedToShuffleReuses) {
         V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
@@ -3152,7 +3320,7 @@
         E->VectorizedValue = NewV;
         return NewV;
       }
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E->Scalars);
       auto *V = Gather(E->Scalars, VecTy);
       if (NeedToShuffleReuses) {
         V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
@@ -3178,12 +3346,12 @@
     case Instruction::FPTrunc:
     case Instruction::BitCast: {
       ValueList INVL;
-      for (Value *V : E->Scalars)
-        INVL.push_back(cast<Instruction>(V)->getOperand(0));
+      for (auto *IOP : E->Scalars)
+        INVL.push_back(IOP->getOpInst()->getOperand(0));
 
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E->Scalars);
 
-      Value *InVec = vectorizeTree(INVL);
+      Value *InVec = vectorizeTree(VL0, INVL);
 
       if (E->VectorizedValue) {
         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
@@ -3203,15 +3371,16 @@
     case Instruction::FCmp:
     case Instruction::ICmp: {
       ValueList LHSV, RHSV;
-      for (Value *V : E->Scalars) {
-        LHSV.push_back(cast<Instruction>(V)->getOperand(0));
-        RHSV.push_back(cast<Instruction>(V)->getOperand(1));
+      for (auto *IOP : E->Scalars) {
+        auto *I = IOP->getOpInst();
+        LHSV.push_back(I->getOperand(0));
+        RHSV.push_back(I->getOperand(1));
       }
 
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E->Scalars);
 
-      Value *L = vectorizeTree(LHSV);
-      Value *R = vectorizeTree(RHSV);
+      Value *L = vectorizeTree(VL0, LHSV);
+      Value *R = vectorizeTree(VL0, RHSV);
 
       if (E->VectorizedValue) {
         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
@@ -3220,7 +3389,7 @@
 
       CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
       Value *V;
-      if (S.getOpcode() == Instruction::FCmp)
+      if (E->Scalars[0]->MainOpcode == Instruction::FCmp)
         V = Builder.CreateFCmp(P0, L, R);
       else
         V = Builder.CreateICmp(P0, L, R);
@@ -3236,17 +3405,18 @@
     }
     case Instruction::Select: {
       ValueList TrueVec, FalseVec, CondVec;
-      for (Value *V : E->Scalars) {
-        CondVec.push_back(cast<Instruction>(V)->getOperand(0));
-        TrueVec.push_back(cast<Instruction>(V)->getOperand(1));
-        FalseVec.push_back(cast<Instruction>(V)->getOperand(2));
+      for (auto *IOP : E->Scalars) {
+        auto *I = IOP->getOpInst();
+        CondVec.push_back(I->getOperand(0));
+        TrueVec.push_back(I->getOperand(1));
+        FalseVec.push_back(I->getOperand(2));
       }
 
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E->Scalars);
 
-      Value *Cond = vectorizeTree(CondVec);
-      Value *True = vectorizeTree(TrueVec);
-      Value *False = vectorizeTree(FalseVec);
+      Value *Cond = vectorizeTree(VL0, CondVec);
+      Value *True = vectorizeTree(VL0, TrueVec);
+      Value *False = vectorizeTree(VL0, FalseVec);
 
       if (E->VectorizedValue) {
         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
@@ -3281,20 +3451,19 @@
     case Instruction::Or:
     case Instruction::Xor: {
       ValueList LHSVL, RHSVL;
-      if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
-        reorderInputsAccordingToOpcode(S.getOpcode(), E->Scalars, LHSVL,
-                                       RHSVL);
-      else
-        for (Value *V : E->Scalars) {
-          auto *I = cast<Instruction>(V);
-          LHSVL.push_back(I->getOperand(0));
-          RHSVL.push_back(I->getOperand(1));
-        }
+      if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
+        reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL);
+      } else
+      for (auto *IOP : E->Scalars) {
+        auto *I = IOP->getOpInst();
+        LHSVL.push_back(I->getOperand(0));
+        RHSVL.push_back(I->getOperand(1));
+      }
 
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E->Scalars);
 
-      Value *LHS = vectorizeTree(LHSVL);
-      Value *RHS = vectorizeTree(RHSVL);
+      Value *LHS = vectorizeTree(VL0, LHSVL);
+      Value *RHS = vectorizeTree(VL0, RHSVL);
 
       if (E->VectorizedValue) {
         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
@@ -3302,7 +3471,7 @@
       }
 
       Value *V = Builder.CreateBinOp(
-          static_cast<Instruction::BinaryOps>(S.getOpcode()), LHS, RHS);
+          static_cast<Instruction::BinaryOps>(VL0->getOpcode()), LHS, RHS);
       propagateIRFlags(V, E->Scalars, VL0);
       if (auto *I = dyn_cast<Instruction>(V))
         V = propagateMetadata(I, E->Scalars);
@@ -3321,10 +3490,11 @@
       // sink them all the way down past store instructions.
       bool IsReorder = !E->ReorderIndices.empty();
       if (IsReorder) {
-        S = getSameOpcode(E->Scalars, E->ReorderIndices.front());
+        InstructionsState S = getSameOpcode(E->Scalars,
+                                            E->ReorderIndices.front());
         VL0 = cast<Instruction>(S.OpValue);
       }
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E->Scalars);
 
       LoadInst *LI = cast<LoadInst>(VL0);
       Type *ScalarLoadTy = LI->getType();
@@ -3368,12 +3538,13 @@
       unsigned AS = SI->getPointerAddressSpace();
 
       ValueList ScalarStoreValues;
-      for (Value *V : E->Scalars)
-        ScalarStoreValues.push_back(cast<StoreInst>(V)->getValueOperand());
+      for (auto *IOP : E->Scalars)
+        ScalarStoreValues.push_back(
+            cast<StoreInst>(IOP->getOpInst())->getValueOperand());
 
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E->Scalars);
 
-      Value *VecValue = vectorizeTree(ScalarStoreValues);
+      Value *VecValue = vectorizeTree(VL0, ScalarStoreValues);
       Value *ScalarPtr = SI->getPointerOperand();
       Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS));
       StoreInst *ST = Builder.CreateStore(VecValue, VecPtr);
@@ -3398,22 +3569,26 @@
       return V;
     }
     case Instruction::GetElementPtr: {
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E->Scalars);
 
       ValueList Op0VL;
-      for (Value *V : E->Scalars)
-        Op0VL.push_back(cast<GetElementPtrInst>(V)->getOperand(0));
+      for (auto *IOP : E->Scalars) {
+        auto *I = IOP->getOpInst();
+        Op0VL.push_back(cast<GetElementPtrInst>(I)->getOperand(0));
+      }
 
-      Value *Op0 = vectorizeTree(Op0VL);
+      Value *Op0 = vectorizeTree(VL0, Op0VL);
 
       std::vector<Value *> OpVecs;
       for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
            ++j) {
         ValueList OpVL;
-        for (Value *V : E->Scalars)
-          OpVL.push_back(cast<GetElementPtrInst>(V)->getOperand(j));
+        for (auto *IOP : E->Scalars) {
+          auto *I = IOP->getOpInst();
+          OpVL.push_back(cast<GetElementPtrInst>(I)->getOperand(j));
+        }
 
-        Value *OpVec = vectorizeTree(OpVL);
+        Value *OpVec = vectorizeTree(VL0, OpVL);
         OpVecs.push_back(OpVec);
       }
 
@@ -3433,7 +3608,7 @@
     }
     case Instruction::Call: {
       CallInst *CI = cast<CallInst>(VL0);
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E->Scalars);
       Function *FI;
       Intrinsic::ID IID  = Intrinsic::not_intrinsic;
       Value *ScalarArg = nullptr;
@@ -3451,12 +3626,12 @@
           OpVecs.push_back(CEI->getArgOperand(j));
           continue;
         }
-        for (Value *V : E->Scalars) {
-          CallInst *CEI = cast<CallInst>(V);
+        for (auto *IOP : E->Scalars) {
+          CallInst *CEI = cast<CallInst>(IOP->getOpInst());
           OpVL.push_back(CEI->getArgOperand(j));
         }
 
-        Value *OpVec = vectorizeTree(OpVL);
+        Value *OpVec = vectorizeTree(VL0, OpVL);
         LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
         OpVecs.push_back(OpVec);
       }
@@ -3486,25 +3661,25 @@
     }
     case Instruction::ShuffleVector: {
       ValueList LHSVL, RHSVL;
-      assert(S.isAltShuffle() &&
-             ((Instruction::isBinaryOp(S.getOpcode()) &&
-               Instruction::isBinaryOp(S.getAltOpcode())) ||
-              (Instruction::isCast(S.getOpcode()) &&
-               Instruction::isCast(S.getAltOpcode()))) &&
+      assert(E->Scalars[0]->isAltShuffle() &&
+             ((Instruction::isBinaryOp(E->Scalars[0]->MainOpcode) &&
+             Instruction::isBinaryOp(E->Scalars[0]->AltOpcode)) ||
+             (Instruction::isCast(E->Scalars[0]->MainOpcode) &&
+             Instruction::isCast(E->Scalars[0]->AltOpcode))) &&
              "Invalid Shuffle Vector Operand");
 
       Value *LHS, *RHS;
-      if (Instruction::isBinaryOp(S.getOpcode())) {
-        reorderAltShuffleOperands(S, E->Scalars, LHSVL, RHSVL);
-        setInsertPointAfterBundle(E->Scalars, S);
-        LHS = vectorizeTree(LHSVL);
-        RHS = vectorizeTree(RHSVL);
+      if (Instruction::isBinaryOp(E->Scalars[0]->MainOpcode)) {
+        reorderAltShuffleOperands(E->Scalars, LHSVL, RHSVL);
+        setInsertPointAfterBundle(E->Scalars);
+        LHS = vectorizeTree(VL0, LHSVL);
+        RHS = vectorizeTree(VL0, RHSVL);
       } else {
         ValueList INVL;
-        for (Value *V : E->Scalars)
-          INVL.push_back(cast<Instruction>(V)->getOperand(0));
-        setInsertPointAfterBundle(E->Scalars, S);
-        LHS = vectorizeTree(INVL);
+        for (auto *IOP : E->Scalars)
+          INVL.push_back(IOP->getOpInst()->getOperand(0));
+        setInsertPointAfterBundle(E->Scalars);
+        LHS = vectorizeTree(VL0, INVL);
       }
 
       if (E->VectorizedValue) {
@@ -3513,28 +3688,31 @@
       }
 
       Value *V0, *V1;
-      if (Instruction::isBinaryOp(S.getOpcode())) {
+      if (Instruction::isBinaryOp(E->Scalars[0]->MainOpcode)) {
         V0 = Builder.CreateBinOp(
-          static_cast<Instruction::BinaryOps>(S.getOpcode()), LHS, RHS);
+          static_cast<Instruction::BinaryOps>(E->Scalars[0]->MainOpcode), LHS,
+          RHS);
         V1 = Builder.CreateBinOp(
-          static_cast<Instruction::BinaryOps>(S.getAltOpcode()), LHS, RHS);
+          static_cast<Instruction::BinaryOps>(E->Scalars[0]->AltOpcode), LHS,
+          RHS);
       } else {
         V0 = Builder.CreateCast(
-            static_cast<Instruction::CastOps>(S.getOpcode()), LHS, VecTy);
+            static_cast<Instruction::CastOps>(E->Scalars[0]->MainOpcode), LHS,
+            VecTy);
         V1 = Builder.CreateCast(
-            static_cast<Instruction::CastOps>(S.getAltOpcode()), LHS, VecTy);
+            static_cast<Instruction::CastOps>(E->Scalars[0]->AltOpcode), LHS,
+            VecTy);
       }
 
       // Create shuffle to take alternate operations from the vector.
       // Also, gather up main and alt scalar ops to propagate IR flags to
       // each vector operation.
-      ValueList OpScalars, AltScalars;
+      IOPList OpScalars, AltScalars;
       unsigned e = E->Scalars.size();
       SmallVector<Constant *, 8> Mask(e);
       for (unsigned i = 0; i < e; ++i) {
-        auto *OpInst = cast<Instruction>(E->Scalars[i]);
-        assert(S.isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode");
-        if (OpInst->getOpcode() == S.getAltOpcode()) {
+        auto *I = E->Scalars[i]->getOpInst();
+        if (I->getOpcode() == E->Scalars[0]->AltOpcode) {
           Mask[i] = Builder.getInt32(e + i);
           AltScalars.push_back(E->Scalars[i]);
         } else {
@@ -3544,8 +3722,10 @@
       }
 
       Value *ShuffleMask = ConstantVector::get(Mask);
-      propagateIRFlags(V0, OpScalars);
-      propagateIRFlags(V1, AltScalars);
+      InstructionsState S = getSameOpcode(OpScalars);
+      propagateIRFlags(V0, OpScalars, S.OpValue);
+      S = getSameOpcode(AltScalars);
+      propagateIRFlags(V1, AltScalars, S.OpValue);
 
       Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
       if (Instruction *I = dyn_cast<Instruction>(V))
@@ -3572,9 +3752,10 @@
 
 Value *
 BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
-  // All blocks must be scheduled before any instructions are inserted.
   for (auto &BSIter : BlocksSchedules) {
-    scheduleBlock(BSIter.second.get());
+    BlockScheduling *BS = BSIter.second.get();
+    eliminatePseudos(BS);
+    scheduleBlock(BS);
   }
 
   Builder.SetInsertPoint(&F->getEntryBlock().front());
@@ -3583,7 +3764,7 @@
   // If the vectorized tree can be rewritten in a smaller type, we truncate the
   // vectorized root. InstCombine will then rewrite the entire expression. We
   // sign extend the extracted values below.
-  auto *ScalarRoot = VectorizableTree[0].Scalars[0];
+  auto *ScalarRoot = VectorizableTree[0].Scalars[0]->MainOp;
   if (MinBWs.count(ScalarRoot)) {
     if (auto *I = dyn_cast<Instruction>(VectorRoot))
       Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
@@ -3698,7 +3879,7 @@
 
     // For each lane:
     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
-      Value *Scalar = Entry->Scalars[Lane];
+      Value *Scalar = Entry->Scalars[Lane]->getOp();
 
       Type *Ty = Scalar->getType();
       if (!Ty->isVoidTy()) {
@@ -3707,7 +3888,8 @@
           LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
 
           // It is legal to replace users in the ignorelist by undef.
-          assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) &&
+          assert((getTreeEntry(U) ||
+                  is_contained(UserIgnoreList, U)) &&
                  "Replacing out-of-tree value with undef");
         }
 #endif
@@ -3809,10 +3991,9 @@
 
 // Groups the instructions to a bundle (which is then a single scheduling entity)
 // and schedules instructions until the bundle gets ready.
-bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
-                                                 BoUpSLP *SLP,
-                                                 const InstructionsState &S) {
-  if (isa<PHINode>(S.OpValue))
+bool BoUpSLP::BlockScheduling::tryScheduleBundle(
+    ArrayRef<InstructionOrPseudo *> IL, BoUpSLP *SLP) {
+  if (isa<PHINode>(IL[0]->MainOp))
     return true;
 
   // Initialize the instruction bundle.
@@ -3820,19 +4001,25 @@
   ScheduleData *PrevInBundle = nullptr;
   ScheduleData *Bundle = nullptr;
   bool ReSchedule = false;
-  LLVM_DEBUG(dbgs() << "SLP:  bundle: " << *S.OpValue << "\n");
+  bool IsPseudo = false;
+  LLVM_DEBUG(dbgs() << "SLP:  bundle: " << *IL[0]->MainOp << "\n");
 
   // Make sure that the scheduling region contains all
   // instructions of the bundle.
-  for (Value *V : VL) {
-    if (!extendSchedulingRegion(V, S))
+  for (InstructionOrPseudo *IOP : IL)
+    if (!extendSchedulingRegion(IOP))
       return false;
-  }
 
-  for (Value *V : VL) {
-    ScheduleData *BundleMember = getScheduleData(V);
+  for (InstructionOrPseudo *IOP : IL) {
+    ScheduleData *BundleMember =
+         ScheduleDataMap[IOP->getOpInst()][std::make_pair(nullptr, 0)];
+    if (BundleMember->isPartOfBundle()) {
+      BundleMember = getScheduleData(IOP);
+      IsPseudo = true;
+    }
     assert(BundleMember &&
            "no ScheduleData for bundle member (maybe not in same basic block)");
+    assert(!BundleMember->isPartOfBundle() && "Already part of another bundle");
     if (BundleMember->IsScheduled) {
       // A bundle member was scheduled as single instruction before and now
       // needs to be scheduled as part of the bundle. We just get rid of the
@@ -3849,13 +4036,15 @@
       Bundle = BundleMember;
     }
     BundleMember->UnscheduledDepsInBundle = 0;
+    BundleMember->Opcode = IOP->MainOpcode;
+    BundleMember->Parent = IOP->getParent();
     Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
 
     // Group the instructions to a bundle.
     BundleMember->FirstInBundle = Bundle;
     PrevInBundle = BundleMember;
   }
-  if (ScheduleEnd != OldScheduleEnd) {
+  if (ScheduleEnd != OldScheduleEnd || IsPseudo) {
     // The scheduling region got new instructions at the lower end (or it is a
     // new region for the first bundle). This makes it necessary to
     // recalculate all dependencies.
@@ -3892,18 +4081,29 @@
     }
   }
   if (!Bundle->isReady()) {
-    cancelScheduling(VL, S.OpValue);
+    cancelScheduling(IL[0]);
+    // We have to clear all dependencies, since all values
+    // were calculated for the vectorized bundle.
+    for (auto *I = ScheduleStart; I != ScheduleEnd;
+      I = I->getNextNode()) {
+      doForAllOpcodes(I, [](ScheduleData *SD) {
+        SD->clearDependencies();
+      });
+    }
+    resetSchedule();
     return false;
   }
   return true;
 }
 
-void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
-                                                Value *OpValue) {
-  if (isa<PHINode>(OpValue))
+void BoUpSLP::BlockScheduling::cancelScheduling(InstructionOrPseudo *IOP) {
+  if (!IOP->MainOp)
     return;
-
-  ScheduleData *Bundle = getScheduleData(OpValue);
+  if (isa<PHINode>(IOP->MainOp))
+    return;
+  auto Key = IOP->getKey();
+  ScheduleData *Bundle = getScheduleData(IOP)->FirstInBundle;
+  assert(Bundle && "Counld not find bundle");
   LLVM_DEBUG(dbgs() << "SLP:  cancel scheduling of " << *Bundle << "\n");
   assert(!Bundle->IsScheduled &&
          "Can't cancel bundle which is already scheduled");
@@ -3913,13 +4113,23 @@
   // Un-bundle: make single instructions out of the bundle.
   ScheduleData *BundleMember = Bundle;
   while (BundleMember) {
-    assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
+    assert(BundleMember->FirstInBundle == Bundle && "Corrupt bundle links");
+    assert(BundleMember->Parent == Key.first &&
+           BundleMember->Opcode == Key.second && "Corrupt bundle");
     BundleMember->FirstInBundle = BundleMember;
     ScheduleData *Next = BundleMember->NextInBundle;
     BundleMember->NextInBundle = nullptr;
     BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
-    if (BundleMember->UnscheduledDepsInBundle == 0) {
-      ReadyInsts.insert(BundleMember);
+    if (BundleMember->IsPseudo) {
+      ScheduleDataMap[BundleMember->Inst].erase(Key);
+      BundleMember->Opcode = 0;
+      BundleMember->Parent = nullptr;
+    } else {
+      BundleMember->Opcode = 0;
+      BundleMember->Parent = nullptr;
+      if (BundleMember->UnscheduledDepsInBundle == 0) {
+        ReadyInsts.insert(BundleMember);
+      }
     }
     BundleMember = Next;
   }
@@ -3934,23 +4144,22 @@
   return &(ScheduleDataChunks.back()[ChunkPos++]);
 }
 
-bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
-                                                      const InstructionsState &S) {
-  if (getScheduleData(V, isOneOf(S, V)))
+bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
+    InstructionOrPseudo *IOP) {
+  if (getScheduleData(IOP))
     return true;
-  Instruction *I = dyn_cast<Instruction>(V);
-  assert(I && "bundle member must be an instruction");
-  assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
-  auto &&CheckSheduleForI = [this, &S](Instruction *I) -> bool {
-    ScheduleData *ISD = getScheduleData(I);
-    if (!ISD)
+  auto *I = IOP->getOpInst();
+  assert(!isa<PHINode>(I)
+         && "phi nodes don't need to be scheduled");
+  auto &&CheckSheduleForI = [this, &IOP](Instruction *I) -> bool {
+    ScheduleData *ISD = ScheduleDataMap[I][std::make_pair(nullptr, 0)];
+    if (!ISD || ISD->SchedulingRegionID != SchedulingRegionID)
       return false;
-    assert(isInSchedulingRegion(ISD) &&
-           "ScheduleData not in scheduling region");
-    ScheduleData *SD = allocateScheduleDataChunks();
-    SD->Inst = I;
-    SD->init(SchedulingRegionID, S.OpValue);
-    ExtraScheduleDataMap[I][S.OpValue] = SD;
+    if (ISD->isPartOfBundle()) {
+      ScheduleData *SD = allocateScheduleDataChunks();
+      SD->init(SchedulingRegionID, ISD, IOP);
+      ScheduleDataMap[I][IOP->getKey()] = SD;
+    }
     return true;
   };
   if (CheckSheduleForI(I))
@@ -3960,10 +4169,9 @@
     initScheduleData(I, I->getNextNode(), nullptr, nullptr);
     ScheduleStart = I;
     ScheduleEnd = I->getNextNode();
-    if (isOneOf(S, I) != I)
-      CheckSheduleForI(I);
     assert(ScheduleEnd && "tried to vectorize a terminator?");
-    LLVM_DEBUG(dbgs() << "SLP:  initialize schedule region to " << *I << "\n");
+    LLVM_DEBUG(dbgs() << "SLP:  initialize schedule region to " << *I
+               << "\n");
     return true;
   }
   // Search up and down at the same time, because we don't know if the new
@@ -3981,23 +4189,20 @@
 
     if (UpIter != UpperEnd) {
       if (&*UpIter == I) {
-        initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
+        initScheduleData(I, ScheduleStart, nullptr,
+                         FirstLoadStoreInRegion);
         ScheduleStart = I;
-        if (isOneOf(S, I) != I)
-          CheckSheduleForI(I);
-        LLVM_DEBUG(dbgs() << "SLP:  extend schedule region start to " << *I
-                          << "\n");
+        LLVM_DEBUG(dbgs() << "SLP:  extend schedule region start to "
+                          << *I << "\n");
         return true;
       }
       UpIter++;
     }
     if (DownIter != LowerEnd) {
       if (&*DownIter == I) {
-        initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
-                         nullptr);
+        initScheduleData(ScheduleEnd, I->getNextNode(),
+                         LastLoadStoreInRegion, nullptr);
         ScheduleEnd = I->getNextNode();
-        if (isOneOf(S, I) != I)
-          CheckSheduleForI(I);
         assert(ScheduleEnd && "tried to vectorize a terminator?");
         LLVM_DEBUG(dbgs() << "SLP:  extend schedule region end to " << *I
                           << "\n");
@@ -4011,21 +4216,20 @@
   return true;
 }
 
-void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
-                                                Instruction *ToI,
-                                                ScheduleData *PrevLoadStore,
-                                                ScheduleData *NextLoadStore) {
+void BoUpSLP::BlockScheduling::initScheduleData(
+    Instruction *FromI, Instruction *ToI, ScheduleData *PrevLoadStore,
+    ScheduleData *NextLoadStore) {
   ScheduleData *CurrentLoadStore = PrevLoadStore;
   for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
-    ScheduleData *SD = ScheduleDataMap[I];
+    ScheduleData *SD = ScheduleDataMap[I][std::make_pair(nullptr, 0)];
     if (!SD) {
       SD = allocateScheduleDataChunks();
-      ScheduleDataMap[I] = SD;
+      ScheduleDataMap[I][std::make_pair(nullptr, 0)] = SD;
       SD->Inst = I;
     }
     assert(!isInSchedulingRegion(SD) &&
            "new ScheduleData already in scheduling region");
-    SD->init(SchedulingRegionID, I);
+    SD->init(SchedulingRegionID);
 
     if (I->mayReadOrWriteMemory() &&
         (!isa<IntrinsicInst>(I) ||
@@ -4060,8 +4264,13 @@
     WorkList.pop_back();
 
     ScheduleData *BundleMember = SD;
+    unsigned Opcode = BundleMember->Opcode;
+    Value *Parent = BundleMember->Parent;
     while (BundleMember) {
-      assert(isInSchedulingRegion(BundleMember));
+      assert(isInSchedulingRegion(BundleMember) &&
+             BundleMember->Opcode == Opcode &&
+             BundleMember->Parent == Parent && "Corrupt bundle member");
+
       if (!BundleMember->hasValidDependencies()) {
 
         LLVM_DEBUG(dbgs() << "SLP:       update deps of " << *BundleMember
@@ -4070,35 +4279,22 @@
         BundleMember->resetUnscheduledDeps();
 
         // Handle def-use chain dependencies.
-        if (BundleMember->OpValue != BundleMember->Inst) {
-          ScheduleData *UseSD = getScheduleData(BundleMember->Inst);
-          if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
-            BundleMember->Dependencies++;
-            ScheduleData *DestBundle = UseSD->FirstInBundle;
-            if (!DestBundle->IsScheduled)
-              BundleMember->incrementUnscheduledDeps(1);
-            if (!DestBundle->hasValidDependencies())
-              WorkList.push_back(DestBundle);
-          }
-        } else {
-          for (User *U : BundleMember->Inst->users()) {
-            if (isa<Instruction>(U)) {
-              ScheduleData *UseSD = getScheduleData(U);
-              if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
-                BundleMember->Dependencies++;
-                ScheduleData *DestBundle = UseSD->FirstInBundle;
-                if (!DestBundle->IsScheduled)
-                  BundleMember->incrementUnscheduledDeps(1);
-                if (!DestBundle->hasValidDependencies())
-                  WorkList.push_back(DestBundle);
-              }
-            } else {
-              // I'm not sure if this can ever happen. But we need to be safe.
-              // This lets the instruction/bundle never be scheduled and
-              // eventually disable vectorization.
+        for (User *U : BundleMember->Inst->users()) {
+          if (auto *I = dyn_cast<Instruction>(U)) {
+            doForAllOpcodes(I, [&](ScheduleData *UseSD) {
               BundleMember->Dependencies++;
-              BundleMember->incrementUnscheduledDeps(1);
-            }
+              ScheduleData *DestBundle = UseSD->FirstInBundle;
+              if (!DestBundle->IsScheduled)
+                BundleMember->incrementUnscheduledDeps(1);
+              if (!DestBundle->hasValidDependencies())
+                WorkList.push_back(DestBundle);
+            });
+          } else {
+            // I'm not sure if this can ever happen. But we need to be safe.
+            // This lets the instruction/bundle never be scheduled and
+            // eventually disable vectorization.
+            BundleMember->Dependencies++;
+            BundleMember->incrementUnscheduledDeps(1);
           }
         }
 
@@ -4107,7 +4303,7 @@
         if (DepDest) {
           Instruction *SrcInst = BundleMember->Inst;
           MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA);
-          bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
+          bool SrcMayWrite = SrcInst->mayWriteToMemory();
           unsigned numAliased = 0;
           unsigned DistToSrc = 1;
 
@@ -4131,15 +4327,17 @@
               // balance between reduced runtime and accurate dependencies.
               numAliased++;
 
-              DepDest->MemoryDependencies.push_back(BundleMember);
-              BundleMember->Dependencies++;
-              ScheduleData *DestBundle = DepDest->FirstInBundle;
-              if (!DestBundle->IsScheduled) {
-                BundleMember->incrementUnscheduledDeps(1);
-              }
-              if (!DestBundle->hasValidDependencies()) {
-                WorkList.push_back(DestBundle);
-              }
+              doForAllOpcodes(DepDest->Inst, [&](ScheduleData *Dep) {
+                Dep->MemoryDependencies.push_back(BundleMember);
+                BundleMember->Dependencies++;
+                ScheduleData *DestBundle = Dep->FirstInBundle;
+                if (!DestBundle->IsScheduled) {
+                  BundleMember->incrementUnscheduledDeps(1);
+                }
+                if (!DestBundle->hasValidDependencies()) {
+                  WorkList.push_back(DestBundle);
+                }
+              });
             }
             DepDest = DepDest->NextLoadStore;
 
@@ -4176,9 +4374,7 @@
   assert(ScheduleStart &&
          "tried to reset schedule on block which has not been scheduled");
   for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
-    doForAllOpcodes(I, [&](ScheduleData *SD) {
-      assert(isInSchedulingRegion(SD) &&
-             "ScheduleData not in scheduling region");
+    doForAllOpcodes(I, [](ScheduleData *SD) {
       SD->IsScheduled = false;
       SD->resetUnscheduledDeps();
     });
@@ -4186,6 +4382,84 @@
   ReadyInsts.clear();
 }
 
+void BoUpSLP::eliminatePseudos(BlockScheduling *BS) {
+  if (!BS->ScheduleStart)
+    return;
+  bool Changed = false;
+  for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
+       I = I->getNextNode()) {
+    SmallSet<std::pair<Value *, unsigned>, 2> Keys;
+    SmallDenseMap<Instruction *, ScheduleData *> NewOps;
+    BS->doForAllOpcodes(I, [&](ScheduleData *SD) {
+      if (!SD->IsPseudo)
+        return;
+      std::pair<Value *, unsigned> Key = std::make_pair(
+                                                SD->Parent, SD->Opcode);
+      // Remove any pseudo instruction that
+      // remaines after bundles construction.
+      if (!SD->isPartOfBundle()) {
+        Keys.insert(Key);
+        return;
+      }
+      Changed = true;
+
+      // Emitting a real instruction.
+      Builder.SetInsertPoint(I->getNextNode());
+      Instruction *NewOp = nullptr;
+      NewOp = I->clone();
+      BS->BB->getInstList().push_back(NewOp);
+      NewOp->moveAfter(I);
+      LLVM_DEBUG(dbgs() << "SLP: Replace pseudo instruction"
+                        << *I << " with " << *NewOp
+                        << " in " << *SD->FirstInBundle << "\n");
+
+      // Change instruction in ScheduleData.
+      SD->Inst = NewOp;
+      SD->IsPseudo = false;
+
+      // Remove the pseudo instruction from ScheduleDataMap
+      // and add the real instruction.
+      NewOps[NewOp] = SD;
+      Keys.insert(Key);
+      int Idx = ScalarToTreeEntry[I][Key];
+      ScalarToTreeEntry[NewOp][Key] = Idx;
+      ScalarToTreeEntry[I].erase(Key);
+      TreeEntry *E = &VectorizableTree[Idx];
+      llvm::propagateIRFlags(NewOp, E->Scalars[0]->MainOp);
+      propagateMetadata(NewOp, E->Scalars);
+
+      // Replace instruction in TreeEntry.
+      for (auto *IOP : E->Scalars) {
+        if (IOP->getOpInst() == I)
+          IOP->replaceWith(NewOp);
+      }
+    });
+    for (auto Key : Keys)
+      BS->ScheduleDataMap[I].erase(Key);
+    for (auto Ops : NewOps) {
+      auto *SD = Ops.second;
+      BS->ScheduleDataMap[Ops.first][std::make_pair(SD->Parent, SD->Opcode)] =
+          SD;
+    }
+  }
+
+  // We have to recalulate all dependancies in
+  // the scheduling region if any pseudo instructions
+  // were replaced.
+  if (Changed) {
+    BS->resetSchedule();
+    BS->initialFillReadyList(BS->ReadyInsts);
+    for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
+         I = I->getNextNode()) {
+      if (BS->ScheduleDataMap.find(I) == BS->ScheduleDataMap.end())
+        continue;
+      BS->doForAllOpcodes(I, [&](ScheduleData *SD) {
+        SD->clearDependencies();
+      });
+    }
+  }
+}
+
 void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
   if (!BS->ScheduleStart)
     return;
@@ -4211,8 +4485,11 @@
   for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
        I = I->getNextNode()) {
     BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) {
+      std::pair<Value *, unsigned> Key = std::make_pair(SD->Parent, SD->Opcode);
       assert(SD->isPartOfBundle() ==
-                 (getTreeEntry(SD->Inst) != nullptr) &&
+                 (ScalarToTreeEntry.find(SD->Inst) != ScalarToTreeEntry.end() &&
+                  ScalarToTreeEntry[SD->Inst].find(Key) !=
+                  ScalarToTreeEntry[SD->Inst].end()) &&
              "scheduler and vectorizer bundle mismatch");
       SD->FirstInBundle->SchedulingPriority = Idx++;
       if (SD->isSchedulingEntity()) {
@@ -4233,20 +4510,33 @@
     // Move the scheduled instruction(s) to their dedicated places, if not
     // there yet.
     ScheduleData *BundleMember = picked;
+    unsigned Opcode = BundleMember->Opcode;
+    Value *Parent = BundleMember->Parent;
     while (BundleMember) {
-      Instruction *pickedInst = BundleMember->Inst;
-      if (LastScheduledInst->getNextNode() != pickedInst) {
-        BS->BB->getInstList().remove(pickedInst);
+      assert(Opcode == BundleMember->Opcode &&
+             Parent == BundleMember->Parent && "Corrupt bundle member");
+      Instruction *PickedInst = BundleMember->Inst;
+      if (LastScheduledInst != PickedInst) {
+        BS->BB->getInstList().remove(PickedInst);
         BS->BB->getInstList().insert(LastScheduledInst->getIterator(),
-                                     pickedInst);
+                                     PickedInst);
       }
-      LastScheduledInst = pickedInst;
+      LastScheduledInst = PickedInst;
       BundleMember = BundleMember->NextInBundle;
     }
-
     BS->schedule(picked, ReadyInsts);
     NumToSchedule--;
   }
+#ifndef NDEBUG
+  if (NumToSchedule != 0) {
+    for (BasicBlock::iterator I = BS->BB->begin(), E = BS->BB->end(); I != E;
+         ++I) {
+      BS->doForAllOpcodes(&*I, [](ScheduleData *SD) {
+        LLVM_DEBUG(dbgs() << "SLP: Failed to schedule: " << *SD << "\n");
+      });
+    }
+  }
+#endif
   assert(NumToSchedule == 0 && "could not schedule all instructions");
 
   // Avoid duplicate scheduling of the block.
@@ -4391,7 +4681,7 @@
 
   // We only attempt to truncate integer expressions.
   auto &TreeRoot = VectorizableTree[0].Scalars;
-  auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());
+  auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getOp()->getType());
   if (!TreeRootIT)
     return;
 
@@ -4401,7 +4691,9 @@
   // This means that if a tree entry other than a root is used externally, it
   // must have multiple uses and InstCombine will not rewrite it. The code
   // below ensures that only the roots are used externally.
-  SmallPtrSet<Value *, 32> Expr(TreeRoot.begin(), TreeRoot.end());
+  SmallPtrSet<Value *, 32> Expr;
+  for (auto *IOP : TreeRoot)
+    Expr.insert(IOP->getOp());
   for (auto &EU : ExternalUses)
     if (!Expr.erase(EU.Scalar))
       return;
@@ -4412,13 +4704,16 @@
   // context to determine which values can be demoted. If we see a truncation,
   // we mark it as seeding another demotion.
   for (auto &Entry : VectorizableTree)
-    Expr.insert(Entry.Scalars.begin(), Entry.Scalars.end());
+    for (auto *IOP: Entry.Scalars)
+      Expr.insert(IOP->getOp());
 
   // Ensure the roots of the vectorizable tree don't form a cycle. They must
   // have a single external user that is not in the vectorizable tree.
-  for (auto *Root : TreeRoot)
-    if (!Root->hasOneUse() || Expr.count(*Root->user_begin()))
+  for (auto *Root : TreeRoot) {
+    Value *V = Root->getOp();
+    if (!V->hasOneUse() || Expr.count(*V->user_begin()))
       return;
+  }
 
   // Conservatively determine if we can actually truncate the roots of the
   // expression. Collect the values that can be demoted in ToDemote and
@@ -4426,7 +4721,7 @@
   SmallVector<Value *, 32> ToDemote;
   SmallVector<Value *, 4> Roots;
   for (auto *Root : TreeRoot)
-    if (!collectValuesToDemote(Root, Expr, ToDemote, Roots))
+    if (!collectValuesToDemote(Root->getOp(), Expr, ToDemote, Roots))
       return;
 
   // The maximum bit width required to represent all the values that can be
@@ -4437,7 +4732,7 @@
   // We first check if all the bits of the roots are demanded. If they're not,
   // we can truncate the roots to this narrower type.
   for (auto *Root : TreeRoot) {
-    auto Mask = DB->getDemandedBits(cast<Instruction>(Root));
+    auto Mask = DB->getDemandedBits(Root->getOpInst());
     MaxBitWidth = std::max<unsigned>(
         Mask.getBitWidth() - Mask.countLeadingZeros(), MaxBitWidth);
   }
@@ -4456,17 +4751,18 @@
   // We start by looking at each entry that can be demoted. We compute the
   // maximum bit width required to store the scalar by using ValueTracking to
   // compute the number of high-order bits we can truncate.
-  if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType()) &&
-      llvm::all_of(TreeRoot, [](Value *R) {
-        assert(R->hasOneUse() && "Root should have only one use!");
-        return isa<GetElementPtrInst>(R->user_back());
+  if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getOp()->getType()) &&
+      llvm::all_of(TreeRoot, [](InstructionOrPseudo *R) {
+        Value *V = R->getOp();
+        assert(V->hasOneUse() && "Root should have only one use!");
+        return isa<GetElementPtrInst>(V->user_back());
       })) {
     MaxBitWidth = 8u;
 
     // Determine if the sign bit of all the roots is known to be zero. If not,
     // IsKnownPositive is set to False.
-    IsKnownPositive = llvm::all_of(TreeRoot, [&](Value *R) {
-      KnownBits Known = computeKnownBits(R, *DL);
+    IsKnownPositive = llvm::all_of(TreeRoot, [&](InstructionOrPseudo *R) {
+      KnownBits Known = computeKnownBits(R->getOp(), *DL);
       return Known.isNonNegative();
     });
 
@@ -4864,7 +5160,14 @@
 
   // Check that all of the parts are scalar instructions of the same type,
   // we permit an alternate opcode via InstructionsState.
-  InstructionsState S = getSameOpcode(VL);
+  SmallVector<InstructionOrPseudo, 4> ArrayIL;
+  SmallVector<InstructionOrPseudo *, 4> IL;
+
+  for (Value *V: VL)
+    ArrayIL.push_back(InstructionOrPseudo(VL[0], V));
+  for (unsigned idx=0; idx < ArrayIL.size(); idx++)
+    IL.push_back(&ArrayIL[idx]);
+  InstructionsState S = getSameOpcode(IL);
   if (!S.getOpcode())
     return false;
 
Index: test/Transforms/SLPVectorizer/X86/duplicate.ll
===================================================================
--- /dev/null
+++ test/Transforms/SLPVectorizer/X86/duplicate.ll
@@ -0,0 +1,118 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck %s
+
+; Reusing %sub.i1247 operation in two different bundles.
+
+%class.j = type { %class.c* }
+%class.c = type { [3 x double] }
+
+@h = dso_local local_unnamed_addr global i64 0, align 8
+@i = dso_local local_unnamed_addr global i64 0, align 8
+
+; Function Attrs: uwtable
+define dso_local void @foo(%class.j* nocapture readonly %this) local_unnamed_addr #0 align 2 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  invoke.cont18:
+; CHECK-NEXT:    [[M:%.*]] = alloca [[CLASS_C:%.*]], align 8
+; CHECK-NEXT:    [[AGG_TMP:%.*]] = alloca [[CLASS_C]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast %class.c* [[M]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 24, i8* nonnull [[TMP0]])
+; CHECK-NEXT:    [[L:%.*]] = getelementptr inbounds [[CLASS_J:%.*]], %class.j* [[THIS:%.*]], i64 0, i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = load %class.c*, %class.c** [[L]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* @h, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[CLASS_C]], %class.c* [[TMP1]], i64 [[TMP2]], i32 0, i64 0
+; CHECK-NEXT:    [[ARRAYIDX5_I49:%.*]] = getelementptr inbounds [[CLASS_C]], %class.c* [[TMP1]], i64 [[TMP2]], i32 0, i64 1
+; CHECK-NEXT:    [[ARRAYIDX9_I51:%.*]] = getelementptr inbounds [[CLASS_C]], %class.c* [[TMP1]], i64 [[TMP2]], i32 0, i64 2
+; CHECK-NEXT:    [[TMP4:%.*]] = load double, double* [[ARRAYIDX9_I51]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, i64* @i, align 8
+; CHECK-NEXT:    [[ARRAYIDX_I41:%.*]] = getelementptr inbounds [[CLASS_C]], %class.c* [[TMP1]], i64 [[TMP5]], i32 0, i64 0
+; CHECK-NEXT:    [[ARRAYIDX5_I44:%.*]] = getelementptr inbounds [[CLASS_C]], %class.c* [[TMP1]], i64 [[TMP5]], i32 0, i64 1
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[ARRAYIDX_I41]] to <2 x double>*
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast double* [[TMP3]] to <2 x double>*
+; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x double>, <2 x double>* [[TMP8]], align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = fsub fast <2 x double> [[TMP7]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x double> [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x double> undef, double [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x double> [[TMP12]], double [[TMP13]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x double> <double 2.000000e+00, double undef>, double [[TMP11]], i32 1
+; CHECK-NEXT:    [[TMP16:%.*]] = fmul fast <2 x double> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = fsub fast <2 x double> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <2 x double> [[TMP16]], <2 x double> [[TMP17]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x double> undef, double [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x double> [[TMP19]], double [[TMP11]], i32 1
+; CHECK-NEXT:    [[TMP21:%.*]] = fmul fast <2 x double> [[TMP10]], [[TMP20]]
+; CHECK-NEXT:    [[TMP22:%.*]] = fsub fast <2 x double> [[TMP21]], [[TMP18]]
+; CHECK-NEXT:    [[P_SROA_0_0__SROA_CAST65:%.*]] = bitcast %class.c* [[AGG_TMP]] to i64*
+; CHECK-NEXT:    store i64 4607182418800017408, i64* [[P_SROA_0_0__SROA_CAST65]], align 8
+; CHECK-NEXT:    [[P_SROA_5_0__SROA_IDX68:%.*]] = getelementptr inbounds [[CLASS_C]], %class.c* [[AGG_TMP]], i64 0, i32 0, i64 1
+; CHECK-NEXT:    [[P_SROA_6_0__SROA_IDX72:%.*]] = getelementptr inbounds [[CLASS_C]], %class.c* [[AGG_TMP]], i64 0, i32 0, i64 2
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast double* [[P_SROA_5_0__SROA_IDX68]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP22]], <2 x double>* [[TMP23]], align 8
+; CHECK-NEXT:    invoke void @bar(%class.c* nonnull [[M]], %class.c* nonnull [[AGG_TMP]])
+; CHECK-NEXT:    to label [[INVOKE_CONT24:%.*]] unwind label [[LPAD23:%.*]]
+; CHECK:       invoke.cont24:
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 24, i8* nonnull [[TMP0]])
+; CHECK-NEXT:    ret void
+; CHECK:       lpad23:
+; CHECK-NEXT:    [[TMP24:%.*]] = landingpad { i8*, i32 }
+; CHECK-NEXT:    cleanup
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 24, i8* nonnull [[TMP0]])
+; CHECK-NEXT:    resume { i8*, i32 } [[TMP24]]
+;
+invoke.cont18:
+  %m = alloca %class.c, align 8
+  %agg.tmp = alloca %class.c, align 8
+  %0 = bitcast %class.c* %m to i8*
+  call void @llvm.lifetime.start.p0i8(i64 24, i8* nonnull %0) #3
+  %l = getelementptr inbounds %class.j, %class.j* %this, i64 0, i32 0
+  %1 = load %class.c*, %class.c** %l, align 8
+  %2 = load i64, i64* @h, align 8
+  %3 = getelementptr inbounds %class.c, %class.c* %1, i64 %2, i32 0, i64 0
+  %4 = load double, double* %3, align 8
+  %arrayidx5.i49 = getelementptr inbounds %class.c, %class.c* %1, i64 %2, i32 0, i64 1
+  %arrayidx9.i51 = getelementptr inbounds %class.c, %class.c* %1, i64 %2, i32 0, i64 2
+  %5 = load double, double* %arrayidx9.i51, align 8
+  %6 = load i64, i64* @i, align 8
+  %arrayidx.i41 = getelementptr inbounds %class.c, %class.c* %1, i64 %6, i32 0, i64 0
+  %7 = load double, double* %arrayidx.i41, align 8
+  %sub.i43 = fsub fast double %7, %4
+  %arrayidx5.i44 = getelementptr inbounds %class.c, %class.c* %1, i64 %6, i32 0, i64 1
+  %8 = load double, double* %arrayidx5.i44, align 8
+  %9 = load double, double* %arrayidx5.i49, align 8
+  %sub8.i = fsub fast double %8, %9
+  %mul.i = fmul fast double %sub.i43, %5
+  %mul8.i = fmul fast double %4, 2.000000e+00
+  %sub.i = fsub fast double %mul.i, %mul8.i
+  %mul13.i = fmul fast double %sub8.i, %4
+  %sub16.i = fsub fast double %mul13.i, %sub.i43
+  %p.sroa.0.0..sroa_cast65 = bitcast %class.c* %agg.tmp to i64*
+  store i64 4607182418800017408, i64* %p.sroa.0.0..sroa_cast65, align 8
+  %p.sroa.5.0..sroa_idx68 = getelementptr inbounds %class.c, %class.c* %agg.tmp, i64 0, i32 0, i64 1
+  store double %sub.i, double* %p.sroa.5.0..sroa_idx68, align 8
+  %p.sroa.6.0..sroa_idx72 = getelementptr inbounds %class.c, %class.c* %agg.tmp, i64 0, i32 0, i64 2
+  store double %sub16.i, double* %p.sroa.6.0..sroa_idx72, align 8
+  invoke void @bar(%class.c* nonnull %m, %class.c* nonnull %agg.tmp)
+  to label %invoke.cont24 unwind label %lpad23
+
+invoke.cont24:                                    ; preds = %invoke.cont18
+  call void @llvm.lifetime.end.p0i8(i64 24, i8* nonnull %0) #3
+  ret void
+
+lpad23:                                           ; preds = %invoke.cont18
+  %10 = landingpad { i8*, i32 }
+  cleanup
+  call void @llvm.lifetime.end.p0i8(i64 24, i8* nonnull %0) #3
+  resume { i8*, i32 } %10
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
+
+declare dso_local i32 @__gxx_personality_v0(...)
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
+
+declare dso_local void @bar(%class.c*, %class.c*) local_unnamed_addr #2