Index: lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -173,40 +173,6 @@
          !Ty->isPPC_FP128Ty();
 }
 
-/// \returns true if all of the instructions in \p VL are in the same block or
-/// false otherwise.
-static bool allSameBlock(ArrayRef<Value *> VL) {
-  Instruction *I0 = dyn_cast<Instruction>(VL[0]);
-  if (!I0)
-    return false;
-  BasicBlock *BB = I0->getParent();
-  for (int i = 1, e = VL.size(); i < e; i++) {
-    Instruction *I = dyn_cast<Instruction>(VL[i]);
-    if (!I)
-      return false;
-
-    if (BB != I->getParent())
-      return false;
-  }
-  return true;
-}
-
-/// \returns True if all of the values in \p VL are constants.
-static bool allConstant(ArrayRef<Value *> VL) {
-  for (Value *i : VL)
-    if (!isa<Constant>(i))
-      return false;
-  return true;
-}
-
-/// \returns True if all of the values in \p VL are identical.
-static bool isSplat(ArrayRef<Value *> VL) {
-  for (unsigned i = 1, e = VL.size(); i < e; ++i)
-    if (VL[i] != VL[0])
-      return false;
-  return true;
-}
-
 /// Checks if the vector of instructions can be represented as a shuffle, like:
 /// %x0 = extractelement <4 x i8> %x, i32 0
 /// %x3 = extractelement <4 x i8> %x, i32 3
@@ -299,6 +265,47 @@
               : TargetTransformInfo::SK_PermuteSingleSrc;
 }
 
+static bool isRemainder(unsigned Opcode) {
+  return (Opcode == Instruction::URem ||
+          Opcode == Instruction::SRem || Opcode == Instruction::FRem);
+}
+
+/// Checks if the \p Opcode can be considered as an operand of a (possibly)
+/// binary operation \p I.
+/// \returns The code of the binary operation of instruction \p I if the
+/// instruction with \p Opcode can be considered as an operand of \p I with the
+/// default value.
+static unsigned tryToRepresentAsInstArg(unsigned Opcode, Instruction *I) {
+  if (Opcode != Instruction::PHI &&
+      Opcode != Instruction::Invoke &&
+      ((I->getType()->isIntegerTy() &&
+       (I->getOpcode() == Instruction::Add ||
+        I->getOpcode() == Instruction::And ||
+        I->getOpcode() == Instruction::AShr ||
+        I->getOpcode() == Instruction::BitCast ||
+        I->getOpcode() == Instruction::Call ||
+        I->getOpcode() == Instruction::ExtractElement ||
+        I->getOpcode() == Instruction::ExtractValue ||
+        I->getOpcode() == Instruction::ICmp ||
+        I->getOpcode() == Instruction::Load ||
+        I->getOpcode() == Instruction::LShr ||
+        I->getOpcode() == Instruction::Mul ||
+        I->getOpcode() == Instruction::Or ||
+        I->getOpcode() == Instruction::PtrToInt ||
+        I->getOpcode() == Instruction::SDiv ||
+        I->getOpcode() == Instruction::Select ||
+        I->getOpcode() == Instruction::SExt ||
+        I->getOpcode() == Instruction::Shl ||
+        I->getOpcode() == Instruction::Sub ||
+        I->getOpcode() == Instruction::Trunc ||
+        I->getOpcode() == Instruction::UDiv ||
+        I->getOpcode() == Instruction::Xor ||
+        I->getOpcode() == Instruction::ZExt)) ||
+      (isa<FPMathOperator>(I) && cast<FPMathOperator>(I)->isFast())))
+    return I->getOpcode();
+  return 0;
+}
+
 namespace {
 
 /// Main data required for vectorization of instructions.
@@ -319,17 +326,65 @@
     return AltOp ? AltOp->getOpcode() : 0;
   }
 
-  /// Some of the instructions in the list have alternate opcodes.
-  bool isAltShuffle() const { return getOpcode() != getAltOpcode(); }
+  InstructionsState() = default;
+  InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
+      : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
+};
+
+class InstructionOrPseudo {
+private:
+  // Parent of the operation.
+  Value *Parent;
 
-  bool isOpcodeOrAlt(Instruction *I) const {
-    unsigned CheckedOpcode = I->getOpcode();
-    return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
+public:
+  // The operation instance.
+  Value *Op;
+  // Instruction represantation of the operation.
+  Instruction *OpInst;
+  // Main operation in a group of operations.
+  Instruction *MainOp = nullptr;
+  // Alternative operation in a group of operations.
+  Instruction *AltOp = nullptr;
+  // Pseudo operation after replacement.
+  Instruction *PseudoOp = nullptr;
+  // Main opcode of a group of operations.
+  unsigned MainOpcode = 0;
+  // Alternative opcode of a group of operations.
+  unsigned AltOpcode = 0;
+
+  InstructionOrPseudo(Value *Parent, Value *Op) : Parent(Parent), Op(Op) {
+    OpInst = dyn_cast<Instruction>(Op);
   }
 
-  InstructionsState() = delete;
-  InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
-      : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
+  void setInstructionsState(const InstructionsState &S) {
+    MainOp = S.MainOp;
+    AltOp = S.AltOp;
+    if (MainOp)
+      MainOpcode = MainOp->getOpcode();
+    if (AltOp)
+      AltOpcode = AltOp->getOpcode();
+  }
+
+  bool isNonAlt() {
+    if (!OpInst)
+      return false;
+    unsigned Opcode = OpInst->getOpcode();
+    return ((Opcode != MainOpcode) && (Opcode != AltOpcode));
+  }
+
+  bool isAltShuffle() const {
+    return (MainOpcode != 0 && AltOpcode != 0 && MainOpcode != AltOpcode);
+  }
+
+  bool isOpcodeOrAlt() const {
+    unsigned CheckedOpcode = OpInst->getOpcode();
+    return MainOpcode == CheckedOpcode || AltOpcode == CheckedOpcode;
+  }
+
+  std::pair<Value *, unsigned> getKey() const {
+    assert(Parent && "Incorrect parent!");
+    return std::make_pair(Parent, MainOpcode);
+  }
 };
 
 } // end anonymous namespace
@@ -339,68 +394,155 @@
 /// OpValue.
 static Value *isOneOf(const InstructionsState &S, Value *Op) {
   auto *I = dyn_cast<Instruction>(Op);
-  if (I && S.isOpcodeOrAlt(I))
-    return Op;
+  if (I) {
+    unsigned CheckedOpcode = I->getOpcode();
+    if (S.getOpcode() == CheckedOpcode || S.getAltOpcode() == CheckedOpcode)
+      return Op;
+  }
   return S.OpValue;
 }
 
+/// \returns True if all of the values in \p IL are constants.
+static bool allConstant(ArrayRef<InstructionOrPseudo *> IL) {
+  return llvm::all_of(
+      make_range(IL.begin(), IL.end()),
+      [](InstructionOrPseudo *I) -> bool { return isa<Constant>(I->Op); });
+}
+
+/// \returns True if all of the values in \p IL are identical.
+static bool isSplat(ArrayRef<InstructionOrPseudo *> IL) {
+  return llvm::all_of(
+      make_range(IL.begin(), IL.end()),
+      [&](InstructionOrPseudo *I) -> bool { return I->Op == IL[0]->Op; });
+}
+
+/// \returns true if all of the instructions in \p VL are in the same block or
+/// false otherwise.
+static bool allSameBlock(ArrayRef<InstructionOrPseudo *> IL) {
+  if (!IL[0]->OpInst)
+    return false;
+  BasicBlock *BB = IL[0]->OpInst->getParent();
+  for (int i = 1, e = IL.size(); i < e; i++) {
+    if (!IL[i]->OpInst)
+      return false;
+
+    if (BB != IL[i]->OpInst->getParent())
+      return false;
+  }
+  return true;
+}
+
 /// \returns analysis of the Instructions in \p VL described in
 /// InstructionsState, the Opcode that we suppose the whole list
 /// could be vectorized even if its structure is diverse.
-static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
+static InstructionsState getSameOpcode(ArrayRef<InstructionOrPseudo *> IL,
                                        unsigned BaseIndex = 0) {
   // Make sure these are all Instructions.
-  if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
-    return InstructionsState(VL[BaseIndex], nullptr, nullptr);
+  if (llvm::any_of(IL, [](InstructionOrPseudo *IOP) { return !IOP->OpInst; }))
+    return InstructionsState(IL[BaseIndex]->Op, nullptr, nullptr);
 
-  bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
-  bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
-  unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
+  unsigned Opcode = IL[BaseIndex]->OpInst->getOpcode();
+  bool IsCastOp = isa<CastInst>(IL[BaseIndex]->OpInst);
+  bool IsBinOp = isa<BinaryOperator>(IL[BaseIndex]->OpInst);
+  bool IsNonAlt = false;
   unsigned AltOpcode = Opcode;
+  // Number of Instructions in VL with Main opcode.
+  unsigned OpcodeNum = 0;
+  // Number of Instructions in VL with Alternative opcode.
+  unsigned AltOpcodeNum = 0;
+  // Number of Instructions in VL with Non-Alternative opcode.
+  unsigned NonAltNum = 0;
+  // Non-Alternative operation offset in VL.
+  unsigned NonAltIndex = 0;
   unsigned AltIndex = BaseIndex;
 
-  // Check for one alternate opcode from another BinaryOperator.
-  // TODO - generalize to support all operators (types, calls etc.).
-  for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
-    unsigned InstOpcode = cast<Instruction>(VL[Cnt])->getOpcode();
-    if (IsBinOp && isa<BinaryOperator>(VL[Cnt])) {
-      if (InstOpcode == Opcode || InstOpcode == AltOpcode)
-        continue;
-      if (Opcode == AltOpcode) {
-        AltOpcode = InstOpcode;
-        AltIndex = Cnt;
-        continue;
-      }
-    } else if (IsCastOp && isa<CastInst>(VL[Cnt])) {
-      Type *Ty0 = cast<Instruction>(VL[BaseIndex])->getOperand(0)->getType();
-      Type *Ty1 = cast<Instruction>(VL[Cnt])->getOperand(0)->getType();
+  // Check for an alternate opcode pattern.
+  for (int Cnt = 0, E = IL.size(); Cnt < E; Cnt++) {
+    auto *I = IL[Cnt]->OpInst;
+    unsigned InstOpcode = I->getOpcode();
+    if (IsCastOp && isa<CastInst>(IL[Cnt]->OpInst)) {
+      Type *Ty0 = IL[BaseIndex]->OpInst->getOperand(0)->getType();
+      Type *Ty1 = IL[Cnt]->OpInst->getOperand(0)->getType();
       if (Ty0 == Ty1) {
-        if (InstOpcode == Opcode || InstOpcode == AltOpcode)
+        if (InstOpcode == Opcode) {
+          OpcodeNum++;
+          continue;
+        }
+        if (AltOpcode != Opcode && InstOpcode == AltOpcode) {
+          AltOpcodeNum++;
           continue;
+        }
         if (Opcode == AltOpcode) {
           AltOpcode = InstOpcode;
           AltIndex = Cnt;
+          AltOpcodeNum++;
           continue;
         }
       }
-    } else if (InstOpcode == Opcode || InstOpcode == AltOpcode)
+      return InstructionsState(IL[BaseIndex]->Op, nullptr, nullptr);
+    }
+    if (InstOpcode == Opcode) {
+      OpcodeNum++;
       continue;
-    return InstructionsState(VL[BaseIndex], nullptr, nullptr);
+    }
+    if (AltOpcode != Opcode && InstOpcode == AltOpcode) {
+      AltOpcodeNum++;
+      continue;
+    }
+    if (InstOpcode != Opcode && InstOpcode != AltOpcode) {
+      if (IsBinOp && AltOpcode == Opcode && isa<BinaryOperator>(I)) {
+        AltOpcode = InstOpcode;
+        AltOpcodeNum++;
+        AltIndex = Cnt;
+        continue;
+      }
+      if (tryToRepresentAsInstArg(Opcode, I) ||
+           (IsBinOp &&
+            tryToRepresentAsInstArg(InstOpcode,
+                                    IL[BaseIndex]->OpInst))) {
+        if (!IsNonAlt) {
+          NonAltIndex = Cnt;
+          IsNonAlt = true;
+        }
+        NonAltNum++;
+        continue;
+      }
+      return InstructionsState(IL[BaseIndex]->Op, nullptr, nullptr);
+    }
   }
 
-  return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
-                           cast<Instruction>(VL[AltIndex]));
+  if (IsNonAlt && IL.size() > 2 && (OpcodeNum + AltOpcodeNum) <= NonAltNum) {
+    BaseIndex = NonAltIndex;
+    AltIndex = BaseIndex;
+    Opcode = IL[BaseIndex]->OpInst->getOpcode();
+    AltOpcode = Opcode;
+    IsBinOp = isa<BinaryOperator>(IL[BaseIndex]->Op);
+    for (int Cnt = 0, E = IL.size(); Cnt < E; Cnt++) {
+      auto *I = IL[Cnt]->OpInst;
+      unsigned InstOpcode = I->getOpcode();
+      if (Opcode == AltOpcode && IsBinOp && isa<BinaryOperator>(I)) {
+        AltOpcode = InstOpcode;
+        AltIndex = Cnt;
+      }
+    }
+  }
+
+  if (IsNonAlt && (!IsBinOp ||
+      isRemainder(Opcode) ||
+      isRemainder(AltOpcode)))
+    return InstructionsState(IL[BaseIndex]->Op, nullptr, nullptr);
+
+  return InstructionsState(IL[BaseIndex]->Op, IL[BaseIndex]->OpInst,
+                           IL[AltIndex]->OpInst);
 }
 
 /// \returns true if all of the values in \p VL have the same type or false
 /// otherwise.
 static bool allSameType(ArrayRef<Value *> VL) {
   Type *Ty = VL[0]->getType();
-  for (int i = 1, e = VL.size(); i < e; i++)
-    if (VL[i]->getType() != Ty)
-      return false;
-
-  return true;
+  return llvm::all_of(
+      make_range(VL.begin(), VL.end()),
+      [&](Value *V) -> bool { return V->getType() == Ty; });
 }
 
 /// \returns True if Extract{Value,Element} instruction extracts element Idx.
@@ -479,6 +621,7 @@
   using InstrList = SmallVector<Instruction *, 16>;
   using ValueSet = SmallPtrSet<Value *, 16>;
   using StoreList = SmallVector<StoreInst *, 8>;
+  using IOPList = SmallVector<InstructionOrPseudo *, 8>;
   using ExtraValueToDebugLocsMap =
       MapVector<Value *, SmallVector<Instruction *, 2>>;
 
@@ -538,6 +681,7 @@
 
   /// Clear the internal data structures that are created by 'buildTree'.
   void deleteTree() {
+    InstructionOrPseudoArray.clear();
     VectorizableTree.clear();
     ScalarToTreeEntry.clear();
     MustGather.clear();
@@ -613,21 +757,22 @@
   int getEntryCost(TreeEntry *E);
 
   /// This is the recursive part of buildTree.
-  void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, int);
+  void buildTree_rec(Value *Parent, ArrayRef<Value *> Roots, unsigned Depth,
+                     int);
 
   /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
   /// be vectorized to use the original vector (or aggregate "bitcast" to a
   /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
   /// returns false, setting \p CurrentOrder to either an empty vector or a
   /// non-identity permutation that allows to reuse extract instructions.
-  bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
+  bool canReuseExtract(ArrayRef<InstructionOrPseudo *> IL,
                        SmallVectorImpl<unsigned> &CurrentOrder) const;
 
   /// Vectorize a single entry in the tree.
   Value *vectorizeTree(TreeEntry *E);
 
   /// Vectorize a single entry in the tree, starting in \p VL.
-  Value *vectorizeTree(ArrayRef<Value *> VL);
+  Value *vectorizeTree(ArrayRef<Value *> VL, Value *Parent);
 
   /// \returns the scalarization cost for this type. Scalarization in this
   /// context means the creation of vectors from a group of scalars.
@@ -636,47 +781,58 @@
   /// \returns the scalarization cost for this list of values. Assuming that
   /// this subtree gets vectorized, we may need to extract the values from the
   /// roots. This method calculates the cost of extracting the values.
-  int getGatherCost(ArrayRef<Value *> VL);
+  int getGatherCost(ArrayRef<InstructionOrPseudo *> IL);
 
   /// Set the Builder insert point to one after the last instruction in
   /// the bundle
-  void setInsertPointAfterBundle(ArrayRef<Value *> VL,
-                                 const InstructionsState &S);
+  void setInsertPointAfterBundle(ArrayRef<InstructionOrPseudo *> IL);
 
   /// \returns a vector from a collection of scalars in \p VL.
-  Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
+  Value *Gather(ArrayRef<InstructionOrPseudo *> IL, VectorType *Ty);
 
   /// \returns whether the VectorizableTree is fully vectorizable and will
   /// be beneficial even the tree height is tiny.
   bool isFullyVectorizableTinyTree();
 
+  Instruction *propagateMetadata(Instruction *I,
+                                 ArrayRef<InstructionOrPseudo *> IL);
+
+  void propagateIRFlags(Value *I,
+                        ArrayRef<InstructionOrPseudo *> IL,
+                        Value *OpValue);
+
   /// \reorder commutative operands in alt shuffle if they result in
   ///  vectorized code.
-  void reorderAltShuffleOperands(const InstructionsState &S,
-                                 ArrayRef<Value *> VL,
+  void reorderAltShuffleOperands(ArrayRef<InstructionOrPseudo *> IL,
                                  SmallVectorImpl<Value *> &Left,
                                  SmallVectorImpl<Value *> &Right);
 
   /// \reorder commutative operands to get better probability of
   /// generating vectorized code.
-  void reorderInputsAccordingToOpcode(unsigned Opcode, ArrayRef<Value *> VL,
+  void reorderInputsAccordingToOpcode(unsigned Opcode,
+                                      ArrayRef<InstructionOrPseudo *> IL,
                                       SmallVectorImpl<Value *> &Left,
                                       SmallVectorImpl<Value *> &Right);
   struct TreeEntry {
     TreeEntry(std::vector<TreeEntry> &Container) : Container(Container) {}
 
     /// \returns true if the scalars in VL are equal to this entry.
-    bool isSame(ArrayRef<Value *> VL) const {
-      if (VL.size() == Scalars.size())
-        return std::equal(VL.begin(), VL.end(), Scalars.begin());
-      return VL.size() == ReuseShuffleIndices.size() &&
+    bool isSame(ArrayRef<InstructionOrPseudo *> IL) const {
+      if (IL.size() == Scalars.size())
+        return std::equal(IL.begin(), IL.end(), Scalars.begin(),
+               [this](InstructionOrPseudo *I1, InstructionOrPseudo *I2)
+                     { return ((I1->Op == I2->Op) ||
+                               (I1->PseudoOp && (I1->PseudoOp == I2->Op)) ||
+                               (I2->PseudoOp && (I2->PseudoOp == I1->Op))); });
+      return IL.size() == ReuseShuffleIndices.size() &&
              std::equal(
-                 VL.begin(), VL.end(), ReuseShuffleIndices.begin(),
-                 [this](Value *V, unsigned Idx) { return V == Scalars[Idx]; });
+                 IL.begin(), IL.end(), ReuseShuffleIndices.begin(),
+                 [this](InstructionOrPseudo *I1, unsigned Idx)
+                 { return ((I1->Op == Scalars[Idx]->Op) || (Scalars[Idx]->PseudoOp && I1->Op == Scalars[Idx]->PseudoOp)); });
     }
 
     /// A vector of scalars.
-    ValueList Scalars;
+    IOPList Scalars;
 
     /// The Scalars are vectorized into this value. It is initialized to Null.
     Value *VectorizedValue = nullptr;
@@ -704,24 +860,30 @@
   };
 
   /// Create a new VectorizableTree entry.
-  void newTreeEntry(ArrayRef<Value *> VL, bool Vectorized, int &UserTreeIdx,
+  void newTreeEntry(ArrayRef<InstructionOrPseudo *> IL, bool Vectorized,
+                    int &UserTreeIdx,
                     ArrayRef<unsigned> ReuseShuffleIndices = None,
                     ArrayRef<unsigned> ReorderIndices = None) {
     VectorizableTree.emplace_back(VectorizableTree);
     int idx = VectorizableTree.size() - 1;
     TreeEntry *Last = &VectorizableTree[idx];
-    Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
+    Last->Scalars.insert(Last->Scalars.begin(), IL.begin(), IL.end());
     Last->NeedToGather = !Vectorized;
     Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
                                      ReuseShuffleIndices.end());
     Last->ReorderIndices = ReorderIndices;
     if (Vectorized) {
-      for (int i = 0, e = VL.size(); i != e; ++i) {
-        assert(!getTreeEntry(VL[i]) && "Scalar already in tree!");
-        ScalarToTreeEntry[VL[i]] = idx;
+      for (InstructionOrPseudo *IOP : IL) {
+        assert(
+            (ScalarToTreeEntry.find(IOP->OpInst) == ScalarToTreeEntry.end() ||
+             ScalarToTreeEntry[IOP->OpInst].find(IOP->getKey()) ==
+                 ScalarToTreeEntry[IOP->OpInst].end()) &&
+            "Scalar already in tree!");
+        ScalarToTreeEntry[IOP->OpInst][IOP->getKey()] = idx;
       }
     } else {
-      MustGather.insert(VL.begin(), VL.end());
+      for (InstructionOrPseudo *IOP : IL)
+        MustGather.insert(IOP->Op);
     }
 
     if (UserTreeIdx >= 0)
@@ -733,15 +895,32 @@
   /// Holds all of the tree entries.
   std::vector<TreeEntry> VectorizableTree;
 
-  TreeEntry *getTreeEntry(Value *V) {
-    auto I = ScalarToTreeEntry.find(V);
-    if (I != ScalarToTreeEntry.end())
-      return &VectorizableTree[I->second];
+  TreeEntry *getTreeEntry(Instruction *I) {
+    if (!I)
+      return nullptr;
+    auto It = ScalarToTreeEntry.find(I);
+    if (It != ScalarToTreeEntry.end()) {
+      auto &STT = It->second;
+      for (auto STTI : STT)
+        for (auto *IOP : VectorizableTree[STTI.second].Scalars)
+          if (IOP->OpInst == I && !IOP->isNonAlt())
+            return &VectorizableTree[STTI.second];
+    }
     return nullptr;
   }
 
+  std::vector<std::unique_ptr<InstructionOrPseudo>> InstructionOrPseudoArray;
+
+  InstructionOrPseudo *allocInstructionOrPseudo(Value* Parent, Value *V) {
+    InstructionOrPseudoArray.emplace_back(
+                       llvm::make_unique<InstructionOrPseudo>(Parent, V));
+    int idx = InstructionOrPseudoArray.size() - 1;
+    return (InstructionOrPseudoArray[idx]).get();
+  }
+
   /// Maps a specific scalar to its tree entry.
-  SmallDenseMap<Value*, int> ScalarToTreeEntry;
+  SmallDenseMap<Instruction *, SmallDenseMap<std::pair<Value *, unsigned>, int>>
+      ScalarToTreeEntry;
 
   /// A list of scalars that we found that we need to keep as scalars.
   ValueSet MustGather;
@@ -833,15 +1012,32 @@
 
     ScheduleData() = default;
 
-    void init(int BlockSchedulingRegionID, Value *OpVal) {
+    void init(int BlockSchedulingRegionID) {
       FirstInBundle = this;
       NextInBundle = nullptr;
       NextLoadStore = nullptr;
       IsScheduled = false;
+      IsPseudo = false;
+      SchedulingRegionID = BlockSchedulingRegionID;
+      UnscheduledDepsInBundle = UnscheduledDeps;
+      clearDependencies();
+    }
+
+    // Initialize ScheduleData for pseudo instruction
+    // with ScheduleData for an original instruction OpOrigin.
+    void init(int BlockSchedulingRegionID, ScheduleData *OpOrigin,
+              Value *OpParent, unsigned OpCode) {
+      FirstInBundle = this;
+      NextInBundle = nullptr;
+      IsScheduled = false;
+      IsPseudo = true;
+      Opcode = OpCode;
+      Parent = OpParent;
+      Inst = OpOrigin->Inst;
+      NextLoadStore = OpOrigin->NextLoadStore;
       SchedulingRegionID = BlockSchedulingRegionID;
       UnscheduledDepsInBundle = UnscheduledDeps;
       clearDependencies();
-      OpValue = OpVal;
     }
 
     /// Returns true if the dependency information has been calculated.
@@ -946,8 +1142,14 @@
     /// dry-run).
     bool IsScheduled = false;
 
-    /// Opcode of the current instruction in the schedule data.
-    Value *OpValue = nullptr;
+    /// Opcode that represents instructions to be vectorized.
+    unsigned Opcode = 0;
+
+    // Parent of this operation.
+    Value *Parent = nullptr;
+
+    // True if this is a pseudo operation.
+    bool IsPseudo = false;
   };
 
 #ifndef NDEBUG
@@ -985,22 +1187,16 @@
       ++SchedulingRegionID;
     }
 
-    ScheduleData *getScheduleData(Value *V) {
-      ScheduleData *SD = ScheduleDataMap[V];
-      if (SD && SD->SchedulingRegionID == SchedulingRegionID)
+    ScheduleData *getScheduleData(InstructionOrPseudo *IOP) {
+      std::pair<Value *, unsigned> Key = IOP->getKey();
+      ScheduleData *SD = ScheduleDataMap[IOP->OpInst][std::make_pair(nullptr, 0)];
+      if (SD && SD->Parent == Key.first && SD->Opcode == Key.second &&
+          SD->SchedulingRegionID == SchedulingRegionID)
+        return SD;
+      SD = ScheduleDataMap[IOP->OpInst][Key];
+      if (SD && SD->Parent == Key.first && SD->Opcode == Key.second &&
+          SD->SchedulingRegionID == SchedulingRegionID)
         return SD;
-      return nullptr;
-    }
-
-    ScheduleData *getScheduleData(Value *V, Value *Key) {
-      if (V == Key)
-        return getScheduleData(V);
-      auto I = ExtraScheduleDataMap.find(V);
-      if (I != ExtraScheduleDataMap.end()) {
-        ScheduleData *SD = I->second[Key];
-        if (SD && SD->SchedulingRegionID == SchedulingRegionID)
-          return SD;
-      }
       return nullptr;
     }
 
@@ -1016,18 +1212,22 @@
       LLVM_DEBUG(dbgs() << "SLP:   schedule " << *SD << "\n");
 
       ScheduleData *BundleMember = SD;
+      unsigned Opcode = BundleMember->Opcode;
+      Value *Parent = BundleMember->Parent;
       while (BundleMember) {
-        if (BundleMember->Inst != BundleMember->OpValue) {
-          BundleMember = BundleMember->NextInBundle;
-          continue;
-        }
+        assert(BundleMember->Opcode == Opcode &&
+               BundleMember->Parent == Parent && "Corrupt bundle member");
         // Handle the def-use chain dependencies.
         for (Use &U : BundleMember->Inst->operands()) {
           auto *I = dyn_cast<Instruction>(U.get());
           if (!I)
             continue;
-          doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
-            if (OpDef && OpDef->hasValidDependencies() &&
+          if (ScheduleDataMap.find(I) == ScheduleDataMap.end())
+            continue;
+          for (auto It : ScheduleDataMap[I]) {
+            auto *OpDef = It.second;
+            if (OpDef && isInSchedulingRegion(OpDef) &&
+                OpDef->hasValidDependencies() &&
                 OpDef->incrementUnscheduledDeps(-1) == 0) {
               // There are no more unscheduled dependencies after
               // decrementing, so we can put the dependent instruction
@@ -1039,7 +1239,7 @@
               LLVM_DEBUG(dbgs()
                          << "SLP:    gets ready (def): " << *DepBundle << "\n");
             }
-          });
+          }
         }
         // Handle the memory dependencies.
         for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
@@ -1058,46 +1258,38 @@
       }
     }
 
-    void doForAllOpcodes(Value *V,
-                         function_ref<void(ScheduleData *SD)> Action) {
-      if (ScheduleData *SD = getScheduleData(V))
-        Action(SD);
-      auto I = ExtraScheduleDataMap.find(V);
-      if (I != ExtraScheduleDataMap.end())
-        for (auto &P : I->second)
-          if (P.second->SchedulingRegionID == SchedulingRegionID)
-            Action(P.second);
-    }
-
     /// Put all instructions into the ReadyList which are ready for scheduling.
     template <typename ReadyListType>
     void initialFillReadyList(ReadyListType &ReadyList) {
       for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
-        doForAllOpcodes(I, [&](ScheduleData *SD) {
-          if (SD->isSchedulingEntity() && SD->isReady()) {
+        if (ScheduleDataMap.find(I) == ScheduleDataMap.end())
+          continue;
+        for (auto It : ScheduleDataMap[I]) {
+          auto *SD = It.second;
+          if (SD && isInSchedulingRegion(SD) &&
+              SD->isSchedulingEntity() && SD->isReady()) {
             ReadyList.insert(SD);
             LLVM_DEBUG(dbgs()
                        << "SLP:    initially in ready list: " << *I << "\n");
           }
-        });
+        }
       }
     }
 
     /// Checks if a bundle of instructions can be scheduled, i.e. has no
     /// cyclic dependencies. This is only a dry-run, no instructions are
     /// actually moved at this stage.
-    bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
-                           const InstructionsState &S);
+    bool tryScheduleBundle(ArrayRef<InstructionOrPseudo *> IL, BoUpSLP *SLP);
 
     /// Un-bundles a group of instructions.
-    void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
+    void cancelScheduling(InstructionOrPseudo *IOP);
 
     /// Allocates schedule data chunk.
     ScheduleData *allocateScheduleDataChunks();
 
-    /// Extends the scheduling region so that V is inside the region.
+    /// Extends the scheduling region so that I is inside the region.
     /// \returns true if the region size is within the limit.
-    bool extendSchedulingRegion(Value *V, const InstructionsState &S);
+    bool extendSchedulingRegion(InstructionOrPseudo *IOP);
 
     /// Initialize the ScheduleData structures for new instructions in the
     /// scheduling region.
@@ -1127,12 +1319,11 @@
 
     /// Attaches ScheduleData to Instruction.
     /// Note that the mapping survives during all vectorization iterations, i.e.
-    /// ScheduleData structures are recycled.
-    DenseMap<Value *, ScheduleData *> ScheduleDataMap;
-
-    /// Attaches ScheduleData to Instruction with the leading key.
-    DenseMap<Value *, SmallDenseMap<Value *, ScheduleData *>>
-        ExtraScheduleDataMap;
+    /// ScheduleData structures are recycled. All real instructions are placed
+    /// in [Instruction *][nullptr][0].
+    DenseMap<Instruction *,
+             SmallDenseMap<std::pair<Value *, unsigned>, ScheduleData *>>
+        ScheduleDataMap;
 
     struct ReadyList : SmallVector<ScheduleData *, 8> {
       void insert(ScheduleData *SD) { push_back(SD); }
@@ -1171,6 +1362,9 @@
   /// Attaches the BlockScheduling structures to basic blocks.
   MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
 
+  /// Remove or subsititude any pseudo instructions in the tree.
+  void eliminatePseudos(BlockScheduling *BS);
+
   /// Performs the "real" scheduling. Done before vectorization is actually
   /// performed in a basic block.
   void scheduleBlock(BlockScheduling *BS);
@@ -1294,14 +1488,15 @@
     std::string Str;
     raw_string_ostream OS(Str);
     if (isSplat(Entry->Scalars)) {
-      OS << "<splat> " << *Entry->Scalars[0];
+      OS << "<splat> " << *Entry->Scalars[0]->Op;
       return Str;
     }
-    for (auto V : Entry->Scalars) {
-      OS << *V;
-      if (std::any_of(
-              R->ExternalUses.begin(), R->ExternalUses.end(),
-              [&](const BoUpSLP::ExternalUser &EU) { return EU.Scalar == V; }))
+    for (auto *IOP : Entry->Scalars) {
+      OS << *IOP->Op;
+      if (std::any_of(R->ExternalUses.begin(), R->ExternalUses.end(),
+                      [&](const BoUpSLP::ExternalUser &EU) {
+                        return EU.Scalar == IOP->Op;
+                      }))
         OS << " <extract>";
       OS << "\n";
     }
@@ -1331,7 +1526,7 @@
   UserIgnoreList = UserIgnoreLst;
   if (!allSameType(Roots))
     return;
-  buildTree_rec(Roots, 0, -1);
+  buildTree_rec(Roots[0], Roots, 0, -1);
 
   // Collect the values that we need to extract from the tree.
   for (TreeEntry &EIdx : VectorizableTree) {
@@ -1343,8 +1538,10 @@
 
     // For each lane:
     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
-      Value *Scalar = Entry->Scalars[Lane];
+      auto *Scalar = Entry->Scalars[Lane];
       int FoundLane = Lane;
+      if (Scalar->isNonAlt())
+        continue;
       if (!Entry->ReuseShuffleIndices.empty()) {
         FoundLane =
             std::distance(Entry->ReuseShuffleIndices.begin(),
@@ -1352,13 +1549,13 @@
       }
 
       // Check if the scalar is externally used as an extra arg.
-      auto ExtI = ExternallyUsedValues.find(Scalar);
+      auto ExtI = ExternallyUsedValues.find(Scalar->Op);
       if (ExtI != ExternallyUsedValues.end()) {
         LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
-                          << Lane << " from " << *Scalar << ".\n");
-        ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
+                          << Lane << " from " << *Scalar->Op << ".\n");
+        ExternalUses.emplace_back(Scalar->Op, nullptr, FoundLane);
       }
-      for (User *U : Scalar->users()) {
+      for (User *U : Scalar->Op->users()) {
         LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
 
         Instruction *UserInst = dyn_cast<Instruction>(U);
@@ -1366,13 +1563,13 @@
           continue;
 
         // Skip in-tree scalars that become vectors
-        if (TreeEntry *UseEntry = getTreeEntry(U)) {
-          Value *UseScalar = UseEntry->Scalars[0];
+        if (TreeEntry *UseEntry = getTreeEntry(UserInst)) {
+          Value *UseScalar = UseEntry->Scalars[0]->Op;
           // Some in-tree scalars will remain as scalar in vectorized
           // instructions. If that is the case, the one in Lane 0 will
           // be used.
           if (UseScalar != U ||
-              !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
+              !InTreeUserNeedToExtract(Scalar->Op, UserInst, TLI)) {
             LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
                               << ".\n");
             assert(!UseEntry->NeedToGather && "Bad state");
@@ -1385,42 +1582,48 @@
           continue;
 
         LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane "
-                          << Lane << " from " << *Scalar << ".\n");
-        ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane));
+                          << Lane << " from " << *Scalar->Op << ".\n");
+        ExternalUses.push_back(ExternalUser(Scalar->Op, U, FoundLane));
       }
     }
   }
 }
 
-void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
+void BoUpSLP::buildTree_rec(Value *Parent, ArrayRef<Value *> VL, unsigned Depth,
                             int UserTreeIdx) {
-  assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
+  SmallVector<InstructionOrPseudo *, 4> IL;
+  for (Value *V: VL)
+    IL.push_back(allocInstructionOrPseudo(Parent, V));
+  InstructionsState S = getSameOpcode(IL);
+  for (InstructionOrPseudo *IOP : IL)
+    IOP->setInstructionsState(S);
+
+  assert((allConstant(IL) || allSameType(VL)) && "Invalid types!");
 
-  InstructionsState S = getSameOpcode(VL);
   if (Depth == RecursionMaxDepth) {
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
-    newTreeEntry(VL, false, UserTreeIdx);
+    newTreeEntry(IL, false, UserTreeIdx);
     return;
   }
 
   // Don't handle vectors.
   if (S.OpValue->getType()->isVectorTy()) {
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
-    newTreeEntry(VL, false, UserTreeIdx);
+    newTreeEntry(IL, false, UserTreeIdx);
     return;
   }
 
   if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
     if (SI->getValueOperand()->getType()->isVectorTy()) {
       LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
-      newTreeEntry(VL, false, UserTreeIdx);
+      newTreeEntry(IL, false, UserTreeIdx);
       return;
     }
 
   // If all of the operands are identical or constant we have a simple solution.
-  if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode()) {
+  if (allConstant(IL) || isSplat(IL) || !allSameBlock(IL) || !S.getOpcode()) {
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
-    newTreeEntry(VL, false, UserTreeIdx);
+    newTreeEntry(IL, false, UserTreeIdx);
     return;
   }
 
@@ -1428,23 +1631,19 @@
   // the same block.
 
   // Don't vectorize ephemeral values.
-  for (unsigned i = 0, e = VL.size(); i != e; ++i) {
-    if (EphValues.count(VL[i])) {
+  for (unsigned i = 0, e = IL.size(); i != e; ++i) {
+    if (EphValues.count(IL[i]->Op)) {
       LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i]
                         << ") is ephemeral.\n");
-      newTreeEntry(VL, false, UserTreeIdx);
+      newTreeEntry(IL, false, UserTreeIdx);
       return;
     }
   }
 
   // Check if this is a duplicate of another entry.
-  if (TreeEntry *E = getTreeEntry(S.OpValue)) {
+  TreeEntry *E = getTreeEntry(cast<Instruction>(S.OpValue));
+  if (E && E->isSame(IL)) {
     LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
-    if (!E->isSame(VL)) {
-      LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
-      newTreeEntry(VL, false, UserTreeIdx);
-      return;
-    }
     // Record the reuse of the tree node.  FIXME, currently this is only used to
     // properly draw the graph rather than for the actual vectorization.
     E->UserTreeIndices.push_back(UserTreeIdx);
@@ -1454,14 +1653,15 @@
   }
 
   // Check that none of the instructions in the bundle are already in the tree.
-  for (unsigned i = 0, e = VL.size(); i != e; ++i) {
-    auto *I = dyn_cast<Instruction>(VL[i]);
-    if (!I)
-      continue;
-    if (getTreeEntry(I)) {
-      LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i]
+  for (unsigned i = 0, e = IL.size(); i != e; ++i) {
+    auto *I = IL[i]->OpInst;
+    if (getTreeEntry(I) ||
+        (ScalarToTreeEntry.find(I) != ScalarToTreeEntry.end() &&
+         ScalarToTreeEntry[I].find(IL[i]->getKey()) !=
+         ScalarToTreeEntry[I].end())) {
+      LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *I
                         << ") is already in tree.\n");
-      newTreeEntry(VL, false, UserTreeIdx);
+      newTreeEntry(IL, false, UserTreeIdx);
       return;
     }
   }
@@ -1469,47 +1669,50 @@
   // If any of the scalars is marked as a value that needs to stay scalar, then
   // we need to gather the scalars.
   // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
-  for (unsigned i = 0, e = VL.size(); i != e; ++i) {
-    if (MustGather.count(VL[i]) || is_contained(UserIgnoreList, VL[i])) {
+  for (unsigned i = 0, e = IL.size(); i != e; ++i) {
+    if (MustGather.count(IL[i]->Op)) {
       LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
-      newTreeEntry(VL, false, UserTreeIdx);
+      newTreeEntry(IL, false, UserTreeIdx);
       return;
     }
   }
 
   // Check that all of the users of the scalars that we want to vectorize are
   // schedulable.
-  auto *VL0 = cast<Instruction>(S.OpValue);
+  auto *VL0 = IL[0]->MainOp;
   BasicBlock *BB = VL0->getParent();
 
   if (!DT->isReachableFromEntry(BB)) {
     // Don't go into unreachable blocks. They may contain instructions with
     // dependency cycles which confuse the final scheduling.
     LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
-    newTreeEntry(VL, false, UserTreeIdx);
+    newTreeEntry(IL, false, UserTreeIdx);
     return;
   }
 
   // Check that every instruction appears once in this bundle.
   SmallVector<unsigned, 4> ReuseShuffleIndicies;
-  SmallVector<Value *, 4> UniqueValues;
+  SmallVector<InstructionOrPseudo *, 4> UniqueValues;
   DenseMap<Value *, unsigned> UniquePositions;
-  for (Value *V : VL) {
-    auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
+  for (auto *I : IL) {
+    auto Res = UniquePositions.try_emplace(I->Op, UniqueValues.size());
     ReuseShuffleIndicies.emplace_back(Res.first->second);
-    if (Res.second)
-      UniqueValues.emplace_back(V);
+    if (Res.second) {
+      InstructionOrPseudo *IOP = allocInstructionOrPseudo(Parent, I->Op);
+      IOP->setInstructionsState(S);
+      UniqueValues.emplace_back(IOP);
+    }
   }
-  if (UniqueValues.size() == VL.size()) {
+  if (UniqueValues.size() == IL.size()) {
     ReuseShuffleIndicies.clear();
   } else {
     LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
     if (UniqueValues.size() <= 1 || !llvm::isPowerOf2_32(UniqueValues.size())) {
       LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
-      newTreeEntry(VL, false, UserTreeIdx);
+      newTreeEntry(IL, false, UserTreeIdx);
       return;
     }
-    VL = UniqueValues;
+    IL = UniqueValues;
   }
 
   auto &BSRef = BlocksSchedules[BB];
@@ -1518,59 +1721,59 @@
 
   BlockScheduling &BS = *BSRef.get();
 
-  if (!BS.tryScheduleBundle(VL, this, S)) {
+  if (!BS.tryScheduleBundle(IL, this)) {
     LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
-    assert((!BS.getScheduleData(VL0) ||
-            !BS.getScheduleData(VL0)->isPartOfBundle()) &&
+    assert((!BS.getScheduleData(IL[0]) ||
+            !BS.getScheduleData(IL[0])->isPartOfBundle()) &&
            "tryScheduleBundle should cancelScheduling on failure");
-    newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+    newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies);
     return;
   }
   LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
 
-  unsigned ShuffleOrOp = S.isAltShuffle() ?
+  unsigned ShuffleOrOp = IL[0]->isAltShuffle() ?
                 (unsigned) Instruction::ShuffleVector : S.getOpcode();
   switch (ShuffleOrOp) {
     case Instruction::PHI: {
       PHINode *PH = dyn_cast<PHINode>(VL0);
 
       // Check for terminator values (e.g. invoke).
-      for (unsigned j = 0; j < VL.size(); ++j)
+      for (unsigned j = 0; j < IL.size(); ++j)
         for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
           Instruction *Term = dyn_cast<Instruction>(
-              cast<PHINode>(VL[j])->getIncomingValueForBlock(
+              cast<PHINode>(IL[j]->OpInst)->getIncomingValueForBlock(
                   PH->getIncomingBlock(i)));
           if (Term && Term->isTerminator()) {
             LLVM_DEBUG(dbgs()
                        << "SLP: Need to swizzle PHINodes (terminator use).\n");
-            BS.cancelScheduling(VL, VL0);
-            newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+            BS.cancelScheduling(IL[0]);
+            newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies);
             return;
           }
         }
 
-      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      newTreeEntry(IL, true, UserTreeIdx, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
 
       for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
-        for (Value *j : VL)
-          Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock(
+        for (auto *j : IL)
+          Operands.push_back(cast<PHINode>(j->Op)->getIncomingValueForBlock(
               PH->getIncomingBlock(i)));
 
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+        buildTree_rec(VL0, Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
     case Instruction::ExtractValue:
     case Instruction::ExtractElement: {
       OrdersType CurrentOrder;
-      bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
+      bool Reuse = canReuseExtract(IL, CurrentOrder);
       if (Reuse) {
         LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
         ++NumOpsWantToKeepOriginalOrder;
-        newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
+        newTreeEntry(IL, /*Vectorized=*/true, UserTreeIdx,
                      ReuseShuffleIndicies);
         return;
       }
@@ -1587,13 +1790,15 @@
         auto StoredCurrentOrderAndNum =
             NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
         ++StoredCurrentOrderAndNum->getSecond();
-        newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies,
+        newTreeEntry(IL, /*Vectorized=*/true, UserTreeIdx,
+                     ReuseShuffleIndicies,
                      StoredCurrentOrderAndNum->getFirst());
         return;
       }
       LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
-      newTreeEntry(VL, /*Vectorized=*/false, UserTreeIdx, ReuseShuffleIndicies);
-      BS.cancelScheduling(VL, VL0);
+      newTreeEntry(IL, /*Vectorized=*/false, UserTreeIdx,
+                   ReuseShuffleIndicies);
+      BS.cancelScheduling(IL[0]);
       return;
     }
     case Instruction::Load: {
@@ -1607,21 +1812,21 @@
 
       if (DL->getTypeSizeInBits(ScalarTy) !=
           DL->getTypeAllocSizeInBits(ScalarTy)) {
-        BS.cancelScheduling(VL, VL0);
-        newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+        BS.cancelScheduling(IL[0]);
+        newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies);
         LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
         return;
       }
 
       // Make sure all loads in the bundle are simple - we can't vectorize
       // atomic or volatile loads.
-      SmallVector<Value *, 4> PointerOps(VL.size());
+      SmallVector<Value *, 4> PointerOps(IL.size());
       auto POIter = PointerOps.begin();
-      for (Value *V : VL) {
-        auto *L = cast<LoadInst>(V);
+      for (auto *I : IL) {
+        auto *L = cast<LoadInst>(I->OpInst);
         if (!L->isSimple()) {
-          BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          BS.cancelScheduling(IL[0]);
+          newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies);
           LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
           return;
         }
@@ -1647,18 +1852,18 @@
             dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0));
         uint64_t Size = DL->getTypeAllocSize(ScalarTy);
         // Check that the sorted loads are consecutive.
-        if (Diff && Diff->getAPInt().getZExtValue() == (VL.size() - 1) * Size) {
+        if (Diff && Diff->getAPInt().getZExtValue() == (IL.size() - 1) * Size) {
           if (CurrentOrder.empty()) {
             // Original loads are consecutive and does not require reordering.
             ++NumOpsWantToKeepOriginalOrder;
-            newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
+            newTreeEntry(IL, /*Vectorized=*/true, UserTreeIdx,
                          ReuseShuffleIndicies);
             LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
           } else {
             // Need to reorder.
             auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
             ++I->getSecond();
-            newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
+            newTreeEntry(IL, /*Vectorized=*/true, UserTreeIdx,
                          ReuseShuffleIndicies, I->getFirst());
             LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
           }
@@ -1667,8 +1872,8 @@
       }
 
       LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
-      BS.cancelScheduling(VL, VL0);
-      newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+      BS.cancelScheduling(IL[0]);
+      newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies);
       return;
     }
     case Instruction::ZExt:
@@ -1684,26 +1889,26 @@
     case Instruction::FPTrunc:
     case Instruction::BitCast: {
       Type *SrcTy = VL0->getOperand(0)->getType();
-      for (unsigned i = 0; i < VL.size(); ++i) {
-        Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
+      for (unsigned i = 0; i < IL.size(); ++i) {
+        Type *Ty = IL[i]->OpInst->getOperand(0)->getType();
         if (Ty != SrcTy || !isValidElementType(Ty)) {
-          BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          BS.cancelScheduling(IL[0]);
+          newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies);
           LLVM_DEBUG(dbgs()
                      << "SLP: Gathering casts with different src types.\n");
           return;
         }
       }
-      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      newTreeEntry(IL, true, UserTreeIdx, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
 
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
-        for (Value *j : VL)
-          Operands.push_back(cast<Instruction>(j)->getOperand(i));
+        for (auto *I : IL)
+          Operands.push_back(I->OpInst->getOperand(i));
 
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+        buildTree_rec(VL0, Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
@@ -1712,28 +1917,28 @@
       // Check that all of the compares have the same predicate.
       CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
       Type *ComparedTy = VL0->getOperand(0)->getType();
-      for (unsigned i = 1, e = VL.size(); i < e; ++i) {
-        CmpInst *Cmp = cast<CmpInst>(VL[i]);
+      for (unsigned i = 1, e = IL.size(); i < e; ++i) {
+        CmpInst *Cmp = cast<CmpInst>(IL[i]->OpInst);
         if (Cmp->getPredicate() != P0 ||
             Cmp->getOperand(0)->getType() != ComparedTy) {
-          BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          BS.cancelScheduling(IL[0]);
+          newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies);
           LLVM_DEBUG(dbgs()
                      << "SLP: Gathering cmp with different predicate.\n");
           return;
         }
       }
 
-      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      newTreeEntry(IL, true, UserTreeIdx, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
 
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
-        for (Value *j : VL)
-          Operands.push_back(cast<Instruction>(j)->getOperand(i));
+        for (auto *I : IL)
+          Operands.push_back(I->OpInst->getOperand(i));
 
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+        buildTree_rec(VL0, Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
@@ -1756,36 +1961,44 @@
     case Instruction::And:
     case Instruction::Or:
     case Instruction::Xor:
-      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      newTreeEntry(IL, true, UserTreeIdx, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
 
       // Sort operands of the instructions so that each side is more likely to
       // have the same opcode.
       if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
         ValueList Left, Right;
-        reorderInputsAccordingToOpcode(S.getOpcode(), VL, Left, Right);
-        buildTree_rec(Left, Depth + 1, UserTreeIdx);
-        buildTree_rec(Right, Depth + 1, UserTreeIdx);
+        reorderInputsAccordingToOpcode(S.getOpcode(), IL, Left, Right);
+        buildTree_rec(VL0, Left, Depth + 1, UserTreeIdx);
+        buildTree_rec(VL0, Right, Depth + 1, UserTreeIdx);
         return;
       }
 
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
-        for (Value *j : VL)
-          Operands.push_back(cast<Instruction>(j)->getOperand(i));
-
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+        for (auto *Inst: IL) {
+          auto *I = Inst->OpInst;
+          if (I->getOpcode() == S.getOpcode()) {
+             Operands.push_back(I->getOperand(i));
+             continue;
+          }
+          assert(Instruction::isBinaryOp(S.getOpcode()) &&
+                  "Expected a binary operation.");
+          Operands.push_back(Inst->Op);
+        }
+        if (allSameType(Operands))
+          buildTree_rec(VL0, Operands, Depth + 1, UserTreeIdx);
       }
       return;
 
     case Instruction::GetElementPtr: {
       // We don't combine GEPs with complicated (nested) indexing.
-      for (unsigned j = 0; j < VL.size(); ++j) {
-        if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
+      for (unsigned j = 0; j < IL.size(); ++j) {
+        if (IL[j]->OpInst->getNumOperands() != 2) {
           LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
-          BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          BS.cancelScheduling(IL[0]);
+          newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies);
           return;
         }
       }
@@ -1793,59 +2006,59 @@
       // We can't combine several GEPs into one vector if they operate on
       // different types.
       Type *Ty0 = VL0->getOperand(0)->getType();
-      for (unsigned j = 0; j < VL.size(); ++j) {
-        Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
+      for (unsigned j = 0; j < IL.size(); ++j) {
+        Type *CurTy = IL[j]->OpInst->getOperand(0)->getType();
         if (Ty0 != CurTy) {
           LLVM_DEBUG(dbgs()
                      << "SLP: not-vectorizable GEP (different types).\n");
-          BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          BS.cancelScheduling(IL[0]);
+          newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies);
           return;
         }
       }
 
       // We don't combine GEPs with non-constant indexes.
-      for (unsigned j = 0; j < VL.size(); ++j) {
-        auto Op = cast<Instruction>(VL[j])->getOperand(1);
+      for (unsigned j = 0; j < IL.size(); ++j) {
+        auto Op = IL[j]->OpInst->getOperand(1);
         if (!isa<ConstantInt>(Op)) {
           LLVM_DEBUG(dbgs()
                      << "SLP: not-vectorizable GEP (non-constant indexes).\n");
-          BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          BS.cancelScheduling(IL[0]);
+          newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies);
           return;
         }
       }
 
-      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      newTreeEntry(IL, true, UserTreeIdx, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
       for (unsigned i = 0, e = 2; i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
-        for (Value *j : VL)
-          Operands.push_back(cast<Instruction>(j)->getOperand(i));
+        for (auto *j : IL)
+          Operands.push_back(j->OpInst->getOperand(i));
 
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+        buildTree_rec(VL0, Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
     case Instruction::Store: {
       // Check if the stores are consecutive or of we need to swizzle them.
-      for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
-        if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
-          BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+      for (unsigned i = 0, e = IL.size() - 1; i < e; ++i)
+        if (!isConsecutiveAccess(IL[i]->Op, IL[i + 1]->Op, *DL, *SE)) {
+          BS.cancelScheduling(IL[0]);
+          newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies);
           LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
           return;
         }
 
-      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      newTreeEntry(IL, true, UserTreeIdx, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
 
       ValueList Operands;
-      for (Value *j : VL)
-        Operands.push_back(cast<Instruction>(j)->getOperand(0));
+      for (auto *j : IL)
+        Operands.push_back(j->OpInst->getOperand(0));
 
-      buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+      buildTree_rec(VL0, Operands, Depth + 1, UserTreeIdx);
       return;
     }
     case Instruction::Call: {
@@ -1855,8 +2068,8 @@
       // represented by an intrinsic call
       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
       if (!isTriviallyVectorizable(ID)) {
-        BS.cancelScheduling(VL, VL0);
-        newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+        BS.cancelScheduling(IL[0]);
+        newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies);
         LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
         return;
       }
@@ -1864,15 +2077,15 @@
       Value *A1I = nullptr;
       if (hasVectorInstrinsicScalarOpd(ID, 1))
         A1I = CI->getArgOperand(1);
-      for (unsigned i = 1, e = VL.size(); i != e; ++i) {
-        CallInst *CI2 = dyn_cast<CallInst>(VL[i]);
+      for (unsigned i = 1, e = IL.size(); i != e; ++i) {
+        CallInst *CI2 = dyn_cast<CallInst>(IL[i]->OpInst);
         if (!CI2 || CI2->getCalledFunction() != Int ||
             getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
             !CI->hasIdenticalOperandBundleSchema(*CI2)) {
-          BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
-          LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
-                            << "\n");
+          BS.cancelScheduling(IL[0]);
+          newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies);
+          LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" <<
+                            *IL[i]->OpInst << "\n");
           return;
         }
         // ctlz,cttz and powi are special intrinsics whose second argument
@@ -1880,8 +2093,8 @@
         if (hasVectorInstrinsicScalarOpd(ID, 1)) {
           Value *A1J = CI2->getArgOperand(1);
           if (A1I != A1J) {
-            BS.cancelScheduling(VL, VL0);
-            newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+            BS.cancelScheduling(IL[0]);
+            newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies);
             LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
                               << " argument " << A1I << "!=" << A1J << "\n");
             return;
@@ -1892,60 +2105,78 @@
             !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
                         CI->op_begin() + CI->getBundleOperandsEndIndex(),
                         CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
-          BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          BS.cancelScheduling(IL[0]);
+          newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies);
           LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:"
-                            << *CI << "!=" << *VL[i] << '\n');
+                            << *CI << "!=" << *IL[i]->OpInst << '\n');
           return;
         }
       }
 
-      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      newTreeEntry(IL, true, UserTreeIdx, ReuseShuffleIndicies);
       for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
-        for (Value *j : VL) {
-          CallInst *CI2 = dyn_cast<CallInst>(j);
-          Operands.push_back(CI2->getArgOperand(i));
+        for (auto *VecOp : IL) {
+          auto *I = VecOp->OpInst;
+          if (VecOp->isOpcodeOrAlt()) {
+             Operands.push_back(I->getOperand(i));
+             continue;
+          }
+          assert(Instruction::isBinaryOp(S.getOpcode()) &&
+                  "Expected a binary operation.");
+          Value *Operand = ConstantExpr::getBinOpIdentity(S.getOpcode(),
+                                                       I->getType(), true);
+          Operands.push_back(Operand);
         }
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+        if (allSameType(Operands))
+          buildTree_rec(VL0, Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
     case Instruction::ShuffleVector:
       // If this is not an alternate sequence of opcode like add-sub
       // then do not vectorize this instruction.
-      if (!S.isAltShuffle()) {
-        BS.cancelScheduling(VL, VL0);
-        newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+      if (!IL[0]->isAltShuffle()) {
+        BS.cancelScheduling(IL[0]);
+        newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies);
         LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
         return;
       }
-      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      newTreeEntry(IL, true, UserTreeIdx, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
 
       // Reorder operands if reordering would enable vectorization.
       if (isa<BinaryOperator>(VL0)) {
         ValueList Left, Right;
-        reorderAltShuffleOperands(S, VL, Left, Right);
-        buildTree_rec(Left, Depth + 1, UserTreeIdx);
-        buildTree_rec(Right, Depth + 1, UserTreeIdx);
+        reorderAltShuffleOperands(IL, Left, Right);
+        buildTree_rec(VL0, Left, Depth + 1, UserTreeIdx);
+        buildTree_rec(VL0, Right, Depth + 1, UserTreeIdx);
         return;
       }
 
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
-        for (Value *j : VL)
-          Operands.push_back(cast<Instruction>(j)->getOperand(i));
+        for (auto *VecOp : IL) {
+          if (VecOp->isOpcodeOrAlt()) {
+            Operands.push_back(VecOp->OpInst->getOperand(i));
+            continue;
+          }
+          assert(Instruction::isBinaryOp(S.getOpcode()) &&
+                  "Expected a binary operation.");
+          Value *Operand = ConstantExpr::getBinOpIdentity(
+              S.getOpcode(), VecOp->OpInst->getType(), true);
+          Operands.push_back(Operand);
+        }
 
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+        buildTree_rec(VL0, Operands, Depth + 1, UserTreeIdx);
       }
       return;
 
     default:
-      BS.cancelScheduling(VL, VL0);
-      newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+      BS.cancelScheduling(IL[0]);
+      newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
       return;
   }
@@ -1976,12 +2207,12 @@
   return N;
 }
 
-bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
+bool BoUpSLP::canReuseExtract(ArrayRef<InstructionOrPseudo *> IL,
                               SmallVectorImpl<unsigned> &CurrentOrder) const {
-  Instruction *E0 = cast<Instruction>(OpValue);
+  Instruction *E0 = IL[0]->MainOp;
   assert(E0->getOpcode() == Instruction::ExtractElement ||
          E0->getOpcode() == Instruction::ExtractValue);
-  assert(E0->getOpcode() == getSameOpcode(VL).getOpcode() && "Invalid opcode");
+  assert(E0->getOpcode() == getSameOpcode(IL).getOpcode() && "Invalid opcode");
   // Check if all of the extracts come from the same vector and from the
   // correct offset.
   Value *Vec = E0->getOperand(0);
@@ -1997,18 +2228,18 @@
       return false;
     // Check if load can be rewritten as load of vector.
     LoadInst *LI = dyn_cast<LoadInst>(Vec);
-    if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
+    if (!LI || !LI->isSimple() || !LI->hasNUses(IL.size()))
       return false;
   } else {
     NElts = Vec->getType()->getVectorNumElements();
   }
 
-  if (NElts != VL.size())
+  if (NElts != IL.size())
     return false;
 
   // Check that all of the indices extract from the correct offset.
   bool ShouldKeepOrder = true;
-  unsigned E = VL.size();
+  unsigned E = IL.size();
   // Assign to all items the initial value E + 1 so we can check if the extract
   // instruction index was used already.
   // Also, later we can check that all the indices are used and we have a
@@ -2017,7 +2248,7 @@
   CurrentOrder.assign(E, E + 1);
   unsigned I = 0;
   for (; I < E; ++I) {
-    auto *Inst = cast<Instruction>(VL[I]);
+    auto *Inst = IL[I]->OpInst;
     if (Inst->getOperand(0) != Vec)
       break;
     Optional<unsigned> Idx = getExtractIndex(Inst);
@@ -2046,25 +2277,25 @@
 bool BoUpSLP::areAllUsersVectorized(Instruction *I) const {
   return I->hasOneUse() ||
          std::all_of(I->user_begin(), I->user_end(), [this](User *U) {
-           return ScalarToTreeEntry.count(U) > 0;
+           return ScalarToTreeEntry.count(dyn_cast<Instruction>(U)) > 0;
          });
 }
 
 int BoUpSLP::getEntryCost(TreeEntry *E) {
-  ArrayRef<Value*> VL = E->Scalars;
-
-  Type *ScalarTy = VL[0]->getType();
-  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+  ArrayRef<InstructionOrPseudo *> IL = E->Scalars;
+  Type *ScalarTy = E->Scalars[0]->Op->getType();
+  if (StoreInst *SI = dyn_cast<StoreInst>(E->Scalars[0]->Op))
     ScalarTy = SI->getValueOperand()->getType();
-  else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0]))
+  else if (CmpInst *CI = dyn_cast<CmpInst>(E->Scalars[0]->Op))
     ScalarTy = CI->getOperand(0)->getType();
-  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
+  VectorType *VecTy = VectorType::get(ScalarTy, IL.size());
 
   // If we have computed a smaller type for the expression, update VecTy so
   // that the costs will be accurate.
-  if (MinBWs.count(VL[0]))
+  if (MinBWs.count(E->Scalars[0]->Op))
     VecTy = VectorType::get(
-        IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
+        IntegerType::get(F->getContext(), MinBWs[E->Scalars[0]->Op].first),
+                         IL.size());
 
   unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size();
   bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
@@ -2074,26 +2305,28 @@
         TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
   }
   if (E->NeedToGather) {
-    if (allConstant(VL))
+    if (allConstant(IL))
       return 0;
-    if (isSplat(VL)) {
+    if (isSplat(IL)) {
       return ReuseShuffleCost +
              TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
     }
-    if (getSameOpcode(VL).getOpcode() == Instruction::ExtractElement &&
-        allSameType(VL) && allSameBlock(VL)) {
+    if (IL[0]->MainOpcode == Instruction::ExtractElement && allSameBlock(IL)) {
+      ValueList VL;
+      for (auto *IOP : IL)
+        VL.push_back(IOP->Op);
       Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isShuffle(VL);
       if (ShuffleKind.hasValue()) {
         int Cost = TTI->getShuffleCost(ShuffleKind.getValue(), VecTy);
-        for (auto *V : VL) {
+        for (auto *IOP : IL) {
           // If all users of instruction are going to be vectorized and this
           // instruction itself is not going to be vectorized, consider this
           // instruction as dead and remove its cost from the final cost of the
           // vectorized tree.
-          if (areAllUsersVectorized(cast<Instruction>(V)) &&
-              !ScalarToTreeEntry.count(V)) {
+          if (areAllUsersVectorized(IOP->OpInst) &&
+              !ScalarToTreeEntry.count(IOP->OpInst)) {
             auto *IO = cast<ConstantInt>(
-                cast<ExtractElementInst>(V)->getIndexOperand());
+                cast<ExtractElementInst>(IOP->OpInst)->getIndexOperand());
             Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
                                             IO->getZExtValue());
           }
@@ -2101,13 +2334,12 @@
         return ReuseShuffleCost + Cost;
       }
     }
-    return ReuseShuffleCost + getGatherCost(VL);
+    return ReuseShuffleCost + getGatherCost(IL);
   }
-  InstructionsState S = getSameOpcode(VL);
-  assert(S.getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
-  Instruction *VL0 = cast<Instruction>(S.OpValue);
-  unsigned ShuffleOrOp = S.isAltShuffle() ?
-               (unsigned) Instruction::ShuffleVector : S.getOpcode();
+  auto *VL0 = E->Scalars[0]->MainOp;
+  assert(IL[0]->MainOpcode != 0 && allSameBlock(IL) && "Invalid VL");
+  unsigned ShuffleOrOp = IL[0]->isAltShuffle() ?
+             (unsigned) Instruction::ShuffleVector : IL[0]->MainOpcode;
   switch (ShuffleOrOp) {
     case Instruction::PHI:
       return 0;
@@ -2119,7 +2351,7 @@
         for (unsigned I : E->ReuseShuffleIndices) {
           if (ShuffleOrOp == Instruction::ExtractElement) {
             auto *IO = cast<ConstantInt>(
-                cast<ExtractElementInst>(VL[I])->getIndexOperand());
+                cast<ExtractElementInst>(IL[I]->OpInst)->getIndexOperand());
             Idx = IO->getZExtValue();
             ReuseShuffleCost -= TTI->getVectorInstrCost(
                 Instruction::ExtractElement, VecTy, Idx);
@@ -2130,10 +2362,10 @@
           }
         }
         Idx = ReuseShuffleNumbers;
-        for (Value *V : VL) {
+        for (auto *IOP : IL) {
           if (ShuffleOrOp == Instruction::ExtractElement) {
             auto *IO = cast<ConstantInt>(
-                cast<ExtractElementInst>(V)->getIndexOperand());
+                cast<ExtractElementInst>(IOP->OpInst)->getIndexOperand());
             Idx = IO->getZExtValue();
           } else {
             --Idx;
@@ -2149,15 +2381,15 @@
           DeadCost += TTI->getShuffleCost(
               TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
         }
-        for (unsigned i = 0, e = VL.size(); i < e; ++i) {
-          Instruction *E = cast<Instruction>(VL[i]);
+        for (unsigned i = 0, e = IL.size(); i < e; ++i) {
           // If all users are going to be vectorized, instruction can be
           // considered as dead.
           // The same, if have only one user, it will be vectorized for sure.
-          if (areAllUsersVectorized(E)) {
+          auto *I = IL[i]->OpInst;
+          if (areAllUsersVectorized(I)) {
             // Take credit for instruction that will become dead.
-            if (E->hasOneUse()) {
-              Instruction *Ext = E->user_back();
+            if (I->hasOneUse()) {
+              Instruction *Ext = I->user_back();
               if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
                   all_of(Ext->users(),
                          [](User *U) { return isa<GetElementPtrInst>(U); })) {
@@ -2167,7 +2399,8 @@
                     Ext->getOpcode(), Ext->getType(), VecTy, i);
                 // Add back the cost of s|zext which is subtracted separately.
                 DeadCost += TTI->getCastInstrCost(
-                    Ext->getOpcode(), Ext->getType(), E->getType(), Ext);
+                    Ext->getOpcode(), Ext->getType(),
+                    I->getType(), Ext);
                 continue;
               }
             }
@@ -2177,7 +2410,7 @@
         }
         return DeadCost;
       }
-      return ReuseShuffleCost + getGatherCost(VL);
+      return ReuseShuffleCost + getGatherCost(IL);
 
     case Instruction::ZExt:
     case Instruction::SExt:
@@ -2193,20 +2426,21 @@
     case Instruction::BitCast: {
       Type *SrcTy = VL0->getOperand(0)->getType();
       int ScalarEltCost =
-          TTI->getCastInstrCost(S.getOpcode(), ScalarTy, SrcTy, VL0);
+          TTI->getCastInstrCost(IL[0]->MainOpcode, ScalarTy, SrcTy, VL0);
       if (NeedToShuffleReuses) {
-        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+        ReuseShuffleCost -= (ReuseShuffleNumbers - IL.size()) * ScalarEltCost;
       }
 
       // Calculate the cost of this instruction.
-      int ScalarCost = VL.size() * ScalarEltCost;
+      int ScalarCost = IL.size() * ScalarEltCost;
 
-      VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
+      VectorType *SrcVecTy = VectorType::get(SrcTy, IL.size());
       int VecCost = 0;
       // Check if the values are candidates to demote.
       if (!MinBWs.count(VL0) || VecTy != SrcVecTy) {
         VecCost = ReuseShuffleCost +
-                  TTI->getCastInstrCost(S.getOpcode(), VecTy, SrcVecTy, VL0);
+                  TTI->getCastInstrCost(IL[0]->MainOpcode, VecTy,
+                                        SrcVecTy, VL0);
       }
       return VecCost - ScalarCost;
     }
@@ -2214,14 +2448,17 @@
     case Instruction::ICmp:
     case Instruction::Select: {
       // Calculate the cost of this instruction.
-      int ScalarEltCost = TTI->getCmpSelInstrCost(S.getOpcode(), ScalarTy,
-                                                  Builder.getInt1Ty(), VL0);
+      int ScalarEltCost = TTI->getCmpSelInstrCost(IL[0]->MainOpcode,
+                                                  ScalarTy,
+                                                  Builder.getInt1Ty(),
+                                                  VL0);
       if (NeedToShuffleReuses) {
-        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+        ReuseShuffleCost -= (ReuseShuffleNumbers - IL.size()) * ScalarEltCost;
       }
-      VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
+      VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), IL.size());
       int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
-      int VecCost = TTI->getCmpSelInstrCost(S.getOpcode(), VecTy, MaskTy, VL0);
+      int VecCost = TTI->getCmpSelInstrCost(IL[0]->MainOpcode, VecTy,
+                                            MaskTy, VL0);
       return ReuseShuffleCost + VecCost - ScalarCost;
     }
     case Instruction::Add:
@@ -2247,7 +2484,7 @@
       TargetTransformInfo::OperandValueKind Op1VK =
           TargetTransformInfo::OK_AnyValue;
       TargetTransformInfo::OperandValueKind Op2VK =
-          TargetTransformInfo::OK_UniformConstantValue;
+          TargetTransformInfo::OK_AnyValue;
       TargetTransformInfo::OperandValueProperties Op1VP =
           TargetTransformInfo::OP_None;
       TargetTransformInfo::OperandValueProperties Op2VP =
@@ -2258,35 +2495,40 @@
       // If instead not all operands are constants, then set the operand kind
       // to OK_AnyValue. If all operands are constants but not the same,
       // then set the operand kind to OK_NonUniformConstantValue.
-      ConstantInt *CInt0 = nullptr;
-      for (unsigned i = 0, e = VL.size(); i < e; ++i) {
-        const Instruction *I = cast<Instruction>(VL[i]);
-        ConstantInt *CInt = dyn_cast<ConstantInt>(I->getOperand(1));
-        if (!CInt) {
-          Op2VK = TargetTransformInfo::OK_AnyValue;
-          Op2VP = TargetTransformInfo::OP_None;
-          break;
-        }
-        if (Op2VP == TargetTransformInfo::OP_PowerOf2 &&
-            !CInt->getValue().isPowerOf2())
-          Op2VP = TargetTransformInfo::OP_None;
-        if (i == 0) {
-          CInt0 = CInt;
-          continue;
+      if (auto *CInt = dyn_cast<ConstantInt>(IL[0]->MainOp->getOperand(1))) {
+        Op2VK = TargetTransformInfo::OK_UniformConstantValue;
+        const unsigned Opcode = IL[0]->MainOpcode;
+        for (auto *IOP : IL) {
+          auto *I = IOP->OpInst;
+          if (I == VL0 || Opcode != I->getOpcode())
+            continue;
+          if (!isa<ConstantInt>(I->getOperand(1))) {
+            Op2VK = TargetTransformInfo::OK_AnyValue;
+            Op2VP = TargetTransformInfo::OP_None;
+            break;
+          }
+          ConstantInt *CInt_cur = cast<ConstantInt>(I->getOperand(1));
+          if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
+              CInt != cast<ConstantInt>(I->getOperand(1)))
+            Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
+          if (Op2VP == TargetTransformInfo::OP_PowerOf2 &&
+              !CInt->getValue().isPowerOf2())
+            Op2VP = TargetTransformInfo::OP_None;
+          if (CInt != CInt_cur)
+            Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
         }
-        if (CInt0 != CInt)
-          Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
       }
 
       SmallVector<const Value *, 4> Operands(VL0->operand_values());
       int ScalarEltCost = TTI->getArithmeticInstrCost(
-          S.getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands);
+          IL[0]->MainOpcode, ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands);
       if (NeedToShuffleReuses) {
-        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+        ReuseShuffleCost -= (ReuseShuffleNumbers - IL.size()) * ScalarEltCost;
       }
       int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
-      int VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy, Op1VK,
-                                                Op2VK, Op1VP, Op2VP, Operands);
+      int VecCost = TTI->getArithmeticInstrCost(IL[0]->MainOpcode, VecTy,
+                                                Op1VK, Op2VK, Op1VP, Op2VP,
+                                                Operands);
       return ReuseShuffleCost + VecCost - ScalarCost;
     }
     case Instruction::GetElementPtr: {
@@ -2298,7 +2540,7 @@
       int ScalarEltCost =
           TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
       if (NeedToShuffleReuses) {
-        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+        ReuseShuffleCost -= (ReuseShuffleNumbers - IL.size()) * ScalarEltCost;
       }
       int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
       int VecCost =
@@ -2309,13 +2551,15 @@
       // Cost of wide load - cost of scalar loads.
       unsigned alignment = cast<LoadInst>(VL0)->getAlignment();
       int ScalarEltCost =
-          TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0);
+          TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0,
+                               VL0);
       if (NeedToShuffleReuses) {
-        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+        ReuseShuffleCost -= (ReuseShuffleNumbers - IL.size()) * ScalarEltCost;
       }
       int ScalarLdCost = VecTy->getNumElements() * ScalarEltCost;
       int VecLdCost =
-          TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0, VL0);
+          TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0,
+                               VL0);
       if (!E->ReorderIndices.empty()) {
         // TODO: Merge this shuffle with the ReuseShuffleCost.
         VecLdCost += TTI->getShuffleCost(
@@ -2329,7 +2573,7 @@
       int ScalarEltCost =
           TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0);
       if (NeedToShuffleReuses) {
-        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+        ReuseShuffleCost -= (ReuseShuffleNumbers - IL.size()) * ScalarEltCost;
       }
       int ScalarStCost = VecTy->getNumElements() * ScalarEltCost;
       int VecStCost =
@@ -2352,7 +2596,7 @@
       int ScalarEltCost =
           TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF);
       if (NeedToShuffleReuses) {
-        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+        ReuseShuffleCost -= (ReuseShuffleNumbers - IL.size()) * ScalarEltCost;
       }
       int ScalarCallCost = VecTy->getNumElements() * ScalarEltCost;
 
@@ -2367,44 +2611,38 @@
       return ReuseShuffleCost + VecCallCost - ScalarCallCost;
     }
     case Instruction::ShuffleVector: {
-      assert(S.isAltShuffle() &&
-             ((Instruction::isBinaryOp(S.getOpcode()) &&
-               Instruction::isBinaryOp(S.getAltOpcode())) ||
-              (Instruction::isCast(S.getOpcode()) &&
-               Instruction::isCast(S.getAltOpcode()))) &&
+      assert(IL[0]->isAltShuffle() &&
+             ((Instruction::isBinaryOp(IL[0]->MainOpcode) &&
+             Instruction::isBinaryOp(IL[0]->AltOpcode)) ||
+             (Instruction::isCast(IL[0]->MainOpcode) &&
+             Instruction::isCast(IL[0]->AltOpcode))) &&
              "Invalid Shuffle Vector Operand");
       int ScalarCost = 0;
       if (NeedToShuffleReuses) {
-        for (unsigned Idx : E->ReuseShuffleIndices) {
-          Instruction *I = cast<Instruction>(VL[Idx]);
+        for (unsigned Idx : E->ReuseShuffleIndices)
           ReuseShuffleCost -= TTI->getInstructionCost(
-              I, TargetTransformInfo::TCK_RecipThroughput);
-        }
-        for (Value *V : VL) {
-          Instruction *I = cast<Instruction>(V);
+              IL[Idx]->OpInst, TargetTransformInfo::TCK_RecipThroughput);
+        for (auto *IOP : IL)
           ReuseShuffleCost += TTI->getInstructionCost(
-              I, TargetTransformInfo::TCK_RecipThroughput);
-        }
+              IOP->OpInst, TargetTransformInfo::TCK_RecipThroughput);
       }
-      for (Value *i : VL) {
-        Instruction *I = cast<Instruction>(i);
-        assert(S.isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
+      for (auto *IOP : IL)
         ScalarCost += TTI->getInstructionCost(
-            I, TargetTransformInfo::TCK_RecipThroughput);
-      }
+            IOP->OpInst, TargetTransformInfo::TCK_RecipThroughput);
       // VecCost is equal to sum of the cost of creating 2 vectors
       // and the cost of creating shuffle.
       int VecCost = 0;
-      if (Instruction::isBinaryOp(S.getOpcode())) {
-        VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy);
-        VecCost += TTI->getArithmeticInstrCost(S.getAltOpcode(), VecTy);
+      if (Instruction::isBinaryOp(IL[0]->MainOpcode)) {
+        VecCost = TTI->getArithmeticInstrCost(IL[0]->MainOpcode, VecTy);
+        VecCost += TTI->getArithmeticInstrCost(IL[0]->AltOpcode, VecTy);
       } else {
-        Type *Src0SclTy = S.MainOp->getOperand(0)->getType();
-        Type *Src1SclTy = S.AltOp->getOperand(0)->getType();
-        VectorType *Src0Ty = VectorType::get(Src0SclTy, VL.size());
-        VectorType *Src1Ty = VectorType::get(Src1SclTy, VL.size());
-        VecCost = TTI->getCastInstrCost(S.getOpcode(), VecTy, Src0Ty);
-        VecCost += TTI->getCastInstrCost(S.getAltOpcode(), VecTy, Src1Ty);
+        Type *Src0SclTy = IL[0]->MainOp->getOperand(0)->getType();
+        Type *Src1SclTy = IL[0]->AltOp->getOperand(0)->getType();
+        VectorType *Src0Ty = VectorType::get(Src0SclTy, IL.size());
+        VectorType *Src1Ty = VectorType::get(Src1SclTy, IL.size());
+        VecCost = TTI->getCastInstrCost(IL[0]->MainOpcode, VecTy, Src0Ty);
+        VecCost += TTI->getCastInstrCost(IL[0]->AltOpcode, VecTy,
+                                         Src1Ty);
       }
       VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0);
       return ReuseShuffleCost + VecCost - ScalarCost;
@@ -2428,7 +2666,7 @@
   // Handle splat and all-constants stores.
   if (!VectorizableTree[0].NeedToGather &&
       (allConstant(VectorizableTree[1].Scalars) ||
-       isSplat(VectorizableTree[1].Scalars)))
+      isSplat(VectorizableTree[1].Scalars)))
     return true;
 
   // Gathering cost would be too much for tiny trees.
@@ -2438,6 +2676,23 @@
   return true;
 }
 
+Instruction *BoUpSLP::propagateMetadata(Instruction *I,
+                                        ArrayRef<InstructionOrPseudo *> IL) {
+  ValueList VL;
+  for (auto *IOP: IL)
+    VL.push_back(IOP->Op);
+  return llvm::propagateMetadata(I, VL);
+}
+
+void BoUpSLP::propagateIRFlags(Value *I,
+                               ArrayRef<InstructionOrPseudo *> IL,
+                               Value *OpValue) {
+  ValueList VL;
+  for (auto *IOP: IL)
+    VL.push_back(IOP->Op);
+  llvm::propagateIRFlags(I, VL, OpValue);
+}
+
 bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() {
   // We can vectorize the tree if its size is greater than or equal to the
   // minimum size specified by the MinTreeSize command line option.
@@ -2470,7 +2725,7 @@
   Instruction *PrevInst = nullptr;
 
   for (const auto &N : VectorizableTree) {
-    Instruction *Inst = dyn_cast<Instruction>(N.Scalars[0]);
+    Instruction *Inst = N.Scalars[0]->MainOp;
     if (!Inst)
       continue;
 
@@ -2482,8 +2737,9 @@
     // Update LiveValues.
     LiveValues.erase(PrevInst);
     for (auto &J : PrevInst->operands()) {
-      if (isa<Instruction>(&*J) && getTreeEntry(&*J))
-        LiveValues.insert(cast<Instruction>(&*J));
+      auto *I = dyn_cast<Instruction>(&*J);
+      if (I && getTreeEntry(I))
+        LiveValues.insert(I);
     }
 
     LLVM_DEBUG({
@@ -2554,7 +2810,7 @@
 
     int C = getEntryCost(&TE);
     LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
-                      << " for bundle that starts with " << *TE.Scalars[0]
+                      << " for bundle that starts with " << *TE.Scalars[0]->Op
                       << ".\n");
     Cost += C;
   }
@@ -2576,7 +2832,7 @@
     // extend the extracted value back to the original type. Here, we account
     // for the extract and the added cost of the sign extend if needed.
     auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
-    auto *ScalarRoot = VectorizableTree[0].Scalars[0];
+    auto *ScalarRoot = VectorizableTree[0].Scalars[0]->Op;
     if (MinBWs.count(ScalarRoot)) {
       auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
       auto Extend =
@@ -2619,21 +2875,21 @@
   return Cost;
 }
 
-int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
-  // Find the type of the operands in VL.
-  Type *ScalarTy = VL[0]->getType();
-  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+int BoUpSLP::getGatherCost(ArrayRef<InstructionOrPseudo *> IL) {
+  // Find the type of the operands in IL.
+  Type *ScalarTy = IL[0]->Op->getType();
+  if (StoreInst *SI = dyn_cast<StoreInst>(IL[0]->Op))
     ScalarTy = SI->getValueOperand()->getType();
-  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
+  VectorType *VecTy = VectorType::get(ScalarTy, IL.size());
   // Find the cost of inserting/extracting values from the vector.
   // Check if the same elements are inserted several times and count them as
   // shuffle candidates.
   DenseSet<unsigned> ShuffledElements;
   DenseSet<Value *> UniqueElements;
   // Iterate in reverse order to consider insert elements with the high cost.
-  for (unsigned I = VL.size(); I > 0; --I) {
+  for (unsigned I = IL.size(); I > 0; --I) {
     unsigned Idx = I - 1;
-    if (!UniqueElements.insert(VL[Idx]).second)
+    if (!UniqueElements.insert(IL[Idx]->Op).second)
       ShuffledElements.insert(Idx);
   }
   return getGatherCost(VecTy, ShuffledElements);
@@ -2648,25 +2904,29 @@
 // load a[3] + load b[3]
 // Reordering the second load b[1]  load a[1] would allow us to vectorize this
 // code.
-void BoUpSLP::reorderAltShuffleOperands(const InstructionsState &S,
-                                        ArrayRef<Value *> VL,
+void BoUpSLP::reorderAltShuffleOperands(ArrayRef<InstructionOrPseudo *> IL,
                                         SmallVectorImpl<Value *> &Left,
                                         SmallVectorImpl<Value *> &Right) {
   // Push left and right operands of binary operation into Left and Right
-  for (Value *V : VL) {
-    auto *I = cast<Instruction>(V);
-    assert(S.isOpcodeOrAlt(I) && "Incorrect instruction in vector");
-    Left.push_back(I->getOperand(0));
-    Right.push_back(I->getOperand(1));
+  for (auto *Inst : IL) {
+    if (Inst->isOpcodeOrAlt()) {
+      Left.push_back(Inst->OpInst->getOperand(0));
+      Right.push_back(Inst->OpInst->getOperand(1));
+    } else {
+      Left.push_back(Inst->OpInst);
+      Right.push_back(ConstantExpr::getBinOpIdentity(Inst->MainOpcode,
+                                                     Inst->OpInst->getType(),
+                                                     true));
+    }
   }
 
   // Reorder if we have a commutative operation and consecutive access
   // are on either side of the alternate instructions.
-  for (unsigned j = 0; j < VL.size() - 1; ++j) {
+  for (unsigned j = 0; j < IL.size() - 1; ++j) {
     if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
       if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
-        Instruction *VL1 = cast<Instruction>(VL[j]);
-        Instruction *VL2 = cast<Instruction>(VL[j + 1]);
+        Instruction *VL1 = IL[j]->OpInst;
+        Instruction *VL2 = IL[j + 1]->OpInst;
         if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) {
           std::swap(Left[j], Right[j]);
           continue;
@@ -2680,8 +2940,8 @@
     }
     if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
       if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
-        Instruction *VL1 = cast<Instruction>(VL[j]);
-        Instruction *VL2 = cast<Instruction>(VL[j + 1]);
+        Instruction *VL1 = IL[j]->OpInst;
+        Instruction *VL2 = IL[j + 1]->OpInst;
         if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) {
           std::swap(Left[j], Right[j]);
           continue;
@@ -2706,8 +2966,13 @@
     int i, unsigned Opcode, Instruction &I, ArrayRef<Value *> Left,
     ArrayRef<Value *> Right, bool AllSameOpcodeLeft, bool AllSameOpcodeRight,
     bool SplatLeft, bool SplatRight, Value *&VLeft, Value *&VRight) {
-  VLeft = I.getOperand(0);
-  VRight = I.getOperand(1);
+  if (I.getOpcode() == Opcode) {
+    VLeft = I.getOperand(0);
+    VRight = I.getOperand(1);
+  } else {
+    VLeft = &I;
+    VRight = ConstantExpr::getBinOpIdentity(Opcode, I.getType(), true);
+  }
   // If we have "SplatRight", try to see if commuting is needed to preserve it.
   if (SplatRight) {
     if (VRight == Right[i - 1])
@@ -2764,15 +3029,22 @@
 }
 
 void BoUpSLP::reorderInputsAccordingToOpcode(unsigned Opcode,
-                                             ArrayRef<Value *> VL,
+                                             ArrayRef<InstructionOrPseudo *> IL,
                                              SmallVectorImpl<Value *> &Left,
                                              SmallVectorImpl<Value *> &Right) {
-  if (!VL.empty()) {
+  if (!IL.empty()) {
     // Peel the first iteration out of the loop since there's nothing
     // interesting to do anyway and it simplifies the checks in the loop.
-    auto *I = cast<Instruction>(VL[0]);
-    Value *VLeft = I->getOperand(0);
-    Value *VRight = I->getOperand(1);
+    Value *VLeft;
+    Value *VRight;
+    if (IL[0]->OpInst->getOpcode() == Opcode) {
+      VLeft = IL[0]->OpInst->getOperand(0);
+      VRight = IL[0]->OpInst->getOperand(1);
+    } else {
+      VLeft = IL[0]->OpInst;
+      VRight = ConstantExpr::getBinOpIdentity(Opcode, IL[0]->OpInst->getType(),
+                                              true);
+    }
     if (!isa<Instruction>(VRight) && isa<Instruction>(VLeft))
       // Favor having instruction to the right. FIXME: why?
       std::swap(VLeft, VRight);
@@ -2787,8 +3059,8 @@
   bool SplatLeft = true;
   bool SplatRight = true;
 
-  for (unsigned i = 1, e = VL.size(); i != e; ++i) {
-    Instruction *I = cast<Instruction>(VL[i]);
+  for (unsigned i = 1, e = IL.size(); i != e; ++i) {
+    Instruction *I = IL[i]->OpInst;
     assert(((I->getOpcode() == Opcode && I->isCommutative()) ||
             (I->getOpcode() != Opcode && Instruction::isCommutative(Opcode))) &&
            "Can only process commutative instruction");
@@ -2835,7 +3107,7 @@
   // add a[1],c[2]  load b[1]
   // b[2]           load b[2]
   // add a[3],c[3]  load b[3]
-  for (unsigned j = 0, e = VL.size() - 1; j < e; ++j) {
+  for (unsigned j = 0, e = IL.size() - 1; j < e; ++j) {
     if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
       if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
         if (isConsecutiveAccess(L, L1, *DL, *SE)) {
@@ -2856,31 +3128,27 @@
   }
 }
 
-void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL,
-                                        const InstructionsState &S) {
+void BoUpSLP::setInsertPointAfterBundle(ArrayRef<InstructionOrPseudo *> IL) {
   // Get the basic block this bundle is in. All instructions in the bundle
   // should be in this block.
-  auto *Front = cast<Instruction>(S.OpValue);
+  auto *Front = IL[0]->MainOp;
   auto *BB = Front->getParent();
-  assert(llvm::all_of(make_range(VL.begin(), VL.end()), [=](Value *V) -> bool {
-    auto *I = cast<Instruction>(V);
-    return !S.isOpcodeOrAlt(I) || I->getParent() == BB;
-  }));
+  assert(llvm::all_of(
+      make_range(IL.begin(), IL.end()), [=](InstructionOrPseudo *I) -> bool {
+        return !I->isOpcodeOrAlt() || I->OpInst->getParent() == BB;
+      }));
 
   // The last instruction in the bundle in program order.
   Instruction *LastInst = nullptr;
 
-  // Find the last instruction. The common case should be that BB has been
-  // scheduled, and the last instruction is VL.back(). So we start with
-  // VL.back() and iterate over schedule data until we reach the end of the
-  // bundle. The end of the bundle is marked by null ScheduleData.
+  // Find the last instruction. If the bundle is not scheduled then
+  // the first in the bundle is the last one in BB, because we discover
+  // bundles in backward walk.
   if (BlocksSchedules.count(BB)) {
-    auto *Bundle =
-        BlocksSchedules[BB]->getScheduleData(isOneOf(S, VL.back()));
+    BlockScheduling *BS = BlocksSchedules[BB].get();
+    auto *Bundle = BS->getScheduleData(IL[0]);
     if (Bundle && Bundle->isPartOfBundle())
-      for (; Bundle; Bundle = Bundle->NextInBundle)
-        if (Bundle->OpValue == Bundle->Inst)
-          LastInst = Bundle->Inst;
+      LastInst = Bundle->FirstInBundle->Inst;
   }
 
   // LastInst can still be null at this point if there's either not an entry
@@ -2902,9 +3170,11 @@
   // we both exit early from buildTree_rec and that the bundle be out-of-order
   // (causing us to iterate all the way to the end of the block).
   if (!LastInst) {
-    SmallPtrSet<Value *, 16> Bundle(VL.begin(), VL.end());
+    SmallPtrSet<Value *, 16> Bundle;
+    for (auto *IOP : IL)
+      Bundle.insert(IOP->Op);
     for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) {
-      if (Bundle.erase(&I) && S.isOpcodeOrAlt(&I))
+      if (Bundle.erase(&I))
         LastInst = &I;
       if (Bundle.empty())
         break;
@@ -2917,22 +3187,22 @@
   Builder.SetCurrentDebugLocation(Front->getDebugLoc());
 }
 
-Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
+Value *BoUpSLP::Gather(ArrayRef<InstructionOrPseudo *> IL, VectorType *Ty) {
   Value *Vec = UndefValue::get(Ty);
   // Generate the 'InsertElement' instruction.
   for (unsigned i = 0; i < Ty->getNumElements(); ++i) {
-    Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
+    Vec = Builder.CreateInsertElement(Vec, IL[i]->Op, Builder.getInt32(i));
     if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) {
       GatherSeq.insert(Insrt);
       CSEBlocks.insert(Insrt->getParent());
 
       // Add to our 'need-to-extract' list.
-      if (TreeEntry *E = getTreeEntry(VL[i])) {
+      if (TreeEntry *E = getTreeEntry(IL[i]->OpInst)) {
         // Find which lane we need to extract.
         int FoundLane = -1;
         for (unsigned Lane = 0, LE = E->Scalars.size(); Lane != LE; ++Lane) {
           // Is this the lane of the scalar that we are looking for ?
-          if (E->Scalars[Lane] == VL[i]) {
+          if (E->Scalars[Lane]->Op == IL[i]->Op) {
             FoundLane = Lane;
             break;
           }
@@ -2943,7 +3213,7 @@
               std::distance(E->ReuseShuffleIndices.begin(),
                             llvm::find(E->ReuseShuffleIndices, FoundLane));
         }
-        ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));
+        ExternalUses.push_back(ExternalUser(IL[i]->Op, Insrt, FoundLane));
       }
     }
   }
@@ -2951,29 +3221,33 @@
   return Vec;
 }
 
-Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
-  InstructionsState S = getSameOpcode(VL);
-  if (S.getOpcode()) {
-    if (TreeEntry *E = getTreeEntry(S.OpValue)) {
-      if (E->isSame(VL)) {
-        Value *V = vectorizeTree(E);
-        if (VL.size() == E->Scalars.size() && !E->ReuseShuffleIndices.empty()) {
-          // We need to get the vectorized value but without shuffle.
-          if (auto *SV = dyn_cast<ShuffleVectorInst>(V)) {
-            V = SV->getOperand(0);
-          } else {
-            // Reshuffle to get only unique values.
-            SmallVector<unsigned, 4> UniqueIdxs;
-            SmallSet<unsigned, 4> UsedIdxs;
-            for(unsigned Idx : E->ReuseShuffleIndices)
-              if (UsedIdxs.insert(Idx).second)
+Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL, Value *Parent) {
+  IOPList IL;
+  for (Value *V: VL)
+    IL.push_back(allocInstructionOrPseudo(Parent, V));
+  InstructionsState S = getSameOpcode(IL);
+  for (InstructionOrPseudo *IOP: IL)
+    IOP->setInstructionsState(S);
+  if (IL[0]->MainOp) {
+    TreeEntry *E = getTreeEntry(IL[0]->MainOp);
+    if (E && E->isSame(IL)) {
+      Value *V = vectorizeTree(E);
+      if (VL.size() == E->Scalars.size() && !E->ReuseShuffleIndices.empty()) {
+        // We need to get the vectorized value but without shuffle.
+        if (auto *SV = dyn_cast<ShuffleVectorInst>(V)) {
+          V = SV->getOperand(0);
+        } else {
+          // Reshuffle to get only unique values.
+          SmallVector<unsigned, 4> UniqueIdxs;
+          SmallSet<unsigned, 4> UsedIdxs;
+          for (unsigned Idx : E->ReuseShuffleIndices)
+            if (UsedIdxs.insert(Idx).second)
                 UniqueIdxs.emplace_back(Idx);
-            V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()),
-                                            UniqueIdxs);
-          }
+          V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()),
+                                          UniqueIdxs);
         }
-        return V;
       }
+      return V;
     }
   }
 
@@ -2983,26 +3257,26 @@
 
   // Check that every instruction appears once in this bundle.
   SmallVector<unsigned, 4> ReuseShuffleIndicies;
-  SmallVector<Value *, 4> UniqueValues;
-  if (VL.size() > 2) {
+  IOPList UniqueValues;
+  if (IL.size() > 2) {
     DenseMap<Value *, unsigned> UniquePositions;
-    for (Value *V : VL) {
-      auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
+    for (auto *I : IL) {
+      auto Res = UniquePositions.try_emplace(I->Op, UniqueValues.size());
       ReuseShuffleIndicies.emplace_back(Res.first->second);
-      if (Res.second || isa<Constant>(V))
-        UniqueValues.emplace_back(V);
+      if (Res.second || isa<Constant>(I->Op))
+        UniqueValues.emplace_back(I);
     }
     // Do not shuffle single element or if number of unique values is not power
     // of 2.
-    if (UniqueValues.size() == VL.size() || UniqueValues.size() <= 1 ||
+    if (UniqueValues.size() == IL.size() || UniqueValues.size() <= 1 ||
         !llvm::isPowerOf2_32(UniqueValues.size()))
       ReuseShuffleIndicies.clear();
     else
-      VL = UniqueValues;
+      IL = UniqueValues;
   }
-  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
+  VectorType *VecTy = VectorType::get(ScalarTy, IL.size());
 
-  Value *V = Gather(VL, VecTy);
+  Value *V = Gather(IL, VecTy);
   if (!ReuseShuffleIndicies.empty()) {
     V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
                                     ReuseShuffleIndicies, "shuffle");
@@ -3027,12 +3301,12 @@
   IRBuilder<>::InsertPointGuard Guard(Builder);
 
   if (E->VectorizedValue) {
-    LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
+    LLVM_DEBUG(dbgs() << "SLP: Diamond merged for "
+                      << *E->Scalars[0]->MainOp << ".\n");
     return E->VectorizedValue;
   }
 
-  InstructionsState S = getSameOpcode(E->Scalars);
-  Instruction *VL0 = cast<Instruction>(S.OpValue);
+  auto *VL0 = E->Scalars[0]->MainOp;
   Type *ScalarTy = VL0->getType();
   if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
     ScalarTy = SI->getValueOperand()->getType();
@@ -3041,7 +3315,7 @@
   bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
 
   if (E->NeedToGather) {
-    setInsertPointAfterBundle(E->Scalars, S);
+    setInsertPointAfterBundle(E->Scalars);
     auto *V = Gather(E->Scalars, VecTy);
     if (NeedToShuffleReuses) {
       V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
@@ -3055,8 +3329,8 @@
     return V;
   }
 
-  unsigned ShuffleOrOp = S.isAltShuffle() ?
-           (unsigned) Instruction::ShuffleVector : S.getOpcode();
+  unsigned ShuffleOrOp = E->Scalars[0]->isAltShuffle() ?
+           (unsigned) Instruction::ShuffleVector : E->Scalars[0]->MainOpcode;
   switch (ShuffleOrOp) {
     case Instruction::PHI: {
       PHINode *PH = dyn_cast<PHINode>(VL0);
@@ -3084,12 +3358,13 @@
         }
 
         // Prepare the operand vector.
-        for (Value *V : E->Scalars)
-          Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(IBB));
+        for (auto *IOP : E->Scalars)
+          Operands.push_back(
+              cast<PHINode>(IOP->Op)->getIncomingValueForBlock(IBB));
 
         Builder.SetInsertPoint(IBB->getTerminator());
         Builder.SetCurrentDebugLocation(PH->getDebugLoc());
-        Value *Vec = vectorizeTree(Operands);
+        Value *Vec = vectorizeTree(Operands, VL0);
         NewPhi->addIncoming(Vec, IBB);
       }
 
@@ -3118,7 +3393,7 @@
         E->VectorizedValue = V;
         return V;
       }
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E->Scalars);
       auto *V = Gather(E->Scalars, VecTy);
       if (NeedToShuffleReuses) {
         V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
@@ -3153,7 +3428,7 @@
         E->VectorizedValue = NewV;
         return NewV;
       }
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E->Scalars);
       auto *V = Gather(E->Scalars, VecTy);
       if (NeedToShuffleReuses) {
         V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
@@ -3179,12 +3454,12 @@
     case Instruction::FPTrunc:
     case Instruction::BitCast: {
       ValueList INVL;
-      for (Value *V : E->Scalars)
-        INVL.push_back(cast<Instruction>(V)->getOperand(0));
+      for (auto *IOP : E->Scalars)
+        INVL.push_back(IOP->OpInst->getOperand(0));
 
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E->Scalars);
 
-      Value *InVec = vectorizeTree(INVL);
+      Value *InVec = vectorizeTree(INVL, VL0);
 
       if (E->VectorizedValue) {
         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
@@ -3204,15 +3479,15 @@
     case Instruction::FCmp:
     case Instruction::ICmp: {
       ValueList LHSV, RHSV;
-      for (Value *V : E->Scalars) {
-        LHSV.push_back(cast<Instruction>(V)->getOperand(0));
-        RHSV.push_back(cast<Instruction>(V)->getOperand(1));
+      for (auto *IOP : E->Scalars) {
+        LHSV.push_back(IOP->OpInst->getOperand(0));
+        RHSV.push_back(IOP->OpInst->getOperand(1));
       }
 
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E->Scalars);
 
-      Value *L = vectorizeTree(LHSV);
-      Value *R = vectorizeTree(RHSV);
+      Value *L = vectorizeTree(LHSV, VL0);
+      Value *R = vectorizeTree(RHSV, VL0);
 
       if (E->VectorizedValue) {
         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
@@ -3221,7 +3496,7 @@
 
       CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
       Value *V;
-      if (S.getOpcode() == Instruction::FCmp)
+      if (E->Scalars[0]->MainOpcode == Instruction::FCmp)
         V = Builder.CreateFCmp(P0, L, R);
       else
         V = Builder.CreateICmp(P0, L, R);
@@ -3237,17 +3512,17 @@
     }
     case Instruction::Select: {
       ValueList TrueVec, FalseVec, CondVec;
-      for (Value *V : E->Scalars) {
-        CondVec.push_back(cast<Instruction>(V)->getOperand(0));
-        TrueVec.push_back(cast<Instruction>(V)->getOperand(1));
-        FalseVec.push_back(cast<Instruction>(V)->getOperand(2));
+      for (auto *IOP : E->Scalars) {
+        CondVec.push_back(IOP->OpInst->getOperand(0));
+        TrueVec.push_back(IOP->OpInst->getOperand(1));
+        FalseVec.push_back(IOP->OpInst->getOperand(2));
       }
 
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E->Scalars);
 
-      Value *Cond = vectorizeTree(CondVec);
-      Value *True = vectorizeTree(TrueVec);
-      Value *False = vectorizeTree(FalseVec);
+      Value *Cond = vectorizeTree(CondVec, VL0);
+      Value *True = vectorizeTree(TrueVec, VL0);
+      Value *False = vectorizeTree(FalseVec, VL0);
 
       if (E->VectorizedValue) {
         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
@@ -3282,20 +3557,19 @@
     case Instruction::Or:
     case Instruction::Xor: {
       ValueList LHSVL, RHSVL;
-      if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
-        reorderInputsAccordingToOpcode(S.getOpcode(), E->Scalars, LHSVL,
-                                       RHSVL);
-      else
-        for (Value *V : E->Scalars) {
-          auto *I = cast<Instruction>(V);
-          LHSVL.push_back(I->getOperand(0));
-          RHSVL.push_back(I->getOperand(1));
-        }
+      if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
+        reorderInputsAccordingToOpcode(E->Scalars[0]->MainOpcode,
+                                       E->Scalars, LHSVL, RHSVL);
+      } else
+      for (auto *IOP : E->Scalars) {
+        LHSVL.push_back(IOP->OpInst->getOperand(0));
+        RHSVL.push_back(IOP->OpInst->getOperand(1));
+      }
 
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E->Scalars);
 
-      Value *LHS = vectorizeTree(LHSVL);
-      Value *RHS = vectorizeTree(RHSVL);
+      Value *LHS = vectorizeTree(LHSVL, VL0);
+      Value *RHS = vectorizeTree(RHSVL, VL0);
 
       if (E->VectorizedValue) {
         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
@@ -3303,7 +3577,7 @@
       }
 
       Value *V = Builder.CreateBinOp(
-          static_cast<Instruction::BinaryOps>(S.getOpcode()), LHS, RHS);
+          static_cast<Instruction::BinaryOps>(VL0->getOpcode()), LHS, RHS);
       propagateIRFlags(V, E->Scalars, VL0);
       if (auto *I = dyn_cast<Instruction>(V))
         V = propagateMetadata(I, E->Scalars);
@@ -3322,10 +3596,11 @@
       // sink them all the way down past store instructions.
       bool IsReorder = !E->ReorderIndices.empty();
       if (IsReorder) {
-        S = getSameOpcode(E->Scalars, E->ReorderIndices.front());
+        InstructionsState S = getSameOpcode(E->Scalars,
+                                            E->ReorderIndices.front());
         VL0 = cast<Instruction>(S.OpValue);
       }
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E->Scalars);
 
       LoadInst *LI = cast<LoadInst>(VL0);
       Type *ScalarLoadTy = LI->getType();
@@ -3338,7 +3613,7 @@
       // ExternalUses list to make sure that an extract will be generated in the
       // future.
       Value *PO = LI->getPointerOperand();
-      if (getTreeEntry(PO))
+      if (getTreeEntry(dyn_cast<Instruction>(PO)))
         ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0));
 
       unsigned Alignment = LI->getAlignment();
@@ -3369,12 +3644,13 @@
       unsigned AS = SI->getPointerAddressSpace();
 
       ValueList ScalarStoreValues;
-      for (Value *V : E->Scalars)
-        ScalarStoreValues.push_back(cast<StoreInst>(V)->getValueOperand());
+      for (auto *IOP : E->Scalars)
+        ScalarStoreValues.push_back(
+            cast<StoreInst>(IOP->OpInst)->getValueOperand());
 
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E->Scalars);
 
-      Value *VecValue = vectorizeTree(ScalarStoreValues);
+      Value *VecValue = vectorizeTree(ScalarStoreValues, VL0);
       Value *ScalarPtr = SI->getPointerOperand();
       Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS));
       StoreInst *ST = Builder.CreateStore(VecValue, VecPtr);
@@ -3382,7 +3658,7 @@
       // The pointer operand uses an in-tree scalar, so add the new BitCast to
       // ExternalUses to make sure that an extract will be generated in the
       // future.
-      if (getTreeEntry(ScalarPtr))
+      if (getTreeEntry(dyn_cast<Instruction>(ScalarPtr)))
         ExternalUses.push_back(ExternalUser(ScalarPtr, cast<User>(VecPtr), 0));
 
       if (!Alignment)
@@ -3399,22 +3675,22 @@
       return V;
     }
     case Instruction::GetElementPtr: {
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E->Scalars);
 
       ValueList Op0VL;
-      for (Value *V : E->Scalars)
-        Op0VL.push_back(cast<GetElementPtrInst>(V)->getOperand(0));
+      for (auto *IOP : E->Scalars)
+        Op0VL.push_back(cast<GetElementPtrInst>(IOP->OpInst)->getOperand(0));
 
-      Value *Op0 = vectorizeTree(Op0VL);
+      Value *Op0 = vectorizeTree(Op0VL, VL0);
 
       std::vector<Value *> OpVecs;
       for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
            ++j) {
         ValueList OpVL;
-        for (Value *V : E->Scalars)
-          OpVL.push_back(cast<GetElementPtrInst>(V)->getOperand(j));
+        for (auto *IOP : E->Scalars)
+          OpVL.push_back(cast<GetElementPtrInst>(IOP->OpInst)->getOperand(j));
 
-        Value *OpVec = vectorizeTree(OpVL);
+        Value *OpVec = vectorizeTree(OpVL, VL0);
         OpVecs.push_back(OpVec);
       }
 
@@ -3434,7 +3710,7 @@
     }
     case Instruction::Call: {
       CallInst *CI = cast<CallInst>(VL0);
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E->Scalars);
       Function *FI;
       Intrinsic::ID IID  = Intrinsic::not_intrinsic;
       Value *ScalarArg = nullptr;
@@ -3452,12 +3728,12 @@
           OpVecs.push_back(CEI->getArgOperand(j));
           continue;
         }
-        for (Value *V : E->Scalars) {
-          CallInst *CEI = cast<CallInst>(V);
+        for (auto *IOP : E->Scalars) {
+          CallInst *CEI = cast<CallInst>(IOP->OpInst);
           OpVL.push_back(CEI->getArgOperand(j));
         }
 
-        Value *OpVec = vectorizeTree(OpVL);
+        Value *OpVec = vectorizeTree(OpVL, VL0);
         LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
         OpVecs.push_back(OpVec);
       }
@@ -3473,7 +3749,7 @@
       // The scalar argument uses an in-tree scalar so we add the new vectorized
       // call to ExternalUses list to make sure that an extract will be
       // generated in the future.
-      if (ScalarArg && getTreeEntry(ScalarArg))
+      if (ScalarArg && getTreeEntry(dyn_cast<Instruction>(ScalarArg)))
         ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
 
       propagateIRFlags(V, E->Scalars, VL0);
@@ -3487,25 +3763,25 @@
     }
     case Instruction::ShuffleVector: {
       ValueList LHSVL, RHSVL;
-      assert(S.isAltShuffle() &&
-             ((Instruction::isBinaryOp(S.getOpcode()) &&
-               Instruction::isBinaryOp(S.getAltOpcode())) ||
-              (Instruction::isCast(S.getOpcode()) &&
-               Instruction::isCast(S.getAltOpcode()))) &&
+      assert(E->Scalars[0]->isAltShuffle() &&
+             ((Instruction::isBinaryOp(E->Scalars[0]->MainOpcode) &&
+             Instruction::isBinaryOp(E->Scalars[0]->AltOpcode)) ||
+             (Instruction::isCast(E->Scalars[0]->MainOpcode) &&
+             Instruction::isCast(E->Scalars[0]->AltOpcode))) &&
              "Invalid Shuffle Vector Operand");
 
       Value *LHS, *RHS;
-      if (Instruction::isBinaryOp(S.getOpcode())) {
-        reorderAltShuffleOperands(S, E->Scalars, LHSVL, RHSVL);
-        setInsertPointAfterBundle(E->Scalars, S);
-        LHS = vectorizeTree(LHSVL);
-        RHS = vectorizeTree(RHSVL);
+      if (Instruction::isBinaryOp(E->Scalars[0]->MainOpcode)) {
+        reorderAltShuffleOperands(E->Scalars, LHSVL, RHSVL);
+        setInsertPointAfterBundle(E->Scalars);
+        LHS = vectorizeTree(LHSVL, VL0);
+        RHS = vectorizeTree(RHSVL, VL0);
       } else {
         ValueList INVL;
-        for (Value *V : E->Scalars)
-          INVL.push_back(cast<Instruction>(V)->getOperand(0));
-        setInsertPointAfterBundle(E->Scalars, S);
-        LHS = vectorizeTree(INVL);
+        for (auto *IOP : E->Scalars)
+          INVL.push_back(IOP->OpInst->getOperand(0));
+        setInsertPointAfterBundle(E->Scalars);
+        LHS = vectorizeTree(INVL, VL0);
       }
 
       if (E->VectorizedValue) {
@@ -3514,28 +3790,30 @@
       }
 
       Value *V0, *V1;
-      if (Instruction::isBinaryOp(S.getOpcode())) {
+      if (Instruction::isBinaryOp(E->Scalars[0]->MainOpcode)) {
         V0 = Builder.CreateBinOp(
-          static_cast<Instruction::BinaryOps>(S.getOpcode()), LHS, RHS);
+          static_cast<Instruction::BinaryOps>(E->Scalars[0]->MainOpcode), LHS,
+          RHS);
         V1 = Builder.CreateBinOp(
-          static_cast<Instruction::BinaryOps>(S.getAltOpcode()), LHS, RHS);
+          static_cast<Instruction::BinaryOps>(E->Scalars[0]->AltOpcode), LHS,
+          RHS);
       } else {
         V0 = Builder.CreateCast(
-            static_cast<Instruction::CastOps>(S.getOpcode()), LHS, VecTy);
+            static_cast<Instruction::CastOps>(E->Scalars[0]->MainOpcode), LHS,
+            VecTy);
         V1 = Builder.CreateCast(
-            static_cast<Instruction::CastOps>(S.getAltOpcode()), LHS, VecTy);
+            static_cast<Instruction::CastOps>(E->Scalars[0]->AltOpcode), LHS,
+            VecTy);
       }
 
       // Create shuffle to take alternate operations from the vector.
       // Also, gather up main and alt scalar ops to propagate IR flags to
       // each vector operation.
-      ValueList OpScalars, AltScalars;
+      IOPList OpScalars, AltScalars;
       unsigned e = E->Scalars.size();
       SmallVector<Constant *, 8> Mask(e);
       for (unsigned i = 0; i < e; ++i) {
-        auto *OpInst = cast<Instruction>(E->Scalars[i]);
-        assert(S.isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode");
-        if (OpInst->getOpcode() == S.getAltOpcode()) {
+        if (E->Scalars[i]->OpInst->getOpcode() == E->Scalars[0]->AltOpcode) {
           Mask[i] = Builder.getInt32(e + i);
           AltScalars.push_back(E->Scalars[i]);
         } else {
@@ -3545,8 +3823,10 @@
       }
 
       Value *ShuffleMask = ConstantVector::get(Mask);
-      propagateIRFlags(V0, OpScalars);
-      propagateIRFlags(V1, AltScalars);
+      InstructionsState S = getSameOpcode(OpScalars);
+      propagateIRFlags(V0, OpScalars, S.OpValue);
+      S = getSameOpcode(AltScalars);
+      propagateIRFlags(V1, AltScalars, S.OpValue);
 
       Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
       if (Instruction *I = dyn_cast<Instruction>(V))
@@ -3573,9 +3853,10 @@
 
 Value *
 BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
-  // All blocks must be scheduled before any instructions are inserted.
   for (auto &BSIter : BlocksSchedules) {
-    scheduleBlock(BSIter.second.get());
+    BlockScheduling *BS = BSIter.second.get();
+    eliminatePseudos(BS);
+    scheduleBlock(BS);
   }
 
   Builder.SetInsertPoint(&F->getEntryBlock().front());
@@ -3584,7 +3865,7 @@
   // If the vectorized tree can be rewritten in a smaller type, we truncate the
   // vectorized root. InstCombine will then rewrite the entire expression. We
   // sign extend the extracted values below.
-  auto *ScalarRoot = VectorizableTree[0].Scalars[0];
+  auto *ScalarRoot = VectorizableTree[0].Scalars[0]->MainOp;
   if (MinBWs.count(ScalarRoot)) {
     if (auto *I = dyn_cast<Instruction>(VectorRoot))
       Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
@@ -3609,7 +3890,7 @@
   };
 
   // Extract all of the elements with the external uses.
-  for (const auto &ExternalUse : ExternalUses) {
+  for (auto &ExternalUse : ExternalUses) {
     Value *Scalar = ExternalUse.Scalar;
     llvm::User *User = ExternalUse.User;
 
@@ -3617,7 +3898,7 @@
     // has multiple uses of the same value.
     if (User && !is_contained(Scalar->users(), User))
       continue;
-    TreeEntry *E = getTreeEntry(Scalar);
+    TreeEntry *E = getTreeEntry(dyn_cast<Instruction>(Scalar));
     assert(E && "Invalid scalar");
     assert(!E->NeedToGather && "Extracting from a gather list");
 
@@ -3699,7 +3980,10 @@
 
     // For each lane:
     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
-      Value *Scalar = Entry->Scalars[Lane];
+      Value *Scalar = Entry->Scalars[Lane]->Op;
+
+      if (!Entry->Scalars[Lane]->isOpcodeOrAlt())
+        continue;
 
       Type *Ty = Scalar->getType();
       if (!Ty->isVoidTy()) {
@@ -3708,7 +3992,9 @@
           LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
 
           // It is legal to replace users in the ignorelist by undef.
-          assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) &&
+          assert((getTreeEntry(dyn_cast<Instruction>(U)) ||
+                  Entry->Scalars[Lane]->PseudoOp ||
+                  is_contained(UserIgnoreList, U)) &&
                  "Replacing out-of-tree value with undef");
         }
 #endif
@@ -3716,7 +4002,7 @@
         Scalar->replaceAllUsesWith(Undef);
       }
       LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
-      eraseInstruction(cast<Instruction>(Scalar));
+      eraseInstruction(Entry->Scalars[Lane]->OpInst);
     }
   }
 
@@ -3810,10 +4096,9 @@
 
 // Groups the instructions to a bundle (which is then a single scheduling entity)
 // and schedules instructions until the bundle gets ready.
-bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
-                                                 BoUpSLP *SLP,
-                                                 const InstructionsState &S) {
-  if (isa<PHINode>(S.OpValue))
+bool BoUpSLP::BlockScheduling::tryScheduleBundle(
+    ArrayRef<InstructionOrPseudo *> IL, BoUpSLP *SLP) {
+  if (isa<PHINode>(IL[0]->MainOp))
     return true;
 
   // Initialize the instruction bundle.
@@ -3821,19 +4106,27 @@
   ScheduleData *PrevInBundle = nullptr;
   ScheduleData *Bundle = nullptr;
   bool ReSchedule = false;
-  LLVM_DEBUG(dbgs() << "SLP:  bundle: " << *S.OpValue << "\n");
+  bool IsPseudo = false;
+  LLVM_DEBUG(dbgs() << "SLP:  bundle: " << *IL[0]->MainOp << "\n");
 
   // Make sure that the scheduling region contains all
   // instructions of the bundle.
-  for (Value *V : VL) {
-    if (!extendSchedulingRegion(V, S))
+  for (InstructionOrPseudo *IOP : IL)
+    if (!extendSchedulingRegion(IOP))
       return false;
-  }
 
-  for (Value *V : VL) {
-    ScheduleData *BundleMember = getScheduleData(V);
+  for (InstructionOrPseudo *IOP : IL) {
+    ScheduleData *BundleMember =
+         ScheduleDataMap[IOP->OpInst][std::make_pair(nullptr, 0)];
+    if (IOP->isNonAlt()) {
+      BundleMember = getScheduleData(IOP);
+      IsPseudo = true;
+    }
+    if (BundleMember->isPartOfBundle())
+      return false;
     assert(BundleMember &&
            "no ScheduleData for bundle member (maybe not in same basic block)");
+    assert(!BundleMember->isPartOfBundle() && "Already part of another bundle");
     if (BundleMember->IsScheduled) {
       // A bundle member was scheduled as single instruction before and now
       // needs to be scheduled as part of the bundle. We just get rid of the
@@ -3850,22 +4143,28 @@
       Bundle = BundleMember;
     }
     BundleMember->UnscheduledDepsInBundle = 0;
+    BundleMember->Opcode = IOP->MainOpcode;
+    BundleMember->Parent = IOP->getKey().first;
     Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
 
     // Group the instructions to a bundle.
     BundleMember->FirstInBundle = Bundle;
     PrevInBundle = BundleMember;
   }
-  if (ScheduleEnd != OldScheduleEnd) {
+  if (ScheduleEnd != OldScheduleEnd || IsPseudo) {
     // The scheduling region got new instructions at the lower end (or it is a
     // new region for the first bundle). This makes it necessary to
     // recalculate all dependencies.
     // It is seldom that this needs to be done a second time after adding the
     // initial bundle to the region.
     for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
-      doForAllOpcodes(I, [](ScheduleData *SD) {
-        SD->clearDependencies();
-      });
+      if (ScheduleDataMap.find(I) == ScheduleDataMap.end())
+        continue;
+      for (auto It : ScheduleDataMap[I]) {
+        auto *SD = It.second;
+        if (SD && isInSchedulingRegion(SD))
+          SD->clearDependencies();
+      }
     }
     ReSchedule = true;
   }
@@ -3893,18 +4192,33 @@
     }
   }
   if (!Bundle->isReady()) {
-    cancelScheduling(VL, S.OpValue);
+    cancelScheduling(IL[0]);
+    // We have to clear all dependencies, since all values
+    // were calculated for the vectorized bundle.
+    for (auto *I = ScheduleStart; I != ScheduleEnd;
+      I = I->getNextNode()) {
+      if (ScheduleDataMap.find(I) == ScheduleDataMap.end())
+        continue;
+      for (auto It : ScheduleDataMap[I]) {
+        auto *SD = It.second;
+        if (SD && isInSchedulingRegion(SD))
+          SD->clearDependencies();
+      }
+    }
+    resetSchedule();
     return false;
   }
   return true;
 }
 
-void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
-                                                Value *OpValue) {
-  if (isa<PHINode>(OpValue))
+void BoUpSLP::BlockScheduling::cancelScheduling(InstructionOrPseudo *IOP) {
+  if (!IOP->MainOp)
     return;
-
-  ScheduleData *Bundle = getScheduleData(OpValue);
+  if (isa<PHINode>(IOP->MainOp))
+    return;
+  auto Key = IOP->getKey();
+  ScheduleData *Bundle = getScheduleData(IOP)->FirstInBundle;
+  assert(Bundle && "Counld not find bundle");
   LLVM_DEBUG(dbgs() << "SLP:  cancel scheduling of " << *Bundle << "\n");
   assert(!Bundle->IsScheduled &&
          "Can't cancel bundle which is already scheduled");
@@ -3914,57 +4228,69 @@
   // Un-bundle: make single instructions out of the bundle.
   ScheduleData *BundleMember = Bundle;
   while (BundleMember) {
-    assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
+    assert(BundleMember->FirstInBundle == Bundle && "Corrupt bundle links");
+    assert(BundleMember->Parent == Key.first &&
+           BundleMember->Opcode == Key.second && "Corrupt bundle");
     BundleMember->FirstInBundle = BundleMember;
     ScheduleData *Next = BundleMember->NextInBundle;
     BundleMember->NextInBundle = nullptr;
     BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
-    if (BundleMember->UnscheduledDepsInBundle == 0) {
-      ReadyInsts.insert(BundleMember);
+    if (BundleMember->IsPseudo) {
+      ScheduleDataMap[BundleMember->Inst].erase(Key);
+      BundleMember->Opcode = 0;
+      BundleMember->Parent = nullptr;
+    } else {
+      BundleMember->Opcode = 0;
+      BundleMember->Parent = nullptr;
+      if (BundleMember->UnscheduledDepsInBundle == 0) {
+        ReadyInsts.insert(BundleMember);
+      }
     }
     BundleMember = Next;
   }
 }
 
-BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
+BoUpSLP::ScheduleData *
+BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
   // Allocate a new ScheduleData for the instruction.
   if (ChunkPos >= ChunkSize) {
-    ScheduleDataChunks.push_back(llvm::make_unique<ScheduleData[]>(ChunkSize));
+    ScheduleDataChunks.push_back(
+        llvm::make_unique<ScheduleData[]>(ChunkSize));
     ChunkPos = 0;
   }
   return &(ScheduleDataChunks.back()[ChunkPos++]);
 }
 
-bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
-                                                      const InstructionsState &S) {
-  if (getScheduleData(V, isOneOf(S, V)))
+bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
+    InstructionOrPseudo *IOP) {
+  if (getScheduleData(IOP))
     return true;
-  Instruction *I = dyn_cast<Instruction>(V);
-  assert(I && "bundle member must be an instruction");
-  assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
-  auto &&CheckSheduleForI = [this, &S](Instruction *I) -> bool {
-    ScheduleData *ISD = getScheduleData(I);
-    if (!ISD)
+  assert(!isa<PHINode>(IOP->OpInst) && "phi nodes don't need to be scheduled");
+  auto &&CheckSheduleForI = [this, &IOP](Instruction *I) -> bool {
+    ScheduleData *ISD = ScheduleDataMap[I][std::make_pair(nullptr, 0)];
+    if (!ISD || ISD->SchedulingRegionID != SchedulingRegionID)
       return false;
     assert(isInSchedulingRegion(ISD) &&
            "ScheduleData not in scheduling region");
-    ScheduleData *SD = allocateScheduleDataChunks();
-    SD->Inst = I;
-    SD->init(SchedulingRegionID, S.OpValue);
-    ExtraScheduleDataMap[I][S.OpValue] = SD;
+    if (IOP->isNonAlt()) {
+      ScheduleData *SD = allocateScheduleDataChunks();
+      SD->init(SchedulingRegionID, ISD, IOP->getKey().first, IOP->MainOpcode);
+      ScheduleDataMap[I][IOP->getKey()] = SD;
+    }
     return true;
   };
-  if (CheckSheduleForI(I))
+  if (CheckSheduleForI(IOP->OpInst))
     return true;
   if (!ScheduleStart) {
     // It's the first instruction in the new region.
-    initScheduleData(I, I->getNextNode(), nullptr, nullptr);
-    ScheduleStart = I;
-    ScheduleEnd = I->getNextNode();
-    if (isOneOf(S, I) != I)
-      CheckSheduleForI(I);
+    initScheduleData(IOP->OpInst, IOP->OpInst->getNextNode(), nullptr, nullptr);
+    ScheduleStart = IOP->OpInst;
+    ScheduleEnd = IOP->OpInst->getNextNode();
+    if (IOP->isNonAlt())
+      CheckSheduleForI(IOP->OpInst);
     assert(ScheduleEnd && "tried to vectorize a terminator?");
-    LLVM_DEBUG(dbgs() << "SLP:  initialize schedule region to " << *I << "\n");
+    LLVM_DEBUG(dbgs() << "SLP:  initialize schedule region to " << *IOP->Op
+               << "\n");
     return true;
   }
   // Search up and down at the same time, because we don't know if the new
@@ -3981,26 +4307,27 @@
     }
 
     if (UpIter != UpperEnd) {
-      if (&*UpIter == I) {
-        initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
-        ScheduleStart = I;
-        if (isOneOf(S, I) != I)
-          CheckSheduleForI(I);
-        LLVM_DEBUG(dbgs() << "SLP:  extend schedule region start to " << *I
-                          << "\n");
+      if (&*UpIter == IOP->OpInst) {
+        initScheduleData(IOP->OpInst, ScheduleStart, nullptr,
+                         FirstLoadStoreInRegion);
+        ScheduleStart = IOP->OpInst;
+        if (IOP->isNonAlt())
+          CheckSheduleForI(IOP->OpInst);
+        LLVM_DEBUG(dbgs() << "SLP:  extend schedule region start to "
+                          << *IOP->Op << "\n");
         return true;
       }
       UpIter++;
     }
     if (DownIter != LowerEnd) {
-      if (&*DownIter == I) {
-        initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
-                         nullptr);
-        ScheduleEnd = I->getNextNode();
-        if (isOneOf(S, I) != I)
-          CheckSheduleForI(I);
+      if (&*DownIter == IOP->OpInst) {
+        initScheduleData(ScheduleEnd, IOP->OpInst->getNextNode(),
+                         LastLoadStoreInRegion, nullptr);
+        ScheduleEnd = IOP->OpInst->getNextNode();
+        if (IOP->isNonAlt())
+          CheckSheduleForI(IOP->OpInst);
         assert(ScheduleEnd && "tried to vectorize a terminator?");
-        LLVM_DEBUG(dbgs() << "SLP:  extend schedule region end to " << *I
+        LLVM_DEBUG(dbgs() << "SLP:  extend schedule region end to " << *IOP->Op
                           << "\n");
         return true;
       }
@@ -4012,21 +4339,20 @@
   return true;
 }
 
-void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
-                                                Instruction *ToI,
-                                                ScheduleData *PrevLoadStore,
-                                                ScheduleData *NextLoadStore) {
+void BoUpSLP::BlockScheduling::initScheduleData(
+    Instruction *FromI, Instruction *ToI, ScheduleData *PrevLoadStore,
+    ScheduleData *NextLoadStore) {
   ScheduleData *CurrentLoadStore = PrevLoadStore;
   for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
-    ScheduleData *SD = ScheduleDataMap[I];
+    ScheduleData *SD = ScheduleDataMap[I][std::make_pair(nullptr, 0)];
     if (!SD) {
       SD = allocateScheduleDataChunks();
-      ScheduleDataMap[I] = SD;
+      ScheduleDataMap[I][std::make_pair(nullptr, 0)] = SD;
       SD->Inst = I;
     }
     assert(!isInSchedulingRegion(SD) &&
            "new ScheduleData already in scheduling region");
-    SD->init(SchedulingRegionID, I);
+    SD->init(SchedulingRegionID);
 
     if (I->mayReadOrWriteMemory() &&
         (!isa<IntrinsicInst>(I) ||
@@ -4061,8 +4387,13 @@
     WorkList.pop_back();
 
     ScheduleData *BundleMember = SD;
+    unsigned Opcode = BundleMember->Opcode;
+    Value *Parent = BundleMember->Parent;
     while (BundleMember) {
       assert(isInSchedulingRegion(BundleMember));
+      assert(BundleMember->Opcode == Opcode &&
+             BundleMember->Parent == Parent && "Corrupt bundle member");
+
       if (!BundleMember->hasValidDependencies()) {
 
         LLVM_DEBUG(dbgs() << "SLP:       update deps of " << *BundleMember
@@ -4071,35 +4402,27 @@
         BundleMember->resetUnscheduledDeps();
 
         // Handle def-use chain dependencies.
-        if (BundleMember->OpValue != BundleMember->Inst) {
-          ScheduleData *UseSD = getScheduleData(BundleMember->Inst);
-          if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
-            BundleMember->Dependencies++;
-            ScheduleData *DestBundle = UseSD->FirstInBundle;
-            if (!DestBundle->IsScheduled)
-              BundleMember->incrementUnscheduledDeps(1);
-            if (!DestBundle->hasValidDependencies())
-              WorkList.push_back(DestBundle);
-          }
-        } else {
-          for (User *U : BundleMember->Inst->users()) {
-            if (isa<Instruction>(U)) {
-              ScheduleData *UseSD = getScheduleData(U);
-              if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
-                BundleMember->Dependencies++;
-                ScheduleData *DestBundle = UseSD->FirstInBundle;
-                if (!DestBundle->IsScheduled)
-                  BundleMember->incrementUnscheduledDeps(1);
-                if (!DestBundle->hasValidDependencies())
-                  WorkList.push_back(DestBundle);
-              }
-            } else {
-              // I'm not sure if this can ever happen. But we need to be safe.
-              // This lets the instruction/bundle never be scheduled and
-              // eventually disable vectorization.
+        for (User *U : BundleMember->Inst->users()) {
+          if (auto *I = dyn_cast<Instruction>(U)) {
+            if (ScheduleDataMap.find(I) == ScheduleDataMap.end())
+              continue;
+            for (auto It : ScheduleDataMap[I]) {
+              ScheduleData *UseSD = It.second;
+              if (!UseSD || !isInSchedulingRegion(UseSD))
+                continue;
               BundleMember->Dependencies++;
-              BundleMember->incrementUnscheduledDeps(1);
+              ScheduleData *DestBundle = UseSD->FirstInBundle;
+              if (!DestBundle->IsScheduled)
+                BundleMember->incrementUnscheduledDeps(1);
+              if (!DestBundle->hasValidDependencies())
+                WorkList.push_back(DestBundle);
             }
+          } else {
+            // I'm not sure if this can ever happen. But we need to be safe.
+            // This lets the instruction/bundle never be scheduled and
+            // eventually disable vectorization.
+            BundleMember->Dependencies++;
+            BundleMember->incrementUnscheduledDeps(1);
           }
         }
 
@@ -4108,7 +4431,7 @@
         if (DepDest) {
           Instruction *SrcInst = BundleMember->Inst;
           MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA);
-          bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
+          bool SrcMayWrite = SrcInst->mayWriteToMemory();
           unsigned numAliased = 0;
           unsigned DistToSrc = 1;
 
@@ -4132,14 +4455,23 @@
               // balance between reduced runtime and accurate dependencies.
               numAliased++;
 
-              DepDest->MemoryDependencies.push_back(BundleMember);
-              BundleMember->Dependencies++;
-              ScheduleData *DestBundle = DepDest->FirstInBundle;
-              if (!DestBundle->IsScheduled) {
-                BundleMember->incrementUnscheduledDeps(1);
-              }
-              if (!DestBundle->hasValidDependencies()) {
-                WorkList.push_back(DestBundle);
+              // We don't want any duplicates in the set to have a correct
+              // dependancies.
+              if (ScheduleDataMap.find(DepDest->Inst) != ScheduleDataMap.end()) {
+                for (auto It : ScheduleDataMap[DepDest->Inst]) {
+                  auto *Dep = It.second;
+                  if (!Dep || !isInSchedulingRegion(Dep))
+                    continue;
+                  Dep->MemoryDependencies.push_back(BundleMember);
+                  BundleMember->Dependencies++;
+                  ScheduleData *DestBundle = Dep->FirstInBundle;
+                  if (!DestBundle->IsScheduled) {
+                    BundleMember->incrementUnscheduledDeps(1);
+                  }
+                  if (!DestBundle->hasValidDependencies()) {
+                    WorkList.push_back(DestBundle);
+                  }
+                }
               }
             }
             DepDest = DepDest->NextLoadStore;
@@ -4177,22 +4509,102 @@
   assert(ScheduleStart &&
          "tried to reset schedule on block which has not been scheduled");
   for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
-    doForAllOpcodes(I, [&](ScheduleData *SD) {
-      assert(isInSchedulingRegion(SD) &&
-             "ScheduleData not in scheduling region");
+    if (ScheduleDataMap.find(I) == ScheduleDataMap.end())
+      continue;
+    for (auto It : ScheduleDataMap[I]) {
+      auto *SD = It.second;
+      if (!SD || !isInSchedulingRegion(SD))
+        continue;
       SD->IsScheduled = false;
       SD->resetUnscheduledDeps();
-    });
+    }
   }
   ReadyInsts.clear();
 }
 
+void BoUpSLP::eliminatePseudos(BlockScheduling *BS) {
+  if (!BS->ScheduleStart)
+    return;
+  bool Changed = false;
+  for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
+       I = I->getNextNode()) {
+    if (BS->ScheduleDataMap.find(I) == BS->ScheduleDataMap.end())
+      continue;
+    SmallSet<std::pair<Value *, unsigned>, 2> Keys;
+    SmallDenseMap<Instruction *, ScheduleData *> NewOps;
+    for (auto It : BS->ScheduleDataMap[I]) {
+      auto *SD = It.second;
+      if (!SD || !BS->isInSchedulingRegion(SD) || !SD->IsPseudo)
+        continue;
+      auto Key = It.first;
+      // Removing any pseudo instruction that
+      // remained out of canceled bundles.
+      if (!SD->isPartOfBundle()) {
+        Keys.insert(Key);
+        continue;
+      }
+      Changed = true;
+      // Emitting a real instruction
+      Builder.SetInsertPoint(I->getNextNode());
+      Constant *C =
+          ConstantExpr::getBinOpIdentity(SD->Opcode, SD->Inst->getType(), true);
+      Value *V = Builder.CreateBinOp(
+          static_cast<Instruction::BinaryOps>(SD->Opcode), I, C);
+      Instruction *NewOp = cast<Instruction>(V);
+      // Change instruction in ScheduleData.
+      SD->Inst = NewOp;
+      SD->IsPseudo = false;
+      // Remove the pseudo instruction from ScheduleDataMap
+      // and add the real instruction.
+      NewOps[NewOp] = SD;
+      Keys.insert(Key);
+      std::swap(ScalarToTreeEntry[NewOp][Key], ScalarToTreeEntry[I][Key]);
+      ScalarToTreeEntry[I].erase(Key);
+      TreeEntry *E = &VectorizableTree[ScalarToTreeEntry[NewOp][Key]];
+      llvm::propagateIRFlags(V, E->Scalars[0]->MainOp);
+      V = propagateMetadata(NewOp, E->Scalars);
+      // Replace instruction in TreeEntry.
+      for (auto *IOP : E->Scalars) {
+        if (IOP->OpInst == I) {
+          IOP->Op = NewOp;
+          IOP->OpInst = NewOp;
+          IOP->PseudoOp = I;
+        }
+      }
+    }
+    for (auto Key : Keys)
+      BS->ScheduleDataMap[I].erase(Key);
+    for (auto Ops : NewOps) {
+      auto *SD = Ops.second;
+      BS->ScheduleDataMap[Ops.first][std::make_pair(SD->Parent, SD->Opcode)] =
+          SD;
+    }
+  }
+  // We have to recalulate all dependancies in
+  // the scheduling region if any pseudo instructions
+  // were replaced.
+  if (Changed) {
+    BS->resetSchedule();
+    BS->initialFillReadyList(BS->ReadyInsts);
+    for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
+         I = I->getNextNode()) {
+      if (BS->ScheduleDataMap.find(I) == BS->ScheduleDataMap.end())
+        continue;
+      for (auto It : BS->ScheduleDataMap[I]) {
+        auto *SD = It.second;
+        if (SD && SD->SchedulingRegionID == BS->SchedulingRegionID) {
+          SD->clearDependencies();
+        }
+      }
+    }
+  }
+}
+
 void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
   if (!BS->ScheduleStart)
     return;
 
   LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
-
   BS->resetSchedule();
 
   // For the real scheduling we use a more sophisticated ready-list: it is
@@ -4211,16 +4623,24 @@
   int NumToSchedule = 0;
   for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
        I = I->getNextNode()) {
-    BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) {
+    if (BS->ScheduleDataMap.find(I) == BS->ScheduleDataMap.end())
+      continue;
+    for (auto It : BS->ScheduleDataMap[I]) {
+      auto *SD = It.second;
+      if (!SD || !BS->isInSchedulingRegion(SD))
+        continue;
+      std::pair<Value *, unsigned> Key = std::make_pair(SD->Parent, SD->Opcode);
       assert(SD->isPartOfBundle() ==
-                 (getTreeEntry(SD->Inst) != nullptr) &&
+                 (ScalarToTreeEntry.find(SD->Inst) != ScalarToTreeEntry.end() &&
+                  ScalarToTreeEntry[SD->Inst].find(Key) !=
+                  ScalarToTreeEntry[SD->Inst].end()) &&
              "scheduler and vectorizer bundle mismatch");
       SD->FirstInBundle->SchedulingPriority = Idx++;
       if (SD->isSchedulingEntity()) {
         BS->calculateDependencies(SD, false, this);
         NumToSchedule++;
       }
-    });
+    }
   }
   BS->initialFillReadyList(ReadyInsts);
 
@@ -4234,20 +4654,39 @@
     // Move the scheduled instruction(s) to their dedicated places, if not
     // there yet.
     ScheduleData *BundleMember = picked;
+    unsigned Opcode = BundleMember->Opcode;
+    Value *Parent = BundleMember->Parent;
     while (BundleMember) {
-      Instruction *pickedInst = BundleMember->Inst;
-      if (LastScheduledInst->getNextNode() != pickedInst) {
-        BS->BB->getInstList().remove(pickedInst);
+      assert(Opcode == BundleMember->Opcode &&
+             Parent == BundleMember->Parent && "Corrupt bundle member");
+      Instruction *PickedInst = BundleMember->Inst;
+      if (LastScheduledInst != PickedInst) {
+        BS->BB->getInstList().remove(PickedInst);
         BS->BB->getInstList().insert(LastScheduledInst->getIterator(),
-                                     pickedInst);
+                                     PickedInst);
       }
-      LastScheduledInst = pickedInst;
+      LastScheduledInst = PickedInst;
       BundleMember = BundleMember->NextInBundle;
     }
-
     BS->schedule(picked, ReadyInsts);
     NumToSchedule--;
   }
+#ifndef NDEBUG
+  LLVM_DEBUG(dbgs() << *BS->BB << "\n");
+  if (NumToSchedule != 0) {
+    for (BasicBlock::iterator I = BS->BB->begin(), E = BS->BB->end(); I != E;
+         ++I) {
+      if (BS->ScheduleDataMap.find(&*I) == BS->ScheduleDataMap.end())
+        continue;
+      for (auto It : BS->ScheduleDataMap[&*I]) {
+        auto *SD = It.second;
+        if (SD && BS->isInSchedulingRegion(SD) &&
+            SD->isSchedulingEntity() && SD->UnscheduledDepsInBundle != 0)
+          LLVM_DEBUG(dbgs() << "SLP: Failed to schedule: " << *SD << "\n");
+      }
+    }
+  }
+#endif
   assert(NumToSchedule == 0 && "could not schedule all instructions");
 
   // Avoid duplicate scheduling of the block.
@@ -4392,7 +4831,7 @@
 
   // We only attempt to truncate integer expressions.
   auto &TreeRoot = VectorizableTree[0].Scalars;
-  auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());
+  auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->Op->getType());
   if (!TreeRootIT)
     return;
 
@@ -4402,7 +4841,9 @@
   // This means that if a tree entry other than a root is used externally, it
   // must have multiple uses and InstCombine will not rewrite it. The code
   // below ensures that only the roots are used externally.
-  SmallPtrSet<Value *, 32> Expr(TreeRoot.begin(), TreeRoot.end());
+  SmallPtrSet<Value *, 32> Expr;
+  for (auto *IOP : TreeRoot)
+    Expr.insert(IOP->Op);
   for (auto &EU : ExternalUses)
     if (!Expr.erase(EU.Scalar))
       return;
@@ -4413,12 +4854,13 @@
   // context to determine which values can be demoted. If we see a truncation,
   // we mark it as seeding another demotion.
   for (auto &Entry : VectorizableTree)
-    Expr.insert(Entry.Scalars.begin(), Entry.Scalars.end());
+    for (auto *IOP: Entry.Scalars)
+      Expr.insert(IOP->Op);
 
   // Ensure the roots of the vectorizable tree don't form a cycle. They must
   // have a single external user that is not in the vectorizable tree.
   for (auto *Root : TreeRoot)
-    if (!Root->hasOneUse() || Expr.count(*Root->user_begin()))
+    if (!Root->Op->hasOneUse() || Expr.count(*Root->Op->user_begin()))
       return;
 
   // Conservatively determine if we can actually truncate the roots of the
@@ -4427,7 +4869,7 @@
   SmallVector<Value *, 32> ToDemote;
   SmallVector<Value *, 4> Roots;
   for (auto *Root : TreeRoot)
-    if (!collectValuesToDemote(Root, Expr, ToDemote, Roots))
+    if (!collectValuesToDemote(Root->Op, Expr, ToDemote, Roots))
       return;
 
   // The maximum bit width required to represent all the values that can be
@@ -4438,7 +4880,7 @@
   // We first check if all the bits of the roots are demanded. If they're not,
   // we can truncate the roots to this narrower type.
   for (auto *Root : TreeRoot) {
-    auto Mask = DB->getDemandedBits(cast<Instruction>(Root));
+    auto Mask = DB->getDemandedBits(Root->OpInst);
     MaxBitWidth = std::max<unsigned>(
         Mask.getBitWidth() - Mask.countLeadingZeros(), MaxBitWidth);
   }
@@ -4457,17 +4899,17 @@
   // We start by looking at each entry that can be demoted. We compute the
   // maximum bit width required to store the scalar by using ValueTracking to
   // compute the number of high-order bits we can truncate.
-  if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType()) &&
-      llvm::all_of(TreeRoot, [](Value *R) {
-        assert(R->hasOneUse() && "Root should have only one use!");
-        return isa<GetElementPtrInst>(R->user_back());
+  if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->Op->getType()) &&
+      llvm::all_of(TreeRoot, [](InstructionOrPseudo *R) {
+        assert(R->Op->hasOneUse() && "Root should have only one use!");
+        return isa<GetElementPtrInst>(R->Op->user_back());
       })) {
     MaxBitWidth = 8u;
 
     // Determine if the sign bit of all the roots is known to be zero. If not,
     // IsKnownPositive is set to False.
-    IsKnownPositive = llvm::all_of(TreeRoot, [&](Value *R) {
-      KnownBits Known = computeKnownBits(R, *DL);
+    IsKnownPositive = llvm::all_of(TreeRoot, [&](InstructionOrPseudo *R) {
+      KnownBits Known = computeKnownBits(R->Op, *DL);
       return Known.isNonNegative();
     });
 
@@ -4865,9 +5307,21 @@
 
   // Check that all of the parts are scalar instructions of the same type,
   // we permit an alternate opcode via InstructionsState.
-  InstructionsState S = getSameOpcode(VL);
+  SmallVector<InstructionOrPseudo, 4> ArrayIL;
+  SmallVector<InstructionOrPseudo *, 4> IL;
+
+  for (Value *V: VL)
+    ArrayIL.push_back(InstructionOrPseudo(VL[0], V));
+  for (unsigned idx=0; idx < ArrayIL.size(); idx++)
+    IL.push_back(&ArrayIL[idx]);
+  InstructionsState S = getSameOpcode(IL);
   if (!S.getOpcode())
     return false;
+  for (Value *V : VL) {
+    auto *I = dyn_cast<Instruction>(V);
+    if (isOneOf(S, I) != I)
+      return false;
+  }
 
   Instruction *I0 = cast<Instruction>(S.OpValue);
   unsigned Sz = R.getVectorElementSize(I0);
Index: test/Transforms/SLPVectorizer/X86/PR39774.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/PR39774.ll
+++ test/Transforms/SLPVectorizer/X86/PR39774.ll
@@ -3,185 +3,93 @@
 ; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -slp-threshold=-8 -slp-min-tree-size=6 | FileCheck %s --check-prefixes=ALL,FORCE_REDUCTION
 
 define void @Test(i32) {
-; CHECK-LABEL: @Test(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP15:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> <i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 12529, i32 13685>, [[SHUFFLE]]
-; CHECK-NEXT:    [[VAL_1:%.*]] = and i32 [[TMP2]], undef
-; CHECK-NEXT:    [[VAL_2:%.*]] = and i32 [[VAL_1]], [[TMP0:%.*]]
-; CHECK-NEXT:    [[VAL_3:%.*]] = and i32 [[VAL_2]], [[TMP0]]
-; CHECK-NEXT:    [[VAL_4:%.*]] = and i32 [[VAL_3]], [[TMP0]]
-; CHECK-NEXT:    [[VAL_5:%.*]] = and i32 [[VAL_4]], [[TMP0]]
-; CHECK-NEXT:    [[VAL_7:%.*]] = and i32 [[VAL_5]], undef
-; CHECK-NEXT:    [[VAL_8:%.*]] = and i32 [[VAL_7]], [[TMP0]]
-; CHECK-NEXT:    [[VAL_9:%.*]] = and i32 [[VAL_8]], [[TMP0]]
-; CHECK-NEXT:    [[VAL_10:%.*]] = and i32 [[VAL_9]], [[TMP0]]
-; CHECK-NEXT:    [[VAL_12:%.*]] = and i32 [[VAL_10]], undef
-; CHECK-NEXT:    [[VAL_13:%.*]] = and i32 [[VAL_12]], [[TMP0]]
-; CHECK-NEXT:    [[VAL_14:%.*]] = and i32 [[VAL_13]], [[TMP0]]
-; CHECK-NEXT:    [[VAL_15:%.*]] = and i32 [[VAL_14]], [[TMP0]]
-; CHECK-NEXT:    [[VAL_16:%.*]] = and i32 [[VAL_15]], [[TMP0]]
-; CHECK-NEXT:    [[VAL_17:%.*]] = and i32 [[VAL_16]], [[TMP0]]
-; CHECK-NEXT:    [[VAL_19:%.*]] = and i32 [[VAL_17]], undef
-; CHECK-NEXT:    [[VAL_21:%.*]] = and i32 [[VAL_19]], undef
-; CHECK-NEXT:    [[VAL_22:%.*]] = and i32 [[VAL_21]], [[TMP0]]
-; CHECK-NEXT:    [[VAL_23:%.*]] = and i32 [[VAL_22]], [[TMP0]]
-; CHECK-NEXT:    [[VAL_24:%.*]] = and i32 [[VAL_23]], [[TMP0]]
-; CHECK-NEXT:    [[VAL_25:%.*]] = and i32 [[VAL_24]], [[TMP0]]
-; CHECK-NEXT:    [[VAL_26:%.*]] = and i32 [[VAL_25]], [[TMP0]]
-; CHECK-NEXT:    [[VAL_27:%.*]] = and i32 [[VAL_26]], [[TMP0]]
-; CHECK-NEXT:    [[VAL_28:%.*]] = and i32 [[VAL_27]], [[TMP0]]
-; CHECK-NEXT:    [[VAL_29:%.*]] = and i32 [[VAL_28]], [[TMP0]]
-; CHECK-NEXT:    [[VAL_30:%.*]] = and i32 [[VAL_29]], [[TMP0]]
-; CHECK-NEXT:    [[VAL_31:%.*]] = and i32 [[VAL_30]], [[TMP0]]
-; CHECK-NEXT:    [[VAL_32:%.*]] = and i32 [[VAL_31]], [[TMP0]]
-; CHECK-NEXT:    [[VAL_33:%.*]] = and i32 [[VAL_32]], [[TMP0]]
-; CHECK-NEXT:    [[VAL_35:%.*]] = and i32 [[VAL_33]], undef
-; CHECK-NEXT:    [[VAL_36:%.*]] = and i32 [[VAL_35]], [[TMP0]]
-; CHECK-NEXT:    [[VAL_37:%.*]] = and i32 [[VAL_36]], [[TMP0]]
-; CHECK-NEXT:    [[VAL_38:%.*]] = and i32 [[VAL_37]], [[TMP0]]
-; CHECK-NEXT:    [[VAL_40:%.*]] = and i32 [[VAL_38]], undef
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = and <8 x i32> [[TMP3]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX2:%.*]] = and <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX4:%.*]] = and <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
-; CHECK-NEXT:    [[OP_EXTRA:%.*]] = and i32 [[TMP4]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA5:%.*]] = and i32 [[OP_EXTRA]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA6:%.*]] = and i32 [[OP_EXTRA5]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA7:%.*]] = and i32 [[OP_EXTRA6]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA8:%.*]] = and i32 [[OP_EXTRA7]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA9:%.*]] = and i32 [[OP_EXTRA8]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA10:%.*]] = and i32 [[OP_EXTRA9]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA11:%.*]] = and i32 [[OP_EXTRA10]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA12:%.*]] = and i32 [[OP_EXTRA11]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA13:%.*]] = and i32 [[OP_EXTRA12]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA14:%.*]] = and i32 [[OP_EXTRA13]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA15:%.*]] = and i32 [[OP_EXTRA14]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA16:%.*]] = and i32 [[OP_EXTRA15]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA17:%.*]] = and i32 [[OP_EXTRA16]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA18:%.*]] = and i32 [[OP_EXTRA17]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA19:%.*]] = and i32 [[OP_EXTRA18]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA20:%.*]] = and i32 [[OP_EXTRA19]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA21:%.*]] = and i32 [[OP_EXTRA20]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA22:%.*]] = and i32 [[OP_EXTRA21]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA23:%.*]] = and i32 [[OP_EXTRA22]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA24:%.*]] = and i32 [[OP_EXTRA23]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA25:%.*]] = and i32 [[OP_EXTRA24]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA26:%.*]] = and i32 [[OP_EXTRA25]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA27:%.*]] = and i32 [[OP_EXTRA26]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA28:%.*]] = and i32 [[OP_EXTRA27]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA29:%.*]] = and i32 [[OP_EXTRA28]], [[TMP0]]
-; CHECK-NEXT:    [[OP_EXTRA30:%.*]] = and i32 [[OP_EXTRA29]], [[TMP0]]
-; CHECK-NEXT:    [[VAL_42:%.*]] = and i32 [[VAL_40]], undef
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> undef, i32 [[OP_EXTRA30]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 14910, i32 1
-; CHECK-NEXT:    [[TMP9:%.*]] = and <2 x i32> [[TMP6]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = add <2 x i32> [[TMP6]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i32> [[TMP11]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> undef, i32 [[TMP12]], i32 0
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i32> [[TMP11]], i32 1
-; CHECK-NEXT:    [[TMP15]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP14]], i32 1
-; CHECK-NEXT:    br label [[LOOP]]
-;
-; FORCE_REDUCTION-LABEL: @Test(
-; FORCE_REDUCTION-NEXT:  entry:
-; FORCE_REDUCTION-NEXT:    br label [[LOOP:%.*]]
-; FORCE_REDUCTION:       loop:
-; FORCE_REDUCTION-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP13:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ]
-; FORCE_REDUCTION-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
-; FORCE_REDUCTION-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 1
-; FORCE_REDUCTION-NEXT:    [[TMP3:%.*]] = add <4 x i32> <i32 0, i32 55, i32 285, i32 1240>, [[SHUFFLE]]
-; FORCE_REDUCTION-NEXT:    [[VAL_1:%.*]] = and i32 [[TMP2]], undef
-; FORCE_REDUCTION-NEXT:    [[VAL_2:%.*]] = and i32 [[VAL_1]], [[TMP0:%.*]]
-; FORCE_REDUCTION-NEXT:    [[VAL_3:%.*]] = and i32 [[VAL_2]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[VAL_4:%.*]] = and i32 [[VAL_3]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[VAL_5:%.*]] = and i32 [[VAL_4]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[VAL_7:%.*]] = and i32 [[VAL_5]], undef
-; FORCE_REDUCTION-NEXT:    [[VAL_8:%.*]] = and i32 [[VAL_7]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[VAL_9:%.*]] = and i32 [[VAL_8]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[VAL_10:%.*]] = and i32 [[VAL_9]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[VAL_12:%.*]] = and i32 [[VAL_10]], undef
-; FORCE_REDUCTION-NEXT:    [[VAL_13:%.*]] = and i32 [[VAL_12]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[VAL_14:%.*]] = and i32 [[VAL_13]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[VAL_15:%.*]] = and i32 [[VAL_14]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[VAL_16:%.*]] = and i32 [[VAL_15]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[VAL_17:%.*]] = and i32 [[VAL_16]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[VAL_19:%.*]] = and i32 [[VAL_17]], undef
-; FORCE_REDUCTION-NEXT:    [[VAL_20:%.*]] = add i32 [[TMP2]], 1496
-; FORCE_REDUCTION-NEXT:    [[VAL_21:%.*]] = and i32 [[VAL_19]], [[VAL_20]]
-; FORCE_REDUCTION-NEXT:    [[VAL_22:%.*]] = and i32 [[VAL_21]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[VAL_23:%.*]] = and i32 [[VAL_22]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[VAL_24:%.*]] = and i32 [[VAL_23]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[VAL_25:%.*]] = and i32 [[VAL_24]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[VAL_26:%.*]] = and i32 [[VAL_25]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[VAL_27:%.*]] = and i32 [[VAL_26]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[VAL_28:%.*]] = and i32 [[VAL_27]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[VAL_29:%.*]] = and i32 [[VAL_28]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[VAL_30:%.*]] = and i32 [[VAL_29]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[VAL_31:%.*]] = and i32 [[VAL_30]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[VAL_32:%.*]] = and i32 [[VAL_31]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[VAL_33:%.*]] = and i32 [[VAL_32]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[VAL_34:%.*]] = add i32 [[TMP2]], 8555
-; FORCE_REDUCTION-NEXT:    [[VAL_35:%.*]] = and i32 [[VAL_33]], [[VAL_34]]
-; FORCE_REDUCTION-NEXT:    [[VAL_36:%.*]] = and i32 [[VAL_35]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[VAL_37:%.*]] = and i32 [[VAL_36]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; FORCE_REDUCTION-NEXT:    [[BIN_RDX:%.*]] = and <4 x i32> [[TMP3]], [[RDX_SHUF]]
-; FORCE_REDUCTION-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; FORCE_REDUCTION-NEXT:    [[BIN_RDX2:%.*]] = and <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
-; FORCE_REDUCTION-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
-; FORCE_REDUCTION-NEXT:    [[TMP5:%.*]] = and i32 [[TMP4]], [[VAL_20]]
-; FORCE_REDUCTION-NEXT:    [[TMP6:%.*]] = and i32 [[TMP5]], [[VAL_34]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA:%.*]] = and i32 [[TMP6]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA3:%.*]] = and i32 [[OP_EXTRA]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA4:%.*]] = and i32 [[OP_EXTRA3]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA5:%.*]] = and i32 [[OP_EXTRA4]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA6:%.*]] = and i32 [[OP_EXTRA5]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA7:%.*]] = and i32 [[OP_EXTRA6]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA8:%.*]] = and i32 [[OP_EXTRA7]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA9:%.*]] = and i32 [[OP_EXTRA8]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA10:%.*]] = and i32 [[OP_EXTRA9]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA11:%.*]] = and i32 [[OP_EXTRA10]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA12:%.*]] = and i32 [[OP_EXTRA11]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA13:%.*]] = and i32 [[OP_EXTRA12]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA14:%.*]] = and i32 [[OP_EXTRA13]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA15:%.*]] = and i32 [[OP_EXTRA14]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA16:%.*]] = and i32 [[OP_EXTRA15]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA17:%.*]] = and i32 [[OP_EXTRA16]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA18:%.*]] = and i32 [[OP_EXTRA17]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA19:%.*]] = and i32 [[OP_EXTRA18]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA20:%.*]] = and i32 [[OP_EXTRA19]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA21:%.*]] = and i32 [[OP_EXTRA20]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA22:%.*]] = and i32 [[OP_EXTRA21]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA23:%.*]] = and i32 [[OP_EXTRA22]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA24:%.*]] = and i32 [[OP_EXTRA23]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA25:%.*]] = and i32 [[OP_EXTRA24]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA26:%.*]] = and i32 [[OP_EXTRA25]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA27:%.*]] = and i32 [[OP_EXTRA26]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA28:%.*]] = and i32 [[OP_EXTRA27]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[OP_EXTRA29:%.*]] = and i32 [[OP_EXTRA28]], [[TMP2]]
-; FORCE_REDUCTION-NEXT:    [[VAL_38:%.*]] = and i32 [[VAL_37]], [[TMP0]]
-; FORCE_REDUCTION-NEXT:    [[VAL_39:%.*]] = add i32 [[TMP2]], 12529
-; FORCE_REDUCTION-NEXT:    [[VAL_40:%.*]] = and i32 [[OP_EXTRA29]], [[VAL_39]]
-; FORCE_REDUCTION-NEXT:    [[VAL_41:%.*]] = add i32 [[TMP2]], 13685
-; FORCE_REDUCTION-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> undef, i32 [[VAL_40]], i32 0
-; FORCE_REDUCTION-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP2]], i32 1
-; FORCE_REDUCTION-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> undef, i32 [[VAL_41]], i32 0
-; FORCE_REDUCTION-NEXT:    [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 14910, i32 1
-; FORCE_REDUCTION-NEXT:    [[TMP11:%.*]] = and <2 x i32> [[TMP8]], [[TMP10]]
-; FORCE_REDUCTION-NEXT:    [[TMP12:%.*]] = add <2 x i32> [[TMP8]], [[TMP10]]
-; FORCE_REDUCTION-NEXT:    [[TMP13]] = shufflevector <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> <i32 0, i32 3>
-; FORCE_REDUCTION-NEXT:    br label [[LOOP]]
+; ALL-LABEL: @Test(
+; ALL-NEXT:  entry:
+; ALL-NEXT:    br label [[LOOP:%.*]]
+; ALL:       loop:
+; ALL-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP11:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ]
+; ALL-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; ALL-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1
+; ALL-NEXT:    [[TMP3:%.*]] = add <8 x i32> <i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 12529, i32 13685>, [[SHUFFLE]]
+; ALL-NEXT:    [[VAL_1:%.*]] = and i32 [[TMP2]], undef
+; ALL-NEXT:    [[VAL_2:%.*]] = and i32 [[VAL_1]], [[TMP0:%.*]]
+; ALL-NEXT:    [[VAL_3:%.*]] = and i32 [[VAL_2]], [[TMP0]]
+; ALL-NEXT:    [[VAL_4:%.*]] = and i32 [[VAL_3]], [[TMP0]]
+; ALL-NEXT:    [[VAL_5:%.*]] = and i32 [[VAL_4]], [[TMP0]]
+; ALL-NEXT:    [[VAL_7:%.*]] = and i32 [[VAL_5]], undef
+; ALL-NEXT:    [[VAL_8:%.*]] = and i32 [[VAL_7]], [[TMP0]]
+; ALL-NEXT:    [[VAL_9:%.*]] = and i32 [[VAL_8]], [[TMP0]]
+; ALL-NEXT:    [[VAL_10:%.*]] = and i32 [[VAL_9]], [[TMP0]]
+; ALL-NEXT:    [[VAL_12:%.*]] = and i32 [[VAL_10]], undef
+; ALL-NEXT:    [[VAL_13:%.*]] = and i32 [[VAL_12]], [[TMP0]]
+; ALL-NEXT:    [[VAL_14:%.*]] = and i32 [[VAL_13]], [[TMP0]]
+; ALL-NEXT:    [[VAL_15:%.*]] = and i32 [[VAL_14]], [[TMP0]]
+; ALL-NEXT:    [[VAL_16:%.*]] = and i32 [[VAL_15]], [[TMP0]]
+; ALL-NEXT:    [[VAL_17:%.*]] = and i32 [[VAL_16]], [[TMP0]]
+; ALL-NEXT:    [[VAL_19:%.*]] = and i32 [[VAL_17]], undef
+; ALL-NEXT:    [[VAL_21:%.*]] = and i32 [[VAL_19]], undef
+; ALL-NEXT:    [[VAL_22:%.*]] = and i32 [[VAL_21]], [[TMP0]]
+; ALL-NEXT:    [[VAL_23:%.*]] = and i32 [[VAL_22]], [[TMP0]]
+; ALL-NEXT:    [[VAL_24:%.*]] = and i32 [[VAL_23]], [[TMP0]]
+; ALL-NEXT:    [[VAL_25:%.*]] = and i32 [[VAL_24]], [[TMP0]]
+; ALL-NEXT:    [[VAL_26:%.*]] = and i32 [[VAL_25]], [[TMP0]]
+; ALL-NEXT:    [[VAL_27:%.*]] = and i32 [[VAL_26]], [[TMP0]]
+; ALL-NEXT:    [[VAL_28:%.*]] = and i32 [[VAL_27]], [[TMP0]]
+; ALL-NEXT:    [[VAL_29:%.*]] = and i32 [[VAL_28]], [[TMP0]]
+; ALL-NEXT:    [[VAL_30:%.*]] = and i32 [[VAL_29]], [[TMP0]]
+; ALL-NEXT:    [[VAL_31:%.*]] = and i32 [[VAL_30]], [[TMP0]]
+; ALL-NEXT:    [[VAL_32:%.*]] = and i32 [[VAL_31]], [[TMP0]]
+; ALL-NEXT:    [[VAL_33:%.*]] = and i32 [[VAL_32]], [[TMP0]]
+; ALL-NEXT:    [[VAL_35:%.*]] = and i32 [[VAL_33]], undef
+; ALL-NEXT:    [[VAL_36:%.*]] = and i32 [[VAL_35]], [[TMP0]]
+; ALL-NEXT:    [[VAL_37:%.*]] = and i32 [[VAL_36]], [[TMP0]]
+; ALL-NEXT:    [[VAL_38:%.*]] = and i32 [[VAL_37]], [[TMP0]]
+; ALL-NEXT:    [[VAL_40:%.*]] = and i32 [[VAL_38]], undef
+; ALL-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> undef, i32 [[VAL_40]], i32 0
+; ALL-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[TMP2]], i32 1
+; ALL-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
+; ALL-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> undef, i32 [[TMP6]], i32 0
+; ALL-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 14910, i32 1
+; ALL-NEXT:    [[TMP9:%.*]] = and <2 x i32> [[TMP5]], [[TMP8]]
+; ALL-NEXT:    [[TMP10:%.*]] = add <2 x i32> [[TMP5]], [[TMP8]]
+; ALL-NEXT:    [[TMP11]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> <i32 0, i32 3>
+; ALL-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; ALL-NEXT:    [[BIN_RDX:%.*]] = and <8 x i32> [[TMP3]], [[RDX_SHUF]]
+; ALL-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; ALL-NEXT:    [[BIN_RDX2:%.*]] = and <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; ALL-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; ALL-NEXT:    [[BIN_RDX4:%.*]] = and <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; ALL-NEXT:    [[TMP12:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; ALL-NEXT:    [[OP_EXTRA:%.*]] = and i32 [[TMP12]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA5:%.*]] = and i32 [[OP_EXTRA]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA6:%.*]] = and i32 [[OP_EXTRA5]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA7:%.*]] = and i32 [[OP_EXTRA6]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA8:%.*]] = and i32 [[OP_EXTRA7]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA9:%.*]] = and i32 [[OP_EXTRA8]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA10:%.*]] = and i32 [[OP_EXTRA9]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA11:%.*]] = and i32 [[OP_EXTRA10]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA12:%.*]] = and i32 [[OP_EXTRA11]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA13:%.*]] = and i32 [[OP_EXTRA12]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA14:%.*]] = and i32 [[OP_EXTRA13]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA15:%.*]] = and i32 [[OP_EXTRA14]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA16:%.*]] = and i32 [[OP_EXTRA15]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA17:%.*]] = and i32 [[OP_EXTRA16]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA18:%.*]] = and i32 [[OP_EXTRA17]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA19:%.*]] = and i32 [[OP_EXTRA18]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA20:%.*]] = and i32 [[OP_EXTRA19]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA21:%.*]] = and i32 [[OP_EXTRA20]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA22:%.*]] = and i32 [[OP_EXTRA21]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA23:%.*]] = and i32 [[OP_EXTRA22]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA24:%.*]] = and i32 [[OP_EXTRA23]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA25:%.*]] = and i32 [[OP_EXTRA24]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA26:%.*]] = and i32 [[OP_EXTRA25]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA27:%.*]] = and i32 [[OP_EXTRA26]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA28:%.*]] = and i32 [[OP_EXTRA27]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA29:%.*]] = and i32 [[OP_EXTRA28]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA30:%.*]] = and i32 [[OP_EXTRA29]], [[TMP0]]
+; ALL-NEXT:    [[OP_EXTRA31:%.*]] = and i32 [[OP_EXTRA30]], [[TMP2]]
+; ALL-NEXT:    [[TMP13:%.*]] = extractelement <2 x i32> [[TMP11]], i32 0
+; ALL-NEXT:    br label [[LOOP]]
 ;
 entry:
   br label %loop
Index: test/Transforms/SLPVectorizer/X86/PR40310.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/PR40310.ll
+++ test/Transforms/SLPVectorizer/X86/PR40310.ll
@@ -7,7 +7,7 @@
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 31, i32 undef>, i32 [[PARAM:%.*]], i32 1
 ; CHECK-NEXT:    br label [[BCI_15:%.*]]
 ; CHECK:       bci_15:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP7:%.*]], [[BCI_15]] ], [ [[TMP0]], [[BCI_15_PREHEADER:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP11:%.*]], [[BCI_15]] ], [ [[TMP0]], [[BCI_15_PREHEADER:%.*]] ]
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 15
@@ -28,6 +28,13 @@
 ; CHECK-NEXT:    [[V38:%.*]] = and i32 undef, [[V36]]
 ; CHECK-NEXT:    [[V40:%.*]] = and i32 undef, [[V38]]
 ; CHECK-NEXT:    [[V42:%.*]] = and i32 undef, [[V40]]
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <16 x i32> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP6]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> <i32 16, i32 undef>, i32 [[V42]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = add <2 x i32> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <2 x i32> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = and <16 x i32> [[TMP4]], [[RDX_SHUF]]
 ; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <16 x i32> [[BIN_RDX]], <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -36,12 +43,9 @@
 ; CHECK-NEXT:    [[BIN_RDX4:%.*]] = and <16 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
 ; CHECK-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <16 x i32> [[BIN_RDX4]], <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX6:%.*]] = and <16 x i32> [[BIN_RDX4]], [[RDX_SHUF5]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <16 x i32> [[BIN_RDX6]], i32 0
-; CHECK-NEXT:    [[OP_EXTRA:%.*]] = and i32 [[TMP5]], [[TMP2]]
-; CHECK-NEXT:    [[V43:%.*]] = and i32 undef, [[V42]]
-; CHECK-NEXT:    [[V44:%.*]] = add i32 [[TMP2]], 16
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> undef, i32 [[V44]], i32 0
-; CHECK-NEXT:    [[TMP7]] = insertelement <2 x i32> [[TMP6]], i32 [[OP_EXTRA]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <16 x i32> [[BIN_RDX6]], i32 0
+; CHECK-NEXT:    [[OP_EXTRA:%.*]] = and i32 [[TMP12]], [[TMP2]]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i32> [[TMP11]], i32 1
 ; CHECK-NEXT:    br i1 true, label [[BCI_15]], label [[LOOPEXIT:%.*]]
 ; CHECK:       loopexit:
 ; CHECK-NEXT:    ret void
Index: test/Transforms/SLPVectorizer/X86/pr35497.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/pr35497.ll
+++ test/Transforms/SLPVectorizer/X86/pr35497.ll
@@ -12,20 +12,20 @@
 define void @_ZN1C10SwitchModeEv() local_unnamed_addr #0 comdat align 2 {
 ; CHECK-LABEL: @_ZN1C10SwitchModeEv(
 ; CHECK-NEXT:  for.body.lr.ph.i:
-; CHECK-NEXT:    [[OR_1:%.*]] = or i64 undef, 1
-; CHECK-NEXT:    store i64 [[OR_1]], i64* undef, align 8
+; CHECK-NEXT:    [[BAR5:%.*]] = load i64, i64* undef, align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i64> undef, i64 [[BAR5]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = or <2 x i64> [[TMP0]], <i64 1, i64 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
+; CHECK-NEXT:    store i64 [[TMP2]], i64* undef, align 8
 ; CHECK-NEXT:    [[FOO_1:%.*]] = getelementptr inbounds [[CLASS_1:%.*]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 0
 ; CHECK-NEXT:    [[FOO_2:%.*]] = getelementptr inbounds [[CLASS_1]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[FOO_1]] to <2 x i64>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8
-; CHECK-NEXT:    [[BAR5:%.*]] = load i64, i64* undef, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> undef, i64 [[OR_1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[BAR5]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = and <2 x i64> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[FOO_1]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = and <2 x i64> [[TMP1]], [[TMP4]]
 ; CHECK-NEXT:    [[BAR3:%.*]] = getelementptr inbounds [[CLASS_2:%.*]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 0
 ; CHECK-NEXT:    [[BAR4:%.*]] = getelementptr inbounds [[CLASS_2]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 1
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64* [[BAR3]] to <2 x i64>*
-; CHECK-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64* [[BAR3]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP5]], <2 x i64>* [[TMP6]], align 8
 ; CHECK-NEXT:    ret void
 ;
 for.body.lr.ph.i:
Index: test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
+++ test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
@@ -43,22 +43,16 @@
 ; CHECK-LABEL: @add1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store i32 [[TMP0]], i32* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP1]], 1
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
-; CHECK-NEXT:    store i32 [[ADD3]], i32* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP2]], 2
 ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
-; CHECK-NEXT:    store i32 [[ADD6]], i32* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4
-; CHECK-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP3]], 3
-; CHECK-NEXT:    store i32 [[ADD9]], i32* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -86,22 +80,16 @@
 ; CHECK-LABEL: @sub0(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store i32 [[SUB]], i32* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
-; CHECK-NEXT:    store i32 [[TMP1]], i32* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[SUB5:%.*]] = add nsw i32 [[TMP2]], -2
 ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
-; CHECK-NEXT:    store i32 [[SUB5]], i32* [[INCDEC_PTR3]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[SUB8:%.*]] = add nsw i32 [[TMP3]], -3
-; CHECK-NEXT:    store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> <i32 -1, i32 0, i32 -2, i32 -3>, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -205,22 +193,18 @@
 ; CHECK-LABEL: @addsub0(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store i32 [[SUB]], i32* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
-; CHECK-NEXT:    store i32 [[TMP1]], i32* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[SUB5:%.*]] = add nsw i32 [[TMP2]], -2
 ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
-; CHECK-NEXT:    store i32 [[SUB5]], i32* [[INCDEC_PTR3]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3
-; CHECK-NEXT:    store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], <i32 -1, i32 0, i32 -2, i32 -3>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub nsw <4 x i32> [[TMP1]], <i32 -1, i32 0, i32 -2, i32 -3>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -248,22 +232,18 @@
 ; CHECK-LABEL: @addsub1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store i32 [[SUB]], i32* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[SUB1:%.*]] = sub nsw i32 [[TMP1]], -1
 ; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
-; CHECK-NEXT:    store i32 [[SUB1]], i32* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
-; CHECK-NEXT:    store i32 [[TMP2]], i32* [[INCDEC_PTR3]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3
-; CHECK-NEXT:    store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], <i32 -1, i32 -1, i32 0, i32 -3>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub nsw <4 x i32> [[TMP1]], <i32 -1, i32 -1, i32 0, i32 -3>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -291,22 +271,16 @@
 ; CHECK-LABEL: @mul(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
-; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 257
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store i32 [[MUL]], i32* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[MUL3:%.*]] = mul nsw i32 [[TMP1]], -3
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
-; CHECK-NEXT:    store i32 [[MUL3]], i32* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
-; CHECK-NEXT:    store i32 [[TMP2]], i32* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4
-; CHECK-NEXT:    [[MUL9:%.*]] = mul nsw i32 [[TMP3]], -9
-; CHECK-NEXT:    store i32 [[MUL9]], i32* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nsw <4 x i32> <i32 257, i32 -3, i32 1, i32 -9>, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -334,22 +308,16 @@
 ; CHECK-LABEL: @shl0(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store i32 [[TMP0]], i32* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[TMP1]], 1
 ; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
-; CHECK-NEXT:    store i32 [[SHL]], i32* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[SHL5:%.*]] = shl i32 [[TMP2]], 2
 ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
-; CHECK-NEXT:    store i32 [[SHL5]], i32* [[INCDEC_PTR3]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[SHL8:%.*]] = shl i32 [[TMP3]], 3
-; CHECK-NEXT:    store i32 [[SHL8]], i32* [[INCDEC_PTR6]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = shl <4 x i32> [[TMP1]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -453,22 +421,16 @@
 ; CHECK-LABEL: @add1f(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store float [[TMP0]], float* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[ADD3:%.*]] = fadd fast float [[TMP1]], 1.000000e+00
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
-; CHECK-NEXT:    store float [[ADD3]], float* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[ADD6:%.*]] = fadd fast float [[TMP2]], 2.000000e+00
 ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
-; CHECK-NEXT:    store float [[ADD6]], float* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
-; CHECK-NEXT:    [[ADD9:%.*]] = fadd fast float [[TMP3]], 3.000000e+00
-; CHECK-NEXT:    store float [[ADD9]], float* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <4 x float> <float -0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -496,22 +458,16 @@
 ; CHECK-LABEL: @sub0f(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store float [[ADD]], float* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
-; CHECK-NEXT:    store float [[TMP1]], float* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[ADD6:%.*]] = fadd fast float [[TMP2]], -2.000000e+00
 ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
-; CHECK-NEXT:    store float [[ADD6]], float* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
-; CHECK-NEXT:    [[ADD9:%.*]] = fadd fast float [[TMP3]], -3.000000e+00
-; CHECK-NEXT:    store float [[ADD9]], float* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <4 x float> <float -1.000000e+00, float -0.000000e+00, float -2.000000e+00, float -3.000000e+00>, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -615,22 +571,18 @@
 ; CHECK-LABEL: @addsub0f(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store float [[SUB]], float* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
-; CHECK-NEXT:    store float [[TMP1]], float* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[SUB5:%.*]] = fadd fast float [[TMP2]], -2.000000e+00
 ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
-; CHECK-NEXT:    store float [[SUB5]], float* [[INCDEC_PTR3]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00
-; CHECK-NEXT:    store float [[SUB8]], float* [[INCDEC_PTR6]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <4 x float> [[TMP1]], <float -1.000000e+00, float -0.000000e+00, float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub fast <4 x float> [[TMP1]], <float -1.000000e+00, float -0.000000e+00, float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[DST]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -658,22 +610,18 @@
 ; CHECK-LABEL: @addsub1f(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store float [[SUB]], float* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[SUB1:%.*]] = fsub fast float [[TMP1]], -1.000000e+00
 ; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
-; CHECK-NEXT:    store float [[SUB1]], float* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
-; CHECK-NEXT:    store float [[TMP2]], float* [[INCDEC_PTR3]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00
-; CHECK-NEXT:    store float [[SUB8]], float* [[INCDEC_PTR6]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <4 x float> [[TMP1]], <float -1.000000e+00, float -1.000000e+00, float -0.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub fast <4 x float> [[TMP1]], <float -1.000000e+00, float -1.000000e+00, float -0.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[DST]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -701,22 +649,16 @@
 ; CHECK-LABEL: @mulf(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = fmul fast float [[TMP0]], 2.570000e+02
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store float [[SUB]], float* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[SUB3:%.*]] = fmul fast float [[TMP1]], -3.000000e+00
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
-; CHECK-NEXT:    store float [[SUB3]], float* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
-; CHECK-NEXT:    store float [[TMP2]], float* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
-; CHECK-NEXT:    [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
-; CHECK-NEXT:    store float [[SUB9]], float* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <4 x float> <float 2.570000e+02, float -3.000000e+00, float 1.000000e+00, float -9.000000e+00>, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -825,22 +767,16 @@
 ; CHECK-LABEL: @sub0fn(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store float [[ADD]], float* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
-; CHECK-NEXT:    store float [[TMP1]], float* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[ADD6:%.*]] = fadd float [[TMP2]], -2.000000e+00
 ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
-; CHECK-NEXT:    store float [[ADD6]], float* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
-; CHECK-NEXT:    [[ADD9:%.*]] = fadd float [[TMP3]], -3.000000e+00
-; CHECK-NEXT:    store float [[ADD9]], float* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> <float -1.000000e+00, float -0.000000e+00, float -2.000000e+00, float -3.000000e+00>, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry: