Index: lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -299,12 +299,31 @@
               : TargetTransformInfo::SK_PermuteSingleSrc;
 }
 
+static bool isRemainder(unsigned Opcode) {
+  return (Opcode == Instruction::SRem || Opcode == Instruction::URem ||
+          Opcode == Instruction::SRem || Opcode == Instruction::FRem);
+}
+
+/// Checks if the \p Opcode can be considered as an operand of a (possibly)
+/// binary operation \p I.
+/// \returns The code of the binary operation of instruction \p I if the
+/// instruction with \p Opcode can be considered as an operand of \p I with the
+/// default value.
+static unsigned tryToRepresentAsInstArg(unsigned Opcode, Instruction *I) {
+  if (I->getOpcode() != Instruction::PHI && !isRemainder(I->getOpcode()) &&
+      (I->getType()->isIntegerTy() ||
+       (isa<FPMathOperator>(I) && cast<FPMathOperator>(I)->isFast())))
+    return I->getOpcode();
+  return 0;
+}
+
 namespace {
 
 /// Main data required for vectorization of instructions.
 struct InstructionsState {
   /// The very first instruction in the list with the main opcode.
   Value *OpValue = nullptr;
+  Value *Parent = nullptr;
 
   /// The main/alternate instruction.
   Instruction *MainOp = nullptr;
@@ -315,21 +334,28 @@
     return MainOp ? MainOp->getOpcode() : 0;
   }
 
+  std::pair<Value *, unsigned> getKey() const {
+    assert(Parent && "Incorrect parent!");
+    return std::make_pair(Parent, getOpcode());
+  }
+
   unsigned getAltOpcode() const {
     return AltOp ? AltOp->getOpcode() : 0;
   }
 
   /// Some of the instructions in the list have alternate opcodes.
-  bool isAltShuffle() const { return getOpcode() != getAltOpcode(); }
+  bool isAltShuffle() const { return (getOpcode() != 0 && getAltOpcode() != 0 &&
+                                      getOpcode() != getAltOpcode()); }
 
   bool isOpcodeOrAlt(Instruction *I) const {
     unsigned CheckedOpcode = I->getOpcode();
     return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
   }
 
-  InstructionsState() = delete;
-  InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
-      : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
+  InstructionsState() = default;
+  InstructionsState(Value *OpValue, Value *Parent,
+                    Instruction *MainOp, Instruction *AltOp)
+      : OpValue(OpValue), Parent(Parent), MainOp(MainOp), AltOp(AltOp) {}
 };
 
 } // end anonymous namespace
@@ -337,58 +363,112 @@
 /// Chooses the correct key for scheduling data. If \p Op has the same (or
 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
 /// OpValue.
-static Value *isOneOf(const InstructionsState &S, Value *Op) {
-  auto *I = dyn_cast<Instruction>(Op);
+static Value *isOneOf(const InstructionsState &S, Instruction *I) {
   if (I && S.isOpcodeOrAlt(I))
-    return Op;
+    return I;
   return S.OpValue;
 }
 
 /// \returns analysis of the Instructions in \p VL described in
 /// InstructionsState, the Opcode that we suppose the whole list
 /// could be vectorized even if its structure is diverse.
-static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
+static InstructionsState getSameOpcode(Value *Parent, ArrayRef<Value *> VL,
                                        unsigned BaseIndex = 0) {
+  assert(Parent && "Incorrect parent!");
   // Make sure these are all Instructions.
   if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
-    return InstructionsState(VL[BaseIndex], nullptr, nullptr);
+    return InstructionsState(VL[BaseIndex], Parent, nullptr, nullptr);
 
+  unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
   bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
   bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
-  unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
+  bool IsNonAlt = false;
   unsigned AltOpcode = Opcode;
+  unsigned OpcodeNum = 0;
+  unsigned AltOpcodeNum = 0;
+  unsigned NonAltNum = 0;
+  unsigned NonAltIndex = 0;
   unsigned AltIndex = BaseIndex;
 
-  // Check for one alternate opcode from another BinaryOperator.
-  // TODO - generalize to support all operators (types, calls etc.).
+  // Check for an alternate opcode pattern.
   for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
-    unsigned InstOpcode = cast<Instruction>(VL[Cnt])->getOpcode();
-    if (IsBinOp && isa<BinaryOperator>(VL[Cnt])) {
-      if (InstOpcode == Opcode || InstOpcode == AltOpcode)
-        continue;
-      if (Opcode == AltOpcode) {
-        AltOpcode = InstOpcode;
-        AltIndex = Cnt;
-        continue;
-      }
-    } else if (IsCastOp && isa<CastInst>(VL[Cnt])) {
+    auto *I = cast<Instruction>(VL[Cnt]);
+    unsigned InstOpcode = I->getOpcode();
+    if (IsCastOp && isa<CastInst>(VL[Cnt])) {
       Type *Ty0 = cast<Instruction>(VL[BaseIndex])->getOperand(0)->getType();
       Type *Ty1 = cast<Instruction>(VL[Cnt])->getOperand(0)->getType();
       if (Ty0 == Ty1) {
-        if (InstOpcode == Opcode || InstOpcode == AltOpcode)
+        if (InstOpcode == Opcode) {
+          OpcodeNum++;
+          continue;
+        }
+        if (AltOpcode != Opcode && InstOpcode == AltOpcode) {
+          AltOpcodeNum++;
           continue;
+        }
         if (Opcode == AltOpcode) {
           AltOpcode = InstOpcode;
           AltIndex = Cnt;
+          AltOpcodeNum++;
           continue;
         }
       }
-    } else if (InstOpcode == Opcode || InstOpcode == AltOpcode)
+      return InstructionsState(VL[BaseIndex], Parent, nullptr, nullptr);
+    }
+    if (InstOpcode == Opcode) {
+      OpcodeNum++;
       continue;
-    return InstructionsState(VL[BaseIndex], nullptr, nullptr);
+    }
+    if (AltOpcode != Opcode && InstOpcode == AltOpcode) {
+      AltOpcodeNum++;
+      continue;
+    }
+    if (InstOpcode != Opcode && InstOpcode != AltOpcode) {
+      if (IsBinOp && AltOpcode == Opcode && isa<BinaryOperator>(I)) {
+        AltOpcode = InstOpcode;
+        AltOpcodeNum++;
+        AltIndex = Cnt;
+        continue;
+      }
+      if (Opcode != Instruction::PHI &&
+          (tryToRepresentAsInstArg(Opcode, I) ||
+           (IsBinOp && InstOpcode != Instruction::PHI &&
+            tryToRepresentAsInstArg(InstOpcode,
+                                    cast<Instruction>(VL[BaseIndex]))))) {
+        if (!IsNonAlt) {
+          NonAltIndex = Cnt;
+          IsNonAlt = true;
+        }
+        NonAltNum++;
+        continue;
+      }
+      return InstructionsState(VL[BaseIndex], Parent, nullptr, nullptr);
+    }
   }
 
-  return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
+  if (IsNonAlt && VL.size() > 2 && (OpcodeNum + AltOpcodeNum) <= NonAltNum) {
+    BaseIndex = NonAltIndex;
+    AltIndex = BaseIndex;
+    Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
+    AltOpcode = Opcode;
+    IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
+    for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
+      auto *I = cast<Instruction>(VL[Cnt]);
+      unsigned InstOpcode = I->getOpcode();
+      if (Opcode == AltOpcode && IsBinOp && isa<BinaryOperator>(I)) {
+        AltOpcode = InstOpcode;
+        AltIndex = Cnt;
+      }
+    }
+  }
+
+  if (IsNonAlt && (!IsBinOp ||
+      isRemainder(Opcode) ||
+      isRemainder(AltOpcode)))
+    return InstructionsState(VL[BaseIndex], Parent, nullptr, nullptr);
+
+  return InstructionsState(VL[BaseIndex], Parent,
+                           cast<Instruction>(VL[BaseIndex]),
                            cast<Instruction>(VL[AltIndex]));
 }
 
@@ -613,7 +693,8 @@
   int getEntryCost(TreeEntry *E);
 
   /// This is the recursive part of buildTree.
-  void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, int);
+  void buildTree_rec(Value *Parent, ArrayRef<Value *> Roots, unsigned Depth,
+                     int);
 
   /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
   /// be vectorized to use the original vector (or aggregate "bitcast" to a
@@ -627,7 +708,7 @@
   Value *vectorizeTree(TreeEntry *E);
 
   /// Vectorize a single entry in the tree, starting in \p VL.
-  Value *vectorizeTree(ArrayRef<Value *> VL);
+  Value *vectorizeTree(ArrayRef<Value *> VL, Value *Parent);
 
   /// \returns the scalarization cost for this type. Scalarization in this
   /// context means the creation of vectors from a group of scalars.
@@ -701,10 +782,14 @@
     /// The TreeEntry index containing the user of this entry.  We can actually
     /// have multiple users so the data structure is not truly a tree.
     SmallVector<int, 1> UserTreeIndices;
+
+    /// Info about instruction in this tree entry.
+    InstructionsState State;
   };
 
   /// Create a new VectorizableTree entry.
   void newTreeEntry(ArrayRef<Value *> VL, bool Vectorized, int &UserTreeIdx,
+                    const InstructionsState &S,
                     ArrayRef<unsigned> ReuseShuffleIndices = None,
                     ArrayRef<unsigned> ReorderIndices = None) {
     VectorizableTree.emplace_back(VectorizableTree);
@@ -716,11 +801,22 @@
                                      ReuseShuffleIndices.end());
     Last->ReorderIndices = ReorderIndices;
     if (Vectorized) {
-      for (int i = 0, e = VL.size(); i != e; ++i) {
-        assert(!getTreeEntry(VL[i]) && "Scalar already in tree!");
-        ScalarToTreeEntry[VL[i]] = idx;
+      Last->State = S;
+      for (Value *V: VL) {
+        auto *I = cast<Instruction>(V);
+        assert(!getTreeEntry(I, S.getKey()) && "Scalar already in tree!");
+        ScalarToTreeEntry[I][S.getKey()] = idx;
       }
     } else {
+      for (Value *V: VL) {
+        if (Instruction *I = dyn_cast<Instruction>(V)) {
+           Last->State.MainOp = I;
+           Last->State.AltOp = I;
+           break;
+        }
+      }
+      Last->State.OpValue = VL[0];
+      Last->State.Parent = VL[0];
       MustGather.insert(VL.begin(), VL.end());
     }
 
@@ -733,15 +829,36 @@
   /// Holds all of the tree entries.
   std::vector<TreeEntry> VectorizableTree;
 
-  TreeEntry *getTreeEntry(Value *V) {
-    auto I = ScalarToTreeEntry.find(V);
-    if (I != ScalarToTreeEntry.end())
-      return &VectorizableTree[I->second];
+  TreeEntry *getTreeEntry(Instruction *I) {
+    if (!I)
+      return nullptr;
+    auto It = ScalarToTreeEntry.find(I);
+    if (It != ScalarToTreeEntry.end()) {
+      auto &STT = It->second;
+      for (auto STTI : STT) {
+        if (isOneOf(VectorizableTree[STTI.second].State, I) == I)
+          return &VectorizableTree[STTI.second];
+      }
+    }
+    return nullptr;
+  }
+
+  TreeEntry *getTreeEntry(Instruction *I, std::pair<Value *, unsigned> Key) {
+    if (!I)
+      return nullptr;
+    auto It = ScalarToTreeEntry.find(I);
+    if (It != ScalarToTreeEntry.end()) {
+      auto &STT = It->second;
+      auto STTI = STT.find(Key);
+      if (STTI != STT.end())
+        return &VectorizableTree[STTI->second];
+    }
     return nullptr;
   }
 
   /// Maps a specific scalar to its tree entry.
-  SmallDenseMap<Value*, int> ScalarToTreeEntry;
+  SmallDenseMap<Instruction *, SmallDenseMap<std::pair<Value *, unsigned>, int>>
+      ScalarToTreeEntry;
 
   /// A list of scalars that we found that we need to keep as scalars.
   ValueSet MustGather;
@@ -831,19 +948,6 @@
     // dependencies are not calculated yet.
     enum { InvalidDeps = -1 };
 
-    ScheduleData() = default;
-
-    void init(int BlockSchedulingRegionID, Value *OpVal) {
-      FirstInBundle = this;
-      NextInBundle = nullptr;
-      NextLoadStore = nullptr;
-      IsScheduled = false;
-      SchedulingRegionID = BlockSchedulingRegionID;
-      UnscheduledDepsInBundle = UnscheduledDeps;
-      clearDependencies();
-      OpValue = OpVal;
-    }
-
     /// Returns true if the dependency information has been calculated.
     bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
 
@@ -885,24 +989,39 @@
       MemoryDependencies.clear();
     }
 
+    /// Get an instruction behind this ScheduleData instance.
+    virtual Instruction *getInst() const = 0;
+
+    /// Returns true if the instance is a pseudo instruction one.
+    virtual bool isPseudo() const = 0;
+
     void dump(raw_ostream &os) const {
       if (!isSchedulingEntity()) {
-        os << "/ " << *Inst;
+        os << "/ ";
+        if (isPseudo())
+          os << "*";
+        os << *getInst();
       } else if (NextInBundle) {
-        os << '[' << *Inst;
+        os << '[';
+        if (isPseudo())
+          os << "*";
+        os << *getInst();
         ScheduleData *SD = NextInBundle;
         while (SD) {
-          os << ';' << *SD->Inst;
-          SD = SD->NextInBundle;
+           os << ';' ;
+           if (SD->isPseudo())
+             os << "*";
+           os << *SD->getInst();
+           SD = SD->NextInBundle;
         }
         os << ']';
       } else {
-        os << *Inst;
+       if (isPseudo())
+         os << "*";
+       os << *getInst();
       }
     }
 
-    Instruction *Inst = nullptr;
-
     /// Points to the head in an instruction bundle (and always to this for
     /// single instructions).
     ScheduleData *FirstInBundle = nullptr;
@@ -946,8 +1065,66 @@
     /// dry-run).
     bool IsScheduled = false;
 
-    /// Opcode of the current instruction in the schedule data.
-    Value *OpValue = nullptr;
+    /// Opcode that represents instructions to be vectorized.
+    unsigned Opcode = 0;
+
+    Value *Parent = nullptr;
+  };
+
+  struct InstScheduleData : public ScheduleData {
+
+    InstScheduleData() = default;
+
+    Instruction *Inst = nullptr;
+
+    void init(int BlockSchedulingRegionID) {
+      FirstInBundle = this;
+      NextInBundle = nullptr;
+      NextLoadStore = nullptr;
+      IsScheduled = false;
+      SchedulingRegionID = BlockSchedulingRegionID;
+      UnscheduledDepsInBundle = UnscheduledDeps;
+      clearDependencies();
+    }
+
+    Instruction *getInst() const {
+      return Inst;
+    }
+
+    bool isPseudo() const {
+      return false;
+    }
+
+  };
+
+  struct PseudoScheduleData : public ScheduleData {
+
+    PseudoScheduleData() = default;
+
+    InstScheduleData *ISD;
+
+    void init(int BlockSchedulingRegionID, InstScheduleData *OpISD,
+              Value *OpParent, unsigned OpCode) {
+     FirstInBundle = this;
+     NextInBundle = nullptr;
+     NextLoadStore = OpISD->NextLoadStore;
+     IsScheduled = false;
+     SchedulingRegionID = BlockSchedulingRegionID;
+     UnscheduledDepsInBundle = UnscheduledDeps;
+     clearDependencies();
+     ISD = OpISD;
+     Opcode = OpCode;
+     Parent = OpParent;
+    }
+
+    Instruction *getInst() const {
+      return ISD->Inst;
+    }
+
+    bool isPseudo() const {
+      return true;
+    }
+
   };
 
 #ifndef NDEBUG
@@ -964,7 +1141,8 @@
   /// Contains all scheduling data for a basic block.
   struct BlockScheduling {
     BlockScheduling(BasicBlock *BB)
-        : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
+        : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize),
+          PseudoChunkSize(BB->size()), PseudoChunkPos(PseudoChunkSize) {}
 
     void clear() {
       ReadyInsts.clear();
@@ -972,6 +1150,7 @@
       ScheduleEnd = nullptr;
       FirstLoadStoreInRegion = nullptr;
       LastLoadStoreInRegion = nullptr;
+      PseudoInstScheduleDataMap.clear();
 
       // Reduce the maximum schedule region size by the size of the
       // previous scheduling run.
@@ -985,21 +1164,24 @@
       ++SchedulingRegionID;
     }
 
-    ScheduleData *getScheduleData(Value *V) {
-      ScheduleData *SD = ScheduleDataMap[V];
+    InstScheduleData *getInstScheduleData(Instruction *I) {
+      InstScheduleData *SD = InstScheduleDataMap[I];
       if (SD && SD->SchedulingRegionID == SchedulingRegionID)
         return SD;
       return nullptr;
     }
 
-    ScheduleData *getScheduleData(Value *V, Value *Key) {
-      if (V == Key)
-        return getScheduleData(V);
-      auto I = ExtraScheduleDataMap.find(V);
-      if (I != ExtraScheduleDataMap.end()) {
-        ScheduleData *SD = I->second[Key];
-        if (SD && SD->SchedulingRegionID == SchedulingRegionID)
-          return SD;
+    ScheduleData *getScheduleData(Instruction *I,
+                                  std::pair<Value *, unsigned> Key) {
+      ScheduleData *SD = getInstScheduleData(I);
+      if (SD && SD->Parent == Key.first && SD->Opcode == Key.second)
+        return SD;
+      auto It = PseudoInstScheduleDataMap.find(I);
+      if (It != PseudoInstScheduleDataMap.end()) {
+        PseudoScheduleData *PSD = It->second[Key];
+        if (PSD && PSD->SchedulingRegionID == SchedulingRegionID &&
+            PSD->Parent == Key.first && PSD->Opcode == Key.second)
+          return PSD;
       }
       return nullptr;
     }
@@ -1016,13 +1198,13 @@
       LLVM_DEBUG(dbgs() << "SLP:   schedule " << *SD << "\n");
 
       ScheduleData *BundleMember = SD;
+      unsigned Opcode = BundleMember->Opcode;
+      Value *Parent = BundleMember->Parent;
       while (BundleMember) {
-        if (BundleMember->Inst != BundleMember->OpValue) {
-          BundleMember = BundleMember->NextInBundle;
-          continue;
-        }
+        assert(BundleMember->Opcode == Opcode &&
+               BundleMember->Parent == Parent && "Corrupt bundle member");
         // Handle the def-use chain dependencies.
-        for (Use &U : BundleMember->Inst->operands()) {
+        for (Use &U : BundleMember->getInst()->operands()) {
           auto *I = dyn_cast<Instruction>(U.get());
           if (!I)
             continue;
@@ -1058,15 +1240,21 @@
       }
     }
 
-    void doForAllOpcodes(Value *V,
+    void doForAllOpcodes(Instruction *I,
                          function_ref<void(ScheduleData *SD)> Action) {
-      if (ScheduleData *SD = getScheduleData(V))
+      auto It = PseudoInstScheduleDataMap.find(I);
+      if (It != PseudoInstScheduleDataMap.end()) {
+        for (auto &P : It->second) {
+          ScheduleData *SD = P.second;
+          if (SD && SD->isPartOfBundle() &&
+              SD->SchedulingRegionID == SchedulingRegionID) {
+            Action(SD);
+          }
+        }
+      }
+      if (ScheduleData *SD = getInstScheduleData(I)) {
         Action(SD);
-      auto I = ExtraScheduleDataMap.find(V);
-      if (I != ExtraScheduleDataMap.end())
-        for (auto &P : I->second)
-          if (P.second->SchedulingRegionID == SchedulingRegionID)
-            Action(P.second);
+      }
     }
 
     /// Put all instructions into the ReadyList which are ready for scheduling.
@@ -1090,20 +1278,22 @@
                            const InstructionsState &S);
 
     /// Un-bundles a group of instructions.
-    void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
+    void cancelScheduling(Value *OpValue, std::pair<Value *, unsigned> Key);
 
     /// Allocates schedule data chunk.
-    ScheduleData *allocateScheduleDataChunks();
+    InstScheduleData *allocateInstScheduleDataChunks();
 
-    /// Extends the scheduling region so that V is inside the region.
+    PseudoScheduleData *allocatePseudoInstDataChunks();
+
+    /// Extends the scheduling region so that I is inside the region.
     /// \returns true if the region size is within the limit.
-    bool extendSchedulingRegion(Value *V, const InstructionsState &S);
+    bool extendSchedulingRegion(Instruction *I, const InstructionsState &S);
 
-    /// Initialize the ScheduleData structures for new instructions in the
+    /// Initialize the InstScheduleData structures for new instructions in the
     /// scheduling region.
     void initScheduleData(Instruction *FromI, Instruction *ToI,
-                          ScheduleData *PrevLoadStore,
-                          ScheduleData *NextLoadStore);
+                          InstScheduleData *PrevLoadStore,
+                          InstScheduleData *NextLoadStore);
 
     /// Updates the dependency information of a bundle and of all instructions/
     /// bundles which depend on the original bundle.
@@ -1113,26 +1303,39 @@
     /// Sets all instruction in the scheduling region to un-scheduled.
     void resetSchedule();
 
+    /// Reorder bundles from PseudoScheduleData data after scheduling,
+    /// if an Instruction is present in PseudoScheduleData that means this
+    /// Instruction is prenet in multiply bundles and FirstInBundle is not last
+    /// one scheduled for all copies of instuction in InstScheduleData and
+    /// PseudoScheduleData.
+    void reorderBundles();
+
     BasicBlock *BB;
 
-    /// Simple memory allocation for ScheduleData.
-    std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
+    /// Simple memory allocation for InstScheduleData.
+    std::vector<std::unique_ptr<InstScheduleData[]>> InstScheduleDataChunks;
+
+    std::vector<std::unique_ptr<PseudoScheduleData[]>> PseudoScheduleDataChunks;
 
-    /// The size of a ScheduleData array in ScheduleDataChunks.
+    /// The size of a InstScheduleData array in InstScheduleDataChunks.
     int ChunkSize;
 
     /// The allocator position in the current chunk, which is the last entry
-    /// of ScheduleDataChunks.
+    /// of InstScheduleDataChunks.
     int ChunkPos;
 
-    /// Attaches ScheduleData to Instruction.
+    int PseudoChunkSize;
+
+    int PseudoChunkPos;
+
+    /// Attaches InstScheduleData to Instruction.
     /// Note that the mapping survives during all vectorization iterations, i.e.
-    /// ScheduleData structures are recycled.
-    DenseMap<Value *, ScheduleData *> ScheduleDataMap;
+    /// InstScheduleData structures are recycled.
+    DenseMap<Instruction *, InstScheduleData *> InstScheduleDataMap;
 
-    /// Attaches ScheduleData to Instruction with the leading key.
-    DenseMap<Value *, SmallDenseMap<Value *, ScheduleData *>>
-        ExtraScheduleDataMap;
+    DenseMap<Instruction *,
+             SmallDenseMap<std::pair<Value *, unsigned>, PseudoScheduleData *>>
+        PseudoInstScheduleDataMap;
 
     struct ReadyList : SmallVector<ScheduleData *, 8> {
       void insert(ScheduleData *SD) { push_back(SD); }
@@ -1149,11 +1352,11 @@
 
     /// The first memory accessing instruction in the scheduling region
     /// (can be null).
-    ScheduleData *FirstLoadStoreInRegion = nullptr;
+    InstScheduleData *FirstLoadStoreInRegion = nullptr;
 
     /// The last memory accessing instruction in the scheduling region
     /// (can be null).
-    ScheduleData *LastLoadStoreInRegion = nullptr;
+    InstScheduleData *LastLoadStoreInRegion = nullptr;
 
     /// The current size of the scheduling region.
     int ScheduleRegionSize = 0;
@@ -1162,9 +1365,9 @@
     int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
 
     /// The ID of the scheduling region. For a new vectorization iteration this
-    /// is incremented which "removes" all ScheduleData from the region.
+    /// is incremented which "removes" all InstScheduleData from the region.
     // Make sure that the initial SchedulingRegionID is greater than the
-    // initial SchedulingRegionID in ScheduleData (which is 0).
+    // initial SchedulingRegionID in InstScheduleData (which is 0).
     int SchedulingRegionID = 1;
   };
 
@@ -1331,7 +1534,7 @@
   UserIgnoreList = UserIgnoreLst;
   if (!allSameType(Roots))
     return;
-  buildTree_rec(Roots, 0, -1);
+  buildTree_rec(Roots[0], Roots, 0, -1);
 
   // Collect the values that we need to extract from the tree.
   for (TreeEntry &EIdx : VectorizableTree) {
@@ -1345,6 +1548,8 @@
     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
       Value *Scalar = Entry->Scalars[Lane];
       int FoundLane = Lane;
+      if (!Entry->State.isOpcodeOrAlt(cast<Instruction>(Scalar)))
+        continue;
       if (!Entry->ReuseShuffleIndices.empty()) {
         FoundLane =
             std::distance(Entry->ReuseShuffleIndices.begin(),
@@ -1366,7 +1571,7 @@
           continue;
 
         // Skip in-tree scalars that become vectors
-        if (TreeEntry *UseEntry = getTreeEntry(U)) {
+        if (TreeEntry *UseEntry = getTreeEntry(cast<Instruction>(U))) {
           Value *UseScalar = UseEntry->Scalars[0];
           // Some in-tree scalars will remain as scalar in vectorized
           // instructions. If that is the case, the one in Lane 0 will
@@ -1392,35 +1597,63 @@
   }
 }
 
-void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
+static Value *getDefaultConstantForOpcode(unsigned Opcode, Type *Ty) {
+  switch(Opcode) {
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+    return ConstantInt::getNullValue(Ty);
+  case Instruction::Mul:
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+    return ConstantInt::get(Ty, /*V=*/1);
+  case Instruction::FAdd:
+  case Instruction::FSub:
+    return ConstantFP::get(Ty, /*V=*/0.0);
+  case Instruction::FMul:
+  case Instruction::FDiv:
+    return ConstantFP::get(Ty, /*V=*/1.0);
+  case Instruction::And:
+    return ConstantInt::getAllOnesValue(Ty);
+  default:
+    break;
+  }
+  llvm_unreachable("unknown binop for default constant value");
+}
+
+void BoUpSLP::buildTree_rec(Value *Parent, ArrayRef<Value *> VL, unsigned Depth,
                             int UserTreeIdx) {
   assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
 
-  InstructionsState S = getSameOpcode(VL);
+  InstructionsState S = getSameOpcode(Parent, VL);
   if (Depth == RecursionMaxDepth) {
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
-    newTreeEntry(VL, false, UserTreeIdx);
+    newTreeEntry(VL, false, UserTreeIdx, S);
     return;
   }
 
   // Don't handle vectors.
   if (S.OpValue->getType()->isVectorTy()) {
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
-    newTreeEntry(VL, false, UserTreeIdx);
+    newTreeEntry(VL, false, UserTreeIdx, S);
     return;
   }
 
   if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
     if (SI->getValueOperand()->getType()->isVectorTy()) {
       LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
-      newTreeEntry(VL, false, UserTreeIdx);
+      newTreeEntry(VL, false, UserTreeIdx, S);
       return;
     }
 
   // If all of the operands are identical or constant we have a simple solution.
   if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode()) {
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
-    newTreeEntry(VL, false, UserTreeIdx);
+    newTreeEntry(VL, false, UserTreeIdx, S);
     return;
   }
 
@@ -1432,17 +1665,17 @@
     if (EphValues.count(VL[i])) {
       LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i]
                         << ") is ephemeral.\n");
-      newTreeEntry(VL, false, UserTreeIdx);
+      newTreeEntry(VL, false, UserTreeIdx, S);
       return;
     }
   }
 
   // Check if this is a duplicate of another entry.
-  if (TreeEntry *E = getTreeEntry(S.OpValue)) {
+  if (TreeEntry *E = getTreeEntry(cast<Instruction>(S.OpValue))) {
     LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
     if (!E->isSame(VL)) {
       LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
-      newTreeEntry(VL, false, UserTreeIdx);
+      newTreeEntry(VL, false, UserTreeIdx, S);
       return;
     }
     // Record the reuse of the tree node.  FIXME, currently this is only used to
@@ -1455,13 +1688,11 @@
 
   // Check that none of the instructions in the bundle are already in the tree.
   for (unsigned i = 0, e = VL.size(); i != e; ++i) {
-    auto *I = dyn_cast<Instruction>(VL[i]);
-    if (!I)
-      continue;
-    if (getTreeEntry(I)) {
+    auto *I = cast<Instruction>(VL[i]);
+    if (getTreeEntry(I) || getTreeEntry(I, S.getKey())) {
       LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i]
                         << ") is already in tree.\n");
-      newTreeEntry(VL, false, UserTreeIdx);
+      newTreeEntry(VL, false, UserTreeIdx, S);
       return;
     }
   }
@@ -1471,7 +1702,7 @@
   for (unsigned i = 0, e = VL.size(); i != e; ++i) {
     if (MustGather.count(VL[i])) {
       LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
-      newTreeEntry(VL, false, UserTreeIdx);
+      newTreeEntry(VL, false, UserTreeIdx, S);
       return;
     }
   }
@@ -1485,7 +1716,7 @@
     // Don't go into unreachable blocks. They may contain instructions with
     // dependency cycles which confuse the final scheduling.
     LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
-    newTreeEntry(VL, false, UserTreeIdx);
+    newTreeEntry(VL, false, UserTreeIdx, S);
     return;
   }
 
@@ -1505,7 +1736,7 @@
     LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
     if (UniqueValues.size() <= 1 || !llvm::isPowerOf2_32(UniqueValues.size())) {
       LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
-      newTreeEntry(VL, false, UserTreeIdx);
+      newTreeEntry(VL, false, UserTreeIdx, S);
       return;
     }
     VL = UniqueValues;
@@ -1519,10 +1750,10 @@
 
   if (!BS.tryScheduleBundle(VL, this, S)) {
     LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
-    assert((!BS.getScheduleData(VL0) ||
-            !BS.getScheduleData(VL0)->isPartOfBundle()) &&
+    assert((!BS.getScheduleData(VL0, S.getKey()) ||
+            !BS.getScheduleData(VL0, S.getKey())->isPartOfBundle()) &&
            "tryScheduleBundle should cancelScheduling on failure");
-    newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+    newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies);
     return;
   }
   LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
@@ -1542,13 +1773,13 @@
           if (Term && Term->isTerminator()) {
             LLVM_DEBUG(dbgs()
                        << "SLP: Need to swizzle PHINodes (terminator use).\n");
-            BS.cancelScheduling(VL, VL0);
-            newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+            BS.cancelScheduling(VL0, S.getKey());
+            newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies);
             return;
           }
         }
 
-      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      newTreeEntry(VL, true, UserTreeIdx, S, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
 
       for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
@@ -1558,7 +1789,7 @@
           Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock(
               PH->getIncomingBlock(i)));
 
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+        buildTree_rec(VL0, Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
@@ -1569,7 +1800,7 @@
       if (Reuse) {
         LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
         ++NumOpsWantToKeepOriginalOrder;
-        newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
+        newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, S,
                      ReuseShuffleIndicies);
         return;
       }
@@ -1586,13 +1817,15 @@
         auto StoredCurrentOrderAndNum =
             NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
         ++StoredCurrentOrderAndNum->getSecond();
-        newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies,
+        newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, S,
+                     ReuseShuffleIndicies,
                      StoredCurrentOrderAndNum->getFirst());
         return;
       }
       LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
-      newTreeEntry(VL, /*Vectorized=*/false, UserTreeIdx, ReuseShuffleIndicies);
-      BS.cancelScheduling(VL, VL0);
+      newTreeEntry(VL, /*Vectorized=*/false, UserTreeIdx, S,
+                   ReuseShuffleIndicies);
+      BS.cancelScheduling(VL0, S.getKey());
       return;
     }
     case Instruction::Load: {
@@ -1606,8 +1839,8 @@
 
       if (DL->getTypeSizeInBits(ScalarTy) !=
           DL->getTypeAllocSizeInBits(ScalarTy)) {
-        BS.cancelScheduling(VL, VL0);
-        newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+        BS.cancelScheduling(VL0, S.getKey());
+        newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies);
         LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
         return;
       }
@@ -1619,8 +1852,8 @@
       for (Value *V : VL) {
         auto *L = cast<LoadInst>(V);
         if (!L->isSimple()) {
-          BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          BS.cancelScheduling(VL0, S.getKey());
+          newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies);
           LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
           return;
         }
@@ -1650,14 +1883,14 @@
           if (CurrentOrder.empty()) {
             // Original loads are consecutive and does not require reordering.
             ++NumOpsWantToKeepOriginalOrder;
-            newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
+            newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, S,
                          ReuseShuffleIndicies);
             LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
           } else {
             // Need to reorder.
             auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
             ++I->getSecond();
-            newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
+            newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, S,
                          ReuseShuffleIndicies, I->getFirst());
             LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
           }
@@ -1666,8 +1899,8 @@
       }
 
       LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
-      BS.cancelScheduling(VL, VL0);
-      newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+      BS.cancelScheduling(VL0, S.getKey());
+      newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies);
       return;
     }
     case Instruction::ZExt:
@@ -1686,14 +1919,14 @@
       for (unsigned i = 0; i < VL.size(); ++i) {
         Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
         if (Ty != SrcTy || !isValidElementType(Ty)) {
-          BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          BS.cancelScheduling(VL0, S.getKey());
+          newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies);
           LLVM_DEBUG(dbgs()
                      << "SLP: Gathering casts with different src types.\n");
           return;
         }
       }
-      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      newTreeEntry(VL, true, UserTreeIdx, S, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
 
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
@@ -1702,7 +1935,7 @@
         for (Value *j : VL)
           Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+        buildTree_rec(VL0, Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
@@ -1715,15 +1948,15 @@
         CmpInst *Cmp = cast<CmpInst>(VL[i]);
         if (Cmp->getPredicate() != P0 ||
             Cmp->getOperand(0)->getType() != ComparedTy) {
-          BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          BS.cancelScheduling(VL0, S.getKey());
+          newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies);
           LLVM_DEBUG(dbgs()
                      << "SLP: Gathering cmp with different predicate.\n");
           return;
         }
       }
 
-      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      newTreeEntry(VL, true, UserTreeIdx, S, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
 
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
@@ -1732,7 +1965,7 @@
         for (Value *j : VL)
           Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+        buildTree_rec(VL0, Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
@@ -1755,7 +1988,7 @@
     case Instruction::And:
     case Instruction::Or:
     case Instruction::Xor:
-      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      newTreeEntry(VL, true, UserTreeIdx, S, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
 
       // Sort operands of the instructions so that each side is more likely to
@@ -1763,18 +1996,26 @@
       if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
         ValueList Left, Right;
         reorderInputsAccordingToOpcode(S.getOpcode(), VL, Left, Right);
-        buildTree_rec(Left, Depth + 1, UserTreeIdx);
-        buildTree_rec(Right, Depth + 1, UserTreeIdx);
+        buildTree_rec(VL0, Left, Depth + 1, UserTreeIdx);
+        buildTree_rec(VL0, Right, Depth + 1, UserTreeIdx);
         return;
       }
 
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
-        for (Value *j : VL)
-          Operands.push_back(cast<Instruction>(j)->getOperand(i));
-
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+        for (Value *VecOp : VL) {
+          auto *I = cast<Instruction>(VecOp);
+          if (I->getOpcode() == S.getOpcode()) {
+             Operands.push_back(I->getOperand(i));
+             continue;
+          }
+          assert(Instruction::isBinaryOp(S.getOpcode()) &&
+                  "Expected a binary operation.");
+          Operands.push_back(VecOp);
+        }
+        if (allSameType(Operands))
+          buildTree_rec(VL0, Operands, Depth + 1, UserTreeIdx);
       }
       return;
 
@@ -1783,8 +2024,8 @@
       for (unsigned j = 0; j < VL.size(); ++j) {
         if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
           LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
-          BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          BS.cancelScheduling(VL0, S.getKey());
+          newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies);
           return;
         }
       }
@@ -1797,8 +2038,8 @@
         if (Ty0 != CurTy) {
           LLVM_DEBUG(dbgs()
                      << "SLP: not-vectorizable GEP (different types).\n");
-          BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          BS.cancelScheduling(VL0, S.getKey());
+          newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies);
           return;
         }
       }
@@ -1809,13 +2050,13 @@
         if (!isa<ConstantInt>(Op)) {
           LLVM_DEBUG(dbgs()
                      << "SLP: not-vectorizable GEP (non-constant indexes).\n");
-          BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          BS.cancelScheduling(VL0, S.getKey());
+          newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies);
           return;
         }
       }
 
-      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      newTreeEntry(VL, true, UserTreeIdx, S, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
       for (unsigned i = 0, e = 2; i < e; ++i) {
         ValueList Operands;
@@ -1823,7 +2064,7 @@
         for (Value *j : VL)
           Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+        buildTree_rec(VL0, Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
@@ -1831,20 +2072,20 @@
       // Check if the stores are consecutive or of we need to swizzle them.
       for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
         if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
-          BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          BS.cancelScheduling(VL0, S.getKey());
+          newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies);
           LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
           return;
         }
 
-      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      newTreeEntry(VL, true, UserTreeIdx, S, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
 
       ValueList Operands;
       for (Value *j : VL)
         Operands.push_back(cast<Instruction>(j)->getOperand(0));
 
-      buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+      buildTree_rec(VL0, Operands, Depth + 1, UserTreeIdx);
       return;
     }
     case Instruction::Call: {
@@ -1854,8 +2095,8 @@
       // represented by an intrinsic call
       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
       if (!isTriviallyVectorizable(ID)) {
-        BS.cancelScheduling(VL, VL0);
-        newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+        BS.cancelScheduling(VL0, S.getKey());
+        newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies);
         LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
         return;
       }
@@ -1868,8 +2109,8 @@
         if (!CI2 || CI2->getCalledFunction() != Int ||
             getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
             !CI->hasIdenticalOperandBundleSchema(*CI2)) {
-          BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          BS.cancelScheduling(VL0, S.getKey());
+          newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies);
           LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
                             << "\n");
           return;
@@ -1879,8 +2120,8 @@
         if (hasVectorInstrinsicScalarOpd(ID, 1)) {
           Value *A1J = CI2->getArgOperand(1);
           if (A1I != A1J) {
-            BS.cancelScheduling(VL, VL0);
-            newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+            BS.cancelScheduling(VL0, S.getKey());
+            newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies);
             LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
                               << " argument " << A1I << "!=" << A1J << "\n");
             return;
@@ -1891,23 +2132,32 @@
             !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
                         CI->op_begin() + CI->getBundleOperandsEndIndex(),
                         CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
-          BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          BS.cancelScheduling(VL0, S.getKey());
+          newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies);
           LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:"
                             << *CI << "!=" << *VL[i] << '\n');
           return;
         }
       }
 
-      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      newTreeEntry(VL, true, UserTreeIdx, S, ReuseShuffleIndicies);
       for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
-        for (Value *j : VL) {
-          CallInst *CI2 = dyn_cast<CallInst>(j);
-          Operands.push_back(CI2->getArgOperand(i));
+        for (Value *VecOp : VL) {
+          auto *I = cast<Instruction>(VecOp);
+          if (S.isOpcodeOrAlt(I)) {
+             Operands.push_back(I->getOperand(i));
+             continue;
+          }
+          assert(Instruction::isBinaryOp(S.getOpcode()) &&
+                  "Expected a binary operation.");
+          Value *Operand = getDefaultConstantForOpcode(S.getOpcode(),
+                                                       I->getType());
+          Operands.push_back(Operand);
         }
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+        if (allSameType(Operands))
+          buildTree_rec(VL0, Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
@@ -1915,36 +2165,45 @@
       // If this is not an alternate sequence of opcode like add-sub
       // then do not vectorize this instruction.
       if (!S.isAltShuffle()) {
-        BS.cancelScheduling(VL, VL0);
-        newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+        BS.cancelScheduling(VL0, S.getKey());
+        newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies);
         LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
         return;
       }
-      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      newTreeEntry(VL, true, UserTreeIdx, S, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
 
       // Reorder operands if reordering would enable vectorization.
       if (isa<BinaryOperator>(VL0)) {
         ValueList Left, Right;
         reorderAltShuffleOperands(S, VL, Left, Right);
-        buildTree_rec(Left, Depth + 1, UserTreeIdx);
-        buildTree_rec(Right, Depth + 1, UserTreeIdx);
+        buildTree_rec(VL0, Left, Depth + 1, UserTreeIdx);
+        buildTree_rec(VL0, Right, Depth + 1, UserTreeIdx);
         return;
       }
 
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
-        for (Value *j : VL)
-          Operands.push_back(cast<Instruction>(j)->getOperand(i));
+        for (Value *VecOp : VL) {
+          auto *I = cast<Instruction>(VecOp);
+          if (S.isOpcodeOrAlt(I)) {
+            Operands.push_back(I->getOperand(i));
+            continue;
+          }
+          assert(Instruction::isBinaryOp(S.getOpcode()) &&
+                  "Expected a binary operation.");
+          Value *Operand = getDefaultConstantForOpcode(S.getOpcode(), I->getType());
+          Operands.push_back(Operand);
+        }
 
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+        buildTree_rec(VL0, Operands, Depth + 1, UserTreeIdx);
       }
       return;
 
     default:
-      BS.cancelScheduling(VL, VL0);
-      newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+      BS.cancelScheduling(VL0, S.getKey());
+      newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
       return;
   }
@@ -1980,7 +2239,7 @@
   Instruction *E0 = cast<Instruction>(OpValue);
   assert(E0->getOpcode() == Instruction::ExtractElement ||
          E0->getOpcode() == Instruction::ExtractValue);
-  assert(E0->getOpcode() == getSameOpcode(VL).getOpcode() && "Invalid opcode");
+  assert(E0->getOpcode() == getSameOpcode(VL[0], VL).getOpcode() && "Invalid opcode");
   // Check if all of the extracts come from the same vector and from the
   // correct offset.
   Value *Vec = E0->getOperand(0);
@@ -2045,7 +2304,7 @@
 bool BoUpSLP::areAllUsersVectorized(Instruction *I) const {
   return I->hasOneUse() ||
          std::all_of(I->user_begin(), I->user_end(), [this](User *U) {
-           return ScalarToTreeEntry.count(U) > 0;
+           return ScalarToTreeEntry.count(dyn_cast<Instruction>(U)) > 0;
          });
 }
 
@@ -2079,7 +2338,7 @@
       return ReuseShuffleCost +
              TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
     }
-    if (getSameOpcode(VL).getOpcode() == Instruction::ExtractElement &&
+    if (getSameOpcode(VL[0], VL).getOpcode() == Instruction::ExtractElement &&
         allSameType(VL) && allSameBlock(VL)) {
       Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isShuffle(VL);
       if (ShuffleKind.hasValue()) {
@@ -2089,10 +2348,11 @@
           // instruction itself is not going to be vectorized, consider this
           // instruction as dead and remove its cost from the final cost of the
           // vectorized tree.
-          if (areAllUsersVectorized(cast<Instruction>(V)) &&
-              !ScalarToTreeEntry.count(V)) {
+          auto *I = cast<Instruction>(V);
+          if (areAllUsersVectorized(I) &&
+              !ScalarToTreeEntry.count(I)) {
             auto *IO = cast<ConstantInt>(
-                cast<ExtractElementInst>(V)->getIndexOperand());
+                cast<ExtractElementInst>(I)->getIndexOperand());
             Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
                                             IO->getZExtValue());
           }
@@ -2102,11 +2362,11 @@
     }
     return ReuseShuffleCost + getGatherCost(VL);
   }
-  InstructionsState S = getSameOpcode(VL);
-  assert(S.getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
-  Instruction *VL0 = cast<Instruction>(S.OpValue);
-  unsigned ShuffleOrOp = S.isAltShuffle() ?
-               (unsigned) Instruction::ShuffleVector : S.getOpcode();
+  assert(E->State.getOpcode() && allSameType(VL) && allSameBlock(VL) &&
+         "Invalid VL");
+  auto *VL0 = cast<Instruction>(E->State.OpValue);
+  unsigned ShuffleOrOp = E->State.isAltShuffle() ?
+               (unsigned) Instruction::ShuffleVector : E->State.getOpcode();
   switch (ShuffleOrOp) {
     case Instruction::PHI:
       return 0;
@@ -2192,7 +2452,7 @@
     case Instruction::BitCast: {
       Type *SrcTy = VL0->getOperand(0)->getType();
       int ScalarEltCost =
-          TTI->getCastInstrCost(S.getOpcode(), ScalarTy, SrcTy, VL0);
+          TTI->getCastInstrCost(E->State.getOpcode(), ScalarTy, SrcTy, VL0);
       if (NeedToShuffleReuses) {
         ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
       }
@@ -2205,7 +2465,8 @@
       // Check if the values are candidates to demote.
       if (!MinBWs.count(VL0) || VecTy != SrcVecTy) {
         VecCost = ReuseShuffleCost +
-                  TTI->getCastInstrCost(S.getOpcode(), VecTy, SrcVecTy, VL0);
+                  TTI->getCastInstrCost(E->State.getOpcode(), VecTy,
+                                        SrcVecTy, VL0);
       }
       return VecCost - ScalarCost;
     }
@@ -2213,14 +2474,16 @@
     case Instruction::ICmp:
     case Instruction::Select: {
       // Calculate the cost of this instruction.
-      int ScalarEltCost = TTI->getCmpSelInstrCost(S.getOpcode(), ScalarTy,
+      int ScalarEltCost = TTI->getCmpSelInstrCost(E->State.getOpcode(),
+                                                  ScalarTy,
                                                   Builder.getInt1Ty(), VL0);
       if (NeedToShuffleReuses) {
         ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
       }
       VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
       int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
-      int VecCost = TTI->getCmpSelInstrCost(S.getOpcode(), VecTy, MaskTy, VL0);
+      int VecCost = TTI->getCmpSelInstrCost(E->State.getOpcode(), VecTy,
+                                            MaskTy, VL0);
       return ReuseShuffleCost + VecCost - ScalarCost;
     }
     case Instruction::Add:
@@ -2246,7 +2509,7 @@
       TargetTransformInfo::OperandValueKind Op1VK =
           TargetTransformInfo::OK_AnyValue;
       TargetTransformInfo::OperandValueKind Op2VK =
-          TargetTransformInfo::OK_UniformConstantValue;
+          TargetTransformInfo::OK_AnyValue;
       TargetTransformInfo::OperandValueProperties Op1VP =
           TargetTransformInfo::OP_None;
       TargetTransformInfo::OperandValueProperties Op2VP =
@@ -2257,35 +2520,40 @@
       // If instead not all operands are constants, then set the operand kind
       // to OK_AnyValue. If all operands are constants but not the same,
       // then set the operand kind to OK_NonUniformConstantValue.
-      ConstantInt *CInt0 = nullptr;
-      for (unsigned i = 0, e = VL.size(); i < e; ++i) {
-        const Instruction *I = cast<Instruction>(VL[i]);
-        ConstantInt *CInt = dyn_cast<ConstantInt>(I->getOperand(1));
-        if (!CInt) {
-          Op2VK = TargetTransformInfo::OK_AnyValue;
-          Op2VP = TargetTransformInfo::OP_None;
-          break;
-        }
-        if (Op2VP == TargetTransformInfo::OP_PowerOf2 &&
-            !CInt->getValue().isPowerOf2())
-          Op2VP = TargetTransformInfo::OP_None;
-        if (i == 0) {
-          CInt0 = CInt;
-          continue;
+      if (auto *CInt = dyn_cast<ConstantInt>(VL0->getOperand(1))) {
+        Op2VK = TargetTransformInfo::OK_UniformConstantValue;
+        const unsigned Opcode = E->State.getOpcode();
+        for (auto *V : VL) {
+          auto *I = cast<Instruction>(V);
+          if (I == VL0 || Opcode != I->getOpcode())
+            continue;
+          if (!isa<ConstantInt>(I->getOperand(1))) {
+            Op2VK = TargetTransformInfo::OK_AnyValue;
+            Op2VP = TargetTransformInfo::OP_None;
+            break;
+          }
+          ConstantInt *CInt_cur = cast<ConstantInt>(I->getOperand(1));
+          if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
+              CInt != cast<ConstantInt>(I->getOperand(1)))
+            Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
+          if (Op2VP == TargetTransformInfo::OP_PowerOf2 &&
+              !CInt->getValue().isPowerOf2())
+            Op2VP = TargetTransformInfo::OP_None;
+          if (CInt != CInt_cur)
+            Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
         }
-        if (CInt0 != CInt)
-          Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
       }
 
       SmallVector<const Value *, 4> Operands(VL0->operand_values());
       int ScalarEltCost = TTI->getArithmeticInstrCost(
-          S.getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands);
+          E->State.getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands);
       if (NeedToShuffleReuses) {
         ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
       }
       int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
-      int VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy, Op1VK,
-                                                Op2VK, Op1VP, Op2VP, Operands);
+      int VecCost = TTI->getArithmeticInstrCost(E->State.getOpcode(), VecTy,
+                                                Op1VK, Op2VK, Op1VP, Op2VP,
+                                                Operands);
       return ReuseShuffleCost + VecCost - ScalarCost;
     }
     case Instruction::GetElementPtr: {
@@ -2366,11 +2634,11 @@
       return ReuseShuffleCost + VecCallCost - ScalarCallCost;
     }
     case Instruction::ShuffleVector: {
-      assert(S.isAltShuffle() &&
-             ((Instruction::isBinaryOp(S.getOpcode()) &&
-               Instruction::isBinaryOp(S.getAltOpcode())) ||
-              (Instruction::isCast(S.getOpcode()) &&
-               Instruction::isCast(S.getAltOpcode()))) &&
+      assert(E->State.isAltShuffle() &&
+             ((Instruction::isBinaryOp(E->State.getOpcode()) &&
+             Instruction::isBinaryOp(E->State.getAltOpcode())) ||
+             (Instruction::isCast(E->State.getOpcode()) &&
+             Instruction::isCast(E->State.getAltOpcode()))) &&
              "Invalid Shuffle Vector Operand");
       int ScalarCost = 0;
       if (NeedToShuffleReuses) {
@@ -2387,23 +2655,23 @@
       }
       for (Value *i : VL) {
         Instruction *I = cast<Instruction>(i);
-        assert(S.isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
         ScalarCost += TTI->getInstructionCost(
             I, TargetTransformInfo::TCK_RecipThroughput);
       }
       // VecCost is equal to sum of the cost of creating 2 vectors
       // and the cost of creating shuffle.
       int VecCost = 0;
-      if (Instruction::isBinaryOp(S.getOpcode())) {
-        VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy);
-        VecCost += TTI->getArithmeticInstrCost(S.getAltOpcode(), VecTy);
+      if (Instruction::isBinaryOp(E->State.getOpcode())) {
+        VecCost = TTI->getArithmeticInstrCost(E->State.getOpcode(), VecTy);
+        VecCost += TTI->getArithmeticInstrCost(E->State.getAltOpcode(), VecTy);
       } else {
-        Type *Src0SclTy = S.MainOp->getOperand(0)->getType();
-        Type *Src1SclTy = S.AltOp->getOperand(0)->getType();
+        Type *Src0SclTy = E->State.MainOp->getOperand(0)->getType();
+        Type *Src1SclTy = E->State.AltOp->getOperand(0)->getType();
         VectorType *Src0Ty = VectorType::get(Src0SclTy, VL.size());
         VectorType *Src1Ty = VectorType::get(Src1SclTy, VL.size());
-        VecCost = TTI->getCastInstrCost(S.getOpcode(), VecTy, Src0Ty);
-        VecCost += TTI->getCastInstrCost(S.getAltOpcode(), VecTy, Src1Ty);
+        VecCost = TTI->getCastInstrCost(E->State.getOpcode(), VecTy, Src0Ty);
+        VecCost += TTI->getCastInstrCost(E->State.getAltOpcode(), VecTy,
+                                         Src1Ty);
       }
       VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0);
       return ReuseShuffleCost + VecCost - ScalarCost;
@@ -2469,7 +2737,7 @@
   Instruction *PrevInst = nullptr;
 
   for (const auto &N : VectorizableTree) {
-    Instruction *Inst = dyn_cast<Instruction>(N.Scalars[0]);
+    Instruction *Inst = dyn_cast<Instruction>(N.State.OpValue);
     if (!Inst)
       continue;
 
@@ -2481,8 +2749,9 @@
     // Update LiveValues.
     LiveValues.erase(PrevInst);
     for (auto &J : PrevInst->operands()) {
-      if (isa<Instruction>(&*J) && getTreeEntry(&*J))
-        LiveValues.insert(cast<Instruction>(&*J));
+      auto *I = dyn_cast<Instruction>(&*J);
+      if (I && getTreeEntry(I))
+        LiveValues.insert(I);
     }
 
     LLVM_DEBUG({
@@ -2654,9 +2923,13 @@
   // Push left and right operands of binary operation into Left and Right
   for (Value *V : VL) {
     auto *I = cast<Instruction>(V);
-    assert(S.isOpcodeOrAlt(I) && "Incorrect instruction in vector");
-    Left.push_back(I->getOperand(0));
-    Right.push_back(I->getOperand(1));
+    if (S.isOpcodeOrAlt(I)) {
+      Left.push_back(I->getOperand(0));
+      Right.push_back(I->getOperand(1));
+    } else {
+      Left.push_back(I);
+      Right.push_back(getDefaultConstantForOpcode(S.getOpcode(), I->getType()));
+    }
   }
 
   // Reorder if we have a commutative operation and consecutive access
@@ -2705,8 +2978,13 @@
     int i, unsigned Opcode, Instruction &I, ArrayRef<Value *> Left,
     ArrayRef<Value *> Right, bool AllSameOpcodeLeft, bool AllSameOpcodeRight,
     bool SplatLeft, bool SplatRight, Value *&VLeft, Value *&VRight) {
-  VLeft = I.getOperand(0);
-  VRight = I.getOperand(1);
+  if (I.getOpcode() == Opcode) {
+    VLeft = I.getOperand(0);
+    VRight = I.getOperand(1);
+  } else {
+    VLeft = &I;
+    VRight = getDefaultConstantForOpcode(Opcode, I.getType());
+  }
   // If we have "SplatRight", try to see if commuting is needed to preserve it.
   if (SplatRight) {
     if (VRight == Right[i - 1])
@@ -2770,8 +3048,15 @@
     // Peel the first iteration out of the loop since there's nothing
     // interesting to do anyway and it simplifies the checks in the loop.
     auto *I = cast<Instruction>(VL[0]);
-    Value *VLeft = I->getOperand(0);
-    Value *VRight = I->getOperand(1);
+    Value *VLeft;
+    Value *VRight;
+    if (I->getOpcode() == Opcode) {
+      VLeft = I->getOperand(0);
+      VRight = I->getOperand(1);
+    } else {
+      VLeft = I;
+      VRight = getDefaultConstantForOpcode(Opcode, I->getType());
+    }
     if (!isa<Instruction>(VRight) && isa<Instruction>(VLeft))
       // Favor having instruction to the right. FIXME: why?
       std::swap(VLeft, VRight);
@@ -2869,17 +3154,15 @@
   // The last instruction in the bundle in program order.
   Instruction *LastInst = nullptr;
 
-  // Find the last instruction. The common case should be that BB has been
-  // scheduled, and the last instruction is VL.back(). So we start with
-  // VL.back() and iterate over schedule data until we reach the end of the
-  // bundle. The end of the bundle is marked by null ScheduleData.
+  // Find the last instruction. If the bundle is not scheduled then
+  // the first in the bundle is the last one in BB, because we discover
+  // bundles in backward walk.
   if (BlocksSchedules.count(BB)) {
-    auto *Bundle =
-        BlocksSchedules[BB]->getScheduleData(isOneOf(S, VL.back()));
+    BlockScheduling *BS = BlocksSchedules[BB].get();
+    auto *Bundle = BS->getScheduleData(cast<Instruction>(S.OpValue),
+                                       S.getKey());
     if (Bundle && Bundle->isPartOfBundle())
-      for (; Bundle; Bundle = Bundle->NextInBundle)
-        if (Bundle->OpValue == Bundle->Inst)
-          LastInst = Bundle->Inst;
+      LastInst = Bundle->FirstInBundle->getInst();
   }
 
   // LastInst can still be null at this point if there's either not an entry
@@ -2926,7 +3209,7 @@
       CSEBlocks.insert(Insrt->getParent());
 
       // Add to our 'need-to-extract' list.
-      if (TreeEntry *E = getTreeEntry(VL[i])) {
+      if (TreeEntry *E = getTreeEntry(dyn_cast<Instruction>(VL[i]))) {
         // Find which lane we need to extract.
         int FoundLane = -1;
         for (unsigned Lane = 0, LE = E->Scalars.size(); Lane != LE; ++Lane) {
@@ -2950,10 +3233,11 @@
   return Vec;
 }
 
-Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
-  InstructionsState S = getSameOpcode(VL);
+Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL, Value *Parent) {
+  InstructionsState S = getSameOpcode(Parent, VL);
   if (S.getOpcode()) {
-    if (TreeEntry *E = getTreeEntry(S.OpValue)) {
+    TreeEntry *E = getTreeEntry(dyn_cast<Instruction>(S.OpValue));
+    if (E && E->State.getOpcode() == S.getOpcode()) {
       if (E->isSame(VL)) {
         Value *V = vectorizeTree(E);
         if (VL.size() == E->Scalars.size() && !E->ReuseShuffleIndices.empty()) {
@@ -3026,12 +3310,12 @@
   IRBuilder<>::InsertPointGuard Guard(Builder);
 
   if (E->VectorizedValue) {
-    LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
+    LLVM_DEBUG(dbgs() << "SLP: Diamond merged for "
+                      << *E->State.OpValue << ".\n");
     return E->VectorizedValue;
   }
 
-  InstructionsState S = getSameOpcode(E->Scalars);
-  Instruction *VL0 = cast<Instruction>(S.OpValue);
+  auto *VL0 = cast<Instruction>(E->State.OpValue);
   Type *ScalarTy = VL0->getType();
   if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
     ScalarTy = SI->getValueOperand()->getType();
@@ -3040,7 +3324,7 @@
   bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
 
   if (E->NeedToGather) {
-    setInsertPointAfterBundle(E->Scalars, S);
+    setInsertPointAfterBundle(E->Scalars, E->State);
     auto *V = Gather(E->Scalars, VecTy);
     if (NeedToShuffleReuses) {
       V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
@@ -3054,8 +3338,8 @@
     return V;
   }
 
-  unsigned ShuffleOrOp = S.isAltShuffle() ?
-           (unsigned) Instruction::ShuffleVector : S.getOpcode();
+  unsigned ShuffleOrOp = E->State.isAltShuffle() ?
+           (unsigned) Instruction::ShuffleVector : E->State.getOpcode();
   switch (ShuffleOrOp) {
     case Instruction::PHI: {
       PHINode *PH = dyn_cast<PHINode>(VL0);
@@ -3088,7 +3372,7 @@
 
         Builder.SetInsertPoint(IBB->getTerminator());
         Builder.SetCurrentDebugLocation(PH->getDebugLoc());
-        Value *Vec = vectorizeTree(Operands);
+        Value *Vec = vectorizeTree(Operands, E->State.OpValue);
         NewPhi->addIncoming(Vec, IBB);
       }
 
@@ -3117,7 +3401,7 @@
         E->VectorizedValue = V;
         return V;
       }
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E->Scalars, E->State);
       auto *V = Gather(E->Scalars, VecTy);
       if (NeedToShuffleReuses) {
         V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
@@ -3152,7 +3436,7 @@
         E->VectorizedValue = NewV;
         return NewV;
       }
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E->Scalars, E->State);
       auto *V = Gather(E->Scalars, VecTy);
       if (NeedToShuffleReuses) {
         V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
@@ -3181,9 +3465,9 @@
       for (Value *V : E->Scalars)
         INVL.push_back(cast<Instruction>(V)->getOperand(0));
 
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E->Scalars, E->State);
 
-      Value *InVec = vectorizeTree(INVL);
+      Value *InVec = vectorizeTree(INVL, E->State.OpValue);
 
       if (E->VectorizedValue) {
         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
@@ -3208,10 +3492,10 @@
         RHSV.push_back(cast<Instruction>(V)->getOperand(1));
       }
 
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E->Scalars, E->State);
 
-      Value *L = vectorizeTree(LHSV);
-      Value *R = vectorizeTree(RHSV);
+      Value *L = vectorizeTree(LHSV, E->State.OpValue);
+      Value *R = vectorizeTree(RHSV, E->State.OpValue);
 
       if (E->VectorizedValue) {
         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
@@ -3220,7 +3504,7 @@
 
       CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
       Value *V;
-      if (S.getOpcode() == Instruction::FCmp)
+      if (E->State.getOpcode() == Instruction::FCmp)
         V = Builder.CreateFCmp(P0, L, R);
       else
         V = Builder.CreateICmp(P0, L, R);
@@ -3242,11 +3526,11 @@
         FalseVec.push_back(cast<Instruction>(V)->getOperand(2));
       }
 
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E->Scalars, E->State);
 
-      Value *Cond = vectorizeTree(CondVec);
-      Value *True = vectorizeTree(TrueVec);
-      Value *False = vectorizeTree(FalseVec);
+      Value *Cond = vectorizeTree(CondVec, E->State.OpValue);
+      Value *True = vectorizeTree(TrueVec, E->State.OpValue);
+      Value *False = vectorizeTree(FalseVec, E->State.OpValue);
 
       if (E->VectorizedValue) {
         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
@@ -3282,19 +3566,26 @@
     case Instruction::Xor: {
       ValueList LHSVL, RHSVL;
       if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
-        reorderInputsAccordingToOpcode(S.getOpcode(), E->Scalars, LHSVL,
-                                       RHSVL);
+        reorderInputsAccordingToOpcode(E->State.getOpcode(), E->Scalars,
+                                       LHSVL, RHSVL);
       else
         for (Value *V : E->Scalars) {
           auto *I = cast<Instruction>(V);
-          LHSVL.push_back(I->getOperand(0));
-          RHSVL.push_back(I->getOperand(1));
+          if (I->getOpcode() == E->State.getOpcode()) {
+            LHSVL.push_back(I->getOperand(0));
+            RHSVL.push_back(I->getOperand(1));
+          } else {
+            LHSVL.push_back(V);
+            RHSVL.push_back(
+                getDefaultConstantForOpcode(E->State.getOpcode(),
+                                            I->getType()));
+          }
         }
 
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E->Scalars, E->State);
 
-      Value *LHS = vectorizeTree(LHSVL);
-      Value *RHS = vectorizeTree(RHSVL);
+      Value *LHS = vectorizeTree(LHSVL, E->State.OpValue);
+      Value *RHS = vectorizeTree(RHSVL, E->State.OpValue);
 
       if (E->VectorizedValue) {
         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
@@ -3302,7 +3593,7 @@
       }
 
       Value *V = Builder.CreateBinOp(
-          static_cast<Instruction::BinaryOps>(S.getOpcode()), LHS, RHS);
+          static_cast<Instruction::BinaryOps>(VL0->getOpcode()), LHS, RHS);
       propagateIRFlags(V, E->Scalars, VL0);
       if (auto *I = dyn_cast<Instruction>(V))
         V = propagateMetadata(I, E->Scalars);
@@ -3321,10 +3612,12 @@
       // sink them all the way down past store instructions.
       bool IsReorder = !E->ReorderIndices.empty();
       if (IsReorder) {
-        S = getSameOpcode(E->Scalars, E->ReorderIndices.front());
+        InstructionsState S = getSameOpcode(E->State.OpValue, E->Scalars,
+                                            E->ReorderIndices.front());
         VL0 = cast<Instruction>(S.OpValue);
-      }
-      setInsertPointAfterBundle(E->Scalars, S);
+        setInsertPointAfterBundle(E->Scalars, S);
+      } else
+      setInsertPointAfterBundle(E->Scalars, E->State);
 
       LoadInst *LI = cast<LoadInst>(VL0);
       Type *ScalarLoadTy = LI->getType();
@@ -3337,7 +3630,7 @@
       // ExternalUses list to make sure that an extract will be generated in the
       // future.
       Value *PO = LI->getPointerOperand();
-      if (getTreeEntry(PO))
+      if (getTreeEntry(dyn_cast<Instruction>(PO)))
         ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0));
 
       unsigned Alignment = LI->getAlignment();
@@ -3371,9 +3664,9 @@
       for (Value *V : E->Scalars)
         ScalarStoreValues.push_back(cast<StoreInst>(V)->getValueOperand());
 
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E->Scalars, E->State);
 
-      Value *VecValue = vectorizeTree(ScalarStoreValues);
+      Value *VecValue = vectorizeTree(ScalarStoreValues, E->State.OpValue);
       Value *ScalarPtr = SI->getPointerOperand();
       Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS));
       StoreInst *ST = Builder.CreateStore(VecValue, VecPtr);
@@ -3381,7 +3674,7 @@
       // The pointer operand uses an in-tree scalar, so add the new BitCast to
       // ExternalUses to make sure that an extract will be generated in the
       // future.
-      if (getTreeEntry(ScalarPtr))
+      if (getTreeEntry(dyn_cast<Instruction>(ScalarPtr)))
         ExternalUses.push_back(ExternalUser(ScalarPtr, cast<User>(VecPtr), 0));
 
       if (!Alignment)
@@ -3398,13 +3691,13 @@
       return V;
     }
     case Instruction::GetElementPtr: {
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E->Scalars, E->State);
 
       ValueList Op0VL;
       for (Value *V : E->Scalars)
         Op0VL.push_back(cast<GetElementPtrInst>(V)->getOperand(0));
 
-      Value *Op0 = vectorizeTree(Op0VL);
+      Value *Op0 = vectorizeTree(Op0VL, E->State.OpValue);
 
       std::vector<Value *> OpVecs;
       for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
@@ -3413,7 +3706,7 @@
         for (Value *V : E->Scalars)
           OpVL.push_back(cast<GetElementPtrInst>(V)->getOperand(j));
 
-        Value *OpVec = vectorizeTree(OpVL);
+        Value *OpVec = vectorizeTree(OpVL, E->State.OpValue);
         OpVecs.push_back(OpVec);
       }
 
@@ -3433,7 +3726,7 @@
     }
     case Instruction::Call: {
       CallInst *CI = cast<CallInst>(VL0);
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E->Scalars, E->State);
       Function *FI;
       Intrinsic::ID IID  = Intrinsic::not_intrinsic;
       Value *ScalarArg = nullptr;
@@ -3456,7 +3749,7 @@
           OpVL.push_back(CEI->getArgOperand(j));
         }
 
-        Value *OpVec = vectorizeTree(OpVL);
+        Value *OpVec = vectorizeTree(OpVL, E->State.OpValue);
         LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
         OpVecs.push_back(OpVec);
       }
@@ -3472,7 +3765,7 @@
       // The scalar argument uses an in-tree scalar so we add the new vectorized
       // call to ExternalUses list to make sure that an extract will be
       // generated in the future.
-      if (ScalarArg && getTreeEntry(ScalarArg))
+      if (ScalarArg && getTreeEntry(dyn_cast<Instruction>(ScalarArg)))
         ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
 
       propagateIRFlags(V, E->Scalars, VL0);
@@ -3486,25 +3779,25 @@
     }
     case Instruction::ShuffleVector: {
       ValueList LHSVL, RHSVL;
-      assert(S.isAltShuffle() &&
-             ((Instruction::isBinaryOp(S.getOpcode()) &&
-               Instruction::isBinaryOp(S.getAltOpcode())) ||
-              (Instruction::isCast(S.getOpcode()) &&
-               Instruction::isCast(S.getAltOpcode()))) &&
+      assert(E->State.isAltShuffle() &&
+             ((Instruction::isBinaryOp(E->State.getOpcode()) &&
+             Instruction::isBinaryOp(E->State.getAltOpcode())) ||
+             (Instruction::isCast(E->State.getOpcode()) &&
+             Instruction::isCast(E->State.getAltOpcode()))) &&
              "Invalid Shuffle Vector Operand");
 
       Value *LHS, *RHS;
-      if (Instruction::isBinaryOp(S.getOpcode())) {
-        reorderAltShuffleOperands(S, E->Scalars, LHSVL, RHSVL);
-        setInsertPointAfterBundle(E->Scalars, S);
-        LHS = vectorizeTree(LHSVL);
-        RHS = vectorizeTree(RHSVL);
+      if (Instruction::isBinaryOp(E->State.getOpcode())) {
+        reorderAltShuffleOperands(E->State, E->Scalars, LHSVL, RHSVL);
+        setInsertPointAfterBundle(E->Scalars, E->State);
+        LHS = vectorizeTree(LHSVL, E->State.OpValue);
+        RHS = vectorizeTree(RHSVL, E->State.OpValue);
       } else {
         ValueList INVL;
         for (Value *V : E->Scalars)
           INVL.push_back(cast<Instruction>(V)->getOperand(0));
-        setInsertPointAfterBundle(E->Scalars, S);
-        LHS = vectorizeTree(INVL);
+        setInsertPointAfterBundle(E->Scalars, E->State);
+        LHS = vectorizeTree(INVL, E->State.OpValue);
       }
 
       if (E->VectorizedValue) {
@@ -3513,16 +3806,20 @@
       }
 
       Value *V0, *V1;
-      if (Instruction::isBinaryOp(S.getOpcode())) {
+      if (Instruction::isBinaryOp(E->State.getOpcode())) {
         V0 = Builder.CreateBinOp(
-          static_cast<Instruction::BinaryOps>(S.getOpcode()), LHS, RHS);
+          static_cast<Instruction::BinaryOps>(E->State.getOpcode()), LHS,
+          RHS);
         V1 = Builder.CreateBinOp(
-          static_cast<Instruction::BinaryOps>(S.getAltOpcode()), LHS, RHS);
+          static_cast<Instruction::BinaryOps>(E->State.getAltOpcode()), LHS,
+          RHS);
       } else {
         V0 = Builder.CreateCast(
-            static_cast<Instruction::CastOps>(S.getOpcode()), LHS, VecTy);
+            static_cast<Instruction::CastOps>(E->State.getOpcode()), LHS,
+            VecTy);
         V1 = Builder.CreateCast(
-            static_cast<Instruction::CastOps>(S.getAltOpcode()), LHS, VecTy);
+            static_cast<Instruction::CastOps>(E->State.getAltOpcode()), LHS,
+            VecTy);
       }
 
       // Create shuffle to take alternate operations from the vector.
@@ -3533,8 +3830,7 @@
       SmallVector<Constant *, 8> Mask(e);
       for (unsigned i = 0; i < e; ++i) {
         auto *OpInst = cast<Instruction>(E->Scalars[i]);
-        assert(S.isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode");
-        if (OpInst->getOpcode() == S.getAltOpcode()) {
+        if (OpInst->getOpcode() == E->State.getAltOpcode()) {
           Mask[i] = Builder.getInt32(e + i);
           AltScalars.push_back(E->Scalars[i]);
         } else {
@@ -3544,8 +3840,10 @@
       }
 
       Value *ShuffleMask = ConstantVector::get(Mask);
-      propagateIRFlags(V0, OpScalars);
-      propagateIRFlags(V1, AltScalars);
+      InstructionsState S = getSameOpcode(E->State.OpValue, OpScalars);
+      propagateIRFlags(V0, OpScalars, S.OpValue);
+      S = getSameOpcode(E->State.OpValue, AltScalars);
+      propagateIRFlags(V1, AltScalars, S.OpValue);
 
       Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
       if (Instruction *I = dyn_cast<Instruction>(V))
@@ -3583,7 +3881,7 @@
   // If the vectorized tree can be rewritten in a smaller type, we truncate the
   // vectorized root. InstCombine will then rewrite the entire expression. We
   // sign extend the extracted values below.
-  auto *ScalarRoot = VectorizableTree[0].Scalars[0];
+  auto *ScalarRoot = VectorizableTree[0].State.OpValue;
   if (MinBWs.count(ScalarRoot)) {
     if (auto *I = dyn_cast<Instruction>(VectorRoot))
       Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
@@ -3616,7 +3914,7 @@
     // has multiple uses of the same value.
     if (User && !is_contained(Scalar->users(), User))
       continue;
-    TreeEntry *E = getTreeEntry(Scalar);
+    TreeEntry *E = getTreeEntry(dyn_cast<Instruction>(Scalar));
     assert(E && "Invalid scalar");
     assert(!E->NeedToGather && "Extracting from a gather list");
 
@@ -3698,6 +3996,9 @@
     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
       Value *Scalar = Entry->Scalars[Lane];
 
+      if (!Entry->State.isOpcodeOrAlt(cast<Instruction>(Scalar)))
+        continue;
+
       Type *Ty = Scalar->getType();
       if (!Ty->isVoidTy()) {
 #ifndef NDEBUG
@@ -3705,7 +4006,8 @@
           LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
 
           // It is legal to replace users in the ignorelist by undef.
-          assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) &&
+          assert((getTreeEntry(dyn_cast<Instruction>(U)) ||
+                  is_contained(UserIgnoreList, U)) &&
                  "Replacing out-of-tree value with undef");
         }
 #endif
@@ -3823,14 +4125,22 @@
   // Make sure that the scheduling region contains all
   // instructions of the bundle.
   for (Value *V : VL) {
-    if (!extendSchedulingRegion(V, S))
+    auto *I = dyn_cast<Instruction>(V);
+    assert(I && "bundle member must be an instruction");
+    if (!extendSchedulingRegion(I, S))
       return false;
   }
 
   for (Value *V : VL) {
-    ScheduleData *BundleMember = getScheduleData(V);
+    auto *I = cast<Instruction>(V);
+    ScheduleData *BundleMember = getInstScheduleData(I);
+    if (BundleMember->isPartOfBundle())
+      BundleMember = getScheduleData(I, S.getKey());
+    if (BundleMember->isPartOfBundle())
+      return false;
     assert(BundleMember &&
            "no ScheduleData for bundle member (maybe not in same basic block)");
+    assert(!BundleMember->isPartOfBundle() && "Already part of another bundle");
     if (BundleMember->IsScheduled) {
       // A bundle member was scheduled as single instruction before and now
       // needs to be scheduled as part of the bundle. We just get rid of the
@@ -3847,6 +4157,8 @@
       Bundle = BundleMember;
     }
     BundleMember->UnscheduledDepsInBundle = 0;
+    BundleMember->Opcode = S.getOpcode();
+    BundleMember->Parent = S.Parent;
     Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
 
     // Group the instructions to a bundle.
@@ -3890,18 +4202,30 @@
     }
   }
   if (!Bundle->isReady()) {
-    cancelScheduling(VL, S.OpValue);
+    cancelScheduling(S.OpValue, S.getKey());
+    // We have to clear all dependencies, since all values
+    // were calculated for the vectorized bundle.
+    for (auto *I = ScheduleStart; I != ScheduleEnd;
+      I = I->getNextNode()) {
+      doForAllOpcodes(I, [](ScheduleData *SD) {
+        SD->clearDependencies();
+      });
+    }
+    resetSchedule();
     return false;
   }
   return true;
 }
 
-void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
-                                                Value *OpValue) {
+void BoUpSLP::BlockScheduling::cancelScheduling(Value *OpValue,
+                                                std::pair<Value *, unsigned> Key) {
   if (isa<PHINode>(OpValue))
     return;
-
-  ScheduleData *Bundle = getScheduleData(OpValue);
+  auto *I = dyn_cast<Instruction>(OpValue);
+  if (!I)
+    return;
+  ScheduleData *Bundle = getScheduleData(I, Key)->FirstInBundle;
+  assert(Bundle && "Counld not find bundle");
   LLVM_DEBUG(dbgs() << "SLP:  cancel scheduling of " << *Bundle << "\n");
   assert(!Bundle->IsScheduled &&
          "Can't cancel bundle which is already scheduled");
@@ -3911,44 +4235,66 @@
   // Un-bundle: make single instructions out of the bundle.
   ScheduleData *BundleMember = Bundle;
   while (BundleMember) {
-    assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
+    assert(BundleMember->FirstInBundle == Bundle && "Corrupt bundle links");
+    assert(BundleMember->Parent == Key.first && 
+           BundleMember->Opcode == Key.second && "Corrupt bundle");
     BundleMember->FirstInBundle = BundleMember;
     ScheduleData *Next = BundleMember->NextInBundle;
     BundleMember->NextInBundle = nullptr;
     BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
-    if (BundleMember->UnscheduledDepsInBundle == 0) {
-      ReadyInsts.insert(BundleMember);
+    if (BundleMember->isPseudo()) {
+      PseudoInstScheduleDataMap[BundleMember->getInst()].erase(Key);
+      BundleMember->Opcode = 0;
+      BundleMember->Parent = nullptr;
+    } else {
+      BundleMember->Opcode = 0;
+      BundleMember->Parent = nullptr;
+      if (BundleMember->UnscheduledDepsInBundle == 0) {
+        ReadyInsts.insert(BundleMember);
+      }
     }
     BundleMember = Next;
   }
 }
 
-BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
-  // Allocate a new ScheduleData for the instruction.
+BoUpSLP::InstScheduleData *
+BoUpSLP::BlockScheduling::allocateInstScheduleDataChunks() {
+  // Allocate a new InstScheduleData for the instruction.
   if (ChunkPos >= ChunkSize) {
-    ScheduleDataChunks.push_back(llvm::make_unique<ScheduleData[]>(ChunkSize));
+    InstScheduleDataChunks.push_back(
+        llvm::make_unique<InstScheduleData[]>(ChunkSize));
     ChunkPos = 0;
   }
-  return &(ScheduleDataChunks.back()[ChunkPos++]);
+  return &(InstScheduleDataChunks.back()[ChunkPos++]);
 }
 
-bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
+BoUpSLP::PseudoScheduleData *
+BoUpSLP::BlockScheduling::allocatePseudoInstDataChunks() {
+  // Allocate a new PseudoScheduleData for the instruction.
+  if (PseudoChunkPos >= PseudoChunkSize) {
+    PseudoScheduleDataChunks.push_back(
+        llvm::make_unique<PseudoScheduleData[]>(PseudoChunkSize));
+    PseudoChunkPos = 0;
+  }
+  return &(PseudoScheduleDataChunks.back()[PseudoChunkPos++]);
+}
+
+bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Instruction *I,
                                                       const InstructionsState &S) {
-  if (getScheduleData(V, isOneOf(S, V)))
+  if (getScheduleData(I, S.getKey()))
     return true;
-  Instruction *I = dyn_cast<Instruction>(V);
-  assert(I && "bundle member must be an instruction");
   assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
   auto &&CheckSheduleForI = [this, &S](Instruction *I) -> bool {
-    ScheduleData *ISD = getScheduleData(I);
+    InstScheduleData *ISD = getInstScheduleData(I);
     if (!ISD)
       return false;
     assert(isInSchedulingRegion(ISD) &&
-           "ScheduleData not in scheduling region");
-    ScheduleData *SD = allocateScheduleDataChunks();
-    SD->Inst = I;
-    SD->init(SchedulingRegionID, S.OpValue);
-    ExtraScheduleDataMap[I][S.OpValue] = SD;
+           "InstScheduleData not in scheduling region");
+    if (ISD->isPartOfBundle()) {
+      PseudoScheduleData *PSD = allocatePseudoInstDataChunks();
+      PSD->init(SchedulingRegionID, ISD, S.Parent, S.getOpcode());
+      PseudoInstScheduleDataMap[I][S.getKey()] = PSD;
+    }
     return true;
   };
   if (CheckSheduleForI(I))
@@ -3958,8 +4304,7 @@
     initScheduleData(I, I->getNextNode(), nullptr, nullptr);
     ScheduleStart = I;
     ScheduleEnd = I->getNextNode();
-    if (isOneOf(S, I) != I)
-      CheckSheduleForI(I);
+    CheckSheduleForI(I);
     assert(ScheduleEnd && "tried to vectorize a terminator?");
     LLVM_DEBUG(dbgs() << "SLP:  initialize schedule region to " << *I << "\n");
     return true;
@@ -3981,8 +4326,7 @@
       if (&*UpIter == I) {
         initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
         ScheduleStart = I;
-        if (isOneOf(S, I) != I)
-          CheckSheduleForI(I);
+        CheckSheduleForI(I);
         LLVM_DEBUG(dbgs() << "SLP:  extend schedule region start to " << *I
                           << "\n");
         return true;
@@ -3994,8 +4338,7 @@
         initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
                          nullptr);
         ScheduleEnd = I->getNextNode();
-        if (isOneOf(S, I) != I)
-          CheckSheduleForI(I);
+        CheckSheduleForI(I);
         assert(ScheduleEnd && "tried to vectorize a terminator?");
         LLVM_DEBUG(dbgs() << "SLP:  extend schedule region end to " << *I
                           << "\n");
@@ -4009,21 +4352,20 @@
   return true;
 }
 
-void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
-                                                Instruction *ToI,
-                                                ScheduleData *PrevLoadStore,
-                                                ScheduleData *NextLoadStore) {
-  ScheduleData *CurrentLoadStore = PrevLoadStore;
+void BoUpSLP::BlockScheduling::initScheduleData(
+    Instruction *FromI, Instruction *ToI, InstScheduleData *PrevLoadStore,
+    InstScheduleData *NextLoadStore) {
+  InstScheduleData *CurrentLoadStore = PrevLoadStore;
   for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
-    ScheduleData *SD = ScheduleDataMap[I];
+    InstScheduleData *SD = InstScheduleDataMap[I];
     if (!SD) {
-      SD = allocateScheduleDataChunks();
-      ScheduleDataMap[I] = SD;
+      SD = allocateInstScheduleDataChunks();
+      InstScheduleDataMap[I] = SD;
       SD->Inst = I;
     }
     assert(!isInSchedulingRegion(SD) &&
-           "new ScheduleData already in scheduling region");
-    SD->init(SchedulingRegionID, I);
+           "new InstScheduleData already in scheduling region");
+    SD->init(SchedulingRegionID);
 
     if (I->mayReadOrWriteMemory() &&
         (!isa<IntrinsicInst>(I) ||
@@ -4058,8 +4400,13 @@
     WorkList.pop_back();
 
     ScheduleData *BundleMember = SD;
+    unsigned Opcode = BundleMember->Opcode;
+    Value *Parent = BundleMember->Parent;
     while (BundleMember) {
       assert(isInSchedulingRegion(BundleMember));
+      assert(BundleMember->Opcode == Opcode &&
+             BundleMember->Parent == Parent && "Corrupt bundle member");
+
       if (!BundleMember->hasValidDependencies()) {
 
         LLVM_DEBUG(dbgs() << "SLP:       update deps of " << *BundleMember
@@ -4068,44 +4415,31 @@
         BundleMember->resetUnscheduledDeps();
 
         // Handle def-use chain dependencies.
-        if (BundleMember->OpValue != BundleMember->Inst) {
-          ScheduleData *UseSD = getScheduleData(BundleMember->Inst);
-          if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
-            BundleMember->Dependencies++;
-            ScheduleData *DestBundle = UseSD->FirstInBundle;
-            if (!DestBundle->IsScheduled)
-              BundleMember->incrementUnscheduledDeps(1);
-            if (!DestBundle->hasValidDependencies())
-              WorkList.push_back(DestBundle);
-          }
-        } else {
-          for (User *U : BundleMember->Inst->users()) {
-            if (isa<Instruction>(U)) {
-              ScheduleData *UseSD = getScheduleData(U);
-              if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
-                BundleMember->Dependencies++;
-                ScheduleData *DestBundle = UseSD->FirstInBundle;
-                if (!DestBundle->IsScheduled)
-                  BundleMember->incrementUnscheduledDeps(1);
-                if (!DestBundle->hasValidDependencies())
-                  WorkList.push_back(DestBundle);
-              }
-            } else {
-              // I'm not sure if this can ever happen. But we need to be safe.
-              // This lets the instruction/bundle never be scheduled and
-              // eventually disable vectorization.
+        for (User *U : BundleMember->getInst()->users()) {
+          if (auto *I = dyn_cast<Instruction>(U)) {
+            doForAllOpcodes(I, [&BundleMember, &WorkList](ScheduleData *UseSD) {
               BundleMember->Dependencies++;
-              BundleMember->incrementUnscheduledDeps(1);
-            }
+              ScheduleData *DestBundle = UseSD->FirstInBundle;
+              if (!DestBundle->IsScheduled)
+                BundleMember->incrementUnscheduledDeps(1);
+              if (!DestBundle->hasValidDependencies())
+                WorkList.push_back(DestBundle);
+            });
+          } else {
+            // I'm not sure if this can ever happen. But we need to be safe.
+            // This lets the instruction/bundle never be scheduled and
+            // eventually disable vectorization.
+            BundleMember->Dependencies++;
+            BundleMember->incrementUnscheduledDeps(1);
           }
         }
 
         // Handle the memory dependencies.
         ScheduleData *DepDest = BundleMember->NextLoadStore;
         if (DepDest) {
-          Instruction *SrcInst = BundleMember->Inst;
+          Instruction *SrcInst = BundleMember->getInst();
           MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA);
-          bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
+          bool SrcMayWrite = SrcInst->mayWriteToMemory();
           unsigned numAliased = 0;
           unsigned DistToSrc = 1;
 
@@ -4120,24 +4454,29 @@
             //    It's important for the loop break condition (see below) to
             //    check this limit even between two read-only instructions.
             if (DistToSrc >= MaxMemDepDistance ||
-                    ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
+                    ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
                      (numAliased >= AliasedCheckLimit ||
-                      SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
+                      SLP->isAliased(SrcLoc, SrcInst, DepDest->getInst())))) {
 
               // We increment the counter only if the locations are aliased
               // (instead of counting all alias checks). This gives a better
               // balance between reduced runtime and accurate dependencies.
               numAliased++;
 
-              DepDest->MemoryDependencies.push_back(BundleMember);
-              BundleMember->Dependencies++;
-              ScheduleData *DestBundle = DepDest->FirstInBundle;
-              if (!DestBundle->IsScheduled) {
-                BundleMember->incrementUnscheduledDeps(1);
-              }
-              if (!DestBundle->hasValidDependencies()) {
-                WorkList.push_back(DestBundle);
-              }
+              // We don't want any duplicates in the set to have a correct
+              // dependancies.
+              doForAllOpcodes(DepDest->getInst(), [&BundleMember, &WorkList](
+                                                      ScheduleData *DepDest) {
+                  DepDest->MemoryDependencies.push_back(BundleMember);
+                  BundleMember->Dependencies++;
+                  ScheduleData *DestBundle = DepDest->FirstInBundle;
+                  if (!DestBundle->IsScheduled) {
+                    BundleMember->incrementUnscheduledDeps(1);
+                  }
+                  if (!DestBundle->hasValidDependencies()) {
+                    WorkList.push_back(DestBundle);
+                  }
+              });
             }
             DepDest = DepDest->NextLoadStore;
 
@@ -4164,7 +4503,7 @@
     }
     if (InsertInReadyList && SD->isReady()) {
       ReadyInsts.push_back(SD);
-      LLVM_DEBUG(dbgs() << "SLP:     gets ready on update: " << *SD->Inst
+      LLVM_DEBUG(dbgs() << "SLP:     gets ready on update: " << *SD->getInst()
                         << "\n");
     }
   }
@@ -4176,7 +4515,7 @@
   for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
     doForAllOpcodes(I, [&](ScheduleData *SD) {
       assert(isInSchedulingRegion(SD) &&
-             "ScheduleData not in scheduling region");
+             "InstScheduleData not in scheduling region");
       SD->IsScheduled = false;
       SD->resetUnscheduledDeps();
     });
@@ -4184,6 +4523,56 @@
   ReadyInsts.clear();
 }
 
+void BoUpSLP::BlockScheduling::reorderBundles() {
+  SmallPtrSet<ScheduleData *, 2> Bundles;
+  DenseMap<ScheduleData *, ScheduleData *> ReorderMap;
+  for (auto I = PseudoInstScheduleDataMap.begin(),
+            E = PseudoInstScheduleDataMap.end();
+       I != E; ++I) {
+    doForAllOpcodes(I->first, [&Bundles](ScheduleData *SD) {
+      if (SD->isPartOfBundle())
+        Bundles.insert(SD->FirstInBundle);
+    });
+  }
+  // Walk backward in the BB to descover the last instruction
+  // for a bundle.
+  for (auto I = BB->rbegin(), E = BB->rend(); (I != E && Bundles.size() > 0);
+       ++I) {
+    doForAllOpcodes(&*I, [&ReorderMap, &Bundles](ScheduleData *SD) {
+      if (SD->isPartOfBundle() && Bundles.count(SD->FirstInBundle) != 0) {
+        ReorderMap[SD->FirstInBundle] = SD;
+        Bundles.erase(SD->FirstInBundle);
+      }
+    });
+  }
+  // Swap the last scheduled instruction with the first one in the bundle.
+  for (auto I = ReorderMap.begin(), E = ReorderMap.end(); I != E; ++I) {
+    ScheduleData *FirstSD = I->first;
+    ScheduleData *LastSD = I->second;
+    SmallVector<ScheduleData *, 4> Bundle;
+    unsigned LastPos = 0;
+    // The first instruction in the bundle is already the last one scheduled.
+    if (FirstSD == LastSD)
+      continue;
+    ScheduleData *SD = FirstSD;
+    while (SD) {
+      if (SD == LastSD)
+        LastPos = Bundle.size();
+      Bundle.push_back(SD);
+      SD = SD->NextInBundle;
+    }
+    std::swap(Bundle[0], Bundle[LastPos]);
+    for (ScheduleData *SD : Bundle)
+      SD->FirstInBundle = Bundle[0];
+    Bundle[0]->NextInBundle = Bundle[1];
+    Bundle[LastPos - 1]->NextInBundle = Bundle[LastPos];
+    if (LastPos == Bundle.size() - 1)
+      Bundle[LastPos]->NextInBundle = nullptr;
+    else
+      Bundle[LastPos]->NextInBundle = Bundle[LastPos + 1];
+  }
+}
+
 void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
   if (!BS->ScheduleStart)
     return;
@@ -4210,7 +4599,9 @@
        I = I->getNextNode()) {
     BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) {
       assert(SD->isPartOfBundle() ==
-                 (getTreeEntry(SD->Inst) != nullptr) &&
+                 (getTreeEntry(SD->getInst(),
+                               std::make_pair(SD->Parent, SD->Opcode)) !=
+                  nullptr) &&
              "scheduler and vectorizer bundle mismatch");
       SD->FirstInBundle->SchedulingPriority = Idx++;
       if (SD->isSchedulingEntity()) {
@@ -4231,21 +4622,36 @@
     // Move the scheduled instruction(s) to their dedicated places, if not
     // there yet.
     ScheduleData *BundleMember = picked;
+    unsigned Opcode = BundleMember->Opcode;
+    Value *Parent = BundleMember->Parent;
     while (BundleMember) {
-      Instruction *pickedInst = BundleMember->Inst;
-      if (LastScheduledInst->getNextNode() != pickedInst) {
-        BS->BB->getInstList().remove(pickedInst);
+      assert(Opcode == BundleMember->Opcode &&
+             Parent == BundleMember->Parent && "Corrupt bundle member");
+      Instruction *PickedInst = BundleMember->getInst();
+      if (LastScheduledInst->getNextNode() != PickedInst) {
+        BS->BB->getInstList().remove(PickedInst);
         BS->BB->getInstList().insert(LastScheduledInst->getIterator(),
-                                     pickedInst);
+                                     PickedInst);
       }
-      LastScheduledInst = pickedInst;
+      LastScheduledInst = PickedInst;
       BundleMember = BundleMember->NextInBundle;
     }
-
     BS->schedule(picked, ReadyInsts);
     NumToSchedule--;
   }
+#ifndef NDEBUG
+  if (NumToSchedule != 0) {
+    for (BasicBlock::iterator I = BS->BB->begin(), E = BS->BB->end(); I != E;
+         ++I) {
+      BS->doForAllOpcodes(&*I, [](ScheduleData *SD) {
+        if (SD->isSchedulingEntity() && SD->UnscheduledDepsInBundle != 0)
+          LLVM_DEBUG(dbgs() << "SLP: Failed to schedule: " << *SD << ".\n");
+      });
+    }
+  }
+#endif
   assert(NumToSchedule == 0 && "could not schedule all instructions");
+  BS->reorderBundles();
 
   // Avoid duplicate scheduling of the block.
   BS->ScheduleStart = nullptr;
@@ -4862,9 +5268,14 @@
 
   // Check that all of the parts are scalar instructions of the same type,
   // we permit an alternate opcode via InstructionsState.
-  InstructionsState S = getSameOpcode(VL);
+  InstructionsState S = getSameOpcode(VL[0], VL);
   if (!S.getOpcode())
     return false;
+  for (Value *V : VL) {
+    auto *I = dyn_cast<Instruction>(V);
+    if (isOneOf(S, I) != I)
+      return false;
+  }
 
   Instruction *I0 = cast<Instruction>(S.OpValue);
   unsigned Sz = R.getVectorElementSize(I0);
Index: test/Transforms/SLPVectorizer/X86/cancel_scheduling.ll
===================================================================
--- /dev/null
+++ test/Transforms/SLPVectorizer/X86/cancel_scheduling.ll
@@ -0,0 +1,215 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -slp-vectorizer -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; This testcase shows the failure of scheduling bundles after calling
+; cancelScheduling() in tryScheduleBundle() and not cleaning all
+; dependencies. The dependency values are supposed to be cleared,
+; since everything was calculated before we cancel the bundle.
+
+define dso_local void @fn1() local_unnamed_addr #0 {
+; CHECK-LABEL: @fn1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 2
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 1
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 0
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 4
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 3
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 17
+; CHECK-NEXT:    store i16 7, i16* [[ARRAYIDX5]], align 2
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 16
+; CHECK-NEXT:    store i16 7, i16* [[ARRAYIDX6]], align 2
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 15
+; CHECK-NEXT:    store i16 7, i16* [[ARRAYIDX7]], align 2
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 12
+; CHECK-NEXT:    store i16 7, i16* [[ARRAYIDX8]], align 2
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 11
+; CHECK-NEXT:    store i16 7, i16* [[ARRAYIDX9]], align 2
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 10
+; CHECK-NEXT:    store i16 7, i16* [[ARRAYIDX10]], align 2
+; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 9
+; CHECK-NEXT:    store i16 7, i16* [[ARRAYIDX11]], align 2
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 8
+; CHECK-NEXT:    store i16 7, i16* [[ARRAYIDX12]], align 2
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 7
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 6
+; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 5
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[ARRAYIDX2]] to <8 x i16>*
+; CHECK-NEXT:    store <8 x i16> <i16 2, i16 2, i16 2, i16 0, i16 0, i16 7, i16 7, i16 7>, <8 x i16>* [[TMP0]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, i8* inttoptr (i64 1 to i8*), align 1
+; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 18
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i8 [[TMP1]], 2
+; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 19
+; CHECK-NEXT:    [[TMP3:%.*]] = and i8 [[TMP1]], 2
+; CHECK-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 20
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, i8* inttoptr (i64 2 to i8*), align 2
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr i8 [[TMP4]], 4
+; CHECK-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 21
+; CHECK-NEXT:    [[TMP6:%.*]] = lshr i8 [[TMP4]], 1
+; CHECK-NEXT:    [[ARRAYIDX39:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 22
+; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 23
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, i8* inttoptr (i64 3 to i8*), align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = lshr i8 [[TMP7]], 3
+; CHECK-NEXT:    [[ARRAYIDX49:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 24
+; CHECK-NEXT:    [[ARRAYIDX54:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 25
+; CHECK-NEXT:    [[TMP9:%.*]] = load i8, i8* inttoptr (i64 4 to i8*), align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = lshr i8 [[TMP9]], 4
+; CHECK-NEXT:    [[ARRAYIDX59:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 26
+; CHECK-NEXT:    [[TMP11:%.*]] = lshr i8 [[TMP9]], 1
+; CHECK-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 27
+; CHECK-NEXT:    [[ARRAYIDX69:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 28
+; CHECK-NEXT:    [[TMP12:%.*]] = load i8, i8* inttoptr (i64 5 to i8*), align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = lshr i8 [[TMP12]], 3
+; CHECK-NEXT:    [[ARRAYIDX74:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 29
+; CHECK-NEXT:    [[ARRAYIDX79:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 30
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, i8* inttoptr (i64 6 to i8*), align 2
+; CHECK-NEXT:    [[ARRAYIDX83:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 31
+; CHECK-NEXT:    [[TMP15:%.*]] = lshr i8 [[TMP14]], 2
+; CHECK-NEXT:    [[ARRAYIDX88:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 32
+; CHECK-NEXT:    [[TMP16:%.*]] = shl i8 [[TMP14]], 1
+; CHECK-NEXT:    [[TMP17:%.*]] = or i8 [[TMP3]], 1
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <16 x i8> undef, i8 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <16 x i8> [[TMP18]], i8 [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <16 x i8> [[TMP19]], i8 [[TMP17]], i32 2
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <16 x i8> [[TMP20]], i8 [[TMP5]], i32 3
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <16 x i8> [[TMP21]], i8 [[TMP6]], i32 4
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <16 x i8> [[TMP22]], i8 [[TMP4]], i32 5
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <16 x i8> [[TMP23]], i8 [[TMP8]], i32 6
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <16 x i8> [[TMP24]], i8 [[TMP7]], i32 7
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <16 x i8> [[TMP25]], i8 [[TMP10]], i32 8
+; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <16 x i8> [[TMP26]], i8 [[TMP11]], i32 9
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <16 x i8> [[TMP27]], i8 [[TMP9]], i32 10
+; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <16 x i8> [[TMP28]], i8 [[TMP13]], i32 11
+; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <16 x i8> [[TMP29]], i8 [[TMP12]], i32 12
+; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <16 x i8> [[TMP30]], i8 [[TMP14]], i32 13
+; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <16 x i8> [[TMP31]], i8 [[TMP15]], i32 14
+; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <16 x i8> [[TMP32]], i8 [[TMP16]], i32 15
+; CHECK-NEXT:    [[TMP34:%.*]] = ashr <16 x i8> [[TMP33]], <i8 7, i8 7, i8 0, i8 7, i8 7, i8 2, i8 7, i8 7, i8 7, i8 7, i8 2, i8 7, i8 7, i8 7, i8 7, i8 6>
+; CHECK-NEXT:    [[TMP35:%.*]] = and <16 x i8> [[TMP33]], <i8 7, i8 7, i8 0, i8 7, i8 7, i8 2, i8 7, i8 7, i8 7, i8 7, i8 2, i8 7, i8 7, i8 7, i8 7, i8 6>
+; CHECK-NEXT:    [[TMP36:%.*]] = shufflevector <16 x i8> [[TMP34]], <16 x i8> [[TMP35]], <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP37:%.*]] = sext <16 x i8> [[TMP36]] to <16 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = zext <16 x i8> [[TMP36]] to <16 x i16>
+; CHECK-NEXT:    [[TMP39:%.*]] = shufflevector <16 x i16> [[TMP37]], <16 x i16> [[TMP38]], <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[ARRAYIDX92:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 33
+; CHECK-NEXT:    [[TMP40:%.*]] = bitcast i16* [[ARRAYIDX17]] to <16 x i16>*
+; CHECK-NEXT:    store <16 x i16> [[TMP39]], <16 x i16>* [[TMP40]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %arrayidx = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 2
+  store i16 2, i16* %arrayidx, align 2
+  %arrayidx1 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 1
+  store i16 2, i16* %arrayidx1, align 2
+  %arrayidx2 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 0
+  store i16 2, i16* %arrayidx2, align 2
+  %arrayidx3 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 4
+  store i16 0, i16* %arrayidx3, align 2
+  %arrayidx4 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 3
+  store i16 0, i16* %arrayidx4, align 2
+  %arrayidx5 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 17
+  store i16 7, i16* %arrayidx5, align 2
+  %arrayidx6 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 16
+  store i16 7, i16* %arrayidx6, align 2
+  %arrayidx7 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 15
+  store i16 7, i16* %arrayidx7, align 2
+  %arrayidx8 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 12
+  store i16 7, i16* %arrayidx8, align 2
+  %arrayidx9 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 11
+  store i16 7, i16* %arrayidx9, align 2
+  %arrayidx10 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 10
+  store i16 7, i16* %arrayidx10, align 2
+  %arrayidx11 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 9
+  store i16 7, i16* %arrayidx11, align 2
+  %arrayidx12 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 8
+  store i16 7, i16* %arrayidx12, align 2
+  %arrayidx13 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 7
+  store i16 7, i16* %arrayidx13, align 2
+  %arrayidx14 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 6
+  store i16 7, i16* %arrayidx14, align 2
+  %arrayidx15 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 5
+  store i16 7, i16* %arrayidx15, align 2
+  %0 = load i8, i8* inttoptr (i64 1 to i8*), align 1
+  %1 = ashr i8 %0, 7
+  %conv16 = sext i8 %1 to i16
+  %arrayidx17 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 18
+  store i16 %conv16, i16* %arrayidx17, align 2
+  %2 = lshr i8 %0, 2
+  %3 = and i8 %2, 7
+  %conv20 = zext i8 %3 to i16
+  %arrayidx21 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 19
+  store i16 %conv20, i16* %arrayidx21, align 2
+  %4 = and i8 %0, 2
+  %arrayidx26 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 20
+  %5 = or i8 %4, 1
+  %conv29 = zext i8 %5 to i16
+  store i16 %conv29, i16* %arrayidx26, align 2
+  %6 = load i8, i8* inttoptr (i64 2 to i8*), align 2
+  %7 = lshr i8 %6, 4
+  %8 = and i8 %7, 7
+  %conv33 = zext i8 %8 to i16
+  %arrayidx34 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 21
+  store i16 %conv33, i16* %arrayidx34, align 2
+  %9 = lshr i8 %6, 1
+  %10 = and i8 %9, 7
+  %conv38 = zext i8 %10 to i16
+  %arrayidx39 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 22
+  store i16 %conv38, i16* %arrayidx39, align 2
+  %11 = and i8 %6, 2
+  %conv43 = zext i8 %11 to i16
+  %arrayidx44 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 23
+  store i16 %conv43, i16* %arrayidx44, align 2
+  %12 = load i8, i8* inttoptr (i64 3 to i8*), align 1
+  %13 = lshr i8 %12, 3
+  %14 = and i8 %13, 7
+  %conv48 = zext i8 %14 to i16
+  %arrayidx49 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 24
+  store i16 %conv48, i16* %arrayidx49, align 2
+  %15 = and i8 %12, 7
+  %conv53 = zext i8 %15 to i16
+  %arrayidx54 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 25
+  store i16 %conv53, i16* %arrayidx54, align 2
+  %16 = load i8, i8* inttoptr (i64 4 to i8*), align 4
+  %17 = lshr i8 %16, 4
+  %18 = and i8 %17, 7
+  %conv58 = zext i8 %18 to i16
+  %arrayidx59 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 26
+  store i16 %conv58, i16* %arrayidx59, align 2
+  %19 = lshr i8 %16, 1
+  %20 = and i8 %19, 7
+  %conv63 = zext i8 %20 to i16
+  %arrayidx64 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 27
+  store i16 %conv63, i16* %arrayidx64, align 2
+  %21 = and i8 %16, 2
+  %conv68 = zext i8 %21 to i16
+  %arrayidx69 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 28
+  store i16 %conv68, i16* %arrayidx69, align 2
+  %22 = load i8, i8* inttoptr (i64 5 to i8*), align 1
+  %23 = lshr i8 %22, 3
+  %24 = and i8 %23, 7
+  %conv73 = zext i8 %24 to i16
+  %arrayidx74 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 29
+  store i16 %conv73, i16* %arrayidx74, align 2
+  %25 = and i8 %22, 7
+  %conv78 = zext i8 %25 to i16
+  %arrayidx79 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 30
+  store i16 %conv78, i16* %arrayidx79, align 2
+  %26 = load i8, i8* inttoptr (i64 6 to i8*), align 2
+  %27 = and i8 %26, 7
+  %conv82 = zext i8 %27 to i16
+  %arrayidx83 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 31
+  store i16 %conv82, i16* %arrayidx83, align 2
+  %28 = lshr i8 %26, 2
+  %29 = and i8 %28, 7
+  %conv87 = zext i8 %29 to i16
+  %arrayidx88 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 32
+  store i16 %conv87, i16* %arrayidx88, align 2
+  %30 = shl i8 %26, 1
+  %31 = and i8 %30, 6
+  %conv91 = zext i8 %31 to i16
+  %arrayidx92 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 33
+  store i16 %conv91, i16* %arrayidx92, align 2
+  ret void
+}
+
+attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="broadwell" "target-features"="+adx,+aes,+avx,+avx2,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" }
Index: test/Transforms/SLPVectorizer/X86/insert-after-multiple-bundle.ll
===================================================================
--- /dev/null
+++ test/Transforms/SLPVectorizer/X86/insert-after-multiple-bundle.ll
@@ -0,0 +1,89 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -slp-vectorizer -slp-vectorizer -mcpu=bdver1 < %s | FileCheck %s
+
+; Function Attrs: nounwind uwtable
+define dso_local void @fn1() local_unnamed_addr #0 {
+; CHECK-LABEL: @fn1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <4 x i32> [ undef, [[ENTRY:%.*]] ], [ [[TMP7:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* undef, align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> <i32 0, i32 undef>, i32 [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = shl nsw <4 x i32> [[SHUFFLE]], <i32 1, i32 2, i32 1, i32 2>
+; CHECK-NEXT:    [[TMP5:%.*]] = sub nsw <4 x i32> undef, [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp slt <4 x i32> [[TMP0]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7]] = select <4 x i1> [[TMP6]], <4 x i32> [[TMP5]], <4 x i32> [[TMP0]]
+; CHECK-NEXT:    br label [[FOR_BODY]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %j.064 = phi i32 [ undef, %entry ], [ %spec.select, %for.body ]
+  %k.063 = phi i32 [ undef, %entry ], [ %k.1, %for.body ]
+  %l.062 = phi i32 [ undef, %entry ], [ %spec.select58, %for.body ]
+  %m.061 = phi i32 [ undef, %entry ], [ %m.1, %for.body ]
+  %conv = zext i8 undef to i32
+  %mul = shl nuw nsw i32 %conv, 1
+  %sub = sub nsw i32 undef, %mul
+  %mul4 = shl nuw nsw i32 %conv, 2
+  %sub5 = sub nsw i32 undef, %mul4
+  %conv8 = zext i8 undef to i32
+  %0 = load i32, i32* undef, align 4
+  %add = add nsw i32 %0, %conv8
+  %mul11 = shl nsw i32 %add, 1
+  %sub12 = sub nsw i32 undef, %mul11
+  %mul19 = shl nsw i32 %add, 2
+  %sub20 = sub nsw i32 undef, %mul19
+  %cmp = icmp slt i32 %j.064, %sub
+  %spec.select = select i1 %cmp, i32 %sub, i32 %j.064
+  %cmp22 = icmp slt i32 %k.063, %sub5
+  %k.1 = select i1 %cmp22, i32 %sub5, i32 %k.063
+  %cmp26 = icmp slt i32 %l.062, %sub12
+  %spec.select58 = select i1 %cmp26, i32 %sub12, i32 %l.062
+  %cmp30 = icmp slt i32 %m.061, %sub20
+  %m.1 = select i1 %cmp30, i32 %sub20, i32 %m.061
+  br label %for.body
+}
+
+; Function Attrs: nounwind uwtable
+define dso_local void @axis_to_quat() local_unnamed_addr #0 {
+; CHECK-LABEL: @axis_to_quat(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CONV:%.*]] = fptrunc double undef to float
+; CHECK-NEXT:    [[MUL:%.*]] = fmul fast float undef, [[CONV]]
+; CHECK-NEXT:    store float [[MUL]], float* undef, align 4
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* undef, i64 1
+; CHECK-NEXT:    [[MUL2:%.*]] = fmul fast float undef, [[CONV]]
+; CHECK-NEXT:    store float [[MUL2]], float* [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* undef, i64 2
+; CHECK-NEXT:    [[MUL4:%.*]] = fmul fast float undef, [[CONV]]
+; CHECK-NEXT:    store float [[MUL4]], float* [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call fast double @llvm.cos.f64(double 0x7FF8000000000000)
+; CHECK-NEXT:    [[CONV6:%.*]] = fptrunc double [[TMP0]] to float
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* undef, i64 3
+; CHECK-NEXT:    store float [[CONV6]], float* [[ARRAYIDX7]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %conv = fptrunc double undef to float
+  %mul = fmul fast float undef, %conv
+  store float %mul, float* undef, align 4
+  %arrayidx1 = getelementptr inbounds float, float* undef, i64 1
+  %mul2 = fmul fast float undef, %conv
+  store float %mul2, float* %arrayidx1, align 4
+  %arrayidx3 = getelementptr inbounds float, float* undef, i64 2
+  %mul4 = fmul fast float undef, %conv
+  store float %mul4, float* %arrayidx3, align 4
+  %0 = tail call fast double @llvm.cos.f64(double 0x7FF8000000000000)
+  %conv6 = fptrunc double %0 to float
+  %arrayidx7 = getelementptr inbounds float, float* undef, i64 3
+  store float %conv6, float* %arrayidx7, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readnone speculatable
+declare double @llvm.cos.f64(double) #1
Index: test/Transforms/SLPVectorizer/X86/memory-dep.ll
===================================================================
--- /dev/null
+++ test/Transforms/SLPVectorizer/X86/memory-dep.ll
@@ -0,0 +1,76 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -slp-vectorizer -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.anon.1.2.3.4.87 = type { [6 x [6 x i16]], [6 x [6 x i32]], [0 x [4 x [4 x i32]]] }
+
+@f = external dso_local local_unnamed_addr global %struct.anon.1.2.3.4.87, align 4
+
+; Function Attrs: norecurse nounwind uwtable
+define dso_local void @itrans() local_unnamed_addr #0 {
+; CHECK-LABEL: @itrans(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store i32 undef, i32* getelementptr inbounds (%struct.anon.1.2.3.4.87, %struct.anon.1.2.3.4.87* @f, i64 0, i32 1, i64 0, i64 3), align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* getelementptr inbounds (%struct.anon.1.2.3.4.87, %struct.anon.1.2.3.4.87* @f, i64 0, i32 1, i64 0, i64 2), align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* getelementptr inbounds (%struct.anon.1.2.3.4.87, %struct.anon.1.2.3.4.87* @f, i64 0, i32 1, i64 0, i64 3), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> undef, i32 undef, i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 undef, i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP1]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> <i32 2, i32 2, i32 2, i32 2>, [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = sub <4 x i32> [[TMP6]], undef
+; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> undef, [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = shl <4 x i32> [[TMP8]], <i32 6, i32 6, i32 6, i32 6>
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp slt <4 x i32> [[TMP9]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP11:%.*]] = zext <4 x i1> [[TMP10]] to <4 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp slt <4 x i32> undef, [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = zext <4 x i1> [[TMP12]] to <4 x i32>
+; CHECK-NEXT:    store <4 x i32> [[TMP13]], <4 x i32>* bitcast (i32* getelementptr inbounds (%struct.anon.1.2.3.4.87, %struct.anon.1.2.3.4.87* @f, i64 0, i32 1, i64 3, i64 0) to <4 x i32>*), align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %add8 = add nsw i32 undef, undef
+  store i32 undef, i32* getelementptr inbounds (%struct.anon.1.2.3.4.87, %struct.anon.1.2.3.4.87* @f, i64 0, i32 1, i64 0, i64 3), align 4
+  %add15 = add nsw i32 undef, undef
+  %add26 = add nsw i32 %add8, 2
+  %sub27 = sub i32 %add26, undef
+  %add33 = add nsw i32 %sub27, undef
+  %shl = shl i32 %add33, 6
+  %cmp.i = icmp slt i32 %shl, 1
+  %conv.i = zext i1 %cmp.i to i32
+  %cmp1.i = icmp slt i32 undef, %conv.i
+  %conv2.i = zext i1 %cmp1.i to i32
+  store i32 %conv2.i, i32* getelementptr inbounds (%struct.anon.1.2.3.4.87, %struct.anon.1.2.3.4.87* @f, i64 0, i32 1, i64 3, i64 0), align 4
+  %add26.1 = add nsw i32 %add15, 2
+  %sub27.1 = sub i32 %add26.1, undef
+  %add33.1 = add nsw i32 %sub27.1, undef
+  %shl.1 = shl i32 %add33.1, 6
+  %cmp.i.1 = icmp slt i32 %shl.1, 1
+  %conv.i.1 = zext i1 %cmp.i.1 to i32
+  %cmp1.i.1 = icmp slt i32 undef, %conv.i.1
+  %conv2.i.1 = zext i1 %cmp1.i.1 to i32
+  store i32 %conv2.i.1, i32* getelementptr inbounds (%struct.anon.1.2.3.4.87, %struct.anon.1.2.3.4.87* @f, i64 0, i32 1, i64 3, i64 1), align 4
+  %0 = load i32, i32* getelementptr inbounds (%struct.anon.1.2.3.4.87, %struct.anon.1.2.3.4.87* @f, i64 0, i32 1, i64 0, i64 2), align 4
+  %add26.2 = add nsw i32 %0, 2
+  %sub27.2 = sub i32 %add26.2, undef
+  %add33.2 = add nsw i32 %sub27.2, undef
+  %shl.2 = shl i32 %add33.2, 6
+  %cmp.i.2 = icmp slt i32 %shl.2, 1
+  %conv.i.2 = zext i1 %cmp.i.2 to i32
+  %cmp1.i.2 = icmp slt i32 undef, %conv.i.2
+  %conv2.i.2 = zext i1 %cmp1.i.2 to i32
+  store i32 %conv2.i.2, i32* getelementptr inbounds (%struct.anon.1.2.3.4.87, %struct.anon.1.2.3.4.87* @f, i64 0, i32 1, i64 3, i64 2), align 4
+  %1 = load i32, i32* getelementptr inbounds (%struct.anon.1.2.3.4.87, %struct.anon.1.2.3.4.87* @f, i64 0, i32 1, i64 0, i64 3), align 4
+  %add26.3 = add nsw i32 %1, 2
+  %sub27.3 = sub i32 %add26.3, undef
+  %add33.3 = add nsw i32 %sub27.3, undef
+  %shl.3 = shl i32 %add33.3, 6
+  %cmp.i.3 = icmp slt i32 %shl.3, 1
+  %conv.i.3 = zext i1 %cmp.i.3 to i32
+  %cmp1.i.3 = icmp slt i32 undef, %conv.i.3
+  %conv2.i.3 = zext i1 %cmp1.i.3 to i32
+  store i32 %conv2.i.3, i32* getelementptr inbounds (%struct.anon.1.2.3.4.87, %struct.anon.1.2.3.4.87* @f, i64 0, i32 1, i64 3, i64 3), align 4
+  ret void
+}
Index: test/Transforms/SLPVectorizer/X86/pr35497.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/pr35497.ll
+++ test/Transforms/SLPVectorizer/X86/pr35497.ll
@@ -12,20 +12,20 @@
 define void @_ZN1C10SwitchModeEv() local_unnamed_addr #0 comdat align 2 {
 ; CHECK-LABEL: @_ZN1C10SwitchModeEv(
 ; CHECK-NEXT:  for.body.lr.ph.i:
-; CHECK-NEXT:    [[OR_1:%.*]] = or i64 undef, 1
-; CHECK-NEXT:    store i64 [[OR_1]], i64* undef, align 8
+; CHECK-NEXT:    [[BAR5:%.*]] = load i64, i64* undef, align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i64> undef, i64 [[BAR5]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = or <2 x i64> [[TMP0]], <i64 1, i64 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
+; CHECK-NEXT:    store i64 [[TMP2]], i64* undef, align 8
 ; CHECK-NEXT:    [[FOO_1:%.*]] = getelementptr inbounds [[CLASS_1:%.*]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 0
 ; CHECK-NEXT:    [[FOO_2:%.*]] = getelementptr inbounds [[CLASS_1]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[FOO_1]] to <2 x i64>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8
-; CHECK-NEXT:    [[BAR5:%.*]] = load i64, i64* undef, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> undef, i64 [[OR_1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[BAR5]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = and <2 x i64> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[FOO_1]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = and <2 x i64> [[TMP1]], [[TMP4]]
 ; CHECK-NEXT:    [[BAR3:%.*]] = getelementptr inbounds [[CLASS_2:%.*]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 0
 ; CHECK-NEXT:    [[BAR4:%.*]] = getelementptr inbounds [[CLASS_2]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 1
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64* [[BAR3]] to <2 x i64>*
-; CHECK-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64* [[BAR3]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP5]], <2 x i64>* [[TMP6]], align 8
 ; CHECK-NEXT:    ret void
 ;
 for.body.lr.ph.i:
Index: test/Transforms/SLPVectorizer/X86/rem-bundle.ll
===================================================================
--- /dev/null
+++ test/Transforms/SLPVectorizer/X86/rem-bundle.ll
@@ -0,0 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -slp-vectorizer -slp-vectorizer -mcpu=bdver1 < %s | FileCheck %s
+
+%struct.anon.0.1.2.3.20 = type { i32, i32, i32, i32 }
+
+; This testcase shows the failure of combining any remainder operation
+; in a bundle with non-alternative operations.
+
+@b = external dso_local local_unnamed_addr global i32, align 4
+@c = external dso_local local_unnamed_addr global %struct.anon.0.1.2.3.20, align 4
+
+; Function Attrs: norecurse nounwind uwtable
+define dso_local void @fn1() local_unnamed_addr #0 {
+; CHECK-LABEL: @fn1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @b, align 4
+; CHECK-NEXT:    store i32 [[TMP0]], i32* getelementptr inbounds (%struct.anon.0.1.2.3.20, %struct.anon.0.1.2.3.20* @c, i64 0, i32 3), align 4
+; CHECK-NEXT:    [[DIV:%.*]] = sdiv i32 undef, 600
+; CHECK-NEXT:    store i32 [[DIV]], i32* getelementptr inbounds (%struct.anon.0.1.2.3.20, %struct.anon.0.1.2.3.20* @c, i64 0, i32 2), align 4
+; CHECK-NEXT:    [[DIV1:%.*]] = sdiv i32 undef, 60
+; CHECK-NEXT:    store i32 [[DIV1]], i32* getelementptr inbounds (%struct.anon.0.1.2.3.20, %struct.anon.0.1.2.3.20* @c, i64 0, i32 1), align 4
+; CHECK-NEXT:    [[REM:%.*]] = srem i32 undef, 60
+; CHECK-NEXT:    store i32 [[REM]], i32* getelementptr inbounds (%struct.anon.0.1.2.3.20, %struct.anon.0.1.2.3.20* @c, i64 0, i32 0), align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load i32, i32* @b, align 4
+  store i32 %0, i32* getelementptr inbounds (%struct.anon.0.1.2.3.20, %struct.anon.0.1.2.3.20* @c, i64 0, i32 3), align 4
+  %div = sdiv i32 undef, 600
+  store i32 %div, i32* getelementptr inbounds (%struct.anon.0.1.2.3.20, %struct.anon.0.1.2.3.20* @c, i64 0, i32 2), align 4
+  %div1 = sdiv i32 undef, 60
+  store i32 %div1, i32* getelementptr inbounds (%struct.anon.0.1.2.3.20, %struct.anon.0.1.2.3.20* @c, i64 0, i32 1), align 4
+  %rem = srem i32 undef, 60
+  store i32 %rem, i32* getelementptr inbounds (%struct.anon.0.1.2.3.20, %struct.anon.0.1.2.3.20* @c, i64 0, i32 0), align 4
+  ret void
+}
Index: test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
+++ test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
@@ -43,22 +43,16 @@
 ; CHECK-LABEL: @add1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store i32 [[TMP0]], i32* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP1]], 1
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
-; CHECK-NEXT:    store i32 [[ADD3]], i32* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP2]], 2
 ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
-; CHECK-NEXT:    store i32 [[ADD6]], i32* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4
-; CHECK-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP3]], 3
-; CHECK-NEXT:    store i32 [[ADD9]], i32* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -86,22 +80,16 @@
 ; CHECK-LABEL: @sub0(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store i32 [[SUB]], i32* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
-; CHECK-NEXT:    store i32 [[TMP1]], i32* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[SUB5:%.*]] = add nsw i32 [[TMP2]], -2
 ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
-; CHECK-NEXT:    store i32 [[SUB5]], i32* [[INCDEC_PTR3]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[SUB8:%.*]] = add nsw i32 [[TMP3]], -3
-; CHECK-NEXT:    store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> <i32 -1, i32 0, i32 -2, i32 -3>, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -205,22 +193,18 @@
 ; CHECK-LABEL: @addsub0(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store i32 [[SUB]], i32* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
-; CHECK-NEXT:    store i32 [[TMP1]], i32* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[SUB5:%.*]] = add nsw i32 [[TMP2]], -2
 ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
-; CHECK-NEXT:    store i32 [[SUB5]], i32* [[INCDEC_PTR3]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3
-; CHECK-NEXT:    store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], <i32 -1, i32 0, i32 -2, i32 -3>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub nsw <4 x i32> [[TMP1]], <i32 -1, i32 0, i32 -2, i32 -3>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -248,22 +232,18 @@
 ; CHECK-LABEL: @addsub1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store i32 [[SUB]], i32* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[SUB1:%.*]] = sub nsw i32 [[TMP1]], -1
 ; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
-; CHECK-NEXT:    store i32 [[SUB1]], i32* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
-; CHECK-NEXT:    store i32 [[TMP2]], i32* [[INCDEC_PTR3]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3
-; CHECK-NEXT:    store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], <i32 -1, i32 -1, i32 0, i32 -3>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub nsw <4 x i32> [[TMP1]], <i32 -1, i32 -1, i32 0, i32 -3>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -291,22 +271,16 @@
 ; CHECK-LABEL: @mul(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
-; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 257
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store i32 [[MUL]], i32* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[MUL3:%.*]] = mul nsw i32 [[TMP1]], -3
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
-; CHECK-NEXT:    store i32 [[MUL3]], i32* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
-; CHECK-NEXT:    store i32 [[TMP2]], i32* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4
-; CHECK-NEXT:    [[MUL9:%.*]] = mul nsw i32 [[TMP3]], -9
-; CHECK-NEXT:    store i32 [[MUL9]], i32* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nsw <4 x i32> <i32 257, i32 -3, i32 1, i32 -9>, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -334,22 +308,16 @@
 ; CHECK-LABEL: @shl0(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store i32 [[TMP0]], i32* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[TMP1]], 1
 ; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
-; CHECK-NEXT:    store i32 [[SHL]], i32* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[SHL5:%.*]] = shl i32 [[TMP2]], 2
 ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
-; CHECK-NEXT:    store i32 [[SHL5]], i32* [[INCDEC_PTR3]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[SHL8:%.*]] = shl i32 [[TMP3]], 3
-; CHECK-NEXT:    store i32 [[SHL8]], i32* [[INCDEC_PTR6]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = shl <4 x i32> [[TMP1]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -453,22 +421,16 @@
 ; CHECK-LABEL: @add1f(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store float [[TMP0]], float* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[ADD3:%.*]] = fadd fast float [[TMP1]], 1.000000e+00
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
-; CHECK-NEXT:    store float [[ADD3]], float* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[ADD6:%.*]] = fadd fast float [[TMP2]], 2.000000e+00
 ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
-; CHECK-NEXT:    store float [[ADD6]], float* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
-; CHECK-NEXT:    [[ADD9:%.*]] = fadd fast float [[TMP3]], 3.000000e+00
-; CHECK-NEXT:    store float [[ADD9]], float* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -496,22 +458,16 @@
 ; CHECK-LABEL: @sub0f(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store float [[ADD]], float* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
-; CHECK-NEXT:    store float [[TMP1]], float* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[ADD6:%.*]] = fadd fast float [[TMP2]], -2.000000e+00
 ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
-; CHECK-NEXT:    store float [[ADD6]], float* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
-; CHECK-NEXT:    [[ADD9:%.*]] = fadd fast float [[TMP3]], -3.000000e+00
-; CHECK-NEXT:    store float [[ADD9]], float* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <4 x float> <float -1.000000e+00, float 0.000000e+00, float -2.000000e+00, float -3.000000e+00>, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -615,22 +571,18 @@
 ; CHECK-LABEL: @addsub0f(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store float [[SUB]], float* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
-; CHECK-NEXT:    store float [[TMP1]], float* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[SUB5:%.*]] = fadd fast float [[TMP2]], -2.000000e+00
 ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
-; CHECK-NEXT:    store float [[SUB5]], float* [[INCDEC_PTR3]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00
-; CHECK-NEXT:    store float [[SUB8]], float* [[INCDEC_PTR6]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <4 x float> [[TMP1]], <float -1.000000e+00, float 0.000000e+00, float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub fast <4 x float> [[TMP1]], <float -1.000000e+00, float 0.000000e+00, float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[DST]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -658,22 +610,18 @@
 ; CHECK-LABEL: @addsub1f(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store float [[SUB]], float* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[SUB1:%.*]] = fsub fast float [[TMP1]], -1.000000e+00
 ; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
-; CHECK-NEXT:    store float [[SUB1]], float* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
-; CHECK-NEXT:    store float [[TMP2]], float* [[INCDEC_PTR3]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00
-; CHECK-NEXT:    store float [[SUB8]], float* [[INCDEC_PTR6]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <4 x float> [[TMP1]], <float -1.000000e+00, float -1.000000e+00, float 0.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub fast <4 x float> [[TMP1]], <float -1.000000e+00, float -1.000000e+00, float 0.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[DST]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -701,22 +649,16 @@
 ; CHECK-LABEL: @mulf(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = fmul fast float [[TMP0]], 2.570000e+02
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store float [[SUB]], float* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[SUB3:%.*]] = fmul fast float [[TMP1]], -3.000000e+00
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
-; CHECK-NEXT:    store float [[SUB3]], float* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
-; CHECK-NEXT:    store float [[TMP2]], float* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
-; CHECK-NEXT:    [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
-; CHECK-NEXT:    store float [[SUB9]], float* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <4 x float> <float 2.570000e+02, float -3.000000e+00, float 1.000000e+00, float -9.000000e+00>, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -825,22 +767,16 @@
 ; CHECK-LABEL: @sub0fn(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store float [[ADD]], float* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
-; CHECK-NEXT:    store float [[TMP1]], float* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[ADD6:%.*]] = fadd float [[TMP2]], -2.000000e+00
 ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
-; CHECK-NEXT:    store float [[ADD6]], float* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
-; CHECK-NEXT:    [[ADD9:%.*]] = fadd float [[TMP3]], -3.000000e+00
-; CHECK-NEXT:    store float [[ADD9]], float* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> <float -1.000000e+00, float 0.000000e+00, float -2.000000e+00, float -3.000000e+00>, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry: