Index: lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -299,6 +299,22 @@
               : TargetTransformInfo::SK_PermuteSingleSrc;
 }
 
+/// Checks if the \p Opcode can be considered as an operand of a (possibly)
+/// binary operation \p I.
+/// \returns The code of the binary operation of instruction \p I if the
+/// instruction with \p Opcode can be considered as an operand of \p I with the
+/// default value.
+static unsigned tryToRepresentAsInstArg(unsigned Opcode, Instruction *I) {
+  if (I->getOpcode() != Instruction::PHI &&
+      I->getOpcode() != Instruction::SRem &&
+      I->getOpcode() != Instruction::URem &&
+      I->getOpcode() != Instruction::FRem &&
+      (I->getType()->isIntegerTy() ||
+       (isa<FPMathOperator>(I) && cast<FPMathOperator>(I)->isFast())))
+    return I->getOpcode();
+  return 0;
+}
+
 namespace {
 
 /// Main data required for vectorization of instructions.
@@ -320,14 +336,15 @@
   }
 
   /// Some of the instructions in the list have alternate opcodes.
-  bool isAltShuffle() const { return getOpcode() != getAltOpcode(); }
+  bool isAltShuffle() const { return (getOpcode() != 0 && getAltOpcode() != 0 &&
+                                      getOpcode() != getAltOpcode()); }
 
   bool isOpcodeOrAlt(Instruction *I) const {
     unsigned CheckedOpcode = I->getOpcode();
     return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
   }
 
-  InstructionsState() = delete;
+  InstructionsState() = default;
   InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
       : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
 };
@@ -353,41 +370,92 @@
   if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
     return InstructionsState(VL[BaseIndex], nullptr, nullptr);
 
+  unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
   bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
   bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
-  unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
+  bool IsNonAlt = false;
   unsigned AltOpcode = Opcode;
+  unsigned OpcodeNum = 0;
+  unsigned AltOpcodeNum = 0;
+  unsigned NonAltNum = 0;
+  unsigned NonAltIndex = 0;
   unsigned AltIndex = BaseIndex;
 
-  // Check for one alternate opcode from another BinaryOperator.
-  // TODO - generalize to support all operators (types, calls etc.).
+  // Check for an alternate opcode pattern.
   for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
-    unsigned InstOpcode = cast<Instruction>(VL[Cnt])->getOpcode();
-    if (IsBinOp && isa<BinaryOperator>(VL[Cnt])) {
-      if (InstOpcode == Opcode || InstOpcode == AltOpcode)
-        continue;
-      if (Opcode == AltOpcode) {
-        AltOpcode = InstOpcode;
-        AltIndex = Cnt;
-        continue;
-      }
-    } else if (IsCastOp && isa<CastInst>(VL[Cnt])) {
+    auto *I = cast<Instruction>(VL[Cnt]);
+    unsigned InstOpcode = I->getOpcode();
+    if (IsCastOp && isa<CastInst>(VL[Cnt])) {
       Type *Ty0 = cast<Instruction>(VL[BaseIndex])->getOperand(0)->getType();
       Type *Ty1 = cast<Instruction>(VL[Cnt])->getOperand(0)->getType();
       if (Ty0 == Ty1) {
-        if (InstOpcode == Opcode || InstOpcode == AltOpcode)
+        if (InstOpcode == Opcode) {
+          OpcodeNum++;
+          continue;
+        }
+        if (AltOpcode != Opcode && InstOpcode == AltOpcode) {
+          AltOpcodeNum++;
           continue;
+        }
         if (Opcode == AltOpcode) {
           AltOpcode = InstOpcode;
           AltIndex = Cnt;
+          AltOpcodeNum++;
           continue;
         }
       }
-    } else if (InstOpcode == Opcode || InstOpcode == AltOpcode)
+      return InstructionsState(VL[BaseIndex], nullptr, nullptr);
+    }
+    if (InstOpcode == Opcode) {
+      OpcodeNum++;
       continue;
-    return InstructionsState(VL[BaseIndex], nullptr, nullptr);
+    }
+    if (AltOpcode != Opcode && InstOpcode == AltOpcode) {
+      AltOpcodeNum++;
+      continue;
+    }
+    if (InstOpcode != Opcode && InstOpcode != AltOpcode) {
+      if (IsBinOp && AltOpcode == Opcode && isa<BinaryOperator>(I)) {
+        AltOpcode = InstOpcode;
+        AltOpcodeNum++;
+        AltIndex = Cnt;
+        continue;
+      }
+      if (Opcode != Instruction::PHI &&
+          (tryToRepresentAsInstArg(Opcode, I) ||
+           (IsBinOp && InstOpcode != Instruction::PHI &&
+            tryToRepresentAsInstArg(InstOpcode,
+                                    cast<Instruction>(VL[BaseIndex]))))) {
+        if (!IsNonAlt) {
+          NonAltIndex = Cnt;
+          IsNonAlt = true;
+        }
+        NonAltNum++;
+        continue;
+      }
+      return InstructionsState(VL[BaseIndex], nullptr, nullptr);
+    }
   }
 
+  if (IsNonAlt && VL.size() > 2 && (OpcodeNum + AltOpcodeNum) <= NonAltNum) {
+    BaseIndex = NonAltIndex;
+    AltIndex = BaseIndex;
+    Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
+    AltOpcode = Opcode;
+    IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
+    for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
+      auto *I = cast<Instruction>(VL[Cnt]);
+      unsigned InstOpcode = I->getOpcode();
+      if (Opcode == AltOpcode && IsBinOp && isa<BinaryOperator>(I)) {
+        AltOpcode = InstOpcode;
+        AltIndex = Cnt;
+      }
+    }
+  }
+
+  if (IsNonAlt && !IsBinOp)
+    return InstructionsState(VL[BaseIndex], nullptr, nullptr);
+
   return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
                            cast<Instruction>(VL[AltIndex]));
 }
@@ -701,10 +769,14 @@
     /// The TreeEntry index containing the user of this entry.  We can actually
     /// have multiple users so the data structure is not truly a tree.
     SmallVector<int, 1> UserTreeIndices;
+
+    /// Info about instruction in this tree entry.
+    InstructionsState State;
   };
 
   /// Create a new VectorizableTree entry.
   void newTreeEntry(ArrayRef<Value *> VL, bool Vectorized, int &UserTreeIdx,
+                    const InstructionsState &S,
                     ArrayRef<unsigned> ReuseShuffleIndices = None,
                     ArrayRef<unsigned> ReorderIndices = None) {
     VectorizableTree.emplace_back(VectorizableTree);
@@ -716,11 +788,20 @@
                                      ReuseShuffleIndices.end());
     Last->ReorderIndices = ReorderIndices;
     if (Vectorized) {
+      Last->State = S;
       for (int i = 0, e = VL.size(); i != e; ++i) {
-        assert(!getTreeEntry(VL[i]) && "Scalar already in tree!");
-        ScalarToTreeEntry[VL[i]] = idx;
+        assert(!getTreeEntry(VL[i], S.getOpcode()) && "Scalar already in tree!");
+        ScalarToTreeEntry[VL[i]][S.getOpcode()] = idx;
       }
     } else {
+      for (Value *V: VL) {
+        if (Instruction *I = dyn_cast<Instruction>(V)) {
+           Last->State.MainOp = I;
+           Last->State.AltOp = I;
+           break;
+        }
+      }
+      Last->State.OpValue = VL[0];
       MustGather.insert(VL.begin(), VL.end());
     }
 
@@ -735,13 +816,29 @@
 
   TreeEntry *getTreeEntry(Value *V) {
     auto I = ScalarToTreeEntry.find(V);
-    if (I != ScalarToTreeEntry.end())
-      return &VectorizableTree[I->second];
+    if (I != ScalarToTreeEntry.end()) {
+      auto &STT = I->second;
+      for (auto STTI : STT) {
+        if (isOneOf(VectorizableTree[STTI.second].State, V) == V)
+          return &VectorizableTree[STTI.second];
+      }
+    }
+    return nullptr;
+  }
+
+  TreeEntry *getTreeEntry(Value *V, unsigned Opcode) {
+    auto I = ScalarToTreeEntry.find(V);
+    if (I != ScalarToTreeEntry.end()) {
+      auto &STT = I->second;
+      auto STTI = STT.find(Opcode);
+      if (STTI != STT.end())
+        return &VectorizableTree[STTI->second];
+    }
     return nullptr;
   }
 
   /// Maps a specific scalar to its tree entry.
-  SmallDenseMap<Value*, int> ScalarToTreeEntry;
+  SmallDenseMap<Value *, SmallDenseMap<unsigned, int>> ScalarToTreeEntry;
 
   /// A list of scalars that we found that we need to keep as scalars.
   ValueSet MustGather;
@@ -831,19 +928,6 @@
     // dependencies are not calculated yet.
     enum { InvalidDeps = -1 };
 
-    ScheduleData() = default;
-
-    void init(int BlockSchedulingRegionID, Value *OpVal) {
-      FirstInBundle = this;
-      NextInBundle = nullptr;
-      NextLoadStore = nullptr;
-      IsScheduled = false;
-      SchedulingRegionID = BlockSchedulingRegionID;
-      UnscheduledDepsInBundle = UnscheduledDeps;
-      clearDependencies();
-      OpValue = OpVal;
-    }
-
     /// Returns true if the dependency information has been calculated.
     bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
 
@@ -885,24 +969,39 @@
       MemoryDependencies.clear();
     }
 
+    /// Get an instruction behind this ScheduleData instance.
+    virtual Instruction *getInst() const = 0;
+
+    /// Returns true if the instance is a pseudo instruction one.
+    virtual bool isPseudo() const = 0;
+
     void dump(raw_ostream &os) const {
       if (!isSchedulingEntity()) {
-        os << "/ " << *Inst;
+        os << "/ ";
+        if (isPseudo())
+          os << "*";
+        os << *getInst();
       } else if (NextInBundle) {
-        os << '[' << *Inst;
+        os << '[';
+        if (isPseudo())
+          os << "*";
+        os << *getInst();
         ScheduleData *SD = NextInBundle;
         while (SD) {
-          os << ';' << *SD->Inst;
-          SD = SD->NextInBundle;
+           os << ';' ;
+           if (SD->isPseudo())
+             os << "*";
+           os << *SD->getInst();
+           SD = SD->NextInBundle;
         }
         os << ']';
       } else {
-        os << *Inst;
+       if (isPseudo())
+         os << "*";
+       os << *getInst();
       }
     }
 
-    Instruction *Inst = nullptr;
-
     /// Points to the head in an instruction bundle (and always to this for
     /// single instructions).
     ScheduleData *FirstInBundle = nullptr;
@@ -916,8 +1015,8 @@
     ScheduleData *NextLoadStore = nullptr;
 
     /// The dependent memory instructions.
-    /// This list is derived on demand in calculateDependencies().
-    SmallVector<ScheduleData *, 4> MemoryDependencies;
+    /// This set is derived on demand in calculateDependencies().
+    SmallPtrSet<ScheduleData *, 4> MemoryDependencies;
 
     /// This ScheduleData is in the current scheduling region if this matches
     /// the current SchedulingRegionID of BlockScheduling.
@@ -946,7 +1045,64 @@
     /// dry-run).
     bool IsScheduled = false;
 
-    /// Opcode of the current instruction in the schedule data.
+    /// Opcode that represents instructions to be vectorized.
+    unsigned Opcode = 0;
+  };
+
+  struct InstScheduleData : public ScheduleData {
+
+    InstScheduleData() = default;
+
+    Instruction *Inst = nullptr;
+
+    void init(int BlockSchedulingRegionID) {
+      FirstInBundle = this;
+      NextInBundle = nullptr;
+      NextLoadStore = nullptr;
+      IsScheduled = false;
+      SchedulingRegionID = BlockSchedulingRegionID;
+      UnscheduledDepsInBundle = UnscheduledDeps;
+      clearDependencies();
+    }
+
+    Instruction *getInst() const {
+      return Inst;
+    }
+
+    bool isPseudo() const {
+      return false;
+    }
+
+  };
+
+  struct PseudoScheduleData : public ScheduleData {
+
+    PseudoScheduleData() = default;
+
+    InstScheduleData *ISD;
+
+    void init(int BlockSchedulingRegionID, InstScheduleData *OpISD,
+              Value *OpVal, unsigned OpCode) {
+     FirstInBundle = this;
+     NextInBundle = nullptr;
+     NextLoadStore = OpISD->NextLoadStore;
+     IsScheduled = false;
+     SchedulingRegionID = BlockSchedulingRegionID;
+     UnscheduledDepsInBundle = UnscheduledDeps;
+     clearDependencies();
+     OpValue = OpVal;
+     ISD = OpISD;
+     Opcode = OpCode;
+    }
+
+    Instruction *getInst() const {
+      return ISD->Inst;
+    }
+
+    bool isPseudo() const {
+      return true;
+    }
+
     Value *OpValue = nullptr;
   };
 
@@ -964,7 +1120,8 @@
   /// Contains all scheduling data for a basic block.
   struct BlockScheduling {
     BlockScheduling(BasicBlock *BB)
-        : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
+        : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize),
+          PseudoChunkSize(BB->size()), PseudoChunkPos(PseudoChunkSize) {}
 
     void clear() {
       ReadyInsts.clear();
@@ -972,6 +1129,7 @@
       ScheduleEnd = nullptr;
       FirstLoadStoreInRegion = nullptr;
       LastLoadStoreInRegion = nullptr;
+      PseudoInstScheduleDataMap.clear();
 
       // Reduce the maximum schedule region size by the size of the
       // previous scheduling run.
@@ -985,21 +1143,23 @@
       ++SchedulingRegionID;
     }
 
-    ScheduleData *getScheduleData(Value *V) {
-      ScheduleData *SD = ScheduleDataMap[V];
+    InstScheduleData *getInstScheduleData(Value *V) {
+      InstScheduleData *SD = InstScheduleDataMap[V];
       if (SD && SD->SchedulingRegionID == SchedulingRegionID)
         return SD;
       return nullptr;
     }
 
-    ScheduleData *getScheduleData(Value *V, Value *Key) {
-      if (V == Key)
-        return getScheduleData(V);
-      auto I = ExtraScheduleDataMap.find(V);
-      if (I != ExtraScheduleDataMap.end()) {
-        ScheduleData *SD = I->second[Key];
-        if (SD && SD->SchedulingRegionID == SchedulingRegionID)
-          return SD;
+    ScheduleData *getScheduleData(Value *V, unsigned Opcode) {
+      ScheduleData *SD = getInstScheduleData(V);
+      if (SD && SD->Opcode == Opcode)
+        return SD;
+      auto I = PseudoInstScheduleDataMap.find(V);
+      if (I != PseudoInstScheduleDataMap.end()) {
+        PseudoScheduleData *PSD = I->second[Opcode];
+        if (PSD && PSD->SchedulingRegionID == SchedulingRegionID &&
+            PSD->Opcode == Opcode)
+          return PSD;
       }
       return nullptr;
     }
@@ -1016,13 +1176,11 @@
       LLVM_DEBUG(dbgs() << "SLP:   schedule " << *SD << "\n");
 
       ScheduleData *BundleMember = SD;
+      unsigned Opcode = BundleMember->Opcode;
       while (BundleMember) {
-        if (BundleMember->Inst != BundleMember->OpValue) {
-          BundleMember = BundleMember->NextInBundle;
-          continue;
-        }
+        assert(BundleMember->Opcode == Opcode && "Corrupt bundle member");
         // Handle the def-use chain dependencies.
-        for (Use &U : BundleMember->Inst->operands()) {
+        for (Use &U : BundleMember->getInst()->operands()) {
           auto *I = dyn_cast<Instruction>(U.get());
           if (!I)
             continue;
@@ -1060,13 +1218,23 @@
 
     void doForAllOpcodes(Value *V,
                          function_ref<void(ScheduleData *SD)> Action) {
-      if (ScheduleData *SD = getScheduleData(V))
-        Action(SD);
-      auto I = ExtraScheduleDataMap.find(V);
-      if (I != ExtraScheduleDataMap.end())
-        for (auto &P : I->second)
-          if (P.second->SchedulingRegionID == SchedulingRegionID)
-            Action(P.second);
+      bool Found = false;
+      auto I = PseudoInstScheduleDataMap.find(V);
+      if (I != PseudoInstScheduleDataMap.end()) {
+        for (auto &P : I->second) {
+          ScheduleData *SD = P.second;
+          if (SD && SD->isPartOfBundle() &&
+              SD->SchedulingRegionID == SchedulingRegionID) {
+            Found = true;
+            Action(SD);
+          }
+        }
+      }
+      if (ScheduleData *SD = getInstScheduleData(V)) {
+        if (!Found || SD->isPartOfBundle()) {
+          Action(SD);
+        }
+      }
     }
 
     /// Put all instructions into the ReadyList which are ready for scheduling.
@@ -1090,20 +1258,22 @@
                            const InstructionsState &S);
 
     /// Un-bundles a group of instructions.
-    void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
+    void cancelScheduling(Value *OpValue, unsigned Opcode);
 
     /// Allocates schedule data chunk.
-    ScheduleData *allocateScheduleDataChunks();
+    InstScheduleData *allocateInstScheduleDataChunks();
+
+    PseudoScheduleData *allocatePseudoInstDataChunks();
 
     /// Extends the scheduling region so that V is inside the region.
     /// \returns true if the region size is within the limit.
     bool extendSchedulingRegion(Value *V, const InstructionsState &S);
 
-    /// Initialize the ScheduleData structures for new instructions in the
+    /// Initialize the InstScheduleData structures for new instructions in the
     /// scheduling region.
     void initScheduleData(Instruction *FromI, Instruction *ToI,
-                          ScheduleData *PrevLoadStore,
-                          ScheduleData *NextLoadStore);
+                          InstScheduleData *PrevLoadStore,
+                          InstScheduleData *NextLoadStore);
 
     /// Updates the dependency information of a bundle and of all instructions/
     /// bundles which depend on the original bundle.
@@ -1115,24 +1285,30 @@
 
     BasicBlock *BB;
 
-    /// Simple memory allocation for ScheduleData.
-    std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
+    /// Simple memory allocation for InstScheduleData.
+    std::vector<std::unique_ptr<InstScheduleData[]>> InstScheduleDataChunks;
+
+    std::vector<std::unique_ptr<PseudoScheduleData[]>> PseudoScheduleDataChunks;
 
-    /// The size of a ScheduleData array in ScheduleDataChunks.
+    /// The size of a InstScheduleData array in InstScheduleDataChunks.
     int ChunkSize;
 
     /// The allocator position in the current chunk, which is the last entry
-    /// of ScheduleDataChunks.
+    /// of InstScheduleDataChunks.
     int ChunkPos;
 
-    /// Attaches ScheduleData to Instruction.
+    int PseudoChunkSize;
+
+    int PseudoChunkPos;
+
+    /// Attaches InstScheduleData to Instruction.
     /// Note that the mapping survives during all vectorization iterations, i.e.
-    /// ScheduleData structures are recycled.
-    DenseMap<Value *, ScheduleData *> ScheduleDataMap;
+    /// InstScheduleData structures are recycled.
+    DenseMap<Value *, InstScheduleData *> InstScheduleDataMap;
 
-    /// Attaches ScheduleData to Instruction with the leading key.
-    DenseMap<Value *, SmallDenseMap<Value *, ScheduleData *>>
-        ExtraScheduleDataMap;
+    /// Attaches InstScheduleData to Instruction with the leading key.
+    DenseMap<Value *, SmallDenseMap<unsigned, PseudoScheduleData*>>
+        PseudoInstScheduleDataMap;
 
     struct ReadyList : SmallVector<ScheduleData *, 8> {
       void insert(ScheduleData *SD) { push_back(SD); }
@@ -1149,11 +1325,11 @@
 
     /// The first memory accessing instruction in the scheduling region
     /// (can be null).
-    ScheduleData *FirstLoadStoreInRegion = nullptr;
+    InstScheduleData *FirstLoadStoreInRegion = nullptr;
 
     /// The last memory accessing instruction in the scheduling region
     /// (can be null).
-    ScheduleData *LastLoadStoreInRegion = nullptr;
+    InstScheduleData *LastLoadStoreInRegion = nullptr;
 
     /// The current size of the scheduling region.
     int ScheduleRegionSize = 0;
@@ -1162,9 +1338,9 @@
     int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
 
     /// The ID of the scheduling region. For a new vectorization iteration this
-    /// is incremented which "removes" all ScheduleData from the region.
+    /// is incremented which "removes" all InstScheduleData from the region.
     // Make sure that the initial SchedulingRegionID is greater than the
-    // initial SchedulingRegionID in ScheduleData (which is 0).
+    // initial SchedulingRegionID in InstScheduleData (which is 0).
     int SchedulingRegionID = 1;
   };
 
@@ -1345,6 +1521,8 @@
     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
       Value *Scalar = Entry->Scalars[Lane];
       int FoundLane = Lane;
+      if (!Entry->State.isOpcodeOrAlt(cast<Instruction>(Scalar)))
+        continue;
       if (!Entry->ReuseShuffleIndices.empty()) {
         FoundLane =
             std::distance(Entry->ReuseShuffleIndices.begin(),
@@ -1392,6 +1570,34 @@
   }
 }
 
+static Value *getDefaultConstantForOpcode(unsigned Opcode, Type *Ty) {
+  switch(Opcode) {
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+    return ConstantInt::getNullValue(Ty);
+  case Instruction::Mul:
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+    return ConstantInt::get(Ty, /*V=*/1);
+  case Instruction::FAdd:
+  case Instruction::FSub:
+    return ConstantFP::get(Ty, /*V=*/0.0);
+  case Instruction::FMul:
+  case Instruction::FDiv:
+    return ConstantFP::get(Ty, /*V=*/1.0);
+  case Instruction::And:
+    return ConstantInt::getAllOnesValue(Ty);
+  default:
+    break;
+  }
+  llvm_unreachable("unknown binop for default constant value");
+}
+
 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                             int UserTreeIdx) {
   assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
@@ -1399,28 +1605,28 @@
   InstructionsState S = getSameOpcode(VL);
   if (Depth == RecursionMaxDepth) {
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
-    newTreeEntry(VL, false, UserTreeIdx);
+    newTreeEntry(VL, false, UserTreeIdx, S);
     return;
   }
 
   // Don't handle vectors.
   if (S.OpValue->getType()->isVectorTy()) {
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
-    newTreeEntry(VL, false, UserTreeIdx);
+    newTreeEntry(VL, false, UserTreeIdx, S);
     return;
   }
 
   if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
     if (SI->getValueOperand()->getType()->isVectorTy()) {
       LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
-      newTreeEntry(VL, false, UserTreeIdx);
+      newTreeEntry(VL, false, UserTreeIdx, S);
       return;
     }
 
   // If all of the operands are identical or constant we have a simple solution.
   if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode()) {
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
-    newTreeEntry(VL, false, UserTreeIdx);
+    newTreeEntry(VL, false, UserTreeIdx, S);
     return;
   }
 
@@ -1432,7 +1638,7 @@
     if (EphValues.count(VL[i])) {
       LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i]
                         << ") is ephemeral.\n");
-      newTreeEntry(VL, false, UserTreeIdx);
+      newTreeEntry(VL, false, UserTreeIdx, S);
       return;
     }
   }
@@ -1442,7 +1648,7 @@
     LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
     if (!E->isSame(VL)) {
       LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
-      newTreeEntry(VL, false, UserTreeIdx);
+      newTreeEntry(VL, false, UserTreeIdx, S);
       return;
     }
     // Record the reuse of the tree node.  FIXME, currently this is only used to
@@ -1458,10 +1664,10 @@
     auto *I = dyn_cast<Instruction>(VL[i]);
     if (!I)
       continue;
-    if (getTreeEntry(I)) {
+    if (getTreeEntry(VL[i]) || getTreeEntry(VL[i], S.getOpcode())) {
       LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i]
                         << ") is already in tree.\n");
-      newTreeEntry(VL, false, UserTreeIdx);
+      newTreeEntry(VL, false, UserTreeIdx, S);
       return;
     }
   }
@@ -1471,7 +1677,7 @@
   for (unsigned i = 0, e = VL.size(); i != e; ++i) {
     if (MustGather.count(VL[i])) {
       LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
-      newTreeEntry(VL, false, UserTreeIdx);
+      newTreeEntry(VL, false, UserTreeIdx, S);
       return;
     }
   }
@@ -1485,7 +1691,7 @@
     // Don't go into unreachable blocks. They may contain instructions with
     // dependency cycles which confuse the final scheduling.
     LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
-    newTreeEntry(VL, false, UserTreeIdx);
+    newTreeEntry(VL, false, UserTreeIdx, S);
     return;
   }
 
@@ -1505,7 +1711,7 @@
     LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
     if (UniqueValues.size() <= 1 || !llvm::isPowerOf2_32(UniqueValues.size())) {
       LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
-      newTreeEntry(VL, false, UserTreeIdx);
+      newTreeEntry(VL, false, UserTreeIdx, S);
       return;
     }
     VL = UniqueValues;
@@ -1519,10 +1725,10 @@
 
   if (!BS.tryScheduleBundle(VL, this, S)) {
     LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
-    assert((!BS.getScheduleData(VL0) ||
-            !BS.getScheduleData(VL0)->isPartOfBundle()) &&
+    assert((!BS.getScheduleData(VL0, S.getOpcode()) ||
+            !BS.getScheduleData(VL0, S.getOpcode())->isPartOfBundle()) &&
            "tryScheduleBundle should cancelScheduling on failure");
-    newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+    newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies);
     return;
   }
   LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
@@ -1542,13 +1748,13 @@
             LLVM_DEBUG(
                 dbgs()
                 << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
-            BS.cancelScheduling(VL, VL0);
-            newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+            BS.cancelScheduling(VL0, S.getOpcode());
+            newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies);
             return;
           }
         }
 
-      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      newTreeEntry(VL, true, UserTreeIdx, S, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
 
       for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
@@ -1569,7 +1775,7 @@
       if (Reuse) {
         LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
         ++NumOpsWantToKeepOriginalOrder;
-        newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
+        newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, S,
                      ReuseShuffleIndicies);
         return;
       }
@@ -1586,13 +1792,15 @@
         auto StoredCurrentOrderAndNum =
             NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
         ++StoredCurrentOrderAndNum->getSecond();
-        newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies,
+        newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, S,
+                     ReuseShuffleIndicies,
                      StoredCurrentOrderAndNum->getFirst());
         return;
       }
       LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
-      newTreeEntry(VL, /*Vectorized=*/false, UserTreeIdx, ReuseShuffleIndicies);
-      BS.cancelScheduling(VL, VL0);
+      newTreeEntry(VL, /*Vectorized=*/false, UserTreeIdx, S,
+                   ReuseShuffleIndicies);
+      BS.cancelScheduling(VL0, S.getOpcode());
       return;
     }
     case Instruction::Load: {
@@ -1606,8 +1814,8 @@
 
       if (DL->getTypeSizeInBits(ScalarTy) !=
           DL->getTypeAllocSizeInBits(ScalarTy)) {
-        BS.cancelScheduling(VL, VL0);
-        newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+        BS.cancelScheduling(VL0, S.getOpcode());
+        newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies);
         LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
         return;
       }
@@ -1619,8 +1827,8 @@
       for (Value *V : VL) {
         auto *L = cast<LoadInst>(V);
         if (!L->isSimple()) {
-          BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          BS.cancelScheduling(VL0, S.getOpcode());
+          newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies);
           LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
           return;
         }
@@ -1650,14 +1858,14 @@
           if (CurrentOrder.empty()) {
             // Original loads are consecutive and does not require reordering.
             ++NumOpsWantToKeepOriginalOrder;
-            newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
+            newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, S,
                          ReuseShuffleIndicies);
             LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
           } else {
             // Need to reorder.
             auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
             ++I->getSecond();
-            newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
+            newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, S,
                          ReuseShuffleIndicies, I->getFirst());
             LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
           }
@@ -1666,8 +1874,8 @@
       }
 
       LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
-      BS.cancelScheduling(VL, VL0);
-      newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+      BS.cancelScheduling(VL0, S.getOpcode());
+      newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies);
       return;
     }
     case Instruction::ZExt:
@@ -1686,14 +1894,14 @@
       for (unsigned i = 0; i < VL.size(); ++i) {
         Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
         if (Ty != SrcTy || !isValidElementType(Ty)) {
-          BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          BS.cancelScheduling(VL0, S.getOpcode());
+          newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies);
           LLVM_DEBUG(dbgs()
                      << "SLP: Gathering casts with different src types.\n");
           return;
         }
       }
-      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      newTreeEntry(VL, true, UserTreeIdx, S, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
 
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
@@ -1715,15 +1923,15 @@
         CmpInst *Cmp = cast<CmpInst>(VL[i]);
         if (Cmp->getPredicate() != P0 ||
             Cmp->getOperand(0)->getType() != ComparedTy) {
-          BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          BS.cancelScheduling(VL0, S.getOpcode());
+          newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies);
           LLVM_DEBUG(dbgs()
                      << "SLP: Gathering cmp with different predicate.\n");
           return;
         }
       }
 
-      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      newTreeEntry(VL, true, UserTreeIdx, S, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
 
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
@@ -1755,7 +1963,7 @@
     case Instruction::And:
     case Instruction::Or:
     case Instruction::Xor:
-      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      newTreeEntry(VL, true, UserTreeIdx, S, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
 
       // Sort operands of the instructions so that each side is more likely to
@@ -1771,10 +1979,18 @@
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
-        for (Value *j : VL)
-          Operands.push_back(cast<Instruction>(j)->getOperand(i));
-
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+        for (Value *VecOp : VL) {
+          auto *I = cast<Instruction>(VecOp);
+          if (I->getOpcode() == S.getOpcode()) {
+             Operands.push_back(I->getOperand(i));
+             continue;
+          }
+          assert(Instruction::isBinaryOp(S.getOpcode()) &&
+                  "Expected a binary operation.");
+          Operands.push_back(VecOp);
+        }
+        if (allSameType(Operands))
+          buildTree_rec(Operands, Depth + 1, UserTreeIdx);
       }
       return;
 
@@ -1783,8 +1999,8 @@
       for (unsigned j = 0; j < VL.size(); ++j) {
         if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
           LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
-          BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          BS.cancelScheduling(VL0, S.getOpcode());
+          newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies);
           return;
         }
       }
@@ -1797,8 +2013,8 @@
         if (Ty0 != CurTy) {
           LLVM_DEBUG(dbgs()
                      << "SLP: not-vectorizable GEP (different types).\n");
-          BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          BS.cancelScheduling(VL0, S.getOpcode());
+          newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies);
           return;
         }
       }
@@ -1809,13 +2025,13 @@
         if (!isa<ConstantInt>(Op)) {
           LLVM_DEBUG(dbgs()
                      << "SLP: not-vectorizable GEP (non-constant indexes).\n");
-          BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          BS.cancelScheduling(VL0, S.getOpcode());
+          newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies);
           return;
         }
       }
 
-      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      newTreeEntry(VL, true, UserTreeIdx, S, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
       for (unsigned i = 0, e = 2; i < e; ++i) {
         ValueList Operands;
@@ -1831,13 +2047,13 @@
       // Check if the stores are consecutive or of we need to swizzle them.
       for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
         if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
-          BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          BS.cancelScheduling(VL0, S.getOpcode());
+          newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies);
           LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
           return;
         }
 
-      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      newTreeEntry(VL, true, UserTreeIdx, S, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
 
       ValueList Operands;
@@ -1854,8 +2070,8 @@
       // represented by an intrinsic call
       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
       if (!isTriviallyVectorizable(ID)) {
-        BS.cancelScheduling(VL, VL0);
-        newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+        BS.cancelScheduling(VL0, S.getOpcode());
+        newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies);
         LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
         return;
       }
@@ -1868,8 +2084,8 @@
         if (!CI2 || CI2->getCalledFunction() != Int ||
             getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
             !CI->hasIdenticalOperandBundleSchema(*CI2)) {
-          BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          BS.cancelScheduling(VL0, S.getOpcode());
+          newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies);
           LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
                             << "\n");
           return;
@@ -1879,8 +2095,8 @@
         if (hasVectorInstrinsicScalarOpd(ID, 1)) {
           Value *A1J = CI2->getArgOperand(1);
           if (A1I != A1J) {
-            BS.cancelScheduling(VL, VL0);
-            newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+            BS.cancelScheduling(VL0, S.getOpcode());
+            newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies);
             LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
                               << " argument " << A1I << "!=" << A1J << "\n");
             return;
@@ -1891,23 +2107,31 @@
             !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
                         CI->op_begin() + CI->getBundleOperandsEndIndex(),
                         CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
-          BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+          BS.cancelScheduling(VL0, S.getOpcode());
+          newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies);
           LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:"
                             << *CI << "!=" << *VL[i] << '\n');
           return;
         }
       }
 
-      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      newTreeEntry(VL, true, UserTreeIdx, S, ReuseShuffleIndicies);
       for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
-        for (Value *j : VL) {
-          CallInst *CI2 = dyn_cast<CallInst>(j);
-          Operands.push_back(CI2->getArgOperand(i));
+        for (Value *VecOp : VL) {
+          auto *I = cast<Instruction>(VecOp);
+          if (S.isOpcodeOrAlt(I)) {
+             Operands.push_back(I->getOperand(i));
+             continue;
+          }
+          assert(Instruction::isBinaryOp(S.getOpcode()) &&
+                  "Expected a binary operation.");
+          Value *Operand = getDefaultConstantForOpcode(S.getOpcode(), I->getType());
+          Operands.push_back(Operand);
         }
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+        if (allSameType(Operands))
+          buildTree_rec(Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
@@ -1915,12 +2139,12 @@
       // If this is not an alternate sequence of opcode like add-sub
       // then do not vectorize this instruction.
       if (!S.isAltShuffle()) {
-        BS.cancelScheduling(VL, VL0);
-        newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+        BS.cancelScheduling(VL0, S.getOpcode());
+        newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies);
         LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
         return;
       }
-      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+      newTreeEntry(VL, true, UserTreeIdx, S, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
 
       // Reorder operands if reordering would enable vectorization.
@@ -1935,16 +2159,25 @@
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
-        for (Value *j : VL)
-          Operands.push_back(cast<Instruction>(j)->getOperand(i));
+        for (Value *VecOp : VL) {
+          auto *I = cast<Instruction>(VecOp);
+          if (S.isOpcodeOrAlt(I)) {
+            Operands.push_back(I->getOperand(i));
+            continue;
+          }
+          assert(Instruction::isBinaryOp(S.getOpcode()) &&
+                  "Expected a binary operation.");
+          Value *Operand = getDefaultConstantForOpcode(S.getOpcode(), I->getType());
+          Operands.push_back(Operand);
+        }
 
         buildTree_rec(Operands, Depth + 1, UserTreeIdx);
       }
       return;
 
     default:
-      BS.cancelScheduling(VL, VL0);
-      newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
+      BS.cancelScheduling(VL0, S.getOpcode());
+      newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
       return;
   }
@@ -2102,11 +2335,10 @@
     }
     return ReuseShuffleCost + getGatherCost(VL);
   }
-  InstructionsState S = getSameOpcode(VL);
-  assert(S.getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
-  Instruction *VL0 = cast<Instruction>(S.OpValue);
-  unsigned ShuffleOrOp = S.isAltShuffle() ?
-               (unsigned) Instruction::ShuffleVector : S.getOpcode();
+  assert(E->State.getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
+  auto *VL0 = cast<Instruction>(E->State.OpValue);
+  unsigned ShuffleOrOp = E->State.isAltShuffle() ?
+               (unsigned) Instruction::ShuffleVector : E->State.getOpcode();
   switch (ShuffleOrOp) {
     case Instruction::PHI:
       return 0;
@@ -2192,7 +2424,7 @@
     case Instruction::BitCast: {
       Type *SrcTy = VL0->getOperand(0)->getType();
       int ScalarEltCost =
-          TTI->getCastInstrCost(S.getOpcode(), ScalarTy, SrcTy, VL0);
+          TTI->getCastInstrCost(E->State.getOpcode(), ScalarTy, SrcTy, VL0);
       if (NeedToShuffleReuses) {
         ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
       }
@@ -2205,7 +2437,8 @@
       // Check if the values are candidates to demote.
       if (!MinBWs.count(VL0) || VecTy != SrcVecTy) {
         VecCost = ReuseShuffleCost +
-                  TTI->getCastInstrCost(S.getOpcode(), VecTy, SrcVecTy, VL0);
+                  TTI->getCastInstrCost(E->State.getOpcode(), VecTy,
+                                        SrcVecTy, VL0);
       }
       return VecCost - ScalarCost;
     }
@@ -2213,14 +2446,16 @@
     case Instruction::ICmp:
     case Instruction::Select: {
       // Calculate the cost of this instruction.
-      int ScalarEltCost = TTI->getCmpSelInstrCost(S.getOpcode(), ScalarTy,
+      int ScalarEltCost = TTI->getCmpSelInstrCost(E->State.getOpcode(),
+                                                  ScalarTy,
                                                   Builder.getInt1Ty(), VL0);
       if (NeedToShuffleReuses) {
         ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
       }
       VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
       int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
-      int VecCost = TTI->getCmpSelInstrCost(S.getOpcode(), VecTy, MaskTy, VL0);
+      int VecCost = TTI->getCmpSelInstrCost(E->State.getOpcode(), VecTy,
+                                            MaskTy, VL0);
       return ReuseShuffleCost + VecCost - ScalarCost;
     }
     case Instruction::Add:
@@ -2246,7 +2481,7 @@
       TargetTransformInfo::OperandValueKind Op1VK =
           TargetTransformInfo::OK_AnyValue;
       TargetTransformInfo::OperandValueKind Op2VK =
-          TargetTransformInfo::OK_UniformConstantValue;
+          TargetTransformInfo::OK_AnyValue;
       TargetTransformInfo::OperandValueProperties Op1VP =
           TargetTransformInfo::OP_None;
       TargetTransformInfo::OperandValueProperties Op2VP =
@@ -2257,35 +2492,40 @@
       // If instead not all operands are constants, then set the operand kind
       // to OK_AnyValue. If all operands are constants but not the same,
       // then set the operand kind to OK_NonUniformConstantValue.
-      ConstantInt *CInt0 = nullptr;
-      for (unsigned i = 0, e = VL.size(); i < e; ++i) {
-        const Instruction *I = cast<Instruction>(VL[i]);
-        ConstantInt *CInt = dyn_cast<ConstantInt>(I->getOperand(1));
-        if (!CInt) {
-          Op2VK = TargetTransformInfo::OK_AnyValue;
-          Op2VP = TargetTransformInfo::OP_None;
-          break;
-        }
-        if (Op2VP == TargetTransformInfo::OP_PowerOf2 &&
-            !CInt->getValue().isPowerOf2())
-          Op2VP = TargetTransformInfo::OP_None;
-        if (i == 0) {
-          CInt0 = CInt;
-          continue;
+      if (auto *CInt = dyn_cast<ConstantInt>(VL0->getOperand(1))) {
+        Op2VK = TargetTransformInfo::OK_UniformConstantValue;
+        const unsigned Opcode = E->State.getOpcode();
+        for (auto *V : VL) {
+          auto *I = cast<Instruction>(V);
+          if (I == VL0 || Opcode != I->getOpcode())
+            continue;
+          if (!isa<ConstantInt>(I->getOperand(1))) {
+            Op2VK = TargetTransformInfo::OK_AnyValue;
+            Op2VP = TargetTransformInfo::OP_None;
+            break;
+          }
+          ConstantInt *CInt_cur = cast<ConstantInt>(I->getOperand(1));
+          if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
+              CInt != cast<ConstantInt>(I->getOperand(1)))
+            Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
+          if (Op2VP == TargetTransformInfo::OP_PowerOf2 &&
+              !CInt->getValue().isPowerOf2())
+            Op2VP = TargetTransformInfo::OP_None;
+          if (CInt != CInt_cur)
+            Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
         }
-        if (CInt0 != CInt)
-          Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
       }
 
       SmallVector<const Value *, 4> Operands(VL0->operand_values());
       int ScalarEltCost = TTI->getArithmeticInstrCost(
-          S.getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands);
+          E->State.getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands);
       if (NeedToShuffleReuses) {
         ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
       }
       int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
-      int VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy, Op1VK,
-                                                Op2VK, Op1VP, Op2VP, Operands);
+      int VecCost = TTI->getArithmeticInstrCost(E->State.getOpcode(), VecTy,
+                                                Op1VK, Op2VK, Op1VP, Op2VP,
+                                                Operands);
       return ReuseShuffleCost + VecCost - ScalarCost;
     }
     case Instruction::GetElementPtr: {
@@ -2366,11 +2606,11 @@
       return ReuseShuffleCost + VecCallCost - ScalarCallCost;
     }
     case Instruction::ShuffleVector: {
-      assert(S.isAltShuffle() &&
-             ((Instruction::isBinaryOp(S.getOpcode()) &&
-               Instruction::isBinaryOp(S.getAltOpcode())) ||
-              (Instruction::isCast(S.getOpcode()) &&
-               Instruction::isCast(S.getAltOpcode()))) &&
+      assert(E->State.isAltShuffle() &&
+             ((Instruction::isBinaryOp(E->State.getOpcode()) &&
+             Instruction::isBinaryOp(E->State.getAltOpcode())) ||
+             (Instruction::isCast(E->State.getOpcode()) &&
+             Instruction::isCast(E->State.getAltOpcode()))) &&
              "Invalid Shuffle Vector Operand");
       int ScalarCost = 0;
       if (NeedToShuffleReuses) {
@@ -2387,23 +2627,22 @@
       }
       for (Value *i : VL) {
         Instruction *I = cast<Instruction>(i);
-        assert(S.isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
         ScalarCost += TTI->getInstructionCost(
             I, TargetTransformInfo::TCK_RecipThroughput);
       }
       // VecCost is equal to sum of the cost of creating 2 vectors
       // and the cost of creating shuffle.
       int VecCost = 0;
-      if (Instruction::isBinaryOp(S.getOpcode())) {
-        VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy);
-        VecCost += TTI->getArithmeticInstrCost(S.getAltOpcode(), VecTy);
+      if (Instruction::isBinaryOp(E->State.getOpcode())) {
+        VecCost = TTI->getArithmeticInstrCost(E->State.getOpcode(), VecTy);
+        VecCost += TTI->getArithmeticInstrCost(E->State.getAltOpcode(), VecTy);
       } else {
-        Type *Src0SclTy = S.MainOp->getOperand(0)->getType();
-        Type *Src1SclTy = S.AltOp->getOperand(0)->getType();
+        Type *Src0SclTy = E->State.MainOp->getOperand(0)->getType();
+        Type *Src1SclTy = E->State.AltOp->getOperand(0)->getType();
         VectorType *Src0Ty = VectorType::get(Src0SclTy, VL.size());
         VectorType *Src1Ty = VectorType::get(Src1SclTy, VL.size());
-        VecCost = TTI->getCastInstrCost(S.getOpcode(), VecTy, Src0Ty);
-        VecCost += TTI->getCastInstrCost(S.getAltOpcode(), VecTy, Src1Ty);
+        VecCost = TTI->getCastInstrCost(E->State.getOpcode(), VecTy, Src0Ty);
+        VecCost += TTI->getCastInstrCost(E->State.getAltOpcode(), VecTy, Src1Ty);
       }
       VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0);
       return ReuseShuffleCost + VecCost - ScalarCost;
@@ -2469,7 +2708,7 @@
   Instruction *PrevInst = nullptr;
 
   for (const auto &N : VectorizableTree) {
-    Instruction *Inst = dyn_cast<Instruction>(N.Scalars[0]);
+    Instruction *Inst = dyn_cast<Instruction>(N.State.OpValue);
     if (!Inst)
       continue;
 
@@ -2654,9 +2893,13 @@
   // Push left and right operands of binary operation into Left and Right
   for (Value *V : VL) {
     auto *I = cast<Instruction>(V);
-    assert(S.isOpcodeOrAlt(I) && "Incorrect instruction in vector");
-    Left.push_back(I->getOperand(0));
-    Right.push_back(I->getOperand(1));
+    if (S.isOpcodeOrAlt(I)) {
+      Left.push_back(I->getOperand(0));
+      Right.push_back(I->getOperand(1));
+    } else {
+      Left.push_back(I);
+      Right.push_back(getDefaultConstantForOpcode(S.getOpcode(), I->getType()));
+    }
   }
 
   // Reorder if we have a commutative operation and consecutive access
@@ -2705,8 +2948,13 @@
     int i, unsigned Opcode, Instruction &I, ArrayRef<Value *> Left,
     ArrayRef<Value *> Right, bool AllSameOpcodeLeft, bool AllSameOpcodeRight,
     bool SplatLeft, bool SplatRight, Value *&VLeft, Value *&VRight) {
-  VLeft = I.getOperand(0);
-  VRight = I.getOperand(1);
+  if (I.getOpcode() == Opcode) {
+    VLeft = I.getOperand(0);
+    VRight = I.getOperand(1);
+  } else {
+    VLeft = &I;
+    VRight = getDefaultConstantForOpcode(Opcode, I.getType());
+  }
   // If we have "SplatRight", try to see if commuting is needed to preserve it.
   if (SplatRight) {
     if (VRight == Right[i - 1])
@@ -2770,8 +3018,15 @@
     // Peel the first iteration out of the loop since there's nothing
     // interesting to do anyway and it simplifies the checks in the loop.
     auto *I = cast<Instruction>(VL[0]);
-    Value *VLeft = I->getOperand(0);
-    Value *VRight = I->getOperand(1);
+    Value *VLeft;
+    Value *VRight;
+    if (I->getOpcode() == Opcode) {
+      VLeft = I->getOperand(0);
+      VRight = I->getOperand(1);
+    } else {
+      VLeft = I;
+      VRight = getDefaultConstantForOpcode(Opcode, I->getType());
+    }
     if (!isa<Instruction>(VRight) && isa<Instruction>(VLeft))
       // Favor having instruction to the right. FIXME: why?
       std::swap(VLeft, VRight);
@@ -2869,17 +3124,14 @@
   // The last instruction in the bundle in program order.
   Instruction *LastInst = nullptr;
 
-  // Find the last instruction. The common case should be that BB has been
-  // scheduled, and the last instruction is VL.back(). So we start with
-  // VL.back() and iterate over schedule data until we reach the end of the
-  // bundle. The end of the bundle is marked by null ScheduleData.
+  // Find the last instruction. If the bundle is not scheduled then
+  // the first in the bundle is the last one in BB, because we discover
+  // bundles in backward walk.
   if (BlocksSchedules.count(BB)) {
     auto *Bundle =
-        BlocksSchedules[BB]->getScheduleData(isOneOf(S, VL.back()));
+        BlocksSchedules[BB]->getInstScheduleData(isOneOf(S, VL.back()));
     if (Bundle && Bundle->isPartOfBundle())
-      for (; Bundle; Bundle = Bundle->NextInBundle)
-        if (Bundle->OpValue == Bundle->Inst)
-          LastInst = Bundle->Inst;
+      LastInst = Bundle->FirstInBundle->getInst();
   }
 
   // LastInst can still be null at this point if there's either not an entry
@@ -2953,7 +3205,7 @@
 Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
   InstructionsState S = getSameOpcode(VL);
   if (S.getOpcode()) {
-    if (TreeEntry *E = getTreeEntry(S.OpValue)) {
+    if (TreeEntry *E = getTreeEntry(S.OpValue, S.getOpcode())) {
       if (E->isSame(VL)) {
         Value *V = vectorizeTree(E);
         if (VL.size() == E->Scalars.size() && !E->ReuseShuffleIndices.empty()) {
@@ -3026,12 +3278,12 @@
   IRBuilder<>::InsertPointGuard Guard(Builder);
 
   if (E->VectorizedValue) {
-    LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
+    LLVM_DEBUG(dbgs() << "SLP: Diamond merged for "
+                      << *E->State.OpValue << ".\n");
     return E->VectorizedValue;
   }
 
-  InstructionsState S = getSameOpcode(E->Scalars);
-  Instruction *VL0 = cast<Instruction>(S.OpValue);
+  auto *VL0 = cast<Instruction>(E->State.OpValue);
   Type *ScalarTy = VL0->getType();
   if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
     ScalarTy = SI->getValueOperand()->getType();
@@ -3040,7 +3292,7 @@
   bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
 
   if (E->NeedToGather) {
-    setInsertPointAfterBundle(E->Scalars, S);
+    setInsertPointAfterBundle(E->Scalars, E->State);
     auto *V = Gather(E->Scalars, VecTy);
     if (NeedToShuffleReuses) {
       V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
@@ -3054,8 +3306,8 @@
     return V;
   }
 
-  unsigned ShuffleOrOp = S.isAltShuffle() ?
-           (unsigned) Instruction::ShuffleVector : S.getOpcode();
+  unsigned ShuffleOrOp = E->State.isAltShuffle() ?
+           (unsigned) Instruction::ShuffleVector : E->State.getOpcode();
   switch (ShuffleOrOp) {
     case Instruction::PHI: {
       PHINode *PH = dyn_cast<PHINode>(VL0);
@@ -3117,7 +3369,7 @@
         E->VectorizedValue = V;
         return V;
       }
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E->Scalars, E->State);
       auto *V = Gather(E->Scalars, VecTy);
       if (NeedToShuffleReuses) {
         V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
@@ -3152,7 +3404,7 @@
         E->VectorizedValue = NewV;
         return NewV;
       }
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E->Scalars, E->State);
       auto *V = Gather(E->Scalars, VecTy);
       if (NeedToShuffleReuses) {
         V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
@@ -3181,7 +3433,7 @@
       for (Value *V : E->Scalars)
         INVL.push_back(cast<Instruction>(V)->getOperand(0));
 
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E->Scalars, E->State);
 
       Value *InVec = vectorizeTree(INVL);
 
@@ -3208,7 +3460,7 @@
         RHSV.push_back(cast<Instruction>(V)->getOperand(1));
       }
 
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E->Scalars, E->State);
 
       Value *L = vectorizeTree(LHSV);
       Value *R = vectorizeTree(RHSV);
@@ -3220,7 +3472,7 @@
 
       CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
       Value *V;
-      if (S.getOpcode() == Instruction::FCmp)
+      if (E->State.getOpcode() == Instruction::FCmp)
         V = Builder.CreateFCmp(P0, L, R);
       else
         V = Builder.CreateICmp(P0, L, R);
@@ -3242,7 +3494,7 @@
         FalseVec.push_back(cast<Instruction>(V)->getOperand(2));
       }
 
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E->Scalars, E->State);
 
       Value *Cond = vectorizeTree(CondVec);
       Value *True = vectorizeTree(TrueVec);
@@ -3282,16 +3534,22 @@
     case Instruction::Xor: {
       ValueList LHSVL, RHSVL;
       if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
-        reorderInputsAccordingToOpcode(S.getOpcode(), E->Scalars, LHSVL,
-                                       RHSVL);
+        reorderInputsAccordingToOpcode(E->State.getOpcode(), E->Scalars,
+                                       LHSVL, RHSVL);
       else
         for (Value *V : E->Scalars) {
           auto *I = cast<Instruction>(V);
-          LHSVL.push_back(I->getOperand(0));
-          RHSVL.push_back(I->getOperand(1));
+          if (I->getOpcode() == E->State.getOpcode()) {
+            LHSVL.push_back(I->getOperand(0));
+            RHSVL.push_back(I->getOperand(1));
+          } else {
+            LHSVL.push_back(V);
+            RHSVL.push_back(
+                getDefaultConstantForOpcode(E->State.getOpcode(), I->getType()));
+          }
         }
 
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E->Scalars, E->State);
 
       Value *LHS = vectorizeTree(LHSVL);
       Value *RHS = vectorizeTree(RHSVL);
@@ -3302,7 +3560,7 @@
       }
 
       Value *V = Builder.CreateBinOp(
-          static_cast<Instruction::BinaryOps>(S.getOpcode()), LHS, RHS);
+          static_cast<Instruction::BinaryOps>(VL0->getOpcode()), LHS, RHS);
       propagateIRFlags(V, E->Scalars, VL0);
       if (auto *I = dyn_cast<Instruction>(V))
         V = propagateMetadata(I, E->Scalars);
@@ -3321,10 +3579,12 @@
       // sink them all the way down past store instructions.
       bool IsReorder = !E->ReorderIndices.empty();
       if (IsReorder) {
-        S = getSameOpcode(E->Scalars, E->ReorderIndices.front());
+        InstructionsState S = getSameOpcode(E->Scalars,
+                                            E->ReorderIndices.front());
         VL0 = cast<Instruction>(S.OpValue);
-      }
-      setInsertPointAfterBundle(E->Scalars, S);
+        setInsertPointAfterBundle(E->Scalars, S);
+      } else
+      setInsertPointAfterBundle(E->Scalars, E->State);
 
       LoadInst *LI = cast<LoadInst>(VL0);
       Type *ScalarLoadTy = LI->getType();
@@ -3371,7 +3631,7 @@
       for (Value *V : E->Scalars)
         ScalarStoreValues.push_back(cast<StoreInst>(V)->getValueOperand());
 
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E->Scalars, E->State);
 
       Value *VecValue = vectorizeTree(ScalarStoreValues);
       Value *ScalarPtr = SI->getPointerOperand();
@@ -3398,7 +3658,7 @@
       return V;
     }
     case Instruction::GetElementPtr: {
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E->Scalars, E->State);
 
       ValueList Op0VL;
       for (Value *V : E->Scalars)
@@ -3433,7 +3693,7 @@
     }
     case Instruction::Call: {
       CallInst *CI = cast<CallInst>(VL0);
-      setInsertPointAfterBundle(E->Scalars, S);
+      setInsertPointAfterBundle(E->Scalars, E->State);
       Function *FI;
       Intrinsic::ID IID  = Intrinsic::not_intrinsic;
       Value *ScalarArg = nullptr;
@@ -3486,24 +3746,24 @@
     }
     case Instruction::ShuffleVector: {
       ValueList LHSVL, RHSVL;
-      assert(S.isAltShuffle() &&
-             ((Instruction::isBinaryOp(S.getOpcode()) &&
-               Instruction::isBinaryOp(S.getAltOpcode())) ||
-              (Instruction::isCast(S.getOpcode()) &&
-               Instruction::isCast(S.getAltOpcode()))) &&
+      assert(E->State.isAltShuffle() &&
+             ((Instruction::isBinaryOp(E->State.getOpcode()) &&
+             Instruction::isBinaryOp(E->State.getAltOpcode())) ||
+             (Instruction::isCast(E->State.getOpcode()) &&
+             Instruction::isCast(E->State.getAltOpcode()))) &&
              "Invalid Shuffle Vector Operand");
 
       Value *LHS, *RHS;
-      if (Instruction::isBinaryOp(S.getOpcode())) {
-        reorderAltShuffleOperands(S, E->Scalars, LHSVL, RHSVL);
-        setInsertPointAfterBundle(E->Scalars, S);
+      if (Instruction::isBinaryOp(E->State.getOpcode())) {
+        reorderAltShuffleOperands(E->State, E->Scalars, LHSVL, RHSVL);
+        setInsertPointAfterBundle(E->Scalars, E->State);
         LHS = vectorizeTree(LHSVL);
         RHS = vectorizeTree(RHSVL);
       } else {
         ValueList INVL;
         for (Value *V : E->Scalars)
           INVL.push_back(cast<Instruction>(V)->getOperand(0));
-        setInsertPointAfterBundle(E->Scalars, S);
+        setInsertPointAfterBundle(E->Scalars, E->State);
         LHS = vectorizeTree(INVL);
       }
 
@@ -3513,16 +3773,16 @@
       }
 
       Value *V0, *V1;
-      if (Instruction::isBinaryOp(S.getOpcode())) {
+      if (Instruction::isBinaryOp(E->State.getOpcode())) {
         V0 = Builder.CreateBinOp(
-          static_cast<Instruction::BinaryOps>(S.getOpcode()), LHS, RHS);
+          static_cast<Instruction::BinaryOps>(E->State.getOpcode()), LHS, RHS);
         V1 = Builder.CreateBinOp(
-          static_cast<Instruction::BinaryOps>(S.getAltOpcode()), LHS, RHS);
+          static_cast<Instruction::BinaryOps>(E->State.getAltOpcode()), LHS, RHS);
       } else {
         V0 = Builder.CreateCast(
-            static_cast<Instruction::CastOps>(S.getOpcode()), LHS, VecTy);
+            static_cast<Instruction::CastOps>(E->State.getOpcode()), LHS, VecTy);
         V1 = Builder.CreateCast(
-            static_cast<Instruction::CastOps>(S.getAltOpcode()), LHS, VecTy);
+            static_cast<Instruction::CastOps>(E->State.getAltOpcode()), LHS, VecTy);
       }
 
       // Create shuffle to take alternate operations from the vector.
@@ -3533,8 +3793,7 @@
       SmallVector<Constant *, 8> Mask(e);
       for (unsigned i = 0; i < e; ++i) {
         auto *OpInst = cast<Instruction>(E->Scalars[i]);
-        assert(S.isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode");
-        if (OpInst->getOpcode() == S.getAltOpcode()) {
+        if (OpInst->getOpcode() == E->State.getAltOpcode()) {
           Mask[i] = Builder.getInt32(e + i);
           AltScalars.push_back(E->Scalars[i]);
         } else {
@@ -3544,8 +3803,10 @@
       }
 
       Value *ShuffleMask = ConstantVector::get(Mask);
-      propagateIRFlags(V0, OpScalars);
-      propagateIRFlags(V1, AltScalars);
+      InstructionsState S = getSameOpcode(OpScalars);
+      propagateIRFlags(V0, OpScalars, S.OpValue);
+      S = getSameOpcode(AltScalars);
+      propagateIRFlags(V1, AltScalars, S.OpValue);
 
       Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
       if (Instruction *I = dyn_cast<Instruction>(V))
@@ -3583,7 +3844,7 @@
   // If the vectorized tree can be rewritten in a smaller type, we truncate the
   // vectorized root. InstCombine will then rewrite the entire expression. We
   // sign extend the extracted values below.
-  auto *ScalarRoot = VectorizableTree[0].Scalars[0];
+  auto *ScalarRoot = VectorizableTree[0].State.OpValue;
   if (MinBWs.count(ScalarRoot)) {
     if (auto *I = dyn_cast<Instruction>(VectorRoot))
       Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
@@ -3698,6 +3959,9 @@
     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
       Value *Scalar = Entry->Scalars[Lane];
 
+      if (!Entry->State.isOpcodeOrAlt(cast<Instruction>(Scalar)))
+        continue;
+
       Type *Ty = Scalar->getType();
       if (!Ty->isVoidTy()) {
 #ifndef NDEBUG
@@ -3828,9 +4092,14 @@
   }
 
   for (Value *V : VL) {
-    ScheduleData *BundleMember = getScheduleData(V);
+    ScheduleData *BundleMember = getInstScheduleData(V);
+    if (BundleMember->isPartOfBundle())
+      BundleMember = getScheduleData(V, S.getOpcode());
+    if (BundleMember->isPartOfBundle())
+      return false;
     assert(BundleMember &&
            "no ScheduleData for bundle member (maybe not in same basic block)");
+    assert(!BundleMember->isPartOfBundle() && "Already part of another bundle");
     if (BundleMember->IsScheduled) {
       // A bundle member was scheduled as single instruction before and now
       // needs to be scheduled as part of the bundle. We just get rid of the
@@ -3847,6 +4116,7 @@
       Bundle = BundleMember;
     }
     BundleMember->UnscheduledDepsInBundle = 0;
+    BundleMember->Opcode = S.getOpcode();
     Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
 
     // Group the instructions to a bundle.
@@ -3890,18 +4160,27 @@
     }
   }
   if (!Bundle->isReady()) {
-    cancelScheduling(VL, S.OpValue);
+    cancelScheduling(S.OpValue, S.getOpcode());
+    // We have to clear all dependencies, since all values
+    // were calculated for the vectorized bundle.
+    for (auto *I = ScheduleStart; I != ScheduleEnd;
+      I = I->getNextNode()) {
+      doForAllOpcodes(I, [](ScheduleData *SD) {
+        SD->clearDependencies();
+      });
+    }
+    resetSchedule();
     return false;
   }
   return true;
 }
 
-void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
-                                                Value *OpValue) {
+void BoUpSLP::BlockScheduling::cancelScheduling(Value *OpValue,
+                                                unsigned Opcode) {
   if (isa<PHINode>(OpValue))
     return;
-
-  ScheduleData *Bundle = getScheduleData(OpValue);
+  ScheduleData *Bundle = getScheduleData(OpValue, Opcode)->FirstInBundle;
+  assert(Bundle && "Counld not find bundle");
   LLVM_DEBUG(dbgs() << "SLP:  cancel scheduling of " << *Bundle << "\n");
   assert(!Bundle->IsScheduled &&
          "Can't cancel bundle which is already scheduled");
@@ -3911,44 +4190,66 @@
   // Un-bundle: make single instructions out of the bundle.
   ScheduleData *BundleMember = Bundle;
   while (BundleMember) {
-    assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
+    assert(BundleMember->FirstInBundle == Bundle && "Corrupt bundle links");
+    assert(BundleMember->Opcode == Opcode && "Corrupt bundle");
     BundleMember->FirstInBundle = BundleMember;
     ScheduleData *Next = BundleMember->NextInBundle;
     BundleMember->NextInBundle = nullptr;
     BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
-    if (BundleMember->UnscheduledDepsInBundle == 0) {
-      ReadyInsts.insert(BundleMember);
+    if (BundleMember->isPseudo()) {
+      PseudoInstScheduleDataMap[BundleMember->getInst()].erase(
+          BundleMember->Opcode);
+      BundleMember->Opcode = 0;
+    } else {
+      BundleMember->Opcode = 0;
+      if (BundleMember->UnscheduledDepsInBundle == 0) {
+        ReadyInsts.insert(BundleMember);
+      }
     }
     BundleMember = Next;
   }
 }
 
-BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
-  // Allocate a new ScheduleData for the instruction.
+BoUpSLP::InstScheduleData *
+BoUpSLP::BlockScheduling::allocateInstScheduleDataChunks() {
+  // Allocate a new InstScheduleData for the instruction.
   if (ChunkPos >= ChunkSize) {
-    ScheduleDataChunks.push_back(llvm::make_unique<ScheduleData[]>(ChunkSize));
+    InstScheduleDataChunks.push_back(
+        llvm::make_unique<InstScheduleData[]>(ChunkSize));
     ChunkPos = 0;
   }
-  return &(ScheduleDataChunks.back()[ChunkPos++]);
+  return &(InstScheduleDataChunks.back()[ChunkPos++]);
+}
+
+BoUpSLP::PseudoScheduleData *
+BoUpSLP::BlockScheduling::allocatePseudoInstDataChunks() {
+  // Allocate a new PseudoScheduleData for the instruction.
+  if (PseudoChunkPos >= PseudoChunkSize) {
+    PseudoScheduleDataChunks.push_back(
+        llvm::make_unique<PseudoScheduleData[]>(PseudoChunkSize));
+    PseudoChunkPos = 0;
+  }
+  return &(PseudoScheduleDataChunks.back()[PseudoChunkPos++]);
 }
 
 bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
                                                       const InstructionsState &S) {
-  if (getScheduleData(V, isOneOf(S, V)))
+  if (getScheduleData(V, S.getOpcode()))
     return true;
   Instruction *I = dyn_cast<Instruction>(V);
   assert(I && "bundle member must be an instruction");
   assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
   auto &&CheckSheduleForI = [this, &S](Instruction *I) -> bool {
-    ScheduleData *ISD = getScheduleData(I);
+    InstScheduleData *ISD = getInstScheduleData(I);
     if (!ISD)
       return false;
     assert(isInSchedulingRegion(ISD) &&
-           "ScheduleData not in scheduling region");
-    ScheduleData *SD = allocateScheduleDataChunks();
-    SD->Inst = I;
-    SD->init(SchedulingRegionID, S.OpValue);
-    ExtraScheduleDataMap[I][S.OpValue] = SD;
+           "InstScheduleData not in scheduling region");
+    if (ISD->isPartOfBundle()) {
+      PseudoScheduleData *PSD = allocatePseudoInstDataChunks();
+      PSD->init(SchedulingRegionID, ISD, S.OpValue, S.getOpcode());
+      PseudoInstScheduleDataMap[I][S.getOpcode()] = PSD;
+    }
     return true;
   };
   if (CheckSheduleForI(I))
@@ -3958,8 +4259,7 @@
     initScheduleData(I, I->getNextNode(), nullptr, nullptr);
     ScheduleStart = I;
     ScheduleEnd = I->getNextNode();
-    if (isOneOf(S, I) != I)
-      CheckSheduleForI(I);
+    CheckSheduleForI(I);
     assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
     LLVM_DEBUG(dbgs() << "SLP:  initialize schedule region to " << *I << "\n");
     return true;
@@ -3981,8 +4281,7 @@
       if (&*UpIter == I) {
         initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
         ScheduleStart = I;
-        if (isOneOf(S, I) != I)
-          CheckSheduleForI(I);
+        CheckSheduleForI(I);
         LLVM_DEBUG(dbgs() << "SLP:  extend schedule region start to " << *I
                           << "\n");
         return true;
@@ -3994,8 +4293,7 @@
         initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
                          nullptr);
         ScheduleEnd = I->getNextNode();
-        if (isOneOf(S, I) != I)
-          CheckSheduleForI(I);
+        CheckSheduleForI(I);
         assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
         LLVM_DEBUG(dbgs() << "SLP:  extend schedule region end to " << *I
                           << "\n");
@@ -4009,21 +4307,20 @@
   return true;
 }
 
-void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
-                                                Instruction *ToI,
-                                                ScheduleData *PrevLoadStore,
-                                                ScheduleData *NextLoadStore) {
-  ScheduleData *CurrentLoadStore = PrevLoadStore;
+void BoUpSLP::BlockScheduling::initScheduleData(
+    Instruction *FromI, Instruction *ToI, InstScheduleData *PrevLoadStore,
+    InstScheduleData *NextLoadStore) {
+  InstScheduleData *CurrentLoadStore = PrevLoadStore;
   for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
-    ScheduleData *SD = ScheduleDataMap[I];
+    InstScheduleData *SD = InstScheduleDataMap[I];
     if (!SD) {
-      SD = allocateScheduleDataChunks();
-      ScheduleDataMap[I] = SD;
+      SD = allocateInstScheduleDataChunks();
+      InstScheduleDataMap[I] = SD;
       SD->Inst = I;
     }
     assert(!isInSchedulingRegion(SD) &&
-           "new ScheduleData already in scheduling region");
-    SD->init(SchedulingRegionID, I);
+           "new InstScheduleData already in scheduling region");
+    SD->init(SchedulingRegionID);
 
     if (I->mayReadOrWriteMemory() &&
         (!isa<IntrinsicInst>(I) ||
@@ -4058,8 +4355,11 @@
     WorkList.pop_back();
 
     ScheduleData *BundleMember = SD;
+    unsigned Opcode = BundleMember->Opcode;
     while (BundleMember) {
       assert(isInSchedulingRegion(BundleMember));
+      assert(BundleMember->Opcode == Opcode && "Corrupt bundle member");
+
       if (!BundleMember->hasValidDependencies()) {
 
         LLVM_DEBUG(dbgs() << "SLP:       update deps of " << *BundleMember
@@ -4068,44 +4368,31 @@
         BundleMember->resetUnscheduledDeps();
 
         // Handle def-use chain dependencies.
-        if (BundleMember->OpValue != BundleMember->Inst) {
-          ScheduleData *UseSD = getScheduleData(BundleMember->Inst);
-          if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
-            BundleMember->Dependencies++;
-            ScheduleData *DestBundle = UseSD->FirstInBundle;
-            if (!DestBundle->IsScheduled)
-              BundleMember->incrementUnscheduledDeps(1);
-            if (!DestBundle->hasValidDependencies())
-              WorkList.push_back(DestBundle);
-          }
-        } else {
-          for (User *U : BundleMember->Inst->users()) {
-            if (isa<Instruction>(U)) {
-              ScheduleData *UseSD = getScheduleData(U);
-              if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
-                BundleMember->Dependencies++;
-                ScheduleData *DestBundle = UseSD->FirstInBundle;
-                if (!DestBundle->IsScheduled)
-                  BundleMember->incrementUnscheduledDeps(1);
-                if (!DestBundle->hasValidDependencies())
-                  WorkList.push_back(DestBundle);
-              }
-            } else {
-              // I'm not sure if this can ever happen. But we need to be safe.
-              // This lets the instruction/bundle never be scheduled and
-              // eventually disable vectorization.
+        for (User *U : BundleMember->getInst()->users()) {
+          if (isa<Instruction>(U)) {
+            doForAllOpcodes(U, [&BundleMember, &WorkList](ScheduleData *UseSD) {
               BundleMember->Dependencies++;
-              BundleMember->incrementUnscheduledDeps(1);
-            }
+              ScheduleData *DestBundle = UseSD->FirstInBundle;
+              if (!DestBundle->IsScheduled)
+                BundleMember->incrementUnscheduledDeps(1);
+              if (!DestBundle->hasValidDependencies())
+                WorkList.push_back(DestBundle);
+            });
+          } else {
+            // I'm not sure if this can ever happen. But we need to be safe.
+            // This lets the instruction/bundle never be scheduled and
+            // eventually disable vectorization.
+            BundleMember->Dependencies++;
+            BundleMember->incrementUnscheduledDeps(1);
           }
         }
 
         // Handle the memory dependencies.
         ScheduleData *DepDest = BundleMember->NextLoadStore;
         if (DepDest) {
-          Instruction *SrcInst = BundleMember->Inst;
+          Instruction *SrcInst = BundleMember->getInst();
           MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA);
-          bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
+          bool SrcMayWrite = SrcInst->mayWriteToMemory();
           unsigned numAliased = 0;
           unsigned DistToSrc = 1;
 
@@ -4120,24 +4407,31 @@
             //    It's important for the loop break condition (see below) to
             //    check this limit even between two read-only instructions.
             if (DistToSrc >= MaxMemDepDistance ||
-                    ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
+                    ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
                      (numAliased >= AliasedCheckLimit ||
-                      SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
+                      SLP->isAliased(SrcLoc, SrcInst, DepDest->getInst())))) {
 
               // We increment the counter only if the locations are aliased
               // (instead of counting all alias checks). This gives a better
               // balance between reduced runtime and accurate dependencies.
               numAliased++;
 
-              DepDest->MemoryDependencies.push_back(BundleMember);
-              BundleMember->Dependencies++;
-              ScheduleData *DestBundle = DepDest->FirstInBundle;
-              if (!DestBundle->IsScheduled) {
-                BundleMember->incrementUnscheduledDeps(1);
-              }
-              if (!DestBundle->hasValidDependencies()) {
-                WorkList.push_back(DestBundle);
-              }
+              // We don't want any duplicates in the set to have a correct
+              // dependancies.
+              doForAllOpcodes(DepDest->getInst(), [&BundleMember, &WorkList](
+                                                      ScheduleData *DepDest) {
+                if (DepDest->MemoryDependencies.count(BundleMember) == 0) {
+                  DepDest->MemoryDependencies.insert(BundleMember);
+                  BundleMember->Dependencies++;
+                  ScheduleData *DestBundle = DepDest->FirstInBundle;
+                  if (!DestBundle->IsScheduled) {
+                    BundleMember->incrementUnscheduledDeps(1);
+                  }
+                  if (!DestBundle->hasValidDependencies()) {
+                    WorkList.push_back(DestBundle);
+                  }
+                }
+              });
             }
             DepDest = DepDest->NextLoadStore;
 
@@ -4164,7 +4458,7 @@
     }
     if (InsertInReadyList && SD->isReady()) {
       ReadyInsts.push_back(SD);
-      LLVM_DEBUG(dbgs() << "SLP:     gets ready on update: " << *SD->Inst
+      LLVM_DEBUG(dbgs() << "SLP:     gets ready on update: " << *SD->getInst()
                         << "\n");
     }
   }
@@ -4176,7 +4470,7 @@
   for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
     doForAllOpcodes(I, [&](ScheduleData *SD) {
       assert(isInSchedulingRegion(SD) &&
-             "ScheduleData not in scheduling region");
+             "InstScheduleData not in scheduling region");
       SD->IsScheduled = false;
       SD->resetUnscheduledDeps();
     });
@@ -4210,7 +4504,7 @@
        I = I->getNextNode()) {
     BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) {
       assert(SD->isPartOfBundle() ==
-                 (getTreeEntry(SD->Inst) != nullptr) &&
+                 (getTreeEntry(SD->getInst(), SD->Opcode) != nullptr) &&
              "scheduler and vectorizer bundle mismatch");
       SD->FirstInBundle->SchedulingPriority = Idx++;
       if (SD->isSchedulingEntity()) {
@@ -4231,20 +4525,31 @@
     // Move the scheduled instruction(s) to their dedicated places, if not
     // there yet.
     ScheduleData *BundleMember = picked;
+    unsigned Opcode = BundleMember->Opcode;
     while (BundleMember) {
-      Instruction *pickedInst = BundleMember->Inst;
-      if (LastScheduledInst->getNextNode() != pickedInst) {
-        BS->BB->getInstList().remove(pickedInst);
+      assert(Opcode == BundleMember->Opcode && "Corrupt bundle member");
+      Instruction *PickedInst = BundleMember->getInst();
+      if (LastScheduledInst->getNextNode() != PickedInst) {
+        BS->BB->getInstList().remove(PickedInst);
         BS->BB->getInstList().insert(LastScheduledInst->getIterator(),
-                                     pickedInst);
+                                     PickedInst);
       }
-      LastScheduledInst = pickedInst;
+      LastScheduledInst = PickedInst;
       BundleMember = BundleMember->NextInBundle;
     }
-
     BS->schedule(picked, ReadyInsts);
     NumToSchedule--;
   }
+#ifndef NDEBUG
+  if (NumToSchedule != 0) {
+    for (BasicBlock::iterator I = BS->BB->begin(), E = BS->BB->end(); I != E; ++I) {
+      BS->doForAllOpcodes(&*I, [](ScheduleData *SD) {
+        if (SD->isSchedulingEntity() && SD->UnscheduledDepsInBundle != 0)
+          LLVM_DEBUG(dbgs() << "SLP: Failed to schedule: " << *SD << ".\n");
+      });
+    }
+  }
+#endif
   assert(NumToSchedule == 0 && "could not schedule all instructions");
 
   // Avoid duplicate scheduling of the block.
@@ -4865,6 +5170,10 @@
   InstructionsState S = getSameOpcode(VL);
   if (!S.getOpcode())
     return false;
+  for (Value *V : VL) {
+    if (isOneOf(S, V) != V)
+      return false;
+  }
 
   Instruction *I0 = cast<Instruction>(S.OpValue);
   unsigned Sz = R.getVectorElementSize(I0);
Index: test/Transforms/SLPVectorizer/X86/cancel_scheduling.ll
===================================================================
--- /dev/null
+++ test/Transforms/SLPVectorizer/X86/cancel_scheduling.ll
@@ -0,0 +1,215 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -slp-vectorizer -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; This testcase shows the failure of scheduling bundles after calling
+; cancelScheduling() in tryScheduleBundle() and not cleaning all
+; dependencies. The dependency values are supposed to be cleared,
+; since everything was calculated before we cancel the bundle.
+
+define dso_local void @fn1() local_unnamed_addr #0 {
+; CHECK-LABEL: @fn1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 2
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 1
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 0
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 4
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 3
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 17
+; CHECK-NEXT:    store i16 7, i16* [[ARRAYIDX5]], align 2
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 16
+; CHECK-NEXT:    store i16 7, i16* [[ARRAYIDX6]], align 2
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 15
+; CHECK-NEXT:    store i16 7, i16* [[ARRAYIDX7]], align 2
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 12
+; CHECK-NEXT:    store i16 7, i16* [[ARRAYIDX8]], align 2
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 11
+; CHECK-NEXT:    store i16 7, i16* [[ARRAYIDX9]], align 2
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 10
+; CHECK-NEXT:    store i16 7, i16* [[ARRAYIDX10]], align 2
+; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 9
+; CHECK-NEXT:    store i16 7, i16* [[ARRAYIDX11]], align 2
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 8
+; CHECK-NEXT:    store i16 7, i16* [[ARRAYIDX12]], align 2
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 7
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 6
+; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 5
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[ARRAYIDX2]] to <8 x i16>*
+; CHECK-NEXT:    store <8 x i16> <i16 2, i16 2, i16 2, i16 0, i16 0, i16 7, i16 7, i16 7>, <8 x i16>* [[TMP0]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, i8* inttoptr (i64 1 to i8*), align 1
+; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 18
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i8 [[TMP1]], 2
+; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 19
+; CHECK-NEXT:    [[TMP3:%.*]] = and i8 [[TMP1]], 2
+; CHECK-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 20
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, i8* inttoptr (i64 2 to i8*), align 2
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr i8 [[TMP4]], 4
+; CHECK-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 21
+; CHECK-NEXT:    [[TMP6:%.*]] = lshr i8 [[TMP4]], 1
+; CHECK-NEXT:    [[ARRAYIDX39:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 22
+; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 23
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, i8* inttoptr (i64 3 to i8*), align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = lshr i8 [[TMP7]], 3
+; CHECK-NEXT:    [[ARRAYIDX49:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 24
+; CHECK-NEXT:    [[ARRAYIDX54:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 25
+; CHECK-NEXT:    [[TMP9:%.*]] = load i8, i8* inttoptr (i64 4 to i8*), align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = lshr i8 [[TMP9]], 4
+; CHECK-NEXT:    [[ARRAYIDX59:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 26
+; CHECK-NEXT:    [[TMP11:%.*]] = lshr i8 [[TMP9]], 1
+; CHECK-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 27
+; CHECK-NEXT:    [[ARRAYIDX69:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 28
+; CHECK-NEXT:    [[TMP12:%.*]] = load i8, i8* inttoptr (i64 5 to i8*), align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = lshr i8 [[TMP12]], 3
+; CHECK-NEXT:    [[ARRAYIDX74:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 29
+; CHECK-NEXT:    [[ARRAYIDX79:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 30
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, i8* inttoptr (i64 6 to i8*), align 2
+; CHECK-NEXT:    [[ARRAYIDX83:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 31
+; CHECK-NEXT:    [[TMP15:%.*]] = lshr i8 [[TMP14]], 2
+; CHECK-NEXT:    [[ARRAYIDX88:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 32
+; CHECK-NEXT:    [[TMP16:%.*]] = shl i8 [[TMP14]], 1
+; CHECK-NEXT:    [[TMP17:%.*]] = or i8 [[TMP3]], 1
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <16 x i8> undef, i8 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <16 x i8> [[TMP18]], i8 [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <16 x i8> [[TMP19]], i8 [[TMP17]], i32 2
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <16 x i8> [[TMP20]], i8 [[TMP5]], i32 3
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <16 x i8> [[TMP21]], i8 [[TMP6]], i32 4
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <16 x i8> [[TMP22]], i8 [[TMP4]], i32 5
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <16 x i8> [[TMP23]], i8 [[TMP8]], i32 6
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <16 x i8> [[TMP24]], i8 [[TMP7]], i32 7
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <16 x i8> [[TMP25]], i8 [[TMP10]], i32 8
+; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <16 x i8> [[TMP26]], i8 [[TMP11]], i32 9
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <16 x i8> [[TMP27]], i8 [[TMP9]], i32 10
+; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <16 x i8> [[TMP28]], i8 [[TMP13]], i32 11
+; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <16 x i8> [[TMP29]], i8 [[TMP12]], i32 12
+; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <16 x i8> [[TMP30]], i8 [[TMP14]], i32 13
+; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <16 x i8> [[TMP31]], i8 [[TMP15]], i32 14
+; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <16 x i8> [[TMP32]], i8 [[TMP16]], i32 15
+; CHECK-NEXT:    [[TMP34:%.*]] = ashr <16 x i8> [[TMP33]], <i8 7, i8 7, i8 0, i8 7, i8 7, i8 2, i8 7, i8 7, i8 7, i8 7, i8 2, i8 7, i8 7, i8 7, i8 7, i8 6>
+; CHECK-NEXT:    [[TMP35:%.*]] = and <16 x i8> [[TMP33]], <i8 7, i8 7, i8 0, i8 7, i8 7, i8 2, i8 7, i8 7, i8 7, i8 7, i8 2, i8 7, i8 7, i8 7, i8 7, i8 6>
+; CHECK-NEXT:    [[TMP36:%.*]] = shufflevector <16 x i8> [[TMP34]], <16 x i8> [[TMP35]], <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP37:%.*]] = sext <16 x i8> [[TMP36]] to <16 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = zext <16 x i8> [[TMP36]] to <16 x i16>
+; CHECK-NEXT:    [[TMP39:%.*]] = shufflevector <16 x i16> [[TMP37]], <16 x i16> [[TMP38]], <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[ARRAYIDX92:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 33
+; CHECK-NEXT:    [[TMP40:%.*]] = bitcast i16* [[ARRAYIDX17]] to <16 x i16>*
+; CHECK-NEXT:    store <16 x i16> [[TMP39]], <16 x i16>* [[TMP40]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %arrayidx = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 2
+  store i16 2, i16* %arrayidx, align 2
+  %arrayidx1 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 1
+  store i16 2, i16* %arrayidx1, align 2
+  %arrayidx2 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 0
+  store i16 2, i16* %arrayidx2, align 2
+  %arrayidx3 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 4
+  store i16 0, i16* %arrayidx3, align 2
+  %arrayidx4 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 3
+  store i16 0, i16* %arrayidx4, align 2
+  %arrayidx5 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 17
+  store i16 7, i16* %arrayidx5, align 2
+  %arrayidx6 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 16
+  store i16 7, i16* %arrayidx6, align 2
+  %arrayidx7 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 15
+  store i16 7, i16* %arrayidx7, align 2
+  %arrayidx8 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 12
+  store i16 7, i16* %arrayidx8, align 2
+  %arrayidx9 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 11
+  store i16 7, i16* %arrayidx9, align 2
+  %arrayidx10 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 10
+  store i16 7, i16* %arrayidx10, align 2
+  %arrayidx11 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 9
+  store i16 7, i16* %arrayidx11, align 2
+  %arrayidx12 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 8
+  store i16 7, i16* %arrayidx12, align 2
+  %arrayidx13 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 7
+  store i16 7, i16* %arrayidx13, align 2
+  %arrayidx14 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 6
+  store i16 7, i16* %arrayidx14, align 2
+  %arrayidx15 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 5
+  store i16 7, i16* %arrayidx15, align 2
+  %0 = load i8, i8* inttoptr (i64 1 to i8*), align 1
+  %1 = ashr i8 %0, 7
+  %conv16 = sext i8 %1 to i16
+  %arrayidx17 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 18
+  store i16 %conv16, i16* %arrayidx17, align 2
+  %2 = lshr i8 %0, 2
+  %3 = and i8 %2, 7
+  %conv20 = zext i8 %3 to i16
+  %arrayidx21 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 19
+  store i16 %conv20, i16* %arrayidx21, align 2
+  %4 = and i8 %0, 2
+  %arrayidx26 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 20
+  %5 = or i8 %4, 1
+  %conv29 = zext i8 %5 to i16
+  store i16 %conv29, i16* %arrayidx26, align 2
+  %6 = load i8, i8* inttoptr (i64 2 to i8*), align 2
+  %7 = lshr i8 %6, 4
+  %8 = and i8 %7, 7
+  %conv33 = zext i8 %8 to i16
+  %arrayidx34 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 21
+  store i16 %conv33, i16* %arrayidx34, align 2
+  %9 = lshr i8 %6, 1
+  %10 = and i8 %9, 7
+  %conv38 = zext i8 %10 to i16
+  %arrayidx39 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 22
+  store i16 %conv38, i16* %arrayidx39, align 2
+  %11 = and i8 %6, 2
+  %conv43 = zext i8 %11 to i16
+  %arrayidx44 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 23
+  store i16 %conv43, i16* %arrayidx44, align 2
+  %12 = load i8, i8* inttoptr (i64 3 to i8*), align 1
+  %13 = lshr i8 %12, 3
+  %14 = and i8 %13, 7
+  %conv48 = zext i8 %14 to i16
+  %arrayidx49 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 24
+  store i16 %conv48, i16* %arrayidx49, align 2
+  %15 = and i8 %12, 7
+  %conv53 = zext i8 %15 to i16
+  %arrayidx54 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 25
+  store i16 %conv53, i16* %arrayidx54, align 2
+  %16 = load i8, i8* inttoptr (i64 4 to i8*), align 4
+  %17 = lshr i8 %16, 4
+  %18 = and i8 %17, 7
+  %conv58 = zext i8 %18 to i16
+  %arrayidx59 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 26
+  store i16 %conv58, i16* %arrayidx59, align 2
+  %19 = lshr i8 %16, 1
+  %20 = and i8 %19, 7
+  %conv63 = zext i8 %20 to i16
+  %arrayidx64 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 27
+  store i16 %conv63, i16* %arrayidx64, align 2
+  %21 = and i8 %16, 2
+  %conv68 = zext i8 %21 to i16
+  %arrayidx69 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 28
+  store i16 %conv68, i16* %arrayidx69, align 2
+  %22 = load i8, i8* inttoptr (i64 5 to i8*), align 1
+  %23 = lshr i8 %22, 3
+  %24 = and i8 %23, 7
+  %conv73 = zext i8 %24 to i16
+  %arrayidx74 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 29
+  store i16 %conv73, i16* %arrayidx74, align 2
+  %25 = and i8 %22, 7
+  %conv78 = zext i8 %25 to i16
+  %arrayidx79 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 30
+  store i16 %conv78, i16* %arrayidx79, align 2
+  %26 = load i8, i8* inttoptr (i64 6 to i8*), align 2
+  %27 = and i8 %26, 7
+  %conv82 = zext i8 %27 to i16
+  %arrayidx83 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 31
+  store i16 %conv82, i16* %arrayidx83, align 2
+  %28 = lshr i8 %26, 2
+  %29 = and i8 %28, 7
+  %conv87 = zext i8 %29 to i16
+  %arrayidx88 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 32
+  store i16 %conv87, i16* %arrayidx88, align 2
+  %30 = shl i8 %26, 1
+  %31 = and i8 %30, 6
+  %conv91 = zext i8 %31 to i16
+  %arrayidx92 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 33
+  store i16 %conv91, i16* %arrayidx92, align 2
+  ret void
+}
+
+attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="broadwell" "target-features"="+adx,+aes,+avx,+avx2,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" }
Index: test/Transforms/SLPVectorizer/X86/memory-dep.ll
===================================================================
--- /dev/null
+++ test/Transforms/SLPVectorizer/X86/memory-dep.ll
@@ -0,0 +1,76 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -slp-vectorizer -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.anon.1.2.3.4.87 = type { [6 x [6 x i16]], [6 x [6 x i32]], [0 x [4 x [4 x i32]]] }
+
+@f = external dso_local local_unnamed_addr global %struct.anon.1.2.3.4.87, align 4
+
+; Function Attrs: norecurse nounwind uwtable
+define dso_local void @itrans() local_unnamed_addr #0 {
+; CHECK-LABEL: @itrans(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store i32 undef, i32* getelementptr inbounds (%struct.anon.1.2.3.4.87, %struct.anon.1.2.3.4.87* @f, i64 0, i32 1, i64 0, i64 3), align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* getelementptr inbounds (%struct.anon.1.2.3.4.87, %struct.anon.1.2.3.4.87* @f, i64 0, i32 1, i64 0, i64 2), align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* getelementptr inbounds (%struct.anon.1.2.3.4.87, %struct.anon.1.2.3.4.87* @f, i64 0, i32 1, i64 0, i64 3), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> undef, i32 undef, i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 undef, i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP1]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> <i32 2, i32 2, i32 2, i32 2>, [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = sub <4 x i32> [[TMP6]], undef
+; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> undef, [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = shl <4 x i32> [[TMP8]], <i32 6, i32 6, i32 6, i32 6>
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp slt <4 x i32> [[TMP9]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP11:%.*]] = zext <4 x i1> [[TMP10]] to <4 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp slt <4 x i32> undef, [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = zext <4 x i1> [[TMP12]] to <4 x i32>
+; CHECK-NEXT:    store <4 x i32> [[TMP13]], <4 x i32>* bitcast (i32* getelementptr inbounds (%struct.anon.1.2.3.4.87, %struct.anon.1.2.3.4.87* @f, i64 0, i32 1, i64 3, i64 0) to <4 x i32>*), align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %add8 = add nsw i32 undef, undef
+  store i32 undef, i32* getelementptr inbounds (%struct.anon.1.2.3.4.87, %struct.anon.1.2.3.4.87* @f, i64 0, i32 1, i64 0, i64 3), align 4
+  %add15 = add nsw i32 undef, undef
+  %add26 = add nsw i32 %add8, 2
+  %sub27 = sub i32 %add26, undef
+  %add33 = add nsw i32 %sub27, undef
+  %shl = shl i32 %add33, 6
+  %cmp.i = icmp slt i32 %shl, 1
+  %conv.i = zext i1 %cmp.i to i32
+  %cmp1.i = icmp slt i32 undef, %conv.i
+  %conv2.i = zext i1 %cmp1.i to i32
+  store i32 %conv2.i, i32* getelementptr inbounds (%struct.anon.1.2.3.4.87, %struct.anon.1.2.3.4.87* @f, i64 0, i32 1, i64 3, i64 0), align 4
+  %add26.1 = add nsw i32 %add15, 2
+  %sub27.1 = sub i32 %add26.1, undef
+  %add33.1 = add nsw i32 %sub27.1, undef
+  %shl.1 = shl i32 %add33.1, 6
+  %cmp.i.1 = icmp slt i32 %shl.1, 1
+  %conv.i.1 = zext i1 %cmp.i.1 to i32
+  %cmp1.i.1 = icmp slt i32 undef, %conv.i.1
+  %conv2.i.1 = zext i1 %cmp1.i.1 to i32
+  store i32 %conv2.i.1, i32* getelementptr inbounds (%struct.anon.1.2.3.4.87, %struct.anon.1.2.3.4.87* @f, i64 0, i32 1, i64 3, i64 1), align 4
+  %0 = load i32, i32* getelementptr inbounds (%struct.anon.1.2.3.4.87, %struct.anon.1.2.3.4.87* @f, i64 0, i32 1, i64 0, i64 2), align 4
+  %add26.2 = add nsw i32 %0, 2
+  %sub27.2 = sub i32 %add26.2, undef
+  %add33.2 = add nsw i32 %sub27.2, undef
+  %shl.2 = shl i32 %add33.2, 6
+  %cmp.i.2 = icmp slt i32 %shl.2, 1
+  %conv.i.2 = zext i1 %cmp.i.2 to i32
+  %cmp1.i.2 = icmp slt i32 undef, %conv.i.2
+  %conv2.i.2 = zext i1 %cmp1.i.2 to i32
+  store i32 %conv2.i.2, i32* getelementptr inbounds (%struct.anon.1.2.3.4.87, %struct.anon.1.2.3.4.87* @f, i64 0, i32 1, i64 3, i64 2), align 4
+  %1 = load i32, i32* getelementptr inbounds (%struct.anon.1.2.3.4.87, %struct.anon.1.2.3.4.87* @f, i64 0, i32 1, i64 0, i64 3), align 4
+  %add26.3 = add nsw i32 %1, 2
+  %sub27.3 = sub i32 %add26.3, undef
+  %add33.3 = add nsw i32 %sub27.3, undef
+  %shl.3 = shl i32 %add33.3, 6
+  %cmp.i.3 = icmp slt i32 %shl.3, 1
+  %conv.i.3 = zext i1 %cmp.i.3 to i32
+  %cmp1.i.3 = icmp slt i32 undef, %conv.i.3
+  %conv2.i.3 = zext i1 %cmp1.i.3 to i32
+  store i32 %conv2.i.3, i32* getelementptr inbounds (%struct.anon.1.2.3.4.87, %struct.anon.1.2.3.4.87* @f, i64 0, i32 1, i64 3, i64 3), align 4
+  ret void
+}
Index: test/Transforms/SLPVectorizer/X86/pr35497.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/pr35497.ll
+++ test/Transforms/SLPVectorizer/X86/pr35497.ll
@@ -12,20 +12,20 @@
 define void @_ZN1C10SwitchModeEv() local_unnamed_addr #0 comdat align 2 {
 ; CHECK-LABEL: @_ZN1C10SwitchModeEv(
 ; CHECK-NEXT:  for.body.lr.ph.i:
-; CHECK-NEXT:    [[OR_1:%.*]] = or i64 undef, 1
-; CHECK-NEXT:    store i64 [[OR_1]], i64* undef, align 8
+; CHECK-NEXT:    [[BAR5:%.*]] = load i64, i64* undef, align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i64> undef, i64 [[BAR5]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = or <2 x i64> [[TMP0]], <i64 1, i64 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
+; CHECK-NEXT:    store i64 [[TMP2]], i64* undef, align 8
 ; CHECK-NEXT:    [[FOO_1:%.*]] = getelementptr inbounds [[CLASS_1:%.*]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 0
 ; CHECK-NEXT:    [[FOO_2:%.*]] = getelementptr inbounds [[CLASS_1]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[FOO_1]] to <2 x i64>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8
-; CHECK-NEXT:    [[BAR5:%.*]] = load i64, i64* undef, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> undef, i64 [[OR_1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[BAR5]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = and <2 x i64> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[FOO_1]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = and <2 x i64> [[TMP1]], [[TMP4]]
 ; CHECK-NEXT:    [[BAR3:%.*]] = getelementptr inbounds [[CLASS_2:%.*]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 0
 ; CHECK-NEXT:    [[BAR4:%.*]] = getelementptr inbounds [[CLASS_2]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 1
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64* [[BAR3]] to <2 x i64>*
-; CHECK-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64* [[BAR3]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP5]], <2 x i64>* [[TMP6]], align 8
 ; CHECK-NEXT:    ret void
 ;
 for.body.lr.ph.i:
Index: test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
+++ test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
@@ -43,22 +43,16 @@
 ; CHECK-LABEL: @add1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store i32 [[TMP0]], i32* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP1]], 1
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
-; CHECK-NEXT:    store i32 [[ADD3]], i32* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP2]], 2
 ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
-; CHECK-NEXT:    store i32 [[ADD6]], i32* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4
-; CHECK-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP3]], 3
-; CHECK-NEXT:    store i32 [[ADD9]], i32* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -86,22 +80,16 @@
 ; CHECK-LABEL: @sub0(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store i32 [[SUB]], i32* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
-; CHECK-NEXT:    store i32 [[TMP1]], i32* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[SUB5:%.*]] = add nsw i32 [[TMP2]], -2
 ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
-; CHECK-NEXT:    store i32 [[SUB5]], i32* [[INCDEC_PTR3]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[SUB8:%.*]] = add nsw i32 [[TMP3]], -3
-; CHECK-NEXT:    store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> <i32 -1, i32 0, i32 -2, i32 -3>, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -205,22 +193,18 @@
 ; CHECK-LABEL: @addsub0(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store i32 [[SUB]], i32* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
-; CHECK-NEXT:    store i32 [[TMP1]], i32* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[SUB5:%.*]] = add nsw i32 [[TMP2]], -2
 ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
-; CHECK-NEXT:    store i32 [[SUB5]], i32* [[INCDEC_PTR3]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3
-; CHECK-NEXT:    store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], <i32 -1, i32 0, i32 -2, i32 -3>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub nsw <4 x i32> [[TMP1]], <i32 -1, i32 0, i32 -2, i32 -3>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -248,22 +232,18 @@
 ; CHECK-LABEL: @addsub1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store i32 [[SUB]], i32* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[SUB1:%.*]] = sub nsw i32 [[TMP1]], -1
 ; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
-; CHECK-NEXT:    store i32 [[SUB1]], i32* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
-; CHECK-NEXT:    store i32 [[TMP2]], i32* [[INCDEC_PTR3]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3
-; CHECK-NEXT:    store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], <i32 -1, i32 -1, i32 0, i32 -3>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub nsw <4 x i32> [[TMP1]], <i32 -1, i32 -1, i32 0, i32 -3>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -291,22 +271,16 @@
 ; CHECK-LABEL: @mul(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
-; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 257
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store i32 [[MUL]], i32* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[MUL3:%.*]] = mul nsw i32 [[TMP1]], -3
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
-; CHECK-NEXT:    store i32 [[MUL3]], i32* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
-; CHECK-NEXT:    store i32 [[TMP2]], i32* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4
-; CHECK-NEXT:    [[MUL9:%.*]] = mul nsw i32 [[TMP3]], -9
-; CHECK-NEXT:    store i32 [[MUL9]], i32* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nsw <4 x i32> <i32 257, i32 -3, i32 1, i32 -9>, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -334,22 +308,16 @@
 ; CHECK-LABEL: @shl0(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store i32 [[TMP0]], i32* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[TMP1]], 1
 ; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
-; CHECK-NEXT:    store i32 [[SHL]], i32* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[SHL5:%.*]] = shl i32 [[TMP2]], 2
 ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
-; CHECK-NEXT:    store i32 [[SHL5]], i32* [[INCDEC_PTR3]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[SHL8:%.*]] = shl i32 [[TMP3]], 3
-; CHECK-NEXT:    store i32 [[SHL8]], i32* [[INCDEC_PTR6]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = shl <4 x i32> [[TMP1]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -453,22 +421,16 @@
 ; CHECK-LABEL: @add1f(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store float [[TMP0]], float* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[ADD3:%.*]] = fadd fast float [[TMP1]], 1.000000e+00
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
-; CHECK-NEXT:    store float [[ADD3]], float* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[ADD6:%.*]] = fadd fast float [[TMP2]], 2.000000e+00
 ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
-; CHECK-NEXT:    store float [[ADD6]], float* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
-; CHECK-NEXT:    [[ADD9:%.*]] = fadd fast float [[TMP3]], 3.000000e+00
-; CHECK-NEXT:    store float [[ADD9]], float* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -496,22 +458,16 @@
 ; CHECK-LABEL: @sub0f(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store float [[ADD]], float* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
-; CHECK-NEXT:    store float [[TMP1]], float* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[ADD6:%.*]] = fadd fast float [[TMP2]], -2.000000e+00
 ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
-; CHECK-NEXT:    store float [[ADD6]], float* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
-; CHECK-NEXT:    [[ADD9:%.*]] = fadd fast float [[TMP3]], -3.000000e+00
-; CHECK-NEXT:    store float [[ADD9]], float* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <4 x float> <float -1.000000e+00, float 0.000000e+00, float -2.000000e+00, float -3.000000e+00>, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -615,22 +571,18 @@
 ; CHECK-LABEL: @addsub0f(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store float [[SUB]], float* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
-; CHECK-NEXT:    store float [[TMP1]], float* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[SUB5:%.*]] = fadd fast float [[TMP2]], -2.000000e+00
 ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
-; CHECK-NEXT:    store float [[SUB5]], float* [[INCDEC_PTR3]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00
-; CHECK-NEXT:    store float [[SUB8]], float* [[INCDEC_PTR6]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <4 x float> [[TMP1]], <float -1.000000e+00, float 0.000000e+00, float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub fast <4 x float> [[TMP1]], <float -1.000000e+00, float 0.000000e+00, float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[DST]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -658,22 +610,18 @@
 ; CHECK-LABEL: @addsub1f(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store float [[SUB]], float* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[SUB1:%.*]] = fsub fast float [[TMP1]], -1.000000e+00
 ; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
-; CHECK-NEXT:    store float [[SUB1]], float* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
-; CHECK-NEXT:    store float [[TMP2]], float* [[INCDEC_PTR3]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00
-; CHECK-NEXT:    store float [[SUB8]], float* [[INCDEC_PTR6]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <4 x float> [[TMP1]], <float -1.000000e+00, float -1.000000e+00, float 0.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub fast <4 x float> [[TMP1]], <float -1.000000e+00, float -1.000000e+00, float 0.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[DST]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -701,22 +649,16 @@
 ; CHECK-LABEL: @mulf(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = fmul fast float [[TMP0]], 2.570000e+02
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store float [[SUB]], float* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[SUB3:%.*]] = fmul fast float [[TMP1]], -3.000000e+00
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
-; CHECK-NEXT:    store float [[SUB3]], float* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
-; CHECK-NEXT:    store float [[TMP2]], float* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
-; CHECK-NEXT:    [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
-; CHECK-NEXT:    store float [[SUB9]], float* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <4 x float> <float 2.570000e+02, float -3.000000e+00, float 1.000000e+00, float -9.000000e+00>, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -825,22 +767,16 @@
 ; CHECK-LABEL: @sub0fn(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store float [[ADD]], float* [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
-; CHECK-NEXT:    store float [[TMP1]], float* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[ADD6:%.*]] = fadd float [[TMP2]], -2.000000e+00
 ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
-; CHECK-NEXT:    store float [[ADD6]], float* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
-; CHECK-NEXT:    [[ADD9:%.*]] = fadd float [[TMP3]], -3.000000e+00
-; CHECK-NEXT:    store float [[ADD9]], float* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> <float -1.000000e+00, float 0.000000e+00, float -2.000000e+00, float -3.000000e+00>, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry: