diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -167,6 +167,16 @@
     cl::desc("The maximum number of users to visit while visiting the "
              "predecessors. This prevents compilation time increase."));
 
+/// Perform operand reordering across chains of commutative operations (which we
+/// refer to as SuperNodes).
+static cl::opt<bool> EnableSuperNode(
+    "slp-enable-supernode", cl::init(true), cl::Hidden,
+    cl::desc("Enable SuperNodes and operand reordering across them"));
+
+static cl::opt<unsigned> MaxSuperNodeSize(
+    "slp-max-supernode-size", cl::init(2), cl::Hidden,
+    cl::desc("Limit the size of the SuperNode to this many TreeEntries"));
+
 static cl::opt<bool>
     ViewSLPTree("view-slp-tree", cl::Hidden,
                 cl::desc("Display the SLP trees with Graphviz"));
@@ -510,6 +520,7 @@
 class BoUpSLP {
   struct TreeEntry;
   struct ScheduleData;
+  struct BlockScheduling;
 
 public:
   using ValueList = SmallVector<Value *, 8>;
@@ -523,8 +534,9 @@
           TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
           DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
           const DataLayout *DL, OptimizationRemarkEmitter *ORE)
-      : F(Func), SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC),
-        DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) {
+      : CurrSuperNode(*DL, *Se, *this), F(Func), SE(Se), TTI(Tti), TLI(TLi),
+        AA(Aa), LI(Li), DT(Dt), AC(AC), DB(DB), DL(DL), ORE(ORE),
+        Builder(Se->getContext()) {
     CodeMetrics::collectEphemeralValues(F, AC, EphValues);
     // Use the vector register size specified by the target unless overridden
     // by a command-line option.
@@ -661,6 +673,14 @@
     TreeEntry *UserTE = nullptr;
     /// The operand index of the use.
     unsigned EdgeIdx = UINT_MAX;
+    /// The APOs across each lane.
+    SmallVector<bool, 8> APOs;
+    /// Initialize the APOs.
+    void initAPOs(unsigned NumLanes) {
+      // Initialize the APOs in 'UserTreeIdx'.
+      APOs.resize(NumLanes);
+      std::fill(APOs.begin(), APOs.end(), false);
+    }
 #ifndef NDEBUG
     friend inline raw_ostream &operator<<(raw_ostream &OS,
                                           const BoUpSLP::EdgeInfo &EI) {
@@ -670,7 +690,10 @@
     /// Debug print.
     void dump(raw_ostream &OS) const {
       OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
-         << " EdgeIdx:" << EdgeIdx << "}";
+         << " EdgeIdx:" << EdgeIdx << " APOs:";
+      for (bool APO : APOs)
+        OS << APO << ",";
+      OS << "}";
     }
     LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
 #endif
@@ -1130,9 +1153,6 @@
       }
     }
 
-    /// \returns the number of operands.
-    unsigned getNumOperands() const { return OpsVec.size(); }
-
     /// \returns the number of lanes.
     unsigned getNumLanes() const { return OpsVec[0].size(); }
 
@@ -1144,9 +1164,6 @@
     /// \returns true if the data structure is empty.
     bool empty() const { return OpsVec.empty(); }
 
-    /// Clears the data.
-    void clear() { OpsVec.clear(); }
-
     /// \Returns true if there are enough operands identical to \p Op to fill
     /// the whole vector.
     /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
@@ -1174,6 +1191,9 @@
     }
 
   public:
+    VLOperands(const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R)
+        : DL(DL), SE(SE), R(R){};
+
     /// Initialize with all the operands of the instruction vector \p RootVL.
     VLOperands(ArrayRef<Value *> RootVL, const DataLayout &DL,
                ScalarEvolution &SE, const BoUpSLP &R)
@@ -1182,6 +1202,32 @@
       appendOperandsOfVL(RootVL);
     }
 
+    /// Append the \p OpIdx 'th operands of \p VL for all lanes.
+    void appendOperands(ArrayRef<Value *> OpVL, const EdgeInfo &EI,
+                        ArrayRef<bool> APOVec) {
+      unsigned NumLanes = OpVL.size();
+      assert((OpsVec.empty() || NumLanes == getNumLanes()) &&
+             "Must keep same num of lanes");
+      unsigned SNOpIdx = OpsVec.size();
+      OpsVec.resize(SNOpIdx + 1);
+      OpsVec[SNOpIdx].resize(NumLanes);
+      for (unsigned Lane = 0; Lane != NumLanes; ++Lane)
+        OpsVec[SNOpIdx][Lane] = {OpVL[Lane], APOVec[Lane], false};
+    }
+
+    /// \returns the number of operands.
+    unsigned getNumOperands() const { return OpsVec.size(); }
+
+    /// Clears the data.
+    void clear() { OpsVec.clear(); }
+
+    // Since operand reordering is performed on groups of commutative
+    // operations or alternating sequences (e.g., +, -), we can safely
+    // tell the inverse operations by checking commutativity.
+    static bool computeAPO(unsigned OpIdx, bool UserAPO, bool IsUserInverse) {
+      return (OpIdx == 0) ? UserAPO : (IsUserInverse ? !UserAPO : UserAPO);
+    }
+
     /// \Returns a value vector with the operands across all lanes for the
     /// opearnd at \p OpIdx.
     ValueList getVL(unsigned OpIdx) const {
@@ -1193,6 +1239,15 @@
       return OpVL;
     }
 
+    /// \Returns the APOs across all lanes for \p OpIdx.
+    SmallVector<bool, 8> getAPOVec(unsigned OpIdx) const {
+      unsigned NumLanes = getNumLanes();
+      SmallVector<bool, 8> APOVec(NumLanes);
+      for (unsigned Lane = 0; Lane != NumLanes; ++Lane)
+        APOVec[Lane] = getData(OpIdx, Lane).APO;
+      return APOVec;
+    }
+
     // Performs operand reordering for 2 or more operands.
     // The original operands are in OrigOps[OpIdx][Lane].
     // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
@@ -1362,6 +1417,11 @@
   /// \returns the cost of the vectorizable entry.
   int getEntryCost(TreeEntry *E);
 
+  /// Recursively build the supernode.
+  void buildSuperNode_rec(ArrayRef<Value *> VL, TreeEntry *TE, unsigned Depth,
+                          const EdgeInfo &UserTreeIdx,
+                          BoUpSLP::BlockScheduling &BS);
+
   /// This is the recursive part of buildTree.
   void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
                      const EdgeInfo &EI);
@@ -1467,7 +1527,7 @@
     void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
       if (Operands.size() < OpIdx + 1)
         Operands.resize(OpIdx + 1);
-      assert(Operands[OpIdx].size() == 0 && "Already resized?");
+      Operands[OpIdx].clear();
       Operands[OpIdx].resize(Scalars.size());
       for (unsigned Lane = 0, E = Scalars.size(); Lane != E; ++Lane)
         Operands[OpIdx][Lane] = OpVL[Lane];
@@ -1606,6 +1666,220 @@
 #endif
   };
 
+  /// This class represents a SuperNode of TreeEntries.
+  /// A supernode is a subgraph of the SLP graph containining chains of nodes
+  /// of legal opcodes. For now, the SuperNode is a tree of Add/Subs.
+  ///
+  /// For example a SuperNode may look like this:
+  /// \verbatim
+  ///         op0 op1
+  ///   +-------\-/-+
+  ///   |       TE2 |op2 op3 op4
+  ///   |         \ /------\-/--+
+  ///   |         TE1     TE3   |
+  ///   |            \   /      |
+  ///   |SuperNode    TE0       |  TE#: TreeEntry node
+  ///   +-----------------------+  op#: Immediate predecessor of node
+  /// \endverbatim
+  ///
+  ///  The SuperNode is based on ideas described in:
+  ///   Super-Node SLP: Optimized Vectorization for Code Sequences Containing
+  ///   Operators and Their Inverse Elements, CGO 2019 by Vasileios Porpodas,
+  ///   Rodrigo C. O. Rocha, Evgueni Brevnov, Luís F. W. Góes, Timothy Mattson.
+  class SuperNode {
+    /// The TreeEntries that are part of this SuperNode.
+    SmallSet<TreeEntry *, 2> TreeEntries;
+
+    /// The Root of the SuperNode.
+    TreeEntry *RootTE = nullptr;
+
+    /// All instructions in the SuperNode should have this opcode.
+    unsigned Opcode = 0;
+
+    /// Holds the operands of the SuperNode that will be reordered.
+    VLOperands Operands;
+
+    /// The edges that correspond to the operands of this SuperNode.
+    SmallVector<EdgeInfo, 2> Edges;
+
+    /// All nodes in the SuperNode must have the same number of lanes.
+    unsigned NumLanes = 0;
+
+    /// \Returns the inverse opcode of \p Opc, e.g. the inverse of ADD is SUB.
+    static unsigned getInverseOpcode(unsigned Opc) {
+      switch (Opc) {
+      case Instruction::Add:
+        return Instruction::Sub;
+      case Instruction::Sub:
+        return Instruction::Add;
+      case Instruction::FAdd:
+        return Instruction::FSub;
+      case Instruction::FSub:
+        return Instruction::FAdd;
+      default:
+        return 0;
+      }
+    }
+
+    /// \Returns true if \p VL has compatible opcodes to this SuperNode.
+    bool hasCompatibleOpcodesWithSuperNode(ArrayRef<Value *> VL) const {
+      return llvm::all_of(VL, [this](Value *V) {
+        return isa<Instruction>(V) &&
+               (cast<Instruction>(V)->getOpcode() == Opcode ||
+                cast<Instruction>(V)->getOpcode() == getInverseOpcode(Opcode));
+      });
+    }
+
+    /// \returns true if all scalars in TreeEntry \p TEIdx have a single use.
+    static bool hasSingleUse(ArrayRef<Value *> VL) {
+      return std::all_of(VL.begin(), VL.end(),
+                         [](Value *V) { return V->hasOneUse(); });
+    }
+
+    /// \returns true if the binary operator \p BO allows reassociation.
+    static bool canReassociate(BinaryOperator *BO) {
+      // If fmath, then check the fast-math flags.
+      if (auto FPI = dyn_cast<FPMathOperator>(BO))
+        return FPI->getFastMathFlags().allowReassoc();
+      // Else if an overflowing operator, check for the NSW flag.
+      else if (auto OBO = dyn_cast<OverflowingBinaryOperator>(BO))
+        return OBO->hasNoSignedWrap();
+      // Else it is legal to reassociate.
+      return true;
+    }
+
+    /// \returns true if all entries in \p VL allow operand reassociation.
+    static bool allowReassociation(ArrayRef<Value *> VL) {
+      for (Value *V : VL) {
+        assert(isa<BinaryOperator>(V) && "Expected binary operators in VL");
+        BinaryOperator *I = cast<BinaryOperator>(V);
+        if (!canReassociate(I))
+          return false;
+      }
+      return true;
+    }
+
+    /// \returns true if \p VL is compatible with this SuperNode.
+    /// That is if \p VL :
+    /// (i)   has the same opcode as the rest of the SuperNode,
+    /// (ii)  has a single use,
+    /// (iii) has the same number of lanes as the rest of SuperNode, and
+    /// (iv)  contains instructions that allow reassociation.
+    /// (v)   is in the same BB as the root of the SuperNode.
+    bool isCompatibleVL(ArrayRef<Value *> VL) const {
+      BasicBlock *BB0 = cast<Instruction>(RootTE->Scalars[0])->getParent();
+      bool IsInSameBB = (!isa<Instruction>(VL[0]))
+                            ? true
+                            : cast<Instruction>(VL[0])->getParent() == BB0;
+      return hasCompatibleOpcodesWithSuperNode(VL) && hasSingleUse(VL) &&
+             VL.size() == NumLanes && allowReassociation(VL) && IsInSameBB;
+    }
+
+  public:
+    SuperNode(const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R)
+        : Operands(DL, SE, R) {}
+
+    /// \Returns true if \p VL contains legal opcodes for a SuperNode.
+    static bool canInit(ArrayRef<Value *> VL) {
+      assert(isa<Instruction>(VL[0]) && "Expected Instruction");
+      unsigned Opcode0 = cast<Instruction>(VL[0])->getOpcode();
+      if (getInverseOpcode(Opcode0) == 0)
+        return false;
+      return std::all_of(std::next(VL.begin()), VL.end(), [&](Value *V) {
+        assert(isa<Instruction>(V) && "Expected Instruction");
+        unsigned Opcode = cast<Instruction>(V)->getOpcode();
+        return Opcode == Opcode0 || Opcode == getInverseOpcode(Opcode0);
+      });
+    }
+
+    /// Start forming a new SuperNode. Set \p RootIdx it as the root.
+    /// \returns true on success.
+    bool init(TreeEntry *RootTreeEntry) {
+      const ValueList &VL = RootTreeEntry->Scalars;
+      Opcode = cast<Instruction>(VL[0])->getOpcode();
+      RootTE = RootTreeEntry;
+      NumLanes = VL.size();
+      return true;
+    }
+
+    /// \returns true if \p VL can become a SuperNode entry.
+    /// This checks if we have not reached the size limit and if the \p VL is
+    /// compatible.
+    bool canExtendTowards(ArrayRef<Value *> VL) const {
+      return size() < MaxSuperNodeSize && isCompatibleVL(VL);
+    }
+
+    /// Check if the TreeEntry \p TEIdx is compatible with this SuperNode, and
+    void appendEntry(TreeEntry *TE) {
+      assert((empty() || isCompatibleVL(TE->Scalars)) && "Missing check.");
+      TreeEntries.insert(TE);
+    }
+
+    /// Append \p OpVL to the vector containing the operands of this SuperNode.
+    void appendOperands(ArrayRef<Value *> UserVL, const EdgeInfo &EI,
+                        ArrayRef<bool> APOVec) {
+      Operands.appendOperands(UserVL, EI, APOVec);
+      Edges.push_back(EI);
+      assert(Operands.getNumOperands() == Edges.size() && "out of sync");
+    }
+
+    /// \returns true if there are no entires in this SuperNode.
+    bool empty() const { return TreeEntries.empty(); }
+
+    /// \returns the number of TreeEntries in the SuperNode.
+    size_t size() const { return TreeEntries.size(); }
+
+    /// Clears all data.
+    void clear() {
+      TreeEntries.clear();
+      RootTE = nullptr;
+      Opcode = 0;
+      Operands.clear();
+      Edges.clear();
+      NumLanes = 0;
+    }
+
+    /// \returns the index of the root TreeEntry.
+    TreeEntry *getRoot() const {
+      return RootTE;
+    }
+
+    /// \returns the operands of the SuperNode.
+    const VLOperands &getOperands() const { return Operands; }
+
+    /// \returns the number of operands of the SuperNode.
+    unsigned getNumOperands() const { return Operands.getNumOperands(); }
+
+    /// \returns the edge that corresponds to \p OpIdx 'th operand.
+    const EdgeInfo &getEdge(unsigned OpIdx) const { return Edges[OpIdx]; }
+
+    /// Reorder operands across the whole SuperNode to improve vectorization.
+    void reorderOperands(BoUpSLP *R) {
+      Operands.reorder();
+      // We update the operands of the TreeEntries.
+      for (int Idx = 0, E = Edges.size(); Idx != E; ++Idx)
+        Edges[Idx].UserTE->setOperand(Edges[Idx].EdgeIdx, Operands.getVL(Idx));
+    }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+    /// Debug printers.
+    void dump(raw_ostream &OS) const {
+      OS << "TreeEntries: ";
+      for (TreeEntry *TE : TreeEntries)
+        OS << TE->Idx << ", ";
+      OS << "\n";
+
+      for (TreeEntry *TE : TreeEntries)
+        TE->dump();
+      OS << "-------------\n";
+      OS << "Root TE: " << RootTE->Idx << "\n";
+      OS << "Operands:\n";
+      Operands.print(OS);
+    }
+    LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
+#endif
+  };
+
   /// Create a new VectorizableTree entry.
   TreeEntry *newTreeEntry(ArrayRef<Value *> VL, Optional<ScheduleData *> Bundle,
                           const InstructionsState &S,
@@ -1651,6 +1925,9 @@
   /// Holds all of the tree entries.
   TreeEntry::VecTreeTy VectorizableTree;
 
+  /// Holds the tree entries that are in the SuperNode being constructed.
+  SuperNode CurrSuperNode;
+
 #ifndef NDEBUG
   /// Debug printer.
   LLVM_DUMP_METHOD void dumpVectorizableTree() const {
@@ -2051,6 +2328,9 @@
     tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
                       const InstructionsState &S);
 
+    /// Schedule all ready bundles.
+    void scheduleReady();
+
     /// Un-bundles a group of instructions.
     void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
 
@@ -2075,6 +2355,9 @@
     /// Sets all instruction in the scheduling region to un-scheduled.
     void resetSchedule();
 
+    /// \returns true if \p VL is already scheduled.
+    bool alreadyScheduled(Value *V) { return getScheduleData(V); }
+
     BasicBlock *BB;
 
     /// Simple memory allocation for ScheduleData.
@@ -2394,6 +2677,95 @@
   }
 }
 
+// Building the CurrSuperNode changes the way we do the recursion.
+// Originally, we used to do a DFS towards the definitions.
+// When building the SuperNode we change the recursion towards nodes that
+// may be added to the SuperNode. After that, we continue the recursion
+// from the operands of the SuperNode. For example:
+//
+//   L2   L3
+//     \ /
+//  L1 B(+)
+//   \ /
+//   A(+)
+//    |
+//    S         Visiting order
+//              --------------
+// Originally:  S, A+, L1, B+, L2, L3
+// w/SuperNode: S, A+, B+, L1, L2, L3
+//                 |______|
+//                 SuperNode
+//
+void BoUpSLP::buildSuperNode_rec(ArrayRef<Value *> VL, TreeEntry *TE,
+                                 unsigned Depth, const EdgeInfo &UserTreeIdx,
+                                 BoUpSLP::BlockScheduling &BS) {
+  // If we are building a new supernode, TE is the root entry.
+  if (CurrSuperNode.empty())
+    CurrSuperNode.init(TE);
+  // Add a new entry to the supernode under construction.
+  CurrSuperNode.appendEntry(TE);
+
+  // Reorder the operands of VL.
+  VLOperands Ops(VL, *DL, *SE, *this);
+  Ops.reorder();
+
+  // Update TE to reflect the reordered operands. This is needed for the
+  // scheduler.
+  for (unsigned OpIdx = 0, NumOperands = Ops.getNumOperands();
+       OpIdx != NumOperands; ++OpIdx) {
+    const ValueList &OpVL = Ops.getVL(OpIdx);
+    TE->setOperand(OpIdx, OpVL);
+  }
+  // We are now ready to continue the recursion towards the operands.
+  for (unsigned OpIdx = 0, NumOperands = Ops.getNumOperands();
+       OpIdx != NumOperands; ++OpIdx) {
+    const ValueList &OpVL = Ops.getVL(OpIdx);
+    if (CurrSuperNode.canExtendTowards(OpVL))
+      // If the operands are compatible with the supernode, continue the
+      // recursion towards them. 'OpVL' will be part of the supernode.
+      buildTree_rec(OpVL, Depth + 1, {TE, OpIdx});
+    else {
+      // Else, stop the recursion. These operands will now become
+      // operands of the supernode.
+      CurrSuperNode.appendOperands(OpVL, {TE, OpIdx}, Ops.getAPOVec(OpIdx));
+    }
+  }
+  // Return true if we have a non-empty supernode and wea are back at the root.
+  bool BuiltSuperNode =
+      !CurrSuperNode.empty() && TE->Idx == CurrSuperNode.getRoot()->Idx;
+  if (BuiltSuperNode) {
+    // This is rather ugly, but I don't see a cleaner way.
+    // The problem is that even after calling trySchedule(Bundle),
+    // 'Bundle' is not actually scheduled, only its successors are. This
+    // results in some of the nodes of a supernode (the ones closer to the
+    // root) being scheduled, while others not. Scheduled instructions
+    // have their predecessors dependence edges updated. This causes a
+    // problem after reordering, because some of these dependence counters
+    // may be updated twice. With scheduleReady() we force-schedule all
+    // the ready bundles (that is all of the bundles of the supernode)
+    // before we do the reordering, in order to avoid this double
+    // increment of the dependence counters.
+    // Another alternative is to unschedule and reschedule the nodes.
+    BS.scheduleReady();
+
+    // Reordering across the whole superonde.
+    CurrSuperNode.reorderOperands(this);
+
+    // Create a temporary copy of the CurrSuperNode for looping
+    // through its operands after it has been cleared.
+    SuperNode TmpSuperNode = CurrSuperNode;
+    // We must clear the CurrSuperNode at this point because the
+    // recursion should continue without any active supernode.
+    CurrSuperNode.clear();
+    // Resume the recursion towards the operands of the supernode.
+    for (unsigned OpIdx = 0, NumOperands = TmpSuperNode.getNumOperands();
+         OpIdx != NumOperands; ++OpIdx) {
+      const ValueList OpVL = TmpSuperNode.getOperands().getVL(OpIdx);
+      buildTree_rec(OpVL, Depth + 1, TmpSuperNode.getEdge(OpIdx));
+    }
+  }
+}
+
 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                             const EdgeInfo &UserTreeIdx) {
   assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
@@ -2811,26 +3183,33 @@
                                    ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n");
 
-      // Sort operands of the instructions so that each side is more likely to
-      // have the same opcode.
-      if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
-        ValueList Left, Right;
-        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
-        TE->setOperand(0, Left);
-        TE->setOperand(1, Right);
-        buildTree_rec(Left, Depth + 1, {TE, 0});
-        buildTree_rec(Right, Depth + 1, {TE, 1});
-        return;
-      }
+      if (EnableSuperNode &&
+          // We are either in progress, or we can create a new one.
+          (!CurrSuperNode.empty() || CurrSuperNode.canInit(VL)))
+        buildSuperNode_rec(VL, TE, Depth, UserTreeIdx, BS);
+      // The default recursion.
+      else {
+        // Sort operands of the instructions so that each side is more likely to
+        // have the same opcode.
+        if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
+          ValueList Left, Right;
+          reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
+          TE->setOperand(0, Left);
+          TE->setOperand(1, Right);
+          buildTree_rec(Left, Depth + 1, {TE, 0});
+          buildTree_rec(Right, Depth + 1, {TE, 1});
+          return;
+        }
 
-      TE->setOperandsInOrder();
-      for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
-        ValueList Operands;
-        // Prepare the operand vector.
-        for (Value *j : VL)
-          Operands.push_back(cast<Instruction>(j)->getOperand(i));
+        TE->setOperandsInOrder();
+        for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+          ValueList Operands;
+          // Prepare the operand vector.
+          for (Value *j : VL)
+            Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
-        buildTree_rec(Operands, Depth + 1, {TE, i});
+          buildTree_rec(Operands, Depth + 1, {TE, i});
+        }
       }
       return;
     }
@@ -3054,25 +3433,33 @@
                                    ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
 
-      // Reorder operands if reordering would enable vectorization.
-      if (isa<BinaryOperator>(VL0)) {
-        ValueList Left, Right;
-        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
-        TE->setOperand(0, Left);
-        TE->setOperand(1, Right);
-        buildTree_rec(Left, Depth + 1, {TE, 0});
-        buildTree_rec(Right, Depth + 1, {TE, 1});
-        return;
-      }
+      if (EnableSuperNode &&
+          // We are either in progress, or we can create a new one.
+          (!CurrSuperNode.empty() || CurrSuperNode.canInit(VL)))
+        buildSuperNode_rec(VL, TE, Depth, UserTreeIdx, BS);
+      // The default recursion.
+      else {
+        // Reorder operands if reordering would enable vectorization.
+        if (isa<BinaryOperator>(VL0)) {
+          ValueList Left, Right;
+          reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
+          TE->setOperand(0, Left);
+          TE->setOperand(1, Right);
+          buildTree_rec(Left, Depth + 1, {TE, 0});
+          buildTree_rec(Right, Depth + 1, {TE, 1});
+          return;
+        }
 
-      TE->setOperandsInOrder();
-      for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
-        ValueList Operands;
-        // Prepare the operand vector.
-        for (Value *V : VL)
-          Operands.push_back(cast<Instruction>(V)->getOperand(i));
+        TE->setOperandsInOrder();
+        for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+          ValueList Operands;
+          // Prepare the operand vector.
+          for (Value *V : VL)
+            Operands.push_back(cast<Instruction>(V)->getOperand(i));
 
-        buildTree_rec(Operands, Depth + 1, {TE, i});
+          buildTree_rec(Operands, Depth + 1, {TE, i});
+        }
+        return;
       }
       return;
     }
@@ -4879,6 +5266,14 @@
   return Bundle;
 }
 
+void BoUpSLP::BlockScheduling::scheduleReady() {
+  while (!ReadyInsts.empty()) {
+    ScheduleData *pickedSD = ReadyInsts.pop_back_val();
+    if (pickedSD->isSchedulingEntity() && pickedSD->isReady())
+      schedule(pickedSD, ReadyInsts);
+  }
+}
+
 void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
                                                 Value *OpValue) {
   if (isa<PHINode>(OpValue))
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll b/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll
@@ -15,18 +15,14 @@
 ; ENABLED-NEXT:    [[IDXC1:%.*]] = getelementptr inbounds double, double* [[CARRAY]], i64 1
 ; ENABLED-NEXT:    [[IDXS0:%.*]] = getelementptr inbounds double, double* [[SARRAY:%.*]], i64 0
 ; ENABLED-NEXT:    [[IDXS1:%.*]] = getelementptr inbounds double, double* [[SARRAY]], i64 1
-; ENABLED-NEXT:    [[A0:%.*]] = load double, double* [[IDXA0]], align 8
-; ENABLED-NEXT:    [[A1:%.*]] = load double, double* [[IDXA1]], align 8
-; ENABLED-NEXT:    [[TMP0:%.*]] = bitcast double* [[IDXB0]] to <2 x double>*
+; ENABLED-NEXT:    [[TMP0:%.*]] = bitcast double* [[IDXA0]] to <2 x double>*
 ; ENABLED-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
-; ENABLED-NEXT:    [[C0:%.*]] = load double, double* [[IDXC0]], align 8
-; ENABLED-NEXT:    [[C1:%.*]] = load double, double* [[IDXC1]], align 8
-; ENABLED-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[A0]], i32 0
-; ENABLED-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[C1]], i32 1
-; ENABLED-NEXT:    [[TMP4:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP1]]
-; ENABLED-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> undef, double [[C0]], i32 0
-; ENABLED-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[A1]], i32 1
-; ENABLED-NEXT:    [[TMP7:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP6]]
+; ENABLED-NEXT:    [[TMP2:%.*]] = bitcast double* [[IDXB0]] to <2 x double>*
+; ENABLED-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
+; ENABLED-NEXT:    [[TMP4:%.*]] = bitcast double* [[IDXC0]] to <2 x double>*
+; ENABLED-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8
+; ENABLED-NEXT:    [[TMP6:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]]
+; ENABLED-NEXT:    [[TMP7:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP5]]
 ; ENABLED-NEXT:    [[TMP8:%.*]] = bitcast double* [[IDXS0]] to <2 x double>*
 ; ENABLED-NEXT:    store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8
 ; ENABLED-NEXT:    ret void
@@ -74,18 +70,14 @@
 ; ENABLED-NEXT:    [[IDXC1:%.*]] = getelementptr inbounds double, double* [[CARRAY]], i64 1
 ; ENABLED-NEXT:    [[IDXS0:%.*]] = getelementptr inbounds double, double* [[SARRAY:%.*]], i64 0
 ; ENABLED-NEXT:    [[IDXS1:%.*]] = getelementptr inbounds double, double* [[SARRAY]], i64 1
-; ENABLED-NEXT:    [[A0:%.*]] = load double, double* [[IDXA0]], align 8
-; ENABLED-NEXT:    [[A1:%.*]] = load double, double* [[IDXA1]], align 8
-; ENABLED-NEXT:    [[TMP0:%.*]] = bitcast double* [[IDXB0]] to <2 x double>*
+; ENABLED-NEXT:    [[TMP0:%.*]] = bitcast double* [[IDXA0]] to <2 x double>*
 ; ENABLED-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
-; ENABLED-NEXT:    [[C0:%.*]] = load double, double* [[IDXC0]], align 8
-; ENABLED-NEXT:    [[C1:%.*]] = load double, double* [[IDXC1]], align 8
-; ENABLED-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[A0]], i32 0
-; ENABLED-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[C1]], i32 1
-; ENABLED-NEXT:    [[TMP4:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP1]]
-; ENABLED-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> undef, double [[C0]], i32 0
-; ENABLED-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[A1]], i32 1
-; ENABLED-NEXT:    [[TMP7:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP6]]
+; ENABLED-NEXT:    [[TMP2:%.*]] = bitcast double* [[IDXB0]] to <2 x double>*
+; ENABLED-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
+; ENABLED-NEXT:    [[TMP4:%.*]] = bitcast double* [[IDXC0]] to <2 x double>*
+; ENABLED-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8
+; ENABLED-NEXT:    [[TMP6:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]]
+; ENABLED-NEXT:    [[TMP7:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP5]]
 ; ENABLED-NEXT:    [[TMP8:%.*]] = bitcast double* [[IDXS0]] to <2 x double>*
 ; ENABLED-NEXT:    store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8
 ; ENABLED-NEXT:    ret void
@@ -134,18 +126,20 @@
 ; ENABLED-NEXT:    [[IDXC1:%.*]] = getelementptr inbounds double, double* [[CARRAY]], i64 1
 ; ENABLED-NEXT:    [[IDXS0:%.*]] = getelementptr inbounds double, double* [[SARRAY:%.*]], i64 0
 ; ENABLED-NEXT:    [[IDXS1:%.*]] = getelementptr inbounds double, double* [[SARRAY]], i64 1
-; ENABLED-NEXT:    [[A0:%.*]] = load double, double* [[IDXA0]], align 8
-; ENABLED-NEXT:    [[A1:%.*]] = load double, double* [[IDXA1]], align 8
-; ENABLED-NEXT:    [[B0:%.*]] = load double, double* [[IDXB0]], align 8
-; ENABLED-NEXT:    [[B1:%.*]] = load double, double* [[IDXB1]], align 8
-; ENABLED-NEXT:    [[C0:%.*]] = load double, double* [[IDXC0]], align 8
-; ENABLED-NEXT:    [[C1:%.*]] = load double, double* [[IDXC1]], align 8
-; ENABLED-NEXT:    [[SUBA0B0:%.*]] = fsub fast double [[A0]], [[B0]]
-; ENABLED-NEXT:    [[ADDB1C1:%.*]] = fadd fast double [[B1]], [[C1]]
-; ENABLED-NEXT:    [[SUB0:%.*]] = fsub fast double [[SUBA0B0]], [[C0]]
-; ENABLED-NEXT:    [[ADD1:%.*]] = fadd fast double [[ADDB1C1]], [[A1]]
-; ENABLED-NEXT:    store double [[SUB0]], double* [[IDXS0]], align 8
-; ENABLED-NEXT:    store double [[ADD1]], double* [[IDXS1]], align 8
+; ENABLED-NEXT:    [[TMP0:%.*]] = bitcast double* [[IDXA0]] to <2 x double>*
+; ENABLED-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
+; ENABLED-NEXT:    [[TMP2:%.*]] = bitcast double* [[IDXB0]] to <2 x double>*
+; ENABLED-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
+; ENABLED-NEXT:    [[TMP4:%.*]] = bitcast double* [[IDXC0]] to <2 x double>*
+; ENABLED-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8
+; ENABLED-NEXT:    [[TMP6:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]]
+; ENABLED-NEXT:    [[TMP7:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]]
+; ENABLED-NEXT:    [[TMP8:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> [[TMP7]], <2 x i32> <i32 0, i32 3>
+; ENABLED-NEXT:    [[TMP9:%.*]] = fsub fast <2 x double> [[TMP8]], [[TMP5]]
+; ENABLED-NEXT:    [[TMP10:%.*]] = fadd fast <2 x double> [[TMP8]], [[TMP5]]
+; ENABLED-NEXT:    [[TMP11:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> [[TMP10]], <2 x i32> <i32 0, i32 3>
+; ENABLED-NEXT:    [[TMP12:%.*]] = bitcast double* [[IDXS0]] to <2 x double>*
+; ENABLED-NEXT:    store <2 x double> [[TMP11]], <2 x double>* [[TMP12]], align 8
 ; ENABLED-NEXT:    ret void
 ;
 entry:
@@ -216,17 +210,15 @@
 ; ENABLED-NEXT:    [[IDXS0:%.*]] = getelementptr inbounds double, double* [[SARRAY:%.*]], i64 0
 ; ENABLED-NEXT:    [[IDXS1:%.*]] = getelementptr inbounds double, double* [[SARRAY]], i64 1
 ; ENABLED-NEXT:    [[C:%.*]] = load double, double* [[IDXC]], align 8
-; ENABLED-NEXT:    [[B0:%.*]] = load double, double* [[IDXB0]], align 8
 ; ENABLED-NEXT:    [[TMP0:%.*]] = bitcast double* [[IDXA0]] to <2 x double>*
 ; ENABLED-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
-; ENABLED-NEXT:    [[B1:%.*]] = load double, double* [[IDXB1]], align 8
-; ENABLED-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[C]], i32 0
-; ENABLED-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[B1]], i32 1
-; ENABLED-NEXT:    [[TMP4:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]]
+; ENABLED-NEXT:    [[TMP2:%.*]] = bitcast double* [[IDXB0]] to <2 x double>*
+; ENABLED-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
 ; ENABLED-NEXT:    [[D:%.*]] = load double, double* [[IDXD]], align 8
-; ENABLED-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> undef, double [[B0]], i32 0
-; ENABLED-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[D]], i32 1
-; ENABLED-NEXT:    [[TMP7:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP6]]
+; ENABLED-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> undef, double [[C]], i32 0
+; ENABLED-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[D]], i32 1
+; ENABLED-NEXT:    [[TMP6:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP5]]
+; ENABLED-NEXT:    [[TMP7:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP3]]
 ; ENABLED-NEXT:    [[TMP8:%.*]] = bitcast double* [[IDXS0]] to <2 x double>*
 ; ENABLED-NEXT:    store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8
 ; ENABLED-NEXT:    ret void