Index: lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -634,6 +634,235 @@
 #endif
   };
 
+  /// During operand reordering, we are trying to select the operand at lane
+  /// that matches best with the operand at the neighboring lane. Our selection
+  /// is based on the type of value we are looking for. For example, if the
+  /// neighboring lane has a load, we need to look for a load that is accessing
+  /// a consecutive address.
+  /// These strategies are summarized in the 'ReorderingMode' enumerator.
+  enum class ReorderingMode {
+    Load,     // Matching loads to consecutive memory addresses
+    Opcode,   // Matching instructions based on opcode (same or alternate)
+    Constant, // Matching constants
+    Splat,    // Matching the same instruction multiple times (broadcast)
+    Failed,   // We failed to create a vectorizable group
+  };
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  LLVM_DUMP_METHOD static const char *getModeStr(ReorderingMode RMode) {
+    switch (RMode) {
+    case ReorderingMode::Load:
+      return "Load";
+    case ReorderingMode::Opcode:
+      return "Opcode";
+    case ReorderingMode::Constant:
+      return "Constant";
+    case ReorderingMode::Splat:
+      return "Splat";
+    case ReorderingMode::Failed:
+      return "Failed";
+    }
+    llvm_unreachable("Unimplemented Reordering Type");
+  }
+
+  LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
+                                                 raw_ostream &OS) {
+    return OS << getModeStr(RMode);
+  }
+
+  /// Debug print.
+  LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
+    printMode(RMode, dbgs());
+  }
+
+  friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
+    return printMode(RMode, OS);
+  }
+
+  LLVM_DUMP_METHOD static void
+  dumpModeVec(const SmallVectorImpl<ReorderingMode> &ReorderingModes) {
+    for (unsigned OpIdx = 0, E = ReorderingModes.size(); OpIdx != E; ++OpIdx)
+      dbgs() << OpIdx << ". " << ReorderingModes[OpIdx] << "\n";
+  }
+#endif
+
+  /// A helper data structure to hold the operands of a vector of instructions.
+  /// This supports a fixed vector length for all operand vectors.
+  class VLOperands {
+    /// For each operand we need (i) the value, and (ii) the opcode that it
+    /// would be attached to if the expression was in a left-linearized form.
+    /// This is required to avoid illegal operand reordering.
+    /// For example:
+    /// \verbatim
+    ///                         0 Op1
+    ///                         |/
+    /// Op1 Op2   Linearized    + Op2
+    ///   \ /     ---------->   |/
+    ///    -                    -
+    ///
+    /// Op1 - Op2            (0 + Op1) - Op2
+    /// \endverbatim
+    ///
+    /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
+    ///
+    /// Another way to think of this is to track all the operations across the
+    /// path from the operand all the way to the root of the tree and to
+    /// calculate the operation that corresponds to this path. For example, the
+    /// path from Op2 to the root crosses the RHS of the '-', therefore the
+    /// corresponding operation is a '-' (which matches the one in the
+    /// linearized tree, as shown above).
+    ///
+    /// For lack of a better term, we refer to this operation as Accumulated
+    /// Path Operation (APO).
+    struct OperandData {
+      /// The operand value.
+      Value *V;
+      /// TreeEntries only allow a single opcode, or an alternate sequence of
+      /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
+      /// APO. It is set to 'true' if 'V' is attached to an inverse operation
+      /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
+      /// (e.g., Add/Mul)
+      bool APO;
+      /// Helper data for the reordering function.
+      bool IsUsed;
+    };
+    using OperandDataVec = SmallVector<OperandData, 2>;
+
+    /// A vector of operand vectors.
+    SmallVector<OperandDataVec, 4> OpsVec;
+
+  public:
+    VLOperands() {}
+    /// Initialize with all the operands of the instruction vector \p VL.
+    VLOperands(ArrayRef<Value *> VL) {
+        // Append all the operands of VL.
+        appendOperandsOfVL(VL);
+    }
+
+    /// \Returns true if \p I is an inverse operation with respect to a
+    /// corresponding commutative operation. For example, this it will return
+    /// true for a Sub because it is the inverse operation of an Add.
+    static bool isInverseOperation(Instruction *I) {
+      // Since operand reordering is performed on groups of commutative
+      // operations or alternating sequences (e.g., +, -), we can safely tell
+      // the inverse operations by checking commutativity.
+      return !isCommutative(I);
+    }
+
+    /// Go through the instructions in VL and append their operands.
+    void appendOperandsOfVL(ArrayRef<Value *> VL) {
+      assert(!VL.empty() && "Bad VL");
+      assert((empty() || VL.size() == getNumLanes()) &&
+             "Expected same number of lanes");
+      assert(isa<Instruction>(VL[0]) && "Expected instruction");
+      unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
+      OpsVec.resize(NumOperands);
+      unsigned NumLanes = VL.size();
+      for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
+        OpsVec[OpIdx].resize(NumLanes);
+        for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+          assert(isa<Instruction>(VL[Lane]) && "Expected instruction");
+          // Our tree has just 3 nodes: the root and two operands.
+          // It is therefore trivial to get the APO. We only need to check the
+          // opcode of VL[0] and whether the operand at OpIdx is the LHS or RHS
+          // operand. The LHS operand of both add and sub is nevere attached to
+          // an inversese operation in the linearized form, therefore its APO is
+          // false. The RHS is ture only if VL[Lane] is an inverse operation.
+          bool APO = (OpIdx == 0)
+                         ? false
+                         : isInverseOperation(cast<Instruction>(VL[Lane]));
+          OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
+                                 APO, false};
+        }
+      }
+    }
+
+    /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
+    void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
+      std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
+    }
+
+    /// \returns the number of operands.
+    unsigned getNumOperands() const { return OpsVec.size(); }
+
+    /// \returns the number of lanes.
+    unsigned getNumLanes() const { return OpsVec[0].size(); }
+
+    /// \returns the operand data at \p OpIdx and \p Lane.
+    OperandData &getData(unsigned OpIdx, unsigned Lane) {
+      return OpsVec[OpIdx][Lane];
+    }
+
+    /// \returns the operand data at \p OpIdx and \p Lane. Const version.
+    const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
+      return OpsVec[OpIdx][Lane];
+    }
+
+    /// \returns the operand value at \p OpIdx and \p Lane.
+    Value *getValue(unsigned OpIdx, unsigned Lane) {
+      return getData(OpIdx, Lane).V;
+    }
+
+    /// \returns true if the data structure is empty.
+    bool empty() const { return OpsVec.empty(); }
+
+    /// \Returns a value vector with the operands across all lanes for the
+    /// opearnd at \p OpIdx.
+    ValueList getVL(unsigned OpIdx) {
+      ValueList OpVL;
+      for (auto &OpData : OpsVec[OpIdx])
+        OpVL.push_back(OpData.V);
+      return OpVL;
+    }
+
+    /// \Returns the maximum number of operands that are allowed to be reordered
+    /// for \p Lane. This is used as a heuristic for selecting the first lane to
+    /// start operand reordering.
+    unsigned getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
+      unsigned CntTrue = 0;
+      unsigned NumOperands = getNumOperands();
+      // Operands with the same APO can be reordered. We therefore need to count
+      // how many of them we have for each APO, like this: Cnt[APO] = x.
+      // Since we only have two APOs, namely true and false, we can avoid using
+      // a map. Instead we can simply count the number of operands that
+      // correspond to one of them (in this case the 'true' APO), and calculate
+      // the other by subtracting it from the total number of operands.
+      for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx)
+        if (getData(OpIdx, Lane).APO)
+          CntTrue++;
+      unsigned CntFalse = NumOperands - CntTrue;
+      return std::max(CntTrue, CntFalse);
+    }
+
+    /// Clears the data.
+    void clear() { OpsVec.clear(); }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+    LLVM_DUMP_METHOD raw_ostream &print(raw_ostream &OS) {
+      const unsigned Indent = 2;
+      unsigned Cnt = 0;
+      for (const auto &OpDataVec : OpsVec) {
+        OS << Cnt++ << "\n";
+        for (const auto &OpData : OpDataVec) {
+          OS.indent(Indent) << "{";
+          if (Value *V = OpData.V)
+            OS << *V;
+          else
+            OS << "null";
+          OS << ", APO:" << OpData.APO << "}\n";
+        }
+        OS << "\n";
+      }
+      return OS;
+    }
+
+    /// Debug print.
+    LLVM_DUMP_METHOD void dump() {
+      print(dbgs());
+    }
+#endif
+  };
+
 private:
   struct TreeEntry;
 
@@ -681,10 +910,18 @@
   /// be beneficial even the tree height is tiny.
   bool isFullyVectorizableTinyTree();
 
-  /// \reorder commutative operands to get better probability of
+  /// Search all operands in Ops[*][Lane] for the operand that matches
+  /// best with the one in Ops[OpIdx][LastLane] and return its index.
+  Optional<unsigned> getBestOperand(unsigned OpIdx, int Lane, int LastLane,
+                                    VLOperands &Ops,
+                                    ArrayRef<ReorderingMode> ReorderingModes);
+
+  /// \reorder the operands in \p Ops to improve vectorization.
+  void reorderOperandVecs(VLOperands &Ops);
+
+  /// \reorder commutative or alt operands to get better probability of
   /// generating vectorized code.
-  void reorderInputsAccordingToOpcode(const InstructionsState &S,
-                                      ArrayRef<Value *> VL,
+  void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
                                       SmallVectorImpl<Value *> &Left,
                                       SmallVectorImpl<Value *> &Right);
   struct TreeEntry {
@@ -1866,7 +2103,7 @@
         // Commutative predicate - collect + sort operands of the instructions
         // so that each side is more likely to have the same opcode.
         assert(P0 == SwapP0 && "Commutative Predicate mismatch");
-        reorderInputsAccordingToOpcode(S, VL, Left, Right);
+        reorderInputsAccordingToOpcode(VL, Left, Right);
       } else {
         // Collect operands - commute if it uses the swapped predicate.
         for (Value *V : VL) {
@@ -1912,7 +2149,7 @@
       // have the same opcode.
       if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
         ValueList Left, Right;
-        reorderInputsAccordingToOpcode(S, VL, Left, Right);
+        reorderInputsAccordingToOpcode(VL, Left, Right);
         UserTreeIdx.EdgeIdx = 0;
         buildTree_rec(Left, Depth + 1, UserTreeIdx);
         UserTreeIdx.EdgeIdx = 1;
@@ -2087,7 +2324,7 @@
       // Reorder operands if reordering would enable vectorization.
       if (isa<BinaryOperator>(VL0)) {
         ValueList Left, Right;
-        reorderInputsAccordingToOpcode(S, VL, Left, Right);
+        reorderInputsAccordingToOpcode(VL, Left, Right);
         UserTreeIdx.EdgeIdx = 0;
         buildTree_rec(Left, Depth + 1, UserTreeIdx);
         UserTreeIdx.EdgeIdx = 1;
@@ -2802,174 +3039,189 @@
   return getGatherCost(VecTy, ShuffledElements);
 }
 
-// Return true if the i'th left and right operands can be commuted.
-//
-// The vectorizer is trying to either have all elements one side being
-// instruction with the same opcode to enable further vectorization, or having
-// a splat to lower the vectorizing cost.
-static bool shouldReorderOperands(int i, ArrayRef<Value *> Left,
-                                  ArrayRef<Value *> Right,
-                                  bool AllSameOpcodeLeft,
-                                  bool AllSameOpcodeRight, bool SplatLeft,
-                                  bool SplatRight) {
-  Value *PrevLeft = Left[i - 1];
-  Value *PrevRight = Right[i - 1];
-  Value *CurrLeft = Left[i];
-  Value *CurrRight = Right[i];
-
-  // If we have "SplatRight", try to see if commuting is needed to preserve it.
-  if (SplatRight) {
-    if (CurrRight == PrevRight)
-      // Preserve SplatRight
-      return false;
-    if (CurrLeft == PrevRight) {
-      // Commuting would preserve SplatRight, but we don't want to break
-      // SplatLeft either, i.e. preserve the original order if possible.
-      // (FIXME: why do we care?)
-      if (SplatLeft && CurrLeft == PrevLeft)
-        return false;
-      return true;
-    }
-  }
-  // Symmetrically handle Right side.
-  if (SplatLeft) {
-    if (CurrLeft == PrevLeft)
-      // Preserve SplatLeft
-      return false;
-    if (CurrRight == PrevLeft)
-      return true;
-  }
+// Search all operands in Ops[*][Lane] for the one that matches best
+// Ops[OpIdx][LastLane] and return its opreand index.
+// If no good match can be found, return None.
+Optional<unsigned>
+BoUpSLP::getBestOperand(unsigned OpIdx, int Lane, int LastLane, VLOperands &Ops,
+                        ArrayRef<ReorderingMode> ReorderingModes) {
+  unsigned NumOperands = Ops.getNumOperands();
 
-  Instruction *ILeft = dyn_cast<Instruction>(CurrLeft);
-  Instruction *IRight = dyn_cast<Instruction>(CurrRight);
+  // The operand of the previous lane at OpIdx.
+  Value *OpLastLane = Ops.getData(OpIdx, LastLane).V;
 
-  // If we have "AllSameOpcodeRight", try to see if the left operands preserves
-  // it and not the right, in this case we want to commute.
-  if (AllSameOpcodeRight) {
-    unsigned RightPrevOpcode = cast<Instruction>(PrevRight)->getOpcode();
-    if (IRight && RightPrevOpcode == IRight->getOpcode())
-      // Do not commute, a match on the right preserves AllSameOpcodeRight
-      return false;
-    if (ILeft && RightPrevOpcode == ILeft->getOpcode()) {
-      // We have a match and may want to commute, but first check if there is
-      // not also a match on the existing operands on the Left to preserve
-      // AllSameOpcodeLeft, i.e. preserve the original order if possible.
-      // (FIXME: why do we care?)
-      if (AllSameOpcodeLeft && ILeft &&
-          cast<Instruction>(PrevLeft)->getOpcode() == ILeft->getOpcode())
-        return false;
-      return true;
-    }
-  }
-  // Symmetrically handle Left side.
-  if (AllSameOpcodeLeft) {
-    unsigned LeftPrevOpcode = cast<Instruction>(PrevLeft)->getOpcode();
-    if (ILeft && LeftPrevOpcode == ILeft->getOpcode())
-      return false;
-    if (IRight && LeftPrevOpcode == IRight->getOpcode())
-      return true;
-  }
-  return false;
-}
+  // Our strategy mode for OpIdx.
+  ReorderingMode RMode = ReorderingModes[OpIdx];
 
-void BoUpSLP::reorderInputsAccordingToOpcode(const InstructionsState &S,
-                                             ArrayRef<Value *> VL,
-                                             SmallVectorImpl<Value *> &Left,
-                                             SmallVectorImpl<Value *> &Right) {
-  assert(!VL.empty() && Left.empty() && Right.empty() &&
-         "Unexpected instruction/operand lists");
+  // The linearized opcode of the operand at OpIdx, Lane.
+  bool OpIdxAPO = Ops.getData(OpIdx, Lane).APO;
 
-  // Push left and right operands of binary operation into Left and Right
-  for (Value *V : VL) {
-    auto *I = cast<Instruction>(V);
-    assert(S.isOpcodeOrAlt(I) && "Incorrect instruction in vector");
-    Left.push_back(I->getOperand(0));
-    Right.push_back(I->getOperand(1));
-  }
-
-  // Keep track if we have instructions with all the same opcode on one side.
-  bool AllSameOpcodeLeft = isa<Instruction>(Left[0]);
-  bool AllSameOpcodeRight = isa<Instruction>(Right[0]);
-  // Keep track if we have one side with all the same value (broadcast).
-  bool SplatLeft = true;
-  bool SplatRight = true;
-
-  for (unsigned i = 1, e = VL.size(); i != e; ++i) {
-    Instruction *I = cast<Instruction>(VL[i]);
-    // Commute to favor either a splat or maximizing having the same opcodes on
-    // one side.
-    if (isCommutative(I) &&
-        shouldReorderOperands(i, Left, Right, AllSameOpcodeLeft,
-                              AllSameOpcodeRight, SplatLeft, SplatRight))
-      std::swap(Left[i], Right[i]);
-
-    // Update Splat* and AllSameOpcode* after the insertion.
-    SplatRight = SplatRight && (Right[i - 1] == Right[i]);
-    SplatLeft = SplatLeft && (Left[i - 1] == Left[i]);
-    AllSameOpcodeLeft = AllSameOpcodeLeft && isa<Instruction>(Left[i]) &&
-                        (cast<Instruction>(Left[i - 1])->getOpcode() ==
-                         cast<Instruction>(Left[i])->getOpcode());
-    AllSameOpcodeRight = AllSameOpcodeRight && isa<Instruction>(Right[i]) &&
-                         (cast<Instruction>(Right[i - 1])->getOpcode() ==
-                          cast<Instruction>(Right[i])->getOpcode());
-  }
-
-  // If one operand end up being broadcast, return this operand order.
-  if (SplatRight || SplatLeft)
-    return;
+  // Iterate through all unused operands.
+  for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
+    // Get the operand at Idx and Lane.
+    auto &OpData = Ops.getData(Idx, Lane);
+    Value *Op = OpData.V;
+    bool OpAPO = OpData.APO;
 
-  // Finally check if we can get longer vectorizable chain by reordering
-  // without breaking the good operand order detected above.
-  // E.g. If we have something like-
-  // load a[0] - load b[0]
-  // load b[1] + load a[1]
-  // load a[2] - load b[2]
-  // load a[3] + load b[3]
-  // Reordering the second load b[1] + load a[1] would allow us to vectorize
-  // this code and we still retain AllSameOpcode property.
-  // FIXME: This load reordering might break AllSameOpcode in some rare cases
-  // such as-
-  // add a[0],c[0]  load b[0]
-  // add a[1],c[2]  load b[1]
-  // b[2]           load b[2]
-  // add a[3],c[3]  load b[3]
-  for (unsigned j = 0, e = VL.size() - 1; j < e; ++j) {
-    if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
-      if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
-        if (isConsecutiveAccess(L, L1, *DL, *SE)) {
-          auto *VL1 = cast<Instruction>(VL[j]);
-          auto *VL2 = cast<Instruction>(VL[j + 1]);
-          if (isCommutative(VL2)) {
-            std::swap(Left[j + 1], Right[j + 1]);
-            continue;
-          }
-          if (isCommutative(VL1)) {
-            std::swap(Left[j], Right[j]);
-            continue;
-          }
+    // Skip already selected operands.
+    if (OpData.IsUsed)
+      continue;
+
+    // Skip if we are trying to move the operand to a position with a different
+    // opcode in the linearized tree form. This would break the semantics.
+    if (OpAPO != OpIdxAPO)
+      continue;
+
+    // Look for an operand that matches the current mode.
+    switch (RMode) {
+    case ReorderingMode::Load:
+      if (isa<LoadInst>(Op)) {
+        // Figure out which is left and right, so that we can check for
+        // consecutive loads
+        bool LeftToRight = Lane > LastLane;
+        Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
+        Value *OpRight = (LeftToRight) ? Op : OpLastLane;
+        if (isConsecutiveAccess(cast<LoadInst>(OpLeft), cast<LoadInst>(OpRight),
+                                *DL, *SE)) {
+          OpData.IsUsed = true;
+          return Idx;
         }
       }
-    }
-    if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
-      if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
-        if (isConsecutiveAccess(L, L1, *DL, *SE)) {
-          auto *VL1 = cast<Instruction>(VL[j]);
-          auto *VL2 = cast<Instruction>(VL[j + 1]);
-          if (isCommutative(VL2)) {
-            std::swap(Left[j + 1], Right[j + 1]);
-            continue;
-          }
-          if (isCommutative(VL1)) {
-            std::swap(Left[j], Right[j]);
-            continue;
-          }
+      break;
+    case ReorderingMode::Opcode:
+      if (isa<Instruction>(Op))
+        if (cast<Instruction>(Op)->getOpcode() ==
+            cast<Instruction>(OpLastLane)->getOpcode()) {
+          OpData.IsUsed = true;
+          return Idx;
         }
+      break;
+    case ReorderingMode::Constant:
+      if (isa<Constant>(Op)) {
+        OpData.IsUsed = true;
+        return Idx;
+      }
+      break;
+    case ReorderingMode::Splat: {
+      if (Op == OpLastLane) {
+        OpData.IsUsed = true;
+        return Idx;
+      }
+      break;
+    }
+    case ReorderingMode::Failed:
+      return None;
+    }
+   }
+  // If we could not find a good match return None.
+  return None;
+}
+
+/// Helper for reorderOperandVecs. \Returns the lane that we should start
+/// reordering from. This is the one which has the least number of operands that
+/// can freely move about.
+static unsigned getBestLaneToStartReordering(const BoUpSLP::VLOperands &Ops) {
+  unsigned BestLane = 0;
+  unsigned Min = UINT_MAX;
+  for (unsigned Lane = 0, NumLanes = Ops.getNumLanes(); Lane != NumLanes;
+       ++Lane) {
+    unsigned NumFreeOps = Ops.getMaxNumOperandsThatCanBeReordered(Lane);
+    if (NumFreeOps < Min) {
+      Min = NumFreeOps;
+      BestLane = Lane;
+    }
+  }
+  return BestLane;
+}
+
+// Performs operand reordering for 2 or more operands.
+// The original operands are in OrigOps[OpIdx][Lane].
+// The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
+void BoUpSLP::reorderOperandVecs(VLOperands &Ops) {
+  unsigned NumOperands = Ops.getNumOperands();
+  unsigned NumLanes = Ops.getNumLanes();
+  // Each operand has its own mode. We are using this mode to help us select the
+  // instructions for each lane, so that they match best with the ones we have
+  // selected so far.
+  SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
+
+  // This is a greedy single-pass algorithm. We are going over each lane once
+  // and deciding on the best order right away with no back-tracking. However,
+  // in order to increase its effectiveness, we start with the lane that has
+  // operands that can move the least. For example, given the following lanes:
+  //  Lane 0 : A[0] = B[0] + C[0]   // Visited 3rd
+  //  Lane 1 : A[1] = C[1] - B[1]   // Visited 1st
+  //  Lane 2 : A[2] = B[2] + C[2]   // Visited 2nd
+  //  Lane 3 : A[3] = C[3] - B[3]   // Visited 4th
+  // we will start at Lane 1, since the operands of the subtraction cannot be
+  // reordered. Then we will visit the rest of the lanes in a circular fashion.
+  // That is, Lanes 2, then Lane 0, and finally Lane 3.
+
+  // Find the first lane that we will start our search from.
+  unsigned FirstLane = getBestLaneToStartReordering(Ops);
+
+  // Initialize the modes.
+  for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
+    Value *OpLane0 = Ops.getValue(OpIdx, FirstLane);
+    // Keep track if we have instructions with all the same opcode on one side.
+    if (isa<LoadInst>(OpLane0))
+      ReorderingModes[OpIdx] = ReorderingMode::Load;
+    else if (isa<Instruction>(OpLane0))
+      ReorderingModes[OpIdx] = ReorderingMode::Opcode;
+    else if (isa<Constant>(OpLane0))
+      ReorderingModes[OpIdx] = ReorderingMode::Constant;
+    else if (isa<Argument>(OpLane0))
+      // Our best hope is a Splat. It may save some cost in some cases.
+      ReorderingModes[OpIdx] = ReorderingMode::Splat;
+    else
+      llvm_unreachable("Bad Operand");
+  }
+
+  // We keep the original operand order for the FirstLane, so reorder the rest
+  // of the lanes.
+  // We are visiting the nodes in a circular fashion, using FirstLane as the
+  // center point and increasing the radius distance.
+  for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
+    // Visit the lane on the right and then the lane on the left.
+    for (int Direction : {+1, -1}) {
+      int Lane = FirstLane + Direction * Distance;
+      if (Lane < 0 || Lane >= (int)NumLanes)
+        continue;
+      int LastLane = Lane - Direction;
+      assert(LastLane >= 0 && LastLane < (int)NumLanes && "Out of bounds");
+      // Look for a good match for each operand.
+      for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
+        // Search for the operand that matches SortedOps[OpIdx][Lane-1].
+        Optional<unsigned> BestIdx =
+            getBestOperand(OpIdx, Lane, LastLane, Ops, ReorderingModes);
+        // By not selecting a value, we allow the operands that follow to select
+        // a better matching value. We will get a non-null value in the next run
+        // of getBestOperand().
+        if (BestIdx)
+          // Swap the current operand with the one returned by getBestOperand().
+          Ops.swap(OpIdx, BestIdx.getValue(), Lane);
+        else
+          // If we failed to find a best operand, then set the mode to
+          // 'Failed'.
+          ReorderingModes[OpIdx] = ReorderingMode::Failed;
       }
     }
-    // else unchanged
   }
 }
 
+// Perform operand reordering on the instructions in VL.
+void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
+                                             SmallVectorImpl<Value *> &Left,
+                                             SmallVectorImpl<Value *> &Right) {
+  if (VL.empty())
+    return;
+  VLOperands Ops(VL);
+  // Reorder the operands in place.
+  reorderOperandVecs(Ops);
+  Left = Ops.getVL(0);
+  Right = Ops.getVL(1);
+}
+
 void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL,
                                         const InstructionsState &S) {
   // Get the basic block this bundle is in. All instructions in the bundle
Index: test/Transforms/SLPVectorizer/X86/alternate-int.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/alternate-int.ll
+++ test/Transforms/SLPVectorizer/X86/alternate-int.ll
@@ -536,13 +536,11 @@
 
 define <8 x i32> @add_sub_v8i32_splat(<8 x i32> %a, i32 %b) {
 ; CHECK-LABEL: @add_sub_v8i32_splat(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]]
-; CHECK-NEXT:    [[R7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> undef, i32 [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]]
+; CHECK-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    ret <8 x i32> [[R7]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
Index: test/Transforms/SLPVectorizer/X86/crash_lencod.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/crash_lencod.ll
+++ test/Transforms/SLPVectorizer/X86/crash_lencod.ll
@@ -126,14 +126,15 @@
 define fastcc void @dct36(double* %inbuf) {
 ; CHECK-LABEL: @dct36(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds double, double* [[INBUF:%.*]], i64 2
-; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds double, double* [[INBUF]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load double, double* [[ARRAYIDX44]], align 8
-; CHECK-NEXT:    [[ADD46:%.*]] = fadd double [[TMP0]], undef
-; CHECK-NEXT:    store double [[ADD46]], double* [[ARRAYIDX41]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load double, double* [[INBUF]], align 8
-; CHECK-NEXT:    [[ADD49:%.*]] = fadd double [[TMP1]], [[TMP0]]
-; CHECK-NEXT:    store double [[ADD49]], double* [[ARRAYIDX44]], align 8
+; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds double, double* [[INBUF:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[INBUF]] to <2 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double undef, i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[ARRAYIDX44]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
Index: test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
+++ test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
@@ -31,8 +31,8 @@
 ; CHECK:       cond.false66.us:
 ; CHECK-NEXT:    [[ADD_I276_US:%.*]] = fadd double 0.000000e+00, undef
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> undef, double [[ADD_I276_US]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double undef, i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x double> [[TMP1]], <double 0.000000e+00, double 0xBFA5CC2D1960285F>
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double 0xBFA5CC2D1960285F, i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x double> [[TMP1]], <double 0.000000e+00, double undef>
 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP2]], <double 1.400000e+02, double 1.400000e+02>
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], <double 5.000000e+01, double 5.200000e+01>
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> undef, [[TMP2]]
Index: test/Transforms/SLPVectorizer/X86/operandorder.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/operandorder.ll
+++ test/Transforms/SLPVectorizer/X86/operandorder.ll
@@ -37,13 +37,10 @@
 ; CHECK-NEXT:    br label [[LP:%.*]]
 ; CHECK:       lp:
 ; CHECK-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[FROM_1:%.*]] = getelementptr double, double* [[FROM:%.*]], i32 1
-; CHECK-NEXT:    [[V0_1:%.*]] = load double, double* [[FROM]], align 4
-; CHECK-NEXT:    [[V0_2:%.*]] = load double, double* [[FROM_1]], align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> undef, double [[V0_1]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[P]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V0_2]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4
@@ -78,14 +75,11 @@
 ; CHECK-NEXT:    br label [[LP:%.*]]
 ; CHECK:       lp:
 ; CHECK-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[FROM_1:%.*]] = getelementptr double, double* [[FROM:%.*]], i32 1
-; CHECK-NEXT:    [[V0_1:%.*]] = load double, double* [[FROM]], align 4
-; CHECK-NEXT:    [[V0_2:%.*]] = load double, double* [[FROM_1]], align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> undef, double [[P]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V0_2]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[V0_1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> undef, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[P]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4
 ; CHECK-NEXT:    br i1 undef, label [[LP]], label [[EXT:%.*]]
@@ -119,14 +113,11 @@
 ; CHECK-NEXT:    br label [[LP:%.*]]
 ; CHECK:       lp:
 ; CHECK-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[FROM_1:%.*]] = getelementptr double, double* [[FROM:%.*]], i32 1
-; CHECK-NEXT:    [[V0_1:%.*]] = load double, double* [[FROM]], align 4
-; CHECK-NEXT:    [[V0_2:%.*]] = load double, double* [[FROM_1]], align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> undef, double [[P]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V0_2]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[V0_1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> undef, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[P]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4
 ; CHECK-NEXT:    br i1 undef, label [[LP]], label [[EXT:%.*]]
@@ -202,16 +193,15 @@
 ; CHECK-NEXT:    br label [[LP:%.*]]
 ; CHECK:       lp:
 ; CHECK-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[FROM_1:%.*]] = getelementptr double, double* [[FROM:%.*]], i32 1
-; CHECK-NEXT:    [[V0_1:%.*]] = load double, double* [[FROM]], align 4
-; CHECK-NEXT:    [[V0_2:%.*]] = load double, double* [[FROM_1]], align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> undef, double [[V0_1]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> undef, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[V0_2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[P]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[P]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP6]], <2 x double>* [[TMP7]], align 4
 ; CHECK-NEXT:    br i1 undef, label [[LP]], label [[EXT:%.*]]
 ; CHECK:       ext:
 ; CHECK-NEXT:    ret void