Index: lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -596,12 +596,12 @@
 
   /// \reorder commutative operands in alt shuffle if they result in
   ///  vectorized code.
-  void reorderAltShuffleOperands(ArrayRef<Value *> VL,
+  void reorderAltShuffleOperands(unsigned Opcode, ArrayRef<Value *> VL,
                                  SmallVectorImpl<Value *> &Left,
                                  SmallVectorImpl<Value *> &Right);
   /// \reorder commutative operands to get better probability of
   /// generating vectorized code.
-  void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
+  void reorderInputsAccordingToOpcode(unsigned Opcode, ArrayRef<Value *> VL,
                                       SmallVectorImpl<Value *> &Left,
                                       SmallVectorImpl<Value *> &Right);
   struct TreeEntry {
@@ -1302,6 +1302,38 @@
   }
 }
 
+static Value *getDefaultConstantForOpcode(unsigned Opcode, Type *Ty) {
+  switch(Opcode) {
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Or:
+  case Instruction::Xor:
+    return ConstantInt::getNullValue(Ty);
+  case Instruction::Mul:
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+    return ConstantInt::get(Ty, /*V=*/1);
+  case Instruction::FAdd:
+  case Instruction::FSub:
+    return ConstantFP::get(Ty, /*V=*/0.0);
+  case Instruction::FMul:
+  case Instruction::FDiv:
+  case Instruction::FRem:
+    return ConstantFP::get(Ty, /*V=*/1.0);
+  case Instruction::And:
+    return ConstantInt::getAllOnesValue(Ty);
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+    return ConstantInt::getNullValue(Type::getInt32Ty(Ty->getContext()));
+  default:
+    break;
+  }
+  llvm_unreachable("unknown binop for default constant value");
+}
+
 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                             int UserTreeIdx) {
   bool isAltShuffle = false;
@@ -1635,7 +1667,7 @@
       // have the same opcode.
       if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
         ValueList Left, Right;
-        reorderInputsAccordingToOpcode(VL, Left, Right);
+        reorderInputsAccordingToOpcode(VL0->getOpcode(), VL, Left, Right);
         buildTree_rec(Left, Depth + 1, UserTreeIdx);
         buildTree_rec(Right, Depth + 1, UserTreeIdx);
         return;
@@ -1799,7 +1831,7 @@
       // Reorder operands if reordering would enable vectorization.
       if (isa<BinaryOperator>(VL0)) {
         ValueList Left, Right;
-        reorderAltShuffleOperands(VL, Left, Right);
+        reorderAltShuffleOperands(VL0->getOpcode(), VL, Left, Right);
         buildTree_rec(Left, Depth + 1, UserTreeIdx);
         buildTree_rec(Right, Depth + 1, UserTreeIdx);
         return;
@@ -2344,13 +2376,20 @@
 // load a[3] + load b[3]
 // Reordering the second load b[1]  load a[1] would allow us to vectorize this
 // code.
-void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL,
+void BoUpSLP::reorderAltShuffleOperands(unsigned Opcode, ArrayRef<Value *> VL,
                                         SmallVectorImpl<Value *> &Left,
                                         SmallVectorImpl<Value *> &Right) {
   // Push left and right operands of binary operation into Left and Right
-  for (Value *i : VL) {
-    Left.push_back(cast<Instruction>(i)->getOperand(0));
-    Right.push_back(cast<Instruction>(i)->getOperand(1));
+  unsigned AltOpcode = getAltOpcode(Opcode);
+  for (Value *V : VL) {
+    auto *I = cast<Instruction>(V);
+    if (sameOpcodeOrAlt(Opcode, AltOpcode, I->getOpcode())) {
+      Left.push_back(I->getOperand(0));
+      Right.push_back(I->getOperand(1));
+    } else {
+      Left.push_back(I);
+      Right.push_back(getDefaultConstantForOpcode(Opcode, I->getType()));
+    }
   }
 
   // Reorder if we have a commutative operation and consecutive access
@@ -2395,14 +2434,17 @@
 // The vectorizer is trying to either have all elements one side being
 // instruction with the same opcode to enable further vectorization, or having
 // a splat to lower the vectorizing cost.
-static bool shouldReorderOperands(int i, Instruction &I,
-                                  SmallVectorImpl<Value *> &Left,
-                                  SmallVectorImpl<Value *> &Right,
-                                  bool AllSameOpcodeLeft,
-                                  bool AllSameOpcodeRight, bool SplatLeft,
-                                  bool SplatRight) {
-  Value *VLeft = I.getOperand(0);
-  Value *VRight = I.getOperand(1);
+static bool shouldReorderOperands(
+    int i, unsigned Opcode, Instruction &I, ArrayRef<Value *> Left,
+    ArrayRef<Value *> Right, bool AllSameOpcodeLeft, bool AllSameOpcodeRight,
+    bool SplatLeft, bool SplatRight, Value *&VLeft, Value *&VRight) {
+  if (I.getOpcode() == Opcode) {
+    VLeft = I.getOperand(0);
+    VRight = I.getOperand(1);
+  } else {
+    VLeft = &I;
+    VRight = getDefaultConstantForOpcode(Opcode, I.getType());
+  }
   // If we have "SplatRight", try to see if commuting is needed to preserve it.
   if (SplatRight) {
     if (VRight == Right[i - 1])
@@ -2458,15 +2500,24 @@
   return false;
 }
 
-void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
+void BoUpSLP::reorderInputsAccordingToOpcode(unsigned Opcode,
+                                             ArrayRef<Value *> VL,
                                              SmallVectorImpl<Value *> &Left,
                                              SmallVectorImpl<Value *> &Right) {
 
   if (VL.size()) {
     // Peel the first iteration out of the loop since there's nothing
     // interesting to do anyway and it simplifies the checks in the loop.
-    auto VLeft = cast<Instruction>(VL[0])->getOperand(0);
-    auto VRight = cast<Instruction>(VL[0])->getOperand(1);
+    auto *I = cast<Instruction>(VL[0]);
+    Value *VLeft;
+    Value *VRight;
+    if (I->getOpcode() == Opcode) {
+      VLeft = I->getOperand(0);
+      VRight = I->getOperand(1);
+    } else {
+      VLeft = I;
+      VRight = getDefaultConstantForOpcode(Opcode, I->getType());
+    }
     if (!isa<Instruction>(VRight) && isa<Instruction>(VLeft))
       // Favor having instruction to the right. FIXME: why?
       std::swap(VLeft, VRight);
@@ -2483,16 +2534,21 @@
 
   for (unsigned i = 1, e = VL.size(); i != e; ++i) {
     Instruction *I = cast<Instruction>(VL[i]);
-    assert(I->isCommutative() && "Can only process commutative instruction");
+    assert(((I->getOpcode() == Opcode && I->isCommutative()) ||
+            (I->getOpcode() != Opcode && Instruction::isCommutative(Opcode))) &&
+           "Can only process commutative instruction");
     // Commute to favor either a splat or maximizing having the same opcodes on
     // one side.
-    if (shouldReorderOperands(i, *I, Left, Right, AllSameOpcodeLeft,
-                              AllSameOpcodeRight, SplatLeft, SplatRight)) {
-      Left.push_back(I->getOperand(1));
-      Right.push_back(I->getOperand(0));
+    Value *VLeft;
+    Value *VRight;
+    if (shouldReorderOperands(i, Opcode, *I, Left, Right, AllSameOpcodeLeft,
+                              AllSameOpcodeRight, SplatLeft, SplatRight, VLeft,
+                              VRight)) {
+      Left.push_back(VRight);
+      Right.push_back(VLeft);
     } else {
-      Left.push_back(I->getOperand(0));
-      Right.push_back(I->getOperand(1));
+      Left.push_back(VLeft);
+      Right.push_back(VRight);
     }
     // Update Splat* and AllSameOpcode* after the insertion.
     SplatRight = SplatRight && (Right[i - 1] == Right[i]);
@@ -2843,11 +2899,19 @@
     case Instruction::Xor: {
       ValueList LHSVL, RHSVL;
       if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
-        reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL);
+        reorderInputsAccordingToOpcode(VL0->getOpcode(),
+                                       E->Scalars, LHSVL, RHSVL);
       else
         for (Value *V : E->Scalars) {
-          LHSVL.push_back(cast<Instruction>(V)->getOperand(0));
-          RHSVL.push_back(cast<Instruction>(V)->getOperand(1));
+          auto *I = cast<Instruction>(V);
+          if (I->getOpcode() == VL0->getOpcode()) {
+            LHSVL.push_back(I->getOperand(0));
+            RHSVL.push_back(I->getOperand(1));
+          } else {
+            LHSVL.push_back(V);
+            RHSVL.push_back(
+                getDefaultConstantForOpcode(VL0->getOpcode(), I->getType()));
+          }
         }
 
       setInsertPointAfterBundle(E->Scalars, VL0);
@@ -3011,7 +3075,7 @@
     case Instruction::ShuffleVector: {
       ValueList LHSVL, RHSVL;
       assert(isa<BinaryOperator>(VL0) && "Invalid Shuffle Vector Operand");
-      reorderAltShuffleOperands(E->Scalars, LHSVL, RHSVL);
+      reorderAltShuffleOperands(VL0->getOpcode(), E->Scalars, LHSVL, RHSVL);
       setInsertPointAfterBundle(E->Scalars, VL0);
 
       Value *LHS = vectorizeTree(LHSVL);
Index: test/Transforms/SLPVectorizer/X86/reorder.ll
===================================================================
--- /dev/null
+++ test/Transforms/SLPVectorizer/X86/reorder.ll
@@ -0,0 +1,99 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -slp-vectorizer -slp-vectorizer -mcpu=skx < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@b = common global float 0.000000e+00, align 4
+@c = common global [1 x i32] zeroinitializer, align 4
+@d = common global [1 x i32] zeroinitializer, align 4
+
+define i32 @fn1() local_unnamed_addr #0 {
+; CHECK-LABEL: @fn1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i32 ()* @fn1 to <2 x i8>*), align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i8> <i8 52, i8 8>, [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <2 x i8> [[TMP1]] to <2 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw <2 x i32> [[TMP2]], <i32 3, i32 5>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[OR]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 (...) @fn2()
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret i32 undef
+;
+entry:
+  %0 = load i8, i8* bitcast (i32 ()* @fn1 to i8*), align 1
+  %1 = load i8, i8* getelementptr (i8, i8* bitcast (i32 ()* @fn1 to i8*), i64 1), align 1
+  %2 = and i8 %0, 52
+  %and = zext i8 %2 to i32
+  %shl = shl nuw nsw i32 %and, 3
+  %3 = and i8 %1, 8
+  %and2 = zext i8 %3 to i32
+  %shl3 = shl nuw nsw i32 %and2, 5
+  %or = or i32 %shl3, %shl
+  %tobool = icmp eq i32 %or, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %call = tail call i32 (...) @fn2() #2
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret i32 undef
+}
+
+declare i32 @fn2(...) #1
+
+define i32 @fn3() local_unnamed_addr #0 {
+; CHECK-LABEL: @fn3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* @b, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul float [[TMP0]], 0.000000e+00
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @c, i64 0, i64 0), align 4
+; CHECK-NEXT:    [[CONV5:%.*]] = sitofp i32 [[TMP2]] to float
+; CHECK-NEXT:    [[MUL6:%.*]] = fmul float [[TMP0]], [[CONV5]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @c, i64 1, i64 0), align 4
+; CHECK-NEXT:    [[CONV5_1:%.*]] = sitofp i32 [[TMP3]] to float
+; CHECK-NEXT:    [[MUL6_1:%.*]] = fmul float [[TMP0]], [[CONV5_1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> <float -0.000000e+00, float undef, float undef, float undef>, float [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float -0.000000e+00, i32 2
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[MUL6]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[MUL6_1]], i32 3
+; CHECK-NEXT:    [[TMP11:%.*]] = fsub <4 x float> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = fadd <4 x float> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP14:%.*]] = fptosi <4 x float> [[TMP13]] to <4 x i32>
+; CHECK-NEXT:    store <4 x i32> [[TMP14]], <4 x i32>* bitcast ([1 x i32]* @d to <4 x i32>*), align 4
+; CHECK-NEXT:    ret i32 undef
+;
+entry:
+  %0 = load float, float* @b, align 4
+  %1 = fmul float %0, 0.000000e+00
+  %mul = fsub float -0.000000e+00, %1
+  %conv1 = fptosi float %mul to i32
+  store i32 %conv1, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @d, i64 0, i64 0), align 4
+  %2 = load i32, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @c, i64 0, i64 0), align 4
+  %conv5 = sitofp i32 %2 to float
+  %mul6 = fmul float %0, %conv5
+  %add = fadd float %0, %mul6
+  %conv7 = fptosi float %add to i32
+  store i32 %conv7, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @d, i64 1, i64 0), align 4
+  %mul.1 = fsub float -0.000000e+00, %0
+  %conv1.1 = fptosi float %mul.1 to i32
+  store i32 %conv1.1, i32* getelementptr ([1 x i32], [1 x i32]* @d, i64 2, i64 0), align 4
+  %3 = load i32, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @c, i64 1, i64 0), align 4
+  %conv5.1 = sitofp i32 %3 to float
+  %mul6.1 = fmul float %0, %conv5.1
+  %add.1 = fadd float %0, %mul6.1
+  %conv7.1 = fptosi float %add.1 to i32
+  store i32 %conv7.1, i32* getelementptr ([1 x i32], [1 x i32]* @d, i64 3, i64 0), align 4
+  ret i32 undef
+}