Index: lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -4146,8 +4146,8 @@
       if (AllowReorder && R.shouldReorder()) {
         // Conceptually, there is nothing actually preventing us from trying to
         // reorder a larger list. In fact, we do exactly this when vectorizing
-        // reductions. However, at this point, we only expect to get here from
-        // tryToVectorizePair().
+        // reductions. However, at this point, we only expect to get here when
+        // there are exactly two operations.
         assert(Ops.size() == 2);
         assert(BuildVectorSlice.empty());
         Value *ReorderedOps[] = {Ops[1], Ops[0]};
@@ -4904,7 +4904,13 @@
       // Try to vectorize them.
       unsigned NumElts = (SameTypeIt - IncIt);
       DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n");
-      if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R)) {
+      // The order in which the phi nodes appear in the program does not matter.
+      // So allow tryToVectorizeList to reorder them if it is beneficial. This
+      // is done when there are exactly two elements since tryToVectorizeList
+      // asserts that there are only two values when AllowReorder is true.
+      bool AllowReorder = NumElts == 2;
+      if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R,
+                                            None, AllowReorder)) {
         // Success start over because instructions might have been changed.
         HaveVectorizedPhiNodes = true;
         Changed = true;
Index: test/Transforms/SLPVectorizer/X86/reorder_phi.ll
===================================================================
--- /dev/null
+++ test/Transforms/SLPVectorizer/X86/reorder_phi.ll
@@ -0,0 +1,54 @@
+; RUN: opt < %s -basicaa -slp-vectorizer  -S -mtriple=x86_64-unknown -mcpu=corei7-avx | FileCheck %s
+
+%struct.complex = type { float, float }
+
+; CHECK-LABEL: void @foo
+define  void @foo (%struct.complex* %A, %struct.complex* %B, %struct.complex* %Result) {
+
+entry:
+  %0 = add i64 256, 0
+  br label %loop
+
+; CHECK-LABEL: loop
+; CHECK: [[REG0:%[0-9]+]] = phi <2 x float> {{.*}}[ [[REG1:%[0-9]+]], %loop ]
+; CHECK: [[REG2:%[0-9]+]] = load <2 x float>, <2 x float>*
+; CHECK: [[REG3:%[0-9]+]] = fmul <2 x float> [[REG2]]
+; CHECK: [[REG4:%[0-9]+]] = fmul <2 x float>
+; CHECK: fsub <2 x float> [[REG3]], [[REG4]]
+; CHECK: fadd <2 x float> [[REG3]], [[REG4]]
+; CHECK: shufflevector <2 x float>
+; CHECK: [[REG1]] = fadd <2 x float>{{.*}}[[REG0]]
+loop:
+
+  %1 = phi i64 [ 0, %entry ], [ %20, %loop ]
+  %2 = phi float [ 0.000000e+00, %entry ], [ %19, %loop ]
+  %3 = phi float [ 0.000000e+00, %entry ], [ %18, %loop ]
+  %4 = getelementptr inbounds %"struct.complex", %"struct.complex"* %A, i64 %1, i32 0
+  %5 = load float, float* %4, align 4
+  %6 = getelementptr inbounds %"struct.complex", %"struct.complex"* %A, i64 %1, i32 1
+  %7 = load float, float* %6, align 4
+  %8 = getelementptr inbounds %"struct.complex", %"struct.complex"* %B, i64 %1, i32 0
+  %9 = load float, float* %8, align 4
+  %10 = getelementptr inbounds %"struct.complex", %"struct.complex"* %B, i64 %1, i32 1
+  %11 = load float, float* %10, align 4
+  %12 = fmul float %5, %9
+  %13 = fmul float %7, %11
+  %14 = fsub float %12, %13
+  %15 = fmul float %7, %9
+  %16 = fmul float %5, %11
+  %17 = fadd float %15, %16
+  %18 = fadd float %3, %14
+  %19 = fadd float %2, %17
+  %20 = add nuw nsw i64 %1, 1
+  %21 = icmp eq i64 %20, %0
+  br i1 %21, label %exit, label %loop
+
+exit:
+  %22 = getelementptr inbounds %"struct.complex", %"struct.complex"* %Result,  i32 0, i32 0
+  store float %18, float* %22, align 4
+  %23 = getelementptr inbounds %"struct.complex", %"struct.complex"* %Result,  i32 0, i32 1
+  store float %19, float* %23, align 4
+
+  ret void
+
+}