diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -781,6 +781,15 @@
       Scalars[Mask[I]] = Prev[I];
 }
 
+/// Apply \p ReorderMask on \p Order.
+static void applyReorder(SmallVectorImpl<int> &Order,
+                         ArrayRef<unsigned> ReorderMask) {
+  SmallVector<unsigned> OrigOrder(Order.begin(), Order.end());
+  assert(Order.size() == ReorderMask.size() && "Expected same size");
+  for (unsigned Idx : seq<unsigned>(0, Order.size()))
+    Order[Idx] = OrigOrder[ReorderMask[Idx]];
+}
+
 /// Checks if the provided value does not require scheduling. It does not
 /// require scheduling if this is not an instruction or it is an instruction
 /// that does not read/write memory and all operands are either not instructions
@@ -4011,6 +4020,25 @@
       transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
         return I < E ? static_cast<int>(I) : UndefMaskElem;
       });
+      // If the UserTE already has a ReorderingIndices mask we need to combine
+      // the user's mask with the current masks.
+      if (!Data.first->UserTreeIndices.empty()) {
+        // Retruns the single TreeEntry of `TE` if found in `Users`.
+        auto GetUserTEInUsers = [&Users](TreeEntry *TE) {
+          TreeEntry *UserTE = nullptr;
+          for (const EdgeInfo &EI : TE->UserTreeIndices)
+            if (Users.count(EI.UserTE)) {
+              assert(UserTE == nullptr && "Expected only one user");
+              UserTE = EI.UserTE;
+            }
+          return UserTE;
+        };
+        if (TreeEntry *UserTE = GetUserTEInUsers(Data.first))
+          if (!UserTE->ReorderIndices.empty()) {
+            applyReorder(MaskOrder, UserTE->ReorderIndices);
+            applyReorder(Mask, UserTE->ReorderIndices);
+          }
+      }
       for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
         TreeEntry *TE = Op.second;
         OrderedEntries.remove(TE);
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
@@ -728,31 +728,39 @@
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <2 x i32>, <2 x i32>* [[TMP10]], align 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[ARRAYIDX41]] to <2 x i32>*
 ; CHECK-NEXT:    [[TMP13:%.*]] = load <2 x i32>, <2 x i32>* [[TMP12]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = mul nsw <2 x i32> [[TMP11]], [[TMP7]]
-; CHECK-NEXT:    [[TMP15:%.*]] = mul nsw <2 x i32> [[TMP13]], [[TMP9]]
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i32* [[ARRAYIDX72]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <2 x i32> [[TMP11]], <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <2 x i32> [[TMP13]], <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> [[TMP18]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP20:%.*]] = mul nsw <4 x i32> [[TMP16]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i32* [[ARRAYIDX72]] to <4 x i32>*
 ; CHECK-NEXT:    [[ARRAYIDX84:%.*]] = getelementptr inbounds i32, i32* [[Z]], i64 7
 ; CHECK-NEXT:    [[MUL85:%.*]] = mul nsw i32 [[TMP4]], [[TMP1]]
 ; CHECK-NEXT:    [[MUL87:%.*]] = mul nsw i32 [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[ARRAYIDX88:%.*]] = getelementptr inbounds i32, i32* [[Z]], i64 11
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i32* [[ARRAYIDX12]] to <2 x i32>*
-; CHECK-NEXT:    [[TMP18:%.*]] = load <2 x i32>, <2 x i32>* [[TMP17]], align 4
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i32* [[ARRAYIDX28]] to <2 x i32>*
-; CHECK-NEXT:    [[TMP20:%.*]] = load <2 x i32>, <2 x i32>* [[TMP19]], align 4
-; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i32* [[ARRAYIDX48]] to <2 x i32>*
-; CHECK-NEXT:    [[TMP22:%.*]] = load <2 x i32>, <2 x i32>* [[TMP21]], align 4
-; CHECK-NEXT:    [[TMP23:%.*]] = bitcast i32* [[ARRAYIDX64]] to <2 x i32>*
-; CHECK-NEXT:    [[TMP24:%.*]] = load <2 x i32>, <2 x i32>* [[TMP23]], align 4
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i32* [[ARRAYIDX12]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP23:%.*]] = load <2 x i32>, <2 x i32>* [[TMP22]], align 4
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast i32* [[ARRAYIDX28]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP25:%.*]] = load <2 x i32>, <2 x i32>* [[TMP24]], align 4
+; CHECK-NEXT:    [[TMP26:%.*]] = bitcast i32* [[ARRAYIDX48]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP27:%.*]] = load <2 x i32>, <2 x i32>* [[TMP26]], align 4
+; CHECK-NEXT:    [[TMP28:%.*]] = bitcast i32* [[ARRAYIDX64]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP29:%.*]] = load <2 x i32>, <2 x i32>* [[TMP28]], align 4
 ; CHECK-NEXT:    store i32 [[MUL73]], i32* [[Z]], align 4
-; CHECK-NEXT:    store <4 x i32> [[SHUFFLE]], <4 x i32>* [[TMP16]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP21]], align 4
 ; CHECK-NEXT:    store i32 [[MUL85]], i32* [[ARRAYIDX76]], align 4
 ; CHECK-NEXT:    store i32 [[MUL87]], i32* [[ARRAYIDX88]], align 4
-; CHECK-NEXT:    [[TMP25:%.*]] = mul nsw <2 x i32> [[TMP22]], [[TMP18]]
-; CHECK-NEXT:    [[TMP26:%.*]] = mul nsw <2 x i32> [[TMP24]], [[TMP20]]
-; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP25]], <2 x i32> [[TMP26]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    [[TMP27:%.*]] = bitcast i32* [[ARRAYIDX84]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[SHUFFLE1]], <4 x i32>* [[TMP27]], align 4
+; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <2 x i32> [[TMP27]], <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <2 x i32> [[TMP29]], <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP32:%.*]] = shufflevector <4 x i32> [[TMP30]], <4 x i32> [[TMP31]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP33:%.*]] = shufflevector <2 x i32> [[TMP23]], <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP34:%.*]] = shufflevector <2 x i32> [[TMP25]], <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP35:%.*]] = shufflevector <4 x i32> [[TMP33]], <4 x i32> [[TMP34]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP36:%.*]] = mul nsw <4 x i32> [[TMP32]], [[TMP35]]
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast i32* [[ARRAYIDX84]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP36]], <4 x i32>* [[TMP37]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -854,20 +862,44 @@
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[STRIDE:%.*]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[ADD5:%.*]] = add nsw i32 [[STRIDE]], 1
+; CHECK-NEXT:    [[IDXPROM6:%.*]] = sext i32 [[ADD5]] to i64
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 [[IDXPROM6]]
+; CHECK-NEXT:    [[ADD8:%.*]] = add nsw i32 [[STRIDE]], 2
+; CHECK-NEXT:    [[IDXPROM9:%.*]] = sext i32 [[ADD8]] to i64
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 [[IDXPROM9]]
+; CHECK-NEXT:    [[ADD11:%.*]] = add nsw i32 [[STRIDE]], 3
+; CHECK-NEXT:    [[IDXPROM12:%.*]] = sext i32 [[ADD11]] to i64
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 [[IDXPROM12]]
 ; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, i16* [[Y:%.*]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM6]]
+; CHECK-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM9]]
+; CHECK-NEXT:    [[ARRAYIDX29:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM12]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[X]] to <4 x i16>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 2
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[ARRAYIDX4]] to <4 x i16>*
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[TMP2]], align 2
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16* [[Y]] to <4 x i16>*
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[TMP4]], align 2
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16* [[ARRAYIDX20]] to <4 x i16>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, i16* [[ARRAYIDX4]], align 2
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, i16* [[ARRAYIDX7]], align 2
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2
+; CHECK-NEXT:    [[TMP5:%.*]] = load i16, i16* [[ARRAYIDX13]], align 2
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16* [[Y]] to <4 x i16>*
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[TMP6]], align 2
-; CHECK-NEXT:    [[TMP8:%.*]] = mul <4 x i16> [[TMP5]], [[TMP1]]
-; CHECK-NEXT:    [[TMP9:%.*]] = mul <4 x i16> [[TMP7]], [[TMP3]]
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16* [[DST0:%.*]] to <8 x i16>*
-; CHECK-NEXT:    store <8 x i16> [[SHUFFLE]], <8 x i16>* [[TMP10]], align 2
+; CHECK-NEXT:    [[TMP8:%.*]] = load i16, i16* [[ARRAYIDX20]], align 2
+; CHECK-NEXT:    [[TMP9:%.*]] = load i16, i16* [[ARRAYIDX23]], align 2
+; CHECK-NEXT:    [[TMP10:%.*]] = load i16, i16* [[ARRAYIDX26]], align 2
+; CHECK-NEXT:    [[TMP11:%.*]] = load i16, i16* [[ARRAYIDX29]], align 2
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i16> [[TMP7]], <4 x i16> poison, <8 x i32> <i32 0, i32 1, i32 3, i32 2, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <8 x i16> [[TMP12]], i16 [[TMP9]], i64 4
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <8 x i16> [[TMP13]], i16 [[TMP8]], i64 5
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x i16> [[TMP14]], i16 [[TMP11]], i64 6
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <8 x i16> [[TMP15]], i16 [[TMP10]], i64 7
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <8 x i32> <i32 0, i32 1, i32 3, i32 2, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <8 x i16> [[TMP17]], i16 [[TMP3]], i64 4
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <8 x i16> [[TMP18]], i16 [[TMP2]], i64 5
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <8 x i16> [[TMP19]], i16 [[TMP5]], i64 6
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <8 x i16> [[TMP20]], i16 [[TMP4]], i64 7
+; CHECK-NEXT:    [[TMP22:%.*]] = mul <8 x i16> [[TMP16]], [[TMP21]]
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast i16* [[DST0:%.*]] to <8 x i16>*
+; CHECK-NEXT:    store <8 x i16> [[TMP22]], <8 x i16>* [[TMP23]], align 2
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_reordered_users.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_reordered_users.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_reordered_users.ll
@@ -0,0 +1,134 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -slp-vectorizer -mtriple=x86_64-grtev4-linux-gnu -S | FileCheck %s
+
+; This checks that reorderBottomToTop() can handle reordering of a TreeEntry
+; which has a user TreeEntry that has already been reordered.
+; Here is when the crash occurs:
+;
+;                        (N4)OrderB
+;                         |
+;  (N1)OrderA (N2)OrderA (N3)NoOrder
+;            \      |     /
+;               (Phi)NoOrder
+;
+;  1. Phi is visited along with its operands (N1,N2,N3). BestOrder is "OrderA".
+;  2. Phi along with all its operands (N1,N2,N3) are reordered. The result is:
+;
+;                          (N4)OrderB
+;                           |
+;  (N1)NoOrder (N2)NoOrder (N3)OrderA
+;            \      |     /
+;               (Phi)OrderA
+;
+;  3. N3 is now visited along with its operand N4. BestOrder is "OrderB".
+;  4. N3 and N4 are reordered. The result is this:
+;
+;                          (N4)NoOrder
+;                           |
+;  (N1)NoOrder (N2)NoOrder (N3)OrderB
+;            \      |     /
+;               (Phi)OrderA
+;
+;  At this point there is a discrepancy between Phi's Operand 2 which are
+;  reordered based on OrderA and N3's OrderB. This results in a crash in
+;  vectorizeTree() on its way from N3 back to the Phi. The reason is that
+;  N3->isSame(Phi's operand 2) returns false and vectorizeTree() skips N3.
+;
+;  This patch fixes N3's order by setting it to the order tha results from
+;  combining both OrderB and OrderA.
+;
+; NOTE: The crash shows up when reorderTopToBottom() does not reorder the tree,
+; so to simulate this we add external store users. Alternatively one can
+; comment out reorderTopToBottom() and remove the stores.
+
+
+define void @reorder_crash(float* %ptr) {
+; CHECK-LABEL: @reorder_crash(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds float, float* [[PTR:%.*]], i64 0
+; CHECK-NEXT:    br i1 undef, label [[BB0:%.*]], label [[BB12:%.*]]
+; CHECK:       bb0:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[GEP0]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[GEP0]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP1]], <4 x float>* [[TMP2]], align 4
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb12:
+; CHECK-NEXT:    br i1 undef, label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[GEP0]] to <4 x float>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[GEP0]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 4
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[GEP0]] to <4 x float>*
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[TMP6]], align 4
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd <4 x float> [[SHUFFLE]], zeroinitializer
+; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    [[TMP9:%.*]] = phi <4 x float> [ [[TMP1]], [[BB0]] ], [ [[TMP4]], [[BB1]] ], [ [[SHUFFLE1]], [[BB2]] ]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %gep0 = getelementptr inbounds float, float* %ptr, i64 0
+  %gep1 = getelementptr inbounds float, float* %ptr, i64 1
+  %gep2 = getelementptr inbounds float, float* %ptr, i64 2
+  %gep3 = getelementptr inbounds float, float* %ptr, i64 3
+  br i1 undef, label %bb0, label %bb12
+
+bb0:
+  ; Used by phi in this order: 1, 0, 2, 3
+  %ld00 = load float, float* %gep0
+  %ld01 = load float, float* %gep1
+  %ld02 = load float, float* %gep2
+  %ld03 = load float, float* %gep3
+
+  ; External store users in natural order 0, 1, 2, 3
+  store float %ld00, float *%gep0
+  store float %ld01, float *%gep1
+  store float %ld02, float *%gep2
+  store float %ld03, float *%gep3
+  br label %bb3
+
+bb12:
+  br i1 undef, label %bb1, label %bb2
+
+bb1:
+  ; Used by phi in this order: 1, 0, 2, 3
+  %ld10 = load float, float* %gep0
+  %ld11 = load float, float* %gep1
+  %ld12 = load float, float* %gep2
+  %ld13 = load float, float* %gep3
+
+  ; External store users in natural order 0, 1, 2, 3
+  store float %ld10, float *%gep0
+  store float %ld11, float *%gep1
+  store float %ld12, float *%gep2
+  store float %ld13, float *%gep3
+
+  br label %bb3
+
+bb2:
+  ; Used by fadd in this order: 2, 3, 0, 1
+  %ld20 = load float, float* %gep0
+  %ld21 = load float, float* %gep1
+  %ld22 = load float, float* %gep2
+  %ld23 = load float, float* %gep3
+
+  ; Used by phi in this order: 0, 1, 2, 3
+  %add20 = fadd float %ld22, 0.0
+  %add21 = fadd float %ld23, 0.0
+  %add22 = fadd float %ld20, 0.0
+  %add23 = fadd float %ld21, 0.0
+  br label %bb3
+
+bb3:
+  %phi0 = phi float [ %ld01, %bb0 ], [ %ld11, %bb1 ], [ %add20, %bb2 ]
+  %phi1 = phi float [ %ld00, %bb0 ], [ %ld10, %bb1 ], [ %add21, %bb2 ]
+  %phi2 = phi float [ %ld02, %bb0 ], [ %ld12, %bb1 ], [ %add22, %bb2 ]
+  %phi3 = phi float [ %ld03, %bb0 ], [ %ld13, %bb1 ], [ %add23, %bb2 ]
+  ret void
+}