Index: lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1113,6 +1113,10 @@
     /// Sets all instruction in the scheduling region to un-scheduled.
     void resetSchedule();
 
+    /// Reorder ScheduleData data after scheduling, so that the first instruction
+    /// in the list would be the last one scheduled.
+    void reorderBundles();
+
     BasicBlock *BB;
 
     /// Simple memory allocation for ScheduleData.
@@ -2869,17 +2873,15 @@
   // The last instruction in the bundle in program order.
   Instruction *LastInst = nullptr;
 
-  // Find the last instruction. The common case should be that BB has been
-  // scheduled, and the last instruction is VL.back(). So we start with
-  // VL.back() and iterate over schedule data until we reach the end of the
-  // bundle. The end of the bundle is marked by null ScheduleData.
+  // Find the last instruction. If the bundle is not scheduled then
+  // the first in the bundle is the last one in BB, because we discover
+  // bundles in a backward walk. And if the bundle is already scheduled
+  // the we reorder one to be the last one scheduled in the first position.
   if (BlocksSchedules.count(BB)) {
     auto *Bundle =
         BlocksSchedules[BB]->getScheduleData(isOneOf(S, VL.back()));
     if (Bundle && Bundle->isPartOfBundle())
-      for (; Bundle; Bundle = Bundle->NextInBundle)
-        if (Bundle->OpValue == Bundle->Inst)
-          LastInst = Bundle->Inst;
+      LastInst = Bundle->FirstInBundle->Inst;
   }
 
   // LastInst can still be null at this point if there's either not an entry
@@ -4190,6 +4192,47 @@
   ReadyInsts.clear();
 }
 
+void BoUpSLP::BlockScheduling::reorderBundles() {
+  DenseMap<ScheduleData *, ScheduleData *> ReorderMap;
+  // Walk backward in the scheduling region to descover last instruction
+  // for a bundle.
+  for (auto *I = ScheduleEnd; I != ScheduleStart; I = I->getPrevNode()) {
+    doForAllOpcodes(I, [&ReorderMap, &I](ScheduleData *SD) {
+      if (SD->isPartOfBundle() &&
+          ReorderMap.find(SD->FirstInBundle) == ReorderMap.end()) {
+        ReorderMap[SD->FirstInBundle] = SD;
+      }
+    });
+  }
+  DenseMap<ScheduleData *, ScheduleData *>::iterator it;
+  // Swap the last scheduled instruction with the first one in the bundle.
+  for (it = ReorderMap.begin(); it != ReorderMap.end(); it++) {
+    ScheduleData *FirstSD = it->first;
+    ScheduleData *LastSD = it->second;
+    SmallVector<ScheduleData *, 4> Bundle;
+    unsigned LastPos = 0;
+    // The first instruction in the bundle is already the last one scheduled.
+    if (FirstSD == LastSD)
+      continue;
+    ScheduleData *SD = FirstSD;
+    while (SD) {
+      if (SD == LastSD)
+        LastPos = Bundle.size();
+      Bundle.push_back(SD);
+      SD = SD->NextInBundle;
+    }
+    std::swap(Bundle[0], Bundle[LastPos]);
+    for (ScheduleData *SD : Bundle)
+      SD->FirstInBundle = Bundle[0];
+    Bundle[0]->NextInBundle = Bundle[1];
+    Bundle[LastPos - 1]->NextInBundle = Bundle[LastPos];
+    if (LastPos == Bundle.size() - 1)
+      Bundle[LastPos]->NextInBundle = nullptr;
+    else
+      Bundle[LastPos]->NextInBundle = Bundle[LastPos + 1];
+  }
+}
+
 void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
   if (!BS->ScheduleStart)
     return;
@@ -4252,6 +4295,7 @@
     NumToSchedule--;
   }
   assert(NumToSchedule == 0 && "could not schedule all instructions");
+  BS->reorderBundles();
 
   // Avoid duplicate scheduling of the block.
   BS->ScheduleStart = nullptr;
Index: test/Transforms/SLPVectorizer/X86/insert-after-bundle1.ll
===================================================================
--- /dev/null
+++ test/Transforms/SLPVectorizer/X86/insert-after-bundle1.ll
@@ -0,0 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -slp-vectorizer < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; For this bundle [  store i32 %sub.i251.5 ...;
+;                    store i32 %sub.i251.4 ...;
+;                    store i32 %sub.i251.3 ...;
+;                    store i32 %sub.i251.2 ...]
+; the last scheduled instruction expected to be store i32 %sub.i251.2... , but it is
+;  store i32 %sub.i251.5... after scheduling.
+
+define dso_local void @foo() {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX_I252_2:%.*]] = getelementptr inbounds [32 x i32], [32 x i32]* undef, i64 0, i64 13
+; CHECK-NEXT:    [[ARRAYIDX_I252_3:%.*]] = getelementptr inbounds [32 x i32], [32 x i32]* undef, i64 0, i64 12
+; CHECK-NEXT:    [[ARRAYIDX_I252_4:%.*]] = getelementptr inbounds [32 x i32], [32 x i32]* undef, i64 0, i64 11
+; CHECK-NEXT:    [[ARRAYIDX_I252_5:%.*]] = getelementptr inbounds [32 x i32], [32 x i32]* undef, i64 0, i64 10
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[ARRAYIDX_I252_5]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> undef, <4 x i32>* [[TMP0]], align 8
+; CHECK-NEXT:    unreachable
+;
+entry:
+  %sub.i251.2 = sub nsw i32 0, undef
+  %arrayidx.i252.2 = getelementptr inbounds [32 x i32], [32 x i32]* undef, i64 0, i64 13
+  store i32 %sub.i251.2, i32* %arrayidx.i252.2, align 4
+  %sub.i251.3 = sub nsw i32 undef, undef
+  %arrayidx.i252.3 = getelementptr inbounds [32 x i32], [32 x i32]* undef, i64 0, i64 12
+  store i32 %sub.i251.3, i32* %arrayidx.i252.3, align 16
+  %sub.i251.4 = sub nsw i32 undef, undef
+  %arrayidx.i252.4 = getelementptr inbounds [32 x i32], [32 x i32]* undef, i64 0, i64 11
+  store i32 %sub.i251.4, i32* %arrayidx.i252.4, align 4
+  %sub.i251.5 = sub nsw i32 undef, undef
+  %arrayidx.i252.5 = getelementptr inbounds [32 x i32], [32 x i32]* undef, i64 0, i64 10
+  store i32 %sub.i251.5, i32* %arrayidx.i252.5, align 8
+  unreachable
+}