Index: lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
===================================================================
--- lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -90,7 +90,9 @@
 
   /// Reorders the users of I after vectorization to ensure that I dominates its
   /// users.
-  void reorder(Instruction *I);
+  void reorderBefore(Instruction *I);
+  void reorderBeforeHelper(Instruction *I,
+                           SmallPtrSet<Instruction *, 16> &InstructionsToMove);
 
   /// Returns the first and the last instructions in Chain.
   std::pair<BasicBlock::iterator, BasicBlock::iterator>
@@ -333,22 +335,50 @@
   return X2 == OffsetSCEVB;
 }
 
-void Vectorizer::reorder(Instruction *I) {
-  Instruction *InsertAfter = I;
-  for (User *U : I->users()) {
-    Instruction *User = dyn_cast<Instruction>(U);
-    if (!User || User->getOpcode() == Instruction::PHI)
-      continue;
+void Vectorizer::reorderBeforeHelper(
+    Instruction *I, SmallPtrSet<Instruction *, 16> &InstructionsToMove) {
+
+  SmallVector<Instruction *, 16> Worklist;
+  Worklist.insert(Worklist.end(), I);
+  while (Worklist.size()) {
+    auto LastElement = Worklist.end();
+    --LastElement;
+    I = *LastElement;
+    Worklist.erase(LastElement);
+    int NumOperands = I->getNumOperands();
+    for (int i = 0; i < NumOperands; i++) {
+      Instruction *IM = dyn_cast<Instruction>(I->getOperand(i));
+      if (!IM || IM->getOpcode() == Instruction::PHI)
+        continue;
 
-    if (!DT.dominates(I, User)) {
-      User->removeFromParent();
-      User->insertAfter(InsertAfter);
-      InsertAfter = User;
-      reorder(User);
+      if (!DT.dominates(IM, I)) {
+        InstructionsToMove.insert(IM);
+        Worklist.insert(Worklist.end(), IM);
+      }
     }
   }
 }
 
+void Vectorizer::reorderBefore(Instruction *I) {
+  SmallPtrSet<Instruction *, 16> InstructionsToMove;
+  reorderBeforeHelper(I, InstructionsToMove);
+
+  for (auto IM = I->getParent()->begin(), E = I->getParent()->end(); IM != E;
+       ++IM) {
+    if (!is_contained(InstructionsToMove, &*IM))
+      continue;
+    Instruction *InstructionToMove = &*IM;
+    --IM;
+    InstructionToMove->removeFromParent();
+    InstructionToMove->insertBefore(I);
+    InstructionsToMove.erase(InstructionToMove);
+  }
+  // This failure can occur if the helper marks for reordering instructions
+  // outside this BB
+  assert(InstructionsToMove.size() == 0 &&
+         "Failed to reorder all instructions");
+}
+
 std::pair<BasicBlock::iterator, BasicBlock::iterator>
 Vectorizer::getBoundaryInstrs(ArrayRef<Value *> Chain) {
   Instruction *C0 = cast<Instruction>(Chain[0]);
@@ -857,7 +887,7 @@
     return false;
 
   // Set insert point.
-  Builder.SetInsertPoint(&*Last);
+  Builder.SetInsertPoint(&*First);
 
   Value *Bitcast =
       Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS));
@@ -869,6 +899,7 @@
   if (VecLoadTy) {
     SmallVector<Instruction *, 16> InstrsToErase;
     SmallVector<Instruction *, 16> InstrsToReorder;
+    InstrsToReorder.push_back(cast<Instruction>(Bitcast));
 
     unsigned VecWidth = VecLoadTy->getNumElements();
     for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
@@ -884,18 +915,18 @@
 
         // Replace the old instruction.
         UI->replaceAllUsesWith(Extracted);
-        InstrsToReorder.push_back(Extracted);
         InstrsToErase.push_back(UI);
       }
     }
 
     for (Instruction *ModUser : InstrsToReorder)
-      reorder(ModUser);
+      reorderBefore(ModUser);
 
     for (auto I : InstrsToErase)
       I->eraseFromParent();
   } else {
     SmallVector<Instruction *, 16> InstrsToReorder;
+    InstrsToReorder.push_back(cast<Instruction>(Bitcast));
 
     for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
       Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(I));
@@ -908,11 +939,10 @@
 
       // Replace the old instruction.
       UI->replaceAllUsesWith(Extracted);
-      InstrsToReorder.push_back(Extracted);
     }
 
     for (Instruction *ModUser : InstrsToReorder)
-      reorder(ModUser);
+      reorderBefore(ModUser);
   }
 
   eraseInstructions(Chain);
@@ -930,4 +960,3 @@
                                                 Alignment, &Fast);
   return Allows && Fast;
 }
-
Index: test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll
===================================================================
--- test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll
+++ test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll
@@ -4,11 +4,12 @@
 
 ; Check relative position of the inserted vector load relative to the
 ; existing adds.
+; Vectorized loads should be inserted at the position of the first load.
 
 ; CHECK-LABEL: @insert_load_point(
 ; CHECK: %z = add i32 %x, 4
-; CHECK: %w = add i32 %y, 9
 ; CHECK: load <2 x float>
+; CHECK: %w = add i32 %y, 9
 ; CHECK: %foo = add i32 %z, %w
 define void @insert_load_point(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c, i64 %idx, i32 %x, i32 %y) #0 {
 entry:
Index: test/Transforms/LoadStoreVectorizer/X86/correct-order.ll
===================================================================
--- /dev/null
+++ test/Transforms/LoadStoreVectorizer/X86/correct-order.ll
@@ -0,0 +1,26 @@
+; RUN: opt -mtriple=x86-linux -load-store-vectorizer -S -o - %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+; CHECK-LABEL: @correct_order(
+; CHECK: bitcast i32*
+; CHECK: load <2 x i32>
+; CHECK: load i32
+; CHECK: bitcast i32*
+; CHECK: store <2 x i32>
+; CHECK: load i32
+define void @correct_order(i32* noalias %ptr) {
+  %next.gep = getelementptr i32, i32* %ptr, i64 0
+  %next.gep1 = getelementptr i32, i32* %ptr, i64 1
+  %next.gep2 = getelementptr i32, i32* %ptr, i64 2
+
+  %l1 = load i32, i32* %next.gep1, align 4
+  %l2 = load i32, i32* %next.gep, align 4
+  store i32 0, i32* %next.gep1, align 4
+  store i32 0, i32* %next.gep, align 4
+  %l3 = load i32, i32* %next.gep1, align 4
+  %l4 = load i32, i32* %next.gep2, align 4
+
+  ret void
+}
+
Index: test/Transforms/LoadStoreVectorizer/X86/preserve-order32.ll
===================================================================
--- test/Transforms/LoadStoreVectorizer/X86/preserve-order32.ll
+++ test/Transforms/LoadStoreVectorizer/X86/preserve-order32.ll
@@ -6,6 +6,9 @@
 
 ; Check an i32 and i8* get vectorized, and that
 ; the two accesses (load into buff.val and store to buff.p) preserve their order.
+; Vectorized loads should be inserted at the position of the first load,
+; and instructions which were between the first and last load should be
+; reordered preserving their relative order inasmuch as possible.
 
 ; CHECK-LABEL: @preserve_order_32(
 ; CHECK: load <2 x i32>
Index: test/Transforms/LoadStoreVectorizer/X86/preserve-order64.ll
===================================================================
--- test/Transforms/LoadStoreVectorizer/X86/preserve-order64.ll
+++ test/Transforms/LoadStoreVectorizer/X86/preserve-order64.ll
@@ -6,6 +6,9 @@
 
 ; Check an i64 and i8* get vectorized, and that
 ; the two accesses (load into buff.val and store to buff.p) preserve their order.
+; Vectorized loads should be inserted at the position of the first load,
+; and instructions which were between the first and last load should be
+; reordered preserving their relative order inasmuch as possible.
 
 ; CHECK-LABEL: @preserve_order_64(
 ; CHECK: load <2 x i64>
@@ -22,4 +25,33 @@
   ret void
 }
 
+; Check for no vectorization over phi node
+
+; CHECK-LABEL: @no_vect_phi(
+; CHECK: load i8*
+; CHECK: load i8
+; CHECK: store i8 0
+; CHECK: load i64
+define void @no_vect_phi(i32* noalias %ptr, %struct.buffer_t* noalias %buff) {
+entry:
+  %tmp1 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %buff, i64 0, i32 1
+  %buff.p = load i8*, i8** %tmp1, align 8
+  %buff.val = load i8, i8* %buff.p, align 8
+  store i8 0, i8* %buff.p, align 8
+  br label %"for something"
+
+"for something":
+  %index = phi i64 [ 0, %entry ], [ %index.next, %"for something" ]
+
+  %tmp0 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %buff, i64 0, i32 0
+  %buff.int = load i64, i64* %tmp0, align 8
+
+  %index.next = add i64 %index, 8
+  %cmp_res = icmp eq i64 %index.next, 8
+  br i1 %cmp_res, label %ending, label %"for something"
+
+ending:
+  ret void
+}
+
 attributes #0 = { nounwind }
Index: test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll
===================================================================
--- /dev/null
+++ test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll
@@ -0,0 +1,91 @@
+; RUN: opt -mtriple=x86-linux -load-store-vectorizer -S -o - %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+; Vectorized subsets of the load/store chains in the presence of
+; interleaved loads/stores
+
+; CHECK-LABEL: @interleave_2L_2S(
+; CHECK: load <2 x i32>
+; CHECK: load i32
+; CHECK: store <2 x i32>
+; CHECK: load i32
+define void @interleave_2L_2S(i32* noalias %ptr) {
+  %next.gep = getelementptr i32, i32* %ptr, i64 0
+  %next.gep1 = getelementptr i32, i32* %ptr, i64 1
+  %next.gep2 = getelementptr i32, i32* %ptr, i64 2
+
+  %l1 = load i32, i32* %next.gep1, align 4
+  %l2 = load i32, i32* %next.gep, align 4
+  store i32 0, i32* %next.gep1, align 4
+  store i32 0, i32* %next.gep, align 4
+  %l3 = load i32, i32* %next.gep1, align 4
+  %l4 = load i32, i32* %next.gep2, align 4
+
+  ret void
+}
+
+; CHECK-LABEL: @interleave_3L_2S_1L(
+; CHECK: load <3 x i32>
+; CHECK: store <2 x i32>
+; CHECK: load i32
+
+define void @interleave_3L_2S_1L(i32* noalias %ptr) {
+  %next.gep = getelementptr i32, i32* %ptr, i64 0
+  %next.gep1 = getelementptr i32, i32* %ptr, i64 1
+  %next.gep2 = getelementptr i32, i32* %ptr, i64 2
+
+  %l2 = load i32, i32* %next.gep, align 4
+  %l1 = load i32, i32* %next.gep1, align 4
+  store i32 0, i32* %next.gep1, align 4
+  store i32 0, i32* %next.gep, align 4
+  %l3 = load i32, i32* %next.gep1, align 4
+  %l4 = load i32, i32* %next.gep2, align 4
+
+  ret void
+}
+
+; CHECK-LABEL: @chain_suffix(
+; CHECK: load i32
+; CHECK: store <2 x i32>
+; CHECK: load i32
+; CHECK: load i32
+define void @chain_suffix(i32* noalias %ptr) {
+  %next.gep = getelementptr i32, i32* %ptr, i64 0
+  %next.gep1 = getelementptr i32, i32* %ptr, i64 1
+  %next.gep2 = getelementptr i32, i32* %ptr, i64 2
+
+  %l2 = load i32, i32* %next.gep, align 4
+  store i32 0, i32* %next.gep1, align 4
+  store i32 0, i32* %next.gep, align 4
+  %l3 = load i32, i32* %next.gep1, align 4
+  %l4 = load i32, i32* %next.gep2, align 4
+
+  ret void
+}
+
+
+; CHECK-LABEL: @chain_prefix_suffix(
+; CHECK: load i32
+; CHECK: load i32
+; CHECK: store <2 x i32>
+; CHECK: load i32
+; CHECK: load i32
+; CHECK: load i32
+define void  @chain_prefix_suffix(i32* noalias %ptr) {
+  %next.gep = getelementptr i32, i32* %ptr, i64 0
+  %next.gep1 = getelementptr i32, i32* %ptr, i64 1
+  %next.gep2 = getelementptr i32, i32* %ptr, i64 2
+  %next.gep3 = getelementptr i32, i32* %ptr, i64 3
+
+  %l1 = load i32, i32* %next.gep, align 4
+  %l2 = load i32, i32* %next.gep1, align 4
+  store i32 0, i32* %next.gep1, align 4
+  store i32 0, i32* %next.gep2, align 4
+  %l3 = load i32, i32* %next.gep1, align 4
+  %l4 = load i32, i32* %next.gep2, align 4
+  %l5 = load i32, i32* %next.gep3, align 4
+
+  ret void
+}
+