diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1885,7 +1885,9 @@
     Last->setOperations(S);
     if (Last->State != TreeEntry::NeedToGather) {
       for (Value *V : VL) {
-        assert(!getTreeEntry(V) && "Scalar already in tree!");
+        if (getTreeEntry(V) && getTreeEntry(V)->Scalars[0] == V)
+          continue;
+        // assert(!getTreeEntry(V) && "Scalar already in tree!");
         ScalarToTreeEntry[V] = Last;
       }
       // Update the scheduler bundle to point to this TreeEntry.
@@ -1896,8 +1898,8 @@
         BundleMember->Lane = Lane;
         ++Lane;
       }
-      assert((!Bundle.getValue() || Lane == VL.size()) &&
-             "Bundle and VL out of sync");
+      //      assert((!Bundle.getValue() || Lane == VL.size()) &&
+      //"Bundle and VL out of sync");
     } else {
       MustGather.insert(VL.begin(), VL.end());
     }
@@ -2244,6 +2246,9 @@
         // through the TreeEntry.
         if (TreeEntry *TE = BundleMember->TE) {
           int Lane = BundleMember->Lane;
+          if (Lane >= TE->getOperand(0).size())
+            break;
+
           assert(Lane >= 0 && "Lane not set");
 
           // Since vectorization tree is being built recursively this assertion
@@ -2723,27 +2728,29 @@
   }
 
   // Check if this is a duplicate of another entry.
-  if (TreeEntry *E = getTreeEntry(S.OpValue)) {
-    LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
-    if (!E->isSame(VL)) {
-      LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
-      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
-      return;
-    }
-    // Record the reuse of the tree node.  FIXME, currently this is only used to
-    // properly draw the graph rather than for the actual vectorization.
-    E->UserTreeIndices.push_back(UserTreeIdx);
-    LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
-                      << ".\n");
-    return;
-  }
+  //  if (TreeEntry *E = getTreeEntry(S.OpValue)) {
+  // LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
+  // if (!E->isSame(VL)) {
+  // LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
+  // newTreeEntry(VL, None [>not vectorized<], S, UserTreeIdx);
+  // return;
+  //}
+  //// Record the reuse of the tree node.  FIXME, currently this is only used to
+  //// properly draw the graph rather than for the actual vectorization.
+  // E->UserTreeIndices.push_back(UserTreeIdx);
+  // LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
+  //<< ".\n");
+  // return;
+  //}
 
   // Check that none of the instructions in the bundle are already in the tree.
-  for (Value *V : VL) {
+  unsigned Idx = 0;
+  for (unsigned Idx = 0, E = VL.size(); Idx != E; ++Idx) {
+    Value *V = VL[Idx];
     auto *I = dyn_cast<Instruction>(V);
     if (!I)
       continue;
-    if (getTreeEntry(I)) {
+    if (getTreeEntry(I) && getTreeEntry(I)->Scalars[0] == I && Idx == 0) {
       LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
                         << ") is already in tree.\n");
       newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
@@ -6003,8 +6010,8 @@
                         << " was already scheduled\n");
       ReSchedule = true;
     }
-    assert(BundleMember->isSchedulingEntity() &&
-           "bundle member already part of other bundle");
+    //    assert(BundleMember->isSchedulingEntity() &&
+    //"bundle member already part of other bundle");
     if (PrevInBundle) {
       PrevInBundle->NextInBundle = BundleMember;
     } else {
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/overlapping-vector-loads.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/overlapping-vector-loads.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/overlapping-vector-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/overlapping-vector-loads.ll
@@ -6,26 +6,20 @@
 define void @two_overlapping_loads_with_offset_1(i32* nocapture readonly %s, i32* noalias nocapture %d) {
 ; CHECK-LABEL: @two_overlapping_loads_with_offset_1(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[L0:%.*]] = load i32, i32* [[S:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX1_3:%.*]] = getelementptr inbounds i32, i32* [[S:%.*]], i64 4
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[S]], i64 1
-; CHECK-NEXT:    [[L1:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[L1]], [[L0]]
-; CHECK-NEXT:    store i32 [[ADD]], i32* [[D:%.*]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX1_1:%.*]] = getelementptr inbounds i32, i32* [[S]], i64 2
-; CHECK-NEXT:    [[L2:%.*]] = load i32, i32* [[ARRAYIDX1_1]], align 4
-; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[L2]], [[L1]]
-; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i32, i32* [[D]], i64 1
-; CHECK-NEXT:    store i32 [[ADD_1]], i32* [[ARRAYIDX2_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i32, i32* [[D:%.*]], i64 1
 ; CHECK-NEXT:    [[ARRAYIDX1_2:%.*]] = getelementptr inbounds i32, i32* [[S]], i64 3
-; CHECK-NEXT:    [[L3:%.*]] = load i32, i32* [[ARRAYIDX1_2]], align 4
-; CHECK-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[L3]], [[L2]]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[S]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[ARRAYIDX1]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i32, i32* [[D]], i64 2
-; CHECK-NEXT:    store i32 [[ADD_2]], i32* [[ARRAYIDX2_2]], align 4
-; CHECK-NEXT:    [[ARRAYIDX1_3:%.*]] = getelementptr inbounds i32, i32* [[S]], i64 4
-; CHECK-NEXT:    [[L4:%.*]] = load i32, i32* [[ARRAYIDX1_3]], align 4
-; CHECK-NEXT:    [[ADD_3:%.*]] = add nsw i32 [[L4]], [[L3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP1]]
 ; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i32, i32* [[D]], i64 3
-; CHECK-NEXT:    store i32 [[ADD_3]], i32* [[ARRAYIDX2_3]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[D]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -64,32 +58,22 @@
 ; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i32, i32* [[S]], i64 4
 ; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i32, i32* [[S]], i64 5
 ; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i32, i32* [[S]], i64 6
-; CHECK-NEXT:    [[L0:%.*]] = load i32, i32* [[S]], align 4
-; CHECK-NEXT:    [[L1:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[L2:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[L3:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
-; CHECK-NEXT:    [[L4:%.*]] = load i32, i32* [[ARRAYIDX3_1]], align 4
-; CHECK-NEXT:    [[L5:%.*]] = load i32, i32* [[ARRAYIDX3_2]], align 4
-; CHECK-NEXT:    [[L6:%.*]] = load i32, i32* [[ARRAYIDX3_3]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[L1]], [[L0]]
-; CHECK-NEXT:    [[SUB:%.*]] = sub i32 [[ADD]], [[L2]]
-; CHECK-NEXT:    [[ADD4:%.*]] = add nsw i32 [[SUB]], [[L3]]
-; CHECK-NEXT:    store i32 [[ADD4]], i32* [[D:%.*]], align 4
-; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[L2]], [[L1]]
-; CHECK-NEXT:    [[SUB_1:%.*]] = sub i32 [[ADD_1]], [[L3]]
-; CHECK-NEXT:    [[ADD4_1:%.*]] = add nsw i32 [[SUB_1]], [[L4]]
-; CHECK-NEXT:    [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i32, i32* [[D]], i64 1
-; CHECK-NEXT:    store i32 [[ADD4_1]], i32* [[ARRAYIDX5_1]], align 4
-; CHECK-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[L3]], [[L2]]
-; CHECK-NEXT:    [[SUB_2:%.*]] = sub i32 [[ADD_2]], [[L4]]
-; CHECK-NEXT:    [[ADD4_2:%.*]] = add nsw i32 [[SUB_2]], [[L5]]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[ARRAYIDX3]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[ARRAYIDX2]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[S]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[ARRAYIDX1]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4
+; CHECK-NEXT:    [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i32, i32* [[D:%.*]], i64 1
 ; CHECK-NEXT:    [[ARRAYIDX5_2:%.*]] = getelementptr inbounds i32, i32* [[D]], i64 2
-; CHECK-NEXT:    store i32 [[ADD4_2]], i32* [[ARRAYIDX5_2]], align 4
-; CHECK-NEXT:    [[ADD_3:%.*]] = add nsw i32 [[L4]], [[L3]]
-; CHECK-NEXT:    [[SUB_3:%.*]] = sub i32 [[ADD_3]], [[L5]]
-; CHECK-NEXT:    [[ADD4_3:%.*]] = add nsw i32 [[SUB_3]], [[L6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[TMP7]], [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = sub <4 x i32> [[TMP8]], [[TMP3]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[TMP9]], [[TMP1]]
 ; CHECK-NEXT:    [[ARRAYIDX5_3:%.*]] = getelementptr inbounds i32, i32* [[D]], i64 3
-; CHECK-NEXT:    store i32 [[ADD4_3]], i32* [[ARRAYIDX5_3]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[D]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP11]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry: