Index: lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- lib/Target/AArch64/AArch64ISelLowering.cpp
+++ lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -9247,6 +9247,41 @@
   return NewST1;
 }
 
+// Check if addresses X and Y differ by 16 in one direction or another.
+static bool BaseOffsetDifference16(SDValue X, SDValue Y) {
+  int64_t PartialOffset = 0;
+  if (X.getOpcode() == ISD::SIGN_EXTEND)
+    X = X->getOperand(0);
+  if (Y.getOpcode() == ISD::SIGN_EXTEND)
+    Y = Y->getOperand(0);
+  if (X.getOpcode() == ISD::ADD && isa<ConstantSDNode>(X->getOperand(1))) {
+    PartialOffset -= cast<ConstantSDNode>(X->getOperand(1))->getSExtValue();
+    X = X->getOperand(0);
+  }
+  if (Y.getOpcode() == ISD::ADD && isa<ConstantSDNode>(Y->getOperand(1))) {
+    PartialOffset += cast<ConstantSDNode>(Y->getOperand(1))->getSExtValue();
+    Y = Y->getOperand(0);
+  }
+  while (X.getOpcode() == ISD::ADD && Y.getOpcode() == ISD::ADD) {
+    if (X->getOperand(0) == Y->getOperand(0)) {
+      X = X->getOperand(1);
+      Y = Y->getOperand(1);
+    } else if (X->getOperand(1) == Y->getOperand(1)) {
+      X = X->getOperand(0);
+      Y = Y->getOperand(0);
+    } else
+      break;
+  }
+  if (isa<ConstantSDNode>(X) && isa<ConstantSDNode>(Y)) {
+    PartialOffset -= cast<ConstantSDNode>(X)->getSExtValue();
+    PartialOffset += cast<ConstantSDNode>(Y)->getSExtValue();
+    return (PartialOffset == 16) || (PartialOffset == -16);
+  }
+  if (X == Y)
+    return (PartialOffset == 16) || (PartialOffset == -16);
+  return false;
+}
+
 /// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR.  The
 /// load store optimizer pass will merge them to store pair stores.  This should
 /// be better than a movi to create the vector zero followed by a vector store
@@ -9278,11 +9313,27 @@
   if (StVal.getOpcode() != ISD::BUILD_VECTOR)
     return SDValue();
 
-  // If the zero constant has more than one use then the vector store could be
-  // better since the constant mov will be amortized and stp q instructions
-  // should be able to be formed.
-  if (!StVal.hasOneUse())
-    return SDValue();
+  // If the zero constant has more than one use then the vector store
+  // could be better since the constant mov will be amortized and stp
+  // q instructions should be able to be formed. The exception to this
+  // is when all uses of StVal are non-consecutive ST16 instances. In
+  // this case, it is better to replace each instance with stp xzr, xzr.
+
+  if (!StVal.hasOneUse()) {
+    if (VT.getVectorElementType().getSizeInBits() != 64)
+      return SDValue();
+    SmallVector<SDValue, 8> STAddrs;
+    for (auto *U : StVal->uses()) {
+      if ((U->getOpcode() != ISD::STORE) ||
+          (U->getOperand(1).getValueType().getSizeInBits() != 16 * 8))
+        return SDValue();
+      SDValue Addr = U->getOperand(2);
+      for (SDValue &OtherAddr : STAddrs)
+        if (BaseOffsetDifference16(Addr, OtherAddr))
+          return SDValue();
+      STAddrs.push_back(Addr);
+    }
+  }
 
   // If the immediate offset of the address operand is too large for the stp
   // instruction, then bail out.
Index: test/CodeGen/AArch64/ldst-opt.ll
===================================================================
--- test/CodeGen/AArch64/ldst-opt.ll
+++ test/CodeGen/AArch64/ldst-opt.ll
@@ -1608,3 +1608,59 @@
   store <4 x double> zeroinitializer, <4 x double>* %p
   ret void
 }
+
+; Verify that non-consecutive merges do not generate q0
+define void @merge_multiple_128bit_stores(i64* %p) {
+; CHECK-LABEL: merge_multiple_128bit_stores
+; CHECK: // %entry
+; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
+; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}, #24]
+; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}, #48]
+; CHECK-NEXT: ret
+entry:
+  store i64 0, i64* %p
+  %p1 = getelementptr i64, i64* %p, i64 1
+  store i64 0, i64* %p1
+  %p3 = getelementptr i64, i64* %p, i64 3
+  store i64 0, i64* %p3
+  %p4 = getelementptr i64, i64* %p, i64 4
+  store i64 0, i64* %p4
+  %p6 = getelementptr i64, i64* %p, i64 6
+  store i64 0, i64* %p6
+  %p7 = getelementptr i64, i64* %p, i64 7
+  store i64 0, i64* %p7
+  ret void
+}
+
+; Verify that large stores generate stp q
+define void @merge_multiple_128bit_stores_consec(i64* %p) {
+; CHECK-LABEL: merge_multiple_128bit_stores_consec
+; CHECK: // %entry
+; NOSTRICTALIGN-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000
+; NOSTRICTALIGN-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}]
+; NOSTRICTALIGN-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}, #32]
+; STRICTALIGN-NEXT: stp	 xzr, xzr, [x0]
+; STRICTALIGN-NEXT: stp	 xzr, xzr, [x0, #16]
+; STRICTALIGN-NEXT: stp	 xzr, xzr, [x0, #32]
+; STRICTALIGN-NEXT: stp  xzr, xzr, [x0, #48]
+; CHECK-NEXT: ret
+entry:
+  store i64 0, i64* %p
+  %p1 = getelementptr i64, i64* %p, i64 1
+  store i64 0, i64* %p1
+  %p2 = getelementptr i64, i64* %p, i64 2
+  store i64 0, i64* %p2
+  %p3 = getelementptr i64, i64* %p, i64 3
+  store i64 0, i64* %p3
+  %p4 = getelementptr i64, i64* %p, i64 4
+  store i64 0, i64* %p4
+  %p5 = getelementptr i64, i64* %p, i64 5
+  store i64 0, i64* %p5
+  %p6 = getelementptr i64, i64* %p, i64 6
+  store i64 0, i64* %p6
+  %p7 = getelementptr i64, i64* %p, i64 7
+  store i64 0, i64* %p7
+  ret void
+}
+
+