Index: llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -2093,11 +2093,13 @@
   case AArch64::STRSui:
   case AArch64::STRDui:
   case AArch64::STRQui:
+  case AArch64::STRQpre:
   case AArch64::STRXui:
   case AArch64::STRWui:
   case AArch64::LDRSui:
   case AArch64::LDRDui:
   case AArch64::LDRQui:
+  case AArch64::LDRQpre:
   case AArch64::LDRXui:
   case AArch64::LDRWui:
   case AArch64::LDRSWui:
@@ -2208,23 +2210,35 @@
 // Is this a candidate for ld/st merging or pairing?  For example, we don't
 // touch volatiles or load/stores that have a hint to avoid pair formation.
 bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const {
-  // If this is a volatile load/store, don't mess with it.
-  if (MI.hasOrderedMemoryRef())
+  // If this is a volatile load/store, don't mess with it unless if the
+  // instruction is a LDRQpre.
+  if (MI.hasOrderedMemoryRef() && MI.getOpcode() != AArch64::LDRQpre)
     return false;
 
   // Make sure this is a reg/fi+imm (as opposed to an address reloc).
   assert((MI.getOperand(1).isReg() || MI.getOperand(1).isFI()) &&
          "Expected a reg or frame index operand.");
-  if (!MI.getOperand(2).isImm())
+
+  // For Pre-indexed addressing quadword instructions,
+  // the third opperand is the immediate value.
+  bool isPreLdSt = ((MI.getOpcode() == AArch64::STRQpre) ||
+                     (MI.getOpcode() == AArch64::LDRQpre)) &&
+                    MI.getOperand(3).isImm();
+
+  if (!MI.getOperand(2).isImm() && !isPreLdSt)
     return false;
 
   // Can't merge/pair if the instruction modifies the base register.
   // e.g., ldr x0, [x0]
   // This case will never occur with an FI base.
+  // However, if the instruction is an LDRQpre, it can be merged, e.g.,
+  // ldr q0, [x11, #32]!
+  // ldr q1, [x11, #16]
+  // to ldp q0, q1, [x11, #32]!
   if (MI.getOperand(1).isReg()) {
     Register BaseReg = MI.getOperand(1).getReg();
     const TargetRegisterInfo *TRI = &getRegisterInfo();
-    if (MI.modifiesRegister(BaseReg, TRI))
+    if (MI.modifiesRegister(BaseReg, TRI) && MI.getOpcode() != AArch64::LDRQpre)
       return false;
   }
 
@@ -2682,7 +2696,9 @@
   case AArch64::STRQui:
   case AArch64::STURQi:
   case AArch64::LDPQi:
+  case AArch64::LDRQpre:
   case AArch64::STPQi:
+  case AArch64::STRQpre:
   case AArch64::STGOffset:
   case AArch64::STZGOffset:
   case AArch64::ST2GOffset:
Index: llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -249,6 +249,7 @@
   case AArch64::STRDui:
   case AArch64::STURDi:
   case AArch64::STRQui:
+  case AArch64::STRQpre:
   case AArch64::STURQi:
   case AArch64::STRBBui:
   case AArch64::STURBBi:
@@ -261,6 +262,7 @@
   case AArch64::LDRDui:
   case AArch64::LDURDi:
   case AArch64::LDRQui:
+  case AArch64::LDRQpre:
   case AArch64::LDURQi:
   case AArch64::LDRWui:
   case AArch64::LDURWi:
@@ -310,6 +312,8 @@
   case AArch64::STRQui:
   case AArch64::STURQi:
     return AArch64::STPQi;
+  case AArch64::STRQpre:
+    return AArch64::STPQpre;
   case AArch64::STRWui:
   case AArch64::STURWi:
     return AArch64::STPWi;
@@ -325,6 +329,8 @@
   case AArch64::LDRQui:
   case AArch64::LDURQi:
     return AArch64::LDPQi;
+  case AArch64::LDRQpre:
+    return AArch64::LDPQpre;
   case AArch64::LDRWui:
   case AArch64::LDURWi:
     return AArch64::LDPWi;
@@ -539,15 +545,30 @@
   }
 }
 
+// Returns whether the instruction is a pre-index load/store.
+static bool isPreLdSt(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  case AArch64::STRQpre:
+  case AArch64::LDRQpre:
+    return true;
+  }
+}
+
 // Returns the scale and offset range of pre/post indexed variants of MI.
 static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale,
                                        int &MinOffset, int &MaxOffset) {
   bool IsPaired = isPairedLdSt(MI);
   bool IsTagStore = isTagStore(MI);
+  bool IsPreLdSt = isPreLdSt(MI);
+
   // ST*G and all paired ldst have the same scale in pre/post-indexed variants
   // as in the "unsigned offset" variant.
   // All other pre/post indexed ldst instructions are unscaled.
-  Scale = (IsTagStore || IsPaired) ? AArch64InstrInfo::getMemScale(MI) : 1;
+  Scale = (IsTagStore || IsPaired || IsPreLdSt)
+              ? AArch64InstrInfo::getMemScale(MI)
+              : 1;
 
   if (IsPaired) {
     MinOffset = -64;
@@ -561,17 +582,20 @@
 static MachineOperand &getLdStRegOp(MachineInstr &MI,
                                     unsigned PairedRegOp = 0) {
   assert(PairedRegOp < 2 && "Unexpected register operand idx.");
-  unsigned Idx = isPairedLdSt(MI) ? PairedRegOp : 0;
+  bool IsPreLdSt = isPreLdSt(MI);
+  if (IsPreLdSt)
+    PairedRegOp = 1;
+  unsigned Idx = isPairedLdSt(MI) || IsPreLdSt ? PairedRegOp : 0;
   return MI.getOperand(Idx);
 }
 
 static const MachineOperand &getLdStBaseOp(const MachineInstr &MI) {
-  unsigned Idx = isPairedLdSt(MI) ? 2 : 1;
+  unsigned Idx = isPairedLdSt(MI) || isPreLdSt(MI) ? 2 : 1;
   return MI.getOperand(Idx);
 }
 
 static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI) {
-  unsigned Idx = isPairedLdSt(MI) ? 3 : 2;
+  unsigned Idx = isPairedLdSt(MI) || isPreLdSt(MI) ? 3 : 2;
   return MI.getOperand(Idx);
 }
 
@@ -940,13 +964,25 @@
         MI.clearRegisterKills(Reg, TRI);
     }
   }
-  MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingPairOpcode(Opc)))
-            .add(RegOp0)
-            .add(RegOp1)
-            .add(BaseRegOp)
-            .addImm(OffsetImm)
-            .cloneMergedMemRefs({&*I, &*Paired})
-            .setMIFlags(I->mergeFlagsWith(*Paired));
+
+  int Scale, MinOffset, MaxOffset;
+  getPrePostIndexedMemOpInfo(*I, Scale, MinOffset, MaxOffset);
+
+  unsigned int MatchPairOpcode = getMatchingPairOpcode(Opc);
+  MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(MatchPairOpcode));
+
+  if (MatchPairOpcode == AArch64::STPQpre ||
+      MatchPairOpcode == AArch64::LDPQpre) {
+    MIB.addReg(BaseRegOp.getReg(), RegState::Define);
+  } else
+      assert(Scale==1 && "The scale for non pre/post indexed variants must be 1.");
+
+  MIB.add(RegOp0)
+      .add(RegOp1)
+      .add(BaseRegOp)
+      .addImm(OffsetImm / Scale)
+      .cloneMergedMemRefs({&*I, &*Paired})
+      .setMIFlags(I->mergeFlagsWith(*Paired));
 
   (void)MIB;
 
@@ -1225,8 +1261,10 @@
     return false;
 
   // We should have already checked FirstMI for pair suppression and volatility.
-  assert(!FirstMI.hasOrderedMemoryRef() &&
-         !TII->isLdStPairSuppressed(FirstMI) &&
+  // Ignore if the instruction is LDRQpre.
+  assert(((!FirstMI.hasOrderedMemoryRef() &&
+           !TII->isLdStPairSuppressed(FirstMI)) ||
+          FirstMI.getOpcode() == AArch64::LDRQpre) &&
          "FirstMI shouldn't get here if either of these checks are true.");
 
   unsigned OpcA = FirstMI.getOpcode();
@@ -1257,6 +1295,11 @@
   if (isNarrowStore(OpcA) || isNarrowStore(OpcB))
     return false;
 
+ // The STRQpre-STRQui and LDRQpre-LDRQui are candidate pairs that can be merged.
+  if ((OpcA == AArch64::STRQpre && OpcB == AArch64::STRQui) ||
+      (OpcA == AArch64::LDRQpre && OpcB == AArch64::LDRQui))
+    return true;
+
   // Try to match an unscaled load/store with a scaled load/store.
   return TII->isUnscaledLdSt(OpcA) != TII->isUnscaledLdSt(OpcB) &&
          getMatchingPairOpcode(OpcA) == getMatchingPairOpcode(OpcB);
@@ -1643,7 +1686,17 @@
     // Update list of instructions that read/write memory.
     if (MI.mayLoadOrStore())
       MemInsns.push_back(&MI);
+
+    // The STRQpre and LDRQpre instructions can be merged with STRQui and LDRQui, respectively.
+    if (((FirstMI.getOpcode() == AArch64::STRQpre) &&
+         (MI.getOpcode() == AArch64::STRQui)) ||
+        ((FirstMI.getOpcode() == AArch64::LDRQpre) &&
+         (MI.getOpcode() == AArch64::LDRQui))) {
+      Flags.setMergeForward(true);
+      return MBBI;
+    }
   }
+
   return E;
 }
 
Index: llvm/test/CodeGen/AArch64/ldrqpre-ldrqui-merge.mir
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/ldrqpre-ldrqui-merge.mir
@@ -0,0 +1,250 @@
+# RUN: llc -mtriple=aarch64-none-eabi -mcpu=cortex-a55 -lsr-preferred-addressing-mode=preindexed -run-pass=aarch64-ldst-opt -verify-machineinstrs %s -o - | FileCheck %s
+# CHECK:      early-clobber $x11, renamable $q0, renamable $q1 = LDPQpre renamable $x11, 2
+# CHECK-NEXT: early-clobber $x10, renamable $q2, renamable $q3 = LDPQpre renamable $x10, 2
+
+--- |
+  target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+  target triple = "aarch64-none-unknown-eabi"
+
+  ; Function Attrs: nofree norecurse nounwind uwtable mustprogress
+  define dso_local void @ldrqpre-ldrqui-merge(float* nocapture readonly %S, float* nocapture %D, i32 %N) local_unnamed_addr #0 {
+  entry:
+    %cmp11.not = icmp eq i32 %N, 0
+    br i1 %cmp11.not, label %for.cond.cleanup, label %for.body.preheader
+
+  for.body.preheader:                               ; preds = %entry
+    %wide.trip.count = zext i32 %N to i64
+    %min.iters.check = icmp ult i32 %N, 8
+    br i1 %min.iters.check, label %for.body.preheader2, label %vector.memcheck
+
+  for.body.preheader2:                              ; preds = %middle.block, %vector.memcheck, %for.body.preheader
+    %indvars.iv.ph = phi i64 [ %n.vec, %middle.block ], [ 0, %for.body.preheader ], [ 0, %vector.memcheck ]
+    %0 = sub i64 %wide.trip.count, %indvars.iv.ph
+    %1 = add nsw i64 %indvars.iv.ph, -1
+    %scevgep3 = getelementptr float, float* %S, i64 %1
+    %scevgep7 = getelementptr float, float* %D, i64 %1
+    br label %for.body
+
+  vector.memcheck:                                  ; preds = %for.body.preheader
+    %scevgep = getelementptr float, float* %D, i64 %wide.trip.count
+    %scevgep16 = getelementptr float, float* %S, i64 %wide.trip.count
+    %bound0 = icmp ugt float* %scevgep16, %D
+    %bound1 = icmp ugt float* %scevgep, %S
+    %found.conflict = and i1 %bound0, %bound1
+    br i1 %found.conflict, label %for.body.preheader2, label %vector.ph
+
+  vector.ph:                                        ; preds = %vector.memcheck
+    %n.vec = and i64 %wide.trip.count, 4294967288
+    %scevgep13 = getelementptr float, float* %S, i64 -8
+    %scevgep19 = getelementptr float, float* %D, i64 -8
+    br label %vector.body
+
+  vector.body:                                      ; preds = %vector.body, %vector.ph
+    %lsr.iv20 = phi float* [ %scevgep21, %vector.body ], [ %scevgep19, %vector.ph ]
+    %lsr.iv14 = phi float* [ %scevgep15, %vector.body ], [ %scevgep13, %vector.ph ]
+    %lsr.iv11 = phi i64 [ %lsr.iv.next12, %vector.body ], [ %n.vec, %vector.ph ]
+    %lsr.iv1416 = bitcast float* %lsr.iv14 to <4 x float>*
+    %lsr.iv2022 = bitcast float* %lsr.iv20 to <4 x float>*
+    %scevgep23 = getelementptr <4 x float>, <4 x float>* %lsr.iv2022, i64 2
+    %wide.load = load <4 x float>, <4 x float>* %scevgep23, align 4, !tbaa !6, !alias.scope !10, !noalias !13
+    %scevgep26 = getelementptr <4 x float>, <4 x float>* %lsr.iv2022, i64 3
+    %wide.load18 = load <4 x float>, <4 x float>* %scevgep26, align 4, !tbaa !6, !alias.scope !10, !noalias !13
+    %scevgep17 = getelementptr <4 x float>, <4 x float>* %lsr.iv1416, i64 2
+    %wide.load19 = load <4 x float>, <4 x float>* %scevgep17, align 4, !tbaa !6, !alias.scope !13
+    %scevgep18 = getelementptr <4 x float>, <4 x float>* %lsr.iv1416, i64 3
+    %wide.load20 = load <4 x float>, <4 x float>* %scevgep18, align 4, !tbaa !6, !alias.scope !13
+    %2 = fadd fast <4 x float> %wide.load19, %wide.load
+    %3 = fadd fast <4 x float> %wide.load20, %wide.load18
+    %scevgep24 = getelementptr <4 x float>, <4 x float>* %lsr.iv2022, i64 2
+    store <4 x float> %2, <4 x float>* %scevgep24, align 4, !tbaa !6, !alias.scope !10, !noalias !13
+    %scevgep25 = getelementptr <4 x float>, <4 x float>* %lsr.iv2022, i64 3
+    store <4 x float> %3, <4 x float>* %scevgep25, align 4, !tbaa !6, !alias.scope !10, !noalias !13
+    %lsr.iv.next12 = add i64 %lsr.iv11, -8
+    %scevgep15 = getelementptr float, float* %lsr.iv14, i64 8
+    %scevgep21 = getelementptr float, float* %lsr.iv20, i64 8
+    %4 = icmp eq i64 %lsr.iv.next12, 0
+    br i1 %4, label %middle.block, label %vector.body, !llvm.loop !15
+
+  middle.block:                                     ; preds = %vector.body
+    %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
+    br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader2
+
+  for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
+    ret void
+
+  for.body:                                         ; preds = %for.body.preheader2, %for.body
+    %lsr.iv8 = phi float* [ %scevgep7, %for.body.preheader2 ], [ %scevgep9, %for.body ]
+    %lsr.iv4 = phi float* [ %scevgep3, %for.body.preheader2 ], [ %scevgep5, %for.body ]
+    %lsr.iv = phi i64 [ %0, %for.body.preheader2 ], [ %lsr.iv.next, %for.body ]
+    %scevgep10 = getelementptr float, float* %lsr.iv8, i64 1
+    %5 = load float, float* %scevgep10, align 4, !tbaa !6
+    %scevgep6 = getelementptr float, float* %lsr.iv4, i64 1
+    %6 = load float, float* %scevgep6, align 4, !tbaa !6
+    %add = fadd fast float %6, %5
+    store float %add, float* %scevgep10, align 4, !tbaa !6
+    %lsr.iv.next = add i64 %lsr.iv, -1
+    %scevgep5 = getelementptr float, float* %lsr.iv4, i64 1
+    %scevgep9 = getelementptr float, float* %lsr.iv8, i64 1
+    %exitcond.not = icmp eq i64 %lsr.iv.next, 0
+    br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !18
+  }
+
+  attributes #0 = { nofree norecurse nounwind uwtable mustprogress "frame-pointer"="non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a55" "target-features"="+aes,+crc,+crypto,+dotprod,+fp-armv8,+fullfp16,+lse,+neon,+ras,+rcpc,+rdm,+sha2,+v8.2a" "unsafe-fp-math"="true" }
+
+  !llvm.module.flags = !{!0, !1, !2, !3, !4}
+  !llvm.ident = !{!5}
+
+  !0 = !{i32 1, !"wchar_size", i32 4}
+  !1 = !{i32 1, !"branch-target-enforcement", i32 0}
+  !2 = !{i32 1, !"sign-return-address", i32 0}
+  !3 = !{i32 1, !"sign-return-address-all", i32 0}
+  !4 = !{i32 1, !"sign-return-address-with-bkey", i32 0}
+  !5 = !{!"clang version 13.0.0"}
+  !6 = !{!7, !7, i64 0}
+  !7 = !{!"float", !8, i64 0}
+  !8 = !{!"omnipotent char", !9, i64 0}
+  !9 = !{!"Simple C++ TBAA"}
+  !10 = !{!11}
+  !11 = distinct !{!11, !12}
+  !12 = distinct !{!12, !"LVerDomain"}
+  !13 = !{!14}
+  !14 = distinct !{!14, !12}
+  !15 = distinct !{!15, !16, !17}
+  !16 = !{!"llvm.loop.mustprogress"}
+  !17 = !{!"llvm.loop.isvectorized", i32 1}
+  !18 = distinct !{!18, !16, !17}
+
+...
+---
+name:            ldrqpre-ldrqui-merge
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+registers:       []
+liveins:
+  - { reg: '$x0', virtual-reg: '' }
+  - { reg: '$x1', virtual-reg: '' }
+  - { reg: '$w2', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    1
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:           []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo:
+  hasRedZone:      false
+body:             |
+  bb.0.entry:
+    successors: %bb.10(0x30000000), %bb.1(0x50000000)
+    liveins: $w2, $x0, $x1
+
+    CBZW renamable $w2, %bb.10
+
+  bb.1.for.body.preheader:
+    successors: %bb.2(0x40000000), %bb.5(0x40000000)
+    liveins: $w2, $x0, $x1
+
+    renamable $w8 = ORRWrs $wzr, renamable $w2, 0, implicit-def $x8
+    dead $wzr = SUBSWri killed renamable $w2, 7, 0, implicit-def $nzcv
+    Bcc 8, %bb.5, implicit killed $nzcv
+
+  bb.2:
+    successors: %bb.3(0x80000000)
+    liveins: $x0, $x1, $x8
+
+    $x9 = ORRXrs $xzr, $xzr, 0
+
+  bb.3.for.body.preheader2:
+    successors: %bb.4(0x80000000)
+    liveins: $x0, $x1, $x8, $x9
+
+    renamable $x10 = UBFMXri renamable $x9, 62, 61
+    $x8 = SUBXrs killed renamable $x8, killed renamable $x9, 0
+    renamable $x10 = SUBXri killed renamable $x10, 4, 0
+    $x9 = ADDXrs killed renamable $x0, renamable $x10, 0
+    $x10 = ADDXrs killed renamable $x1, killed renamable $x10, 0
+
+  bb.4.for.body:
+    successors: %bb.10(0x04000000), %bb.4(0x7c000000)
+    liveins: $x8, $x9, $x10
+
+    early-clobber renamable $x10, renamable $s0 = LDRSpre killed renamable $x10, 4
+    early-clobber renamable $x9, renamable $s1 = LDRSpre killed renamable $x9, 4
+    renamable $s0 = nnan ninf nsz arcp contract afn reassoc FADDSrr killed renamable $s1, killed renamable $s0
+    renamable $x8 = SUBSXri killed renamable $x8, 1, 0, implicit-def $nzcv
+    STRSui killed renamable $s0, renamable $x10, 0 :: (store 4 into %ir.scevgep10, !tbaa !6)
+    Bcc 1, %bb.4, implicit $nzcv
+    B %bb.10
+
+  bb.5.vector.memcheck:
+    successors: %bb.6(0x60000000), %bb.7(0x20000000)
+    liveins: $x0, $x1, $x8
+
+    renamable $x9 = UBFMXri renamable $x8, 62, 61
+    $x10 = ADDXrs renamable $x0, renamable $x9, 0
+    $xzr = SUBSXrs killed renamable $x10, renamable $x1, 0, implicit-def $nzcv, implicit-def $nzcv
+    Bcc 9, %bb.7, implicit $nzcv
+
+  bb.6.vector.memcheck:
+    successors: %bb.2(0x55555555), %bb.7(0x2aaaaaab)
+    liveins: $x0, $x1, $x8, $x9
+
+    $x9 = ADDXrs renamable $x1, killed renamable $x9, 0
+    $xzr = SUBSXrs killed renamable $x9, renamable $x0, 0, implicit-def $nzcv, implicit-def $nzcv
+    Bcc 8, %bb.2, implicit $nzcv
+
+  bb.7.vector.ph:
+    successors: %bb.8(0x80000000)
+    liveins: $x0, $x1, $x8
+
+    renamable $x9 = ANDXri renamable $x8, 8028
+    renamable $x10 = SUBXri renamable $x0, 32, 0
+    renamable $x11 = SUBXri renamable $x1, 32, 0
+    $x12 = ORRXrs $xzr, $x9, 0
+
+  bb.8.vector.body:
+    successors: %bb.9(0x04000000), %bb.8(0x7c000000)
+    liveins: $x0, $x1, $x8, $x9, $x10, $x11, $x12
+
+    early-clobber $x11, renamable $q0, renamable $q1 = LDPQpre renamable $x11, 2
+    early-clobber $x10, renamable $q2, renamable $q3 = LDPQpre renamable $x10, 2
+    renamable $q0 = nnan ninf nsz arcp contract afn reassoc FADDv4f32 killed renamable $q2, killed renamable $q0
+    renamable $q1 = nnan ninf nsz arcp contract afn reassoc FADDv4f32 killed renamable $q3, killed renamable $q1
+    renamable $x12 = SUBSXri killed renamable $x12, 8, 0, implicit-def $nzcv
+    STPQi renamable $q0, renamable $q1, renamable $x11, 0 :: (store 16 into %ir.scevgep24, align 4, !tbaa !6, !alias.scope !10, !noalias !13), (store 16 into %ir.scevgep25, align 4, !tbaa !6, !alias.scope !10, !noalias !13)
+    Bcc 1, %bb.8, implicit killed $nzcv
+
+  bb.9.middle.block:
+    successors: %bb.10(0x40000000), %bb.3(0x40000000)
+    liveins: $x0, $x1, $x8, $x9
+
+    $xzr = SUBSXrs renamable $x9, renamable $x8, 0, implicit-def $nzcv, implicit-def $nzcv
+    Bcc 1, %bb.3, implicit $nzcv
+
+  bb.10.for.cond.cleanup:
+    RET undef $lr
+
+...
Index: llvm/test/CodeGen/AArch64/strqpre-strqui-merge.mir
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/strqpre-strqui-merge.mir
@@ -0,0 +1,67 @@
+# RUN: llc -mtriple=aarch64-none-eabi -mcpu=cortex-a55 -lsr-preferred-addressing-mode=preindexed -run-pass=aarch64-ldst-opt -verify-machineinstrs %s -o - | FileCheck %s
+# CHECK: early-clobber $x0 = STPQpre killed renamable $q0, killed renamable $q1, renamable $x0, 2 :: (store 16 into %ir.p0), (store 16 into %ir.p1)
+
+--- |
+  target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+  target triple = "aarch64-none-unknown-eabi"
+
+  define <4 x i32>* @strqpre-strqui-merge(<4 x i32>* %p, <4 x i32> %a, <4 x i32> %b) #0 {
+  entry:
+    %p0 = getelementptr <4 x i32>, <4 x i32>* %p, i32 2
+    store <4 x i32> %a, <4 x i32>* %p0, align 16
+    %p1 = getelementptr <4 x i32>, <4 x i32>* %p, i32 3
+    store <4 x i32> %b, <4 x i32>* %p1, align 16
+    ret <4 x i32>* %p0
+  }
+
+  attributes #0 = { "target-cpu"="cortex-a55" }
+
+...
+---
+name:            strqpre-strqui-merge
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+registers:       []
+liveins:
+  - { reg: '$x0', virtual-reg: '' }
+  - { reg: '$q0', virtual-reg: '' }
+  - { reg: '$q1', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    1
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:           []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo:
+  hasRedZone:      false
+body:             |
+  bb.0.entry:
+    liveins: $q0, $q1, $x0
+    early-clobber $x0 = STPQpre killed renamable $q0, killed renamable $q1, renamable $x0, 2 :: (store 16 into %ir.p0), (store 16 into %ir.p1)
+    RET undef $lr, implicit $x0
+
+...