Index: llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -27,8 +27,11 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/TargetSchedule.h"
 #include "llvm/IR/DebugLoc.h"
+#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSchedule.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -154,6 +157,10 @@
   bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
                             unsigned BaseReg, int Offset);
 
+  // Evaluate if the new instruction is a better choice than the old ones.
+  bool isProfitableMergeUpdate(unsigned New,
+                               MachineInstr &OldA, MachineInstr &OldB);
+
   // Merge a pre- or post-index base register update into a ld/st instruction.
   MachineBasicBlock::iterator
   mergeUpdateInsn(MachineBasicBlock::iterator I,
@@ -168,6 +175,9 @@
   // Find and promote load instructions which read directly from store.
   bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);
 
+  // Find and merge a base register updates before or after a ld/st instruction.
+  bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
+
   bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
 
   bool runOnMachineFunction(MachineFunction &Fn) override;
@@ -578,6 +588,101 @@
          getLdStRegOp(MI).getReg() == AArch64::WZR;
 }
 
+static bool isMergeableLdStUpdate(MachineInstr &MI) {
+  // Do update merging. It's simpler to keep this separate from the above
+  // switchs, though not strictly necessary.
+  unsigned Opc = MI.getOpcode();
+  switch (Opc) {
+  default:
+    return false;
+  // Scaled instructions.
+  case AArch64::STRSui:
+  case AArch64::STRDui:
+  case AArch64::STRQui:
+  case AArch64::STRXui:
+  case AArch64::STRWui:
+  case AArch64::STRHHui:
+  case AArch64::STRBBui:
+  case AArch64::LDRSui:
+  case AArch64::LDRDui:
+  case AArch64::LDRQui:
+  case AArch64::LDRXui:
+  case AArch64::LDRWui:
+  case AArch64::LDRHHui:
+  case AArch64::LDRBBui:
+  // Unscaled instructions.
+  case AArch64::STURSi:
+  case AArch64::STURDi:
+  case AArch64::STURQi:
+  case AArch64::STURWi:
+  case AArch64::STURXi:
+  case AArch64::LDURSi:
+  case AArch64::LDURDi:
+  case AArch64::LDURQi:
+  case AArch64::LDURWi:
+  case AArch64::LDURXi:
+  // Paired instructions.
+  case AArch64::LDPSi:
+  case AArch64::LDPSWi:
+  case AArch64::LDPDi:
+  case AArch64::LDPQi:
+  case AArch64::LDPWi:
+  case AArch64::LDPXi:
+  case AArch64::STPSi:
+  case AArch64::STPDi:
+  case AArch64::STPQi:
+  case AArch64::STPWi:
+  case AArch64::STPXi:
+    // Make sure this is a reg+imm (as opposed to an address reloc).
+    if (!getLdStOffsetOp(MI).isImm())
+      return false;
+
+    return true;
+  }
+}
+
+bool AArch64LoadStoreOpt::isProfitableMergeUpdate(unsigned New,
+                                                  MachineInstr &OldA,
+                                                  MachineInstr &OldB) {
+  const MachineFunction *MF = OldA.getMF();
+  // Default as profitable if optimizing for size.
+  if (MF->getFunction()->optForSize())
+    return true;
+
+  TargetSchedModel SM;
+  const TargetSubtargetInfo &STI = MF->getSubtarget();
+  SM.init(STI.getSchedModel(), &STI, STI.getInstrInfo());
+  // It is profitable in the absence of a cost model.
+  if (!SM.hasInstrSchedModel())
+    return true;
+
+  const MCInstrDesc &NewID = TII->get(New),
+                    &OldAID = TII->get(OldA.getOpcode()),
+                    &OldBID = TII->get(OldB.getOpcode());
+  const MCSchedClassDesc
+    *NewSD = SM.getMCSchedModel()->getSchedClassDesc(NewID.getSchedClass()),
+    *OldASD = SM.getMCSchedModel()->getSchedClassDesc(OldAID.getSchedClass()),
+    *OldBSD = SM.getMCSchedModel()->getSchedClassDesc(OldBID.getSchedClass());
+  // Default as profitable in the absence of a valid cost model or when variant.
+  if (!NewSD->isValid() || NewSD->isVariant() ||
+      !OldASD->isValid() || OldASD->isVariant() ||
+      !OldBSD->isValid() || OldBSD->isVariant())
+    return true;
+
+  unsigned NewLat = SM.computeInstrLatency(New),
+           OldALat = SM.computeInstrLatency(&OldA),
+           OldBLat = SM.computeInstrLatency(&OldB);
+  // It is profitable if the new instruction is faster than both old ones.
+  if (NewLat < (OldALat + OldBLat))
+    return true;
+  // It is profitable if the new instruction uses fewer uops than both old ones.
+  else if (NewLat == (OldALat + OldBLat))
+    return (NewSD->NumMicroOps < (OldASD->NumMicroOps + OldBSD->NumMicroOps));
+  // It is not profitable.
+  else
+    return false;
+}
+
 MachineBasicBlock::iterator
 AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I,
                                            MachineBasicBlock::iterator MergeMI,
@@ -1272,6 +1377,9 @@
 
   unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode())
                              : getPostIndexedOpcode(I->getOpcode());
+  if (!isProfitableMergeUpdate(NewOpc, *I, *Update))
+    return ++I;
+
   MachineInstrBuilder MIB;
   if (!isPairedLdSt(*I)) {
     // Non-paired instruction.
@@ -1294,10 +1402,14 @@
   }
   (void)MIB;
 
-  if (IsPreIdx)
+  if (IsPreIdx) {
+    ++NumPreFolded;
     DEBUG(dbgs() << "Creating pre-indexed load/store.");
-  else
+  }
+  else {
+    ++NumPostFolded;
     DEBUG(dbgs() << "Creating post-indexed load/store.");
+  }
   DEBUG(dbgs() << "    Replacing instructions:\n    ");
   DEBUG(I->print(dbgs()));
   DEBUG(dbgs() << "    ");
@@ -1558,6 +1670,60 @@
   return false;
 }
 
+bool
+AArch64LoadStoreOpt::tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI) {
+  MachineInstr &MI = *MBBI;
+  MachineBasicBlock::iterator E = MI.getParent()->end();
+  MachineBasicBlock::iterator Update;
+
+  // Look forward to try to form a post-index instruction. For example,
+  // ldr x0, [x20]
+  // add x20, x20, #32
+  //   merged into:
+  // ldr x0, [x20], #32
+  Update = findMatchingUpdateInsnForward(MBBI, 0, UpdateLimit);
+  if (Update != E) {
+    // Merge the update into the ld/st.
+    MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false);
+    return true;
+  }
+
+  // Don't know how to handle unscaled pre/post-index versions below, so bail.
+  if (TII->isUnscaledLdSt(MI.getOpcode()))
+    return false;
+
+  // Look back to try to find a pre-index instruction. For example,
+  // add x0, x0, #8
+  // ldr x1, [x0]
+  //   merged into:
+  // ldr x1, [x0, #8]!
+  Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit);
+  if (Update != E) {
+    // Merge the update into the ld/st.
+    MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
+    return true;
+  }
+
+  // The immediate in the load/store is scaled by the size of the memory
+  // operation. The immediate in the add we're looking for,
+  // however, is not, so adjust here.
+  int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI);
+
+  // Look forward to try to find a post-index instruction. For example,
+  // ldr x1, [x0, #64]
+  // add x0, x0, #64
+  //   merged into:
+  // ldr x1, [x0, #64]!
+  Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset, UpdateLimit);
+  if (Update != E) {
+    // Merge the update into the ld/st.
+    MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
+    return true;
+  }
+
+  return false;
+}
+
 bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
                                         bool EnableNarrowZeroStOpt) {
   bool Modified = false;
@@ -1618,7 +1784,6 @@
     } else
       ++MBBI;
   }
-
   // 3) Find loads and stores that can be merged into a single load or store
   //    pair instruction.
   //      e.g.,
@@ -1641,119 +1806,15 @@
   //        ; becomes
   //        ldr x0, [x2], #4
   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
-       MBBI != E;) {
-    MachineInstr &MI = *MBBI;
-    // Do update merging. It's simpler to keep this separate from the above
-    // switchs, though not strictly necessary.
-    unsigned Opc = MI.getOpcode();
-    switch (Opc) {
-    default:
-      // Just move on to the next instruction.
-      ++MBBI;
-      break;
-    // Scaled instructions.
-    case AArch64::STRSui:
-    case AArch64::STRDui:
-    case AArch64::STRQui:
-    case AArch64::STRXui:
-    case AArch64::STRWui:
-    case AArch64::STRHHui:
-    case AArch64::STRBBui:
-    case AArch64::LDRSui:
-    case AArch64::LDRDui:
-    case AArch64::LDRQui:
-    case AArch64::LDRXui:
-    case AArch64::LDRWui:
-    case AArch64::LDRHHui:
-    case AArch64::LDRBBui:
-    // Unscaled instructions.
-    case AArch64::STURSi:
-    case AArch64::STURDi:
-    case AArch64::STURQi:
-    case AArch64::STURWi:
-    case AArch64::STURXi:
-    case AArch64::LDURSi:
-    case AArch64::LDURDi:
-    case AArch64::LDURQi:
-    case AArch64::LDURWi:
-    case AArch64::LDURXi:
-    // Paired instructions.
-    case AArch64::LDPSi:
-    case AArch64::LDPSWi:
-    case AArch64::LDPDi:
-    case AArch64::LDPQi:
-    case AArch64::LDPWi:
-    case AArch64::LDPXi:
-    case AArch64::STPSi:
-    case AArch64::STPDi:
-    case AArch64::STPQi:
-    case AArch64::STPWi:
-    case AArch64::STPXi: {
-      // Make sure this is a reg+imm (as opposed to an address reloc).
-      if (!getLdStOffsetOp(MI).isImm()) {
-        ++MBBI;
-        break;
-      }
-      // Look forward to try to form a post-index instruction. For example,
-      // ldr x0, [x20]
-      // add x20, x20, #32
-      //   merged into:
-      // ldr x0, [x20], #32
-      MachineBasicBlock::iterator Update =
-          findMatchingUpdateInsnForward(MBBI, 0, UpdateLimit);
-      if (Update != E) {
-        // Merge the update into the ld/st.
-        MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false);
+       MBBI != E;)
+    if (isMergeableLdStUpdate(*MBBI)) {
+      if (tryToMergeLdStUpdate(MBBI))
         Modified = true;
-        ++NumPostFolded;
-        break;
-      }
-
-      // Don't know how to handle unscaled pre/post-index versions below, so
-      // move to the next instruction.
-      if (TII->isUnscaledLdSt(Opc)) {
+      else
         ++MBBI;
-        break;
-      }
-
-      // Look back to try to find a pre-index instruction. For example,
-      // add x0, x0, #8
-      // ldr x1, [x0]
-      //   merged into:
-      // ldr x1, [x0, #8]!
-      Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit);
-      if (Update != E) {
-        // Merge the update into the ld/st.
-        MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
-        Modified = true;
-        ++NumPreFolded;
-        break;
-      }
-      // The immediate in the load/store is scaled by the size of the memory
-      // operation. The immediate in the add we're looking for,
-      // however, is not, so adjust here.
-      int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI);
-
-      // Look forward to try to find a post-index instruction. For example,
-      // ldr x1, [x0, #64]
-      // add x0, x0, #64
-      //   merged into:
-      // ldr x1, [x0, #64]!
-      Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset, UpdateLimit);
-      if (Update != E) {
-        // Merge the update into the ld/st.
-        MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
-        Modified = true;
-        ++NumPreFolded;
-        break;
-      }
-
-      // Nothing found. Just move to the next instruction.
-      ++MBBI;
-      break;
-    }
     }
-  }
+    else
+      ++MBBI;
 
   return Modified;
 }
Index: llvm/test/CodeGen/AArch64/ldst-opt.ll
===================================================================
--- llvm/test/CodeGen/AArch64/ldst-opt.ll
+++ llvm/test/CodeGen/AArch64/ldst-opt.ll
@@ -1,5 +1,6 @@
-; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 -disable-lsr -verify-machineinstrs -o - %s | FileCheck --check-prefix=CHECK --check-prefix=NOSTRICTALIGN %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+strict-align -aarch64-enable-atomic-cfg-tidy=0 -disable-lsr -verify-machineinstrs -o - %s | FileCheck --check-prefix=CHECK --check-prefix=STRICTALIGN %s
+; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 -disable-lsr -verify-machineinstrs                      -o - %s | FileCheck %s --check-prefixes=CHECK,GENERIC,NOSTRICTALIGN
+; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 -disable-lsr -verify-machineinstrs -mattr=+strict-align -o - %s | FileCheck %s --check-prefixes=CHECK,GENERIC,STRICTALIGN
+; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-enable-atomic-cfg-tidy=1 -disable-lsr -verify-machineinstrs -mcpu=exynos-m1      -o - %s | FileCheck %s --check-prefixes=CHECK,EXYNOS
 
 ; This file contains tests for the AArch64 load/store optimizer.
 
@@ -7,8 +8,8 @@
 %s.byte = type { i8, i8 }
 %s.halfword = type { i16, i16 }
 %s.word = type { i32, i32 }
-%s.doubleword = type { i64, i32 }
-%s.quadword = type { fp128, i32 }
+%s.doubleword = type { i64, i64 }
+%s.quadword = type { fp128, fp128 }
 %s.float = type { float, i32 }
 %s.double = type { double, i32 }
 %struct.byte = type { %padding, %s.byte }
@@ -145,7 +146,9 @@
 
 define void @load-pre-indexed-quadword(%struct.quadword* %ptr) nounwind {
 ; CHECK-LABEL: load-pre-indexed-quadword
-; CHECK: ldr q{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+; GENERIC: ldr q{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+; EXYNOS: ldr q{{[0-9]+}}, [x{{[0-9]+}}, #32]
+; EXYNOS-NEXT: add x{{[0-9]+}}, x{{[0-9]+}}, #32
 entry:
   %a = getelementptr inbounds %struct.quadword, %struct.quadword* %ptr, i64 0, i32 1, i32 0
   %add = load fp128, fp128* %a, align 16
@@ -158,7 +161,9 @@
 
 define void @store-pre-indexed-quadword(%struct.quadword* %ptr, fp128 %val) nounwind {
 ; CHECK-LABEL: store-pre-indexed-quadword
-; CHECK: str q{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+; GENERIC: str q{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+; EXYNOS: add x{{[0-9]+}}, x{{[0-9]+}}, #32
+; EXYNOS-NEXT: str q{{[0-9]+}}, [x{{[0-9]+}}, #32]
 entry:
   %a = getelementptr inbounds %struct.quadword, %struct.quadword* %ptr, i64 0, i32 1, i32 0
   store fp128 %val, fp128* %a, align 16
@@ -236,7 +241,9 @@
 
 define void @load-pair-pre-indexed-word(%struct.word* %ptr) nounwind {
 ; CHECK-LABEL: load-pair-pre-indexed-word
-; CHECK: ldp w{{[0-9]+}}, w{{[0-9]+}}, [x0, #32]!
+; GENERIC: ldp w{{[0-9]+}}, w{{[0-9]+}}, [x0, #32]!
+; EXYNOS: ldr w{{[0-9]+}}, [x0, #32]!
+; EXYNOS-NEXT: ldr w{{[0-9]+}}, [x0, #4]
 ; CHECK-NOT: add x0, x0, #32
 entry:
   %a = getelementptr inbounds %struct.word, %struct.word* %ptr, i64 0, i32 1, i32 0
@@ -253,7 +260,9 @@
 
 define void @store-pair-pre-indexed-word(%struct.word* %ptr, i32 %val) nounwind {
 ; CHECK-LABEL: store-pair-pre-indexed-word
-; CHECK: stp w{{[0-9]+}}, w{{[0-9]+}}, [x0, #32]!
+; GENERIC: stp w{{[0-9]+}}, w{{[0-9]+}}, [x0, #32]!
+; EXYNOS: str w{{[0-9]+}}, [x0, #32]!
+; EXYNOS-NEXT: str w{{[0-9]+}}, [x0, #4]
 ; CHECK-NOT: add x0, x0, #32
 entry:
   %a = getelementptr inbounds %struct.word, %struct.word* %ptr, i64 0, i32 1, i32 0
@@ -267,6 +276,43 @@
   ret void
 }
 
+define void @load-pair-pre-indexed-doubleword(%struct.doubleword* %ptr) nounwind {
+; CHECK-LABEL: load-pair-pre-indexed-doubleword
+; GENERIC: ldp x{{[0-9]+}}, x{{[0-9]+}}, [x0, #32]!
+; EXYNOS: ldr x{{[0-9]+}}, [x0, #32]!
+; EXYNOS-NEXT: ldr x{{[0-9]+}}, [x0, #8]
+; CHECK-NOT: add x0, x0, #32
+entry:
+  %a = getelementptr inbounds %struct.doubleword, %struct.doubleword* %ptr, i64 0, i32 1, i32 0
+  %a1 = load i64, i64* %a, align 8
+  %b = getelementptr inbounds %struct.doubleword, %struct.doubleword* %ptr, i64 0, i32 1, i32 1
+  %b1 = load i64, i64* %b, align 8
+  %add = add i64 %a1, %b1
+  br label %bar
+bar:
+  %c = getelementptr inbounds %struct.doubleword, %struct.doubleword* %ptr, i64 0, i32 1
+  tail call void @bar_doubleword(%s.doubleword* %c, i64 %add)
+  ret void
+}
+
+define void @store-pair-pre-indexed-doubleword(%struct.doubleword* %ptr, i64 %val) nounwind {
+; CHECK-LABEL: store-pair-pre-indexed-doubleword
+; GENERIC: stp x{{[0-9]+}}, x{{[0-9]+}}, [x0, #32]!
+; EXYNOS: str x{{[0-9]+}}, [x0, #32]!
+; EXYNOS-NEXT: str x{{[0-9]+}}, [x0, #8]
+; CHECK-NOT: add x0, x0, #32
+entry:
+  %a = getelementptr inbounds %struct.doubleword, %struct.doubleword* %ptr, i64 0, i32 1, i32 0
+  store i64 %val, i64* %a, align 8
+  %b = getelementptr inbounds %struct.doubleword, %struct.doubleword* %ptr, i64 0, i32 1, i32 1
+  store i64 %val, i64* %b, align 8
+  br label %bar
+bar:
+  %c = getelementptr inbounds %struct.doubleword, %struct.doubleword* %ptr, i64 0, i32 1
+  tail call void @bar_doubleword(%s.doubleword* %c, i64 %val)
+  ret void
+}
+
 ; Check the following transform:
 ;
 ; add x8, x8, #16
@@ -1031,7 +1077,6 @@
 define void @store-pair-post-indexed-word() nounwind {
 ; CHECK-LABEL: store-pair-post-indexed-word
 ; CHECK: stp w{{[0-9]+}}, w{{[0-9]+}}, [sp], #16
-; CHECK: ret
   %src = alloca { i32, i32 }, align 8
   %dst = alloca { i32, i32 }, align 8
 
@@ -1050,7 +1095,6 @@
 define void @store-pair-post-indexed-doubleword() nounwind {
 ; CHECK-LABEL: store-pair-post-indexed-doubleword
 ; CHECK: stp x{{[0-9]+}}, x{{[0-9]+}}, [sp], #32
-; CHECK: ret
   %src = alloca { i64, i64 }, align 8
   %dst = alloca { i64, i64 }, align 8
 
@@ -1069,7 +1113,6 @@
 define void @store-pair-post-indexed-float() nounwind {
 ; CHECK-LABEL: store-pair-post-indexed-float
 ; CHECK: stp s{{[0-9]+}}, s{{[0-9]+}}, [sp], #16
-; CHECK: ret
   %src = alloca { float, float }, align 8
   %dst = alloca { float, float }, align 8
 
@@ -1088,7 +1131,6 @@
 define void @store-pair-post-indexed-double() nounwind {
 ; CHECK-LABEL: store-pair-post-indexed-double
 ; CHECK: stp d{{[0-9]+}}, d{{[0-9]+}}, [sp], #32
-; CHECK: ret
   %src = alloca { double, double }, align 8
   %dst = alloca { double, double }, align 8
 
@@ -1104,6 +1146,27 @@
   ret void
 }
 
+define void @store-pair-post-indexed-quadword() nounwind {
+; CHECK-LABEL: store-pair-post-indexed-quadword
+; GENERIC: stp q{{[0-9]+}}, q{{[0-9]+}}, [sp], #64
+; EXYNOS: str q{{[0-9]+}}, [sp]
+; EXYNOS-NEXT: str q{{[0-9]+}}, [sp, #16]
+; EXYNOS-NEXT: add sp, sp, #64
+  %src = alloca { fp128, fp128 }, align 8
+  %dst = alloca { fp128, fp128 }, align 8
+
+  %src.realp = getelementptr inbounds { fp128, fp128 }, { fp128, fp128 }* %src, i32 0, i32 0
+  %src.real = load fp128, fp128* %src.realp
+  %src.imagp = getelementptr inbounds { fp128, fp128 }, { fp128, fp128 }* %src, i32 0, i32 1
+  %src.imag = load fp128, fp128* %src.imagp
+
+  %dst.realp = getelementptr inbounds { fp128, fp128 }, { fp128, fp128 }* %dst, i32 0, i32 0
+  %dst.imagp = getelementptr inbounds { fp128, fp128 }, { fp128, fp128 }* %dst, i32 0, i32 1
+  store fp128 %src.real, fp128* %dst.realp
+  store fp128 %src.imag, fp128* %dst.imagp
+  ret void
+}
+
 ; Check the following transform:
 ;
 ; (ldr|str) X, [x20]
@@ -1343,6 +1406,7 @@
 ; CHECK: // %entry
 ; NOSTRICTALIGN-NEXT: str xzr, [x{{[0-9]+}}]
 ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}]
+; EXYNOS-NEXT: str xzr, [x{{[0-9]+}}]
 ; CHECK-NEXT: ret
 entry:
   store i32 0, i32* %p
@@ -1358,6 +1422,7 @@
 ; NOSTRICTALIGN-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
 ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}]
 ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}, #8]
+; EXYNOS-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
 ; CHECK-NEXT: ret
 entry:
   store i32 0, i32* %p
@@ -1379,6 +1444,7 @@
 ; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #508]
 ; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #512]
 ; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #516]
+; EXYNOS-NEXT: stp xzr, xzr, [x{{[0-9]+}}, #504]
 ; CHECK-NEXT: ret
 entry:
   %p0 = getelementptr i32, i32* %p, i32 126
@@ -1404,6 +1470,8 @@
 ; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #4100]
 ; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #4104]
 ; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #4108]
+; EXYNOS-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000
+; EXYNOS-NEXT: str q[[REG]], [x{{[0-9]+}}, #4096]
 ; CHECK-NEXT: ret
 entry:
   %p0 = getelementptr i32, i32* %p, i32 1024
@@ -1429,6 +1497,9 @@
 ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}, #8]
 ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}, #16]
 ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}, #24]
+; EXYNOS-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000
+; EXYNOS-NEXT: str q[[REG]], [x{{[0-9]+}}]
+; EXYNOS-NEXT: str q[[REG]], [x{{[0-9]+}}, #16]
 ; CHECK-NEXT: ret
 entry:
   store i32 0, i32* %p
@@ -1455,6 +1526,7 @@
 ; CHECK: // %entry
 ; NOSTRICTALIGN-NEXT: str xzr, [x{{[0-9]+}}]
 ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}]
+; EXYNOS-NEXT: str xzr, [x{{[0-9]+}}]
 ; CHECK-NEXT: ret
 entry:
   store <2 x i32> zeroinitializer, <2 x i32>* %p
@@ -1469,6 +1541,8 @@
 ; NOSTRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #8]
 ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}]
 ; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #8]
+; EXYNOS-NEXT: str xzr, [x{{[0-9]+}}]
+; EXYNOS-NEXT: str wzr, [x{{[0-9]+}}, #8]
 ; CHECK-NEXT: ret
 entry:
   store <3 x i32> zeroinitializer, <3 x i32>* %p
@@ -1482,6 +1556,7 @@
 ; NOSTRICTALIGN-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
 ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}]
 ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}, #8]
+; EXYNOS-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
 ; CHECK-NEXT: ret
 entry:
   store <4 x i32> zeroinitializer, <4 x i32>* %p
@@ -1494,6 +1569,7 @@
 ; CHECK: // %entry
 ; NOSTRICTALIGN-NEXT: str xzr, [x{{[0-9]+}}]
 ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}]
+; EXYNOS-NEXT: str xzr, [x{{[0-9]+}}]
 ; CHECK-NEXT: ret
 entry:
   store <2 x float> zeroinitializer, <2 x float>* %p
@@ -1507,6 +1583,7 @@
 ; NOSTRICTALIGN-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
 ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}]
 ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}, #8]
+; EXYNOS-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
 ; CHECK-NEXT: ret
 entry:
   store <4 x float> zeroinitializer, <4 x float>* %p
@@ -1547,6 +1624,7 @@
 ; STRICTALIGN: strb
 ; STRICTALIGN: strb
 ; STRICTALIGN: strb
+; EXYNOS-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
 ; CHECK-NEXT: ret
 entry:
   store <2 x i64> zeroinitializer, <2 x i64>* %p, align 1
@@ -1562,6 +1640,9 @@
 ; NOSTRICTALIGN-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}]
 ; STRICTALIGN-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
 ; STRICTALIGN-NEXT: stp xzr, xzr, [x{{[0-9]+}}, #16]
+; EXYNOS-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000
+; EXYNOS-NEXT: str q[[REG]], [x{{[0-9]+}}]
+; EXYNOS-NEXT: str q[[REG]], [x{{[0-9]+}}, #16]
 ; CHECK-NEXT: ret
 entry:
   store i64 0, i64* %p
@@ -1601,8 +1682,11 @@
 define void @merge_zr64_4vecd(<4 x double>* %p) {
 ; CHECK-LABEL: merge_zr64_4vecd:
 ; CHECK: // %entry
-; CHECK-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000
-; CHECK-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}]
+; GENERIC-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000
+; GENERIC-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}]
+; EXYNOS-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000
+; EXYNOS-NEXT: str q[[REG]], [x{{[0-9]+}}, #16]
+; EXYNOS-NEXT: str q[[REG]], [x{{[0-9]+}}]
 ; CHECK-NEXT: ret
 entry:
   store <4 x double> zeroinitializer, <4 x double>* %p
@@ -1620,6 +1704,10 @@
 ; STRICTALIGN-NEXT: stp xzr, xzr, [x0]
 ; STRICTALIGN-NEXT: stp xzr, xzr, [x0, #24]
 ; STRICTALIGN-NEXT: stp xzr, xzr, [x0, #48]
+; EXYNOS-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000
+; EXYNOS-NEXT: str q0, [x0]
+; EXYNOS-NEXT: stur q0, [x0, #24]
+; EXYNOS-NEXT: str q0, [x0, #48]
 ; CHECK-NEXT: ret
 entry:
   store i64 0, i64* %p
@@ -1647,6 +1735,11 @@
 ; STRICTALIGN-NEXT: stp	 xzr, xzr, [x0, #16]
 ; STRICTALIGN-NEXT: stp	 xzr, xzr, [x0, #32]
 ; STRICTALIGN-NEXT: stp  xzr, xzr, [x0, #48]
+; EXYNOS-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000
+; EXYNOS-NEXT: str q[[REG]], [x{{[0-9]+}}]
+; EXYNOS-NEXT: str q[[REG]], [x{{[0-9]+}}, #16]
+; EXYNOS-NEXT: str q[[REG]], [x{{[0-9]+}}, #32]
+; EXYNOS-NEXT: str q[[REG]], [x{{[0-9]+}}, #48]
 ; CHECK-NEXT: ret
 entry:
   store i64 0, i64* %p
@@ -1670,7 +1763,7 @@
 ; Check for bug 34674 where invalid add of xzr was being generated.
 ; CHECK-LABEL: bug34674:
 ; CHECK: // %entry
-; CHECK-NEXT: mov [[ZREG:x[0-9]+]], xzr
+; CHECK-NEXT: mov [[ZREG:x[0-9]+]], {{#0|xzr}}
 ; CHECK-DAG: stp [[ZREG]], [[ZREG]], [x0]
 ; CHECK-DAG: add x{{[0-9]+}}, [[ZREG]], #1
 define i64 @bug34674(<2 x i64>* %p) {