Index: lib/Target/AArch64/AArch64.h
===================================================================
--- lib/Target/AArch64/AArch64.h
+++ lib/Target/AArch64/AArch64.h
@@ -46,6 +46,8 @@
 
 FunctionPass *createAArch64CollectLOHPass();
 
+FunctionPass *createAArch64SSALoadStoreOptPass();
+
 void initializeAArch64ExpandPseudoPass(PassRegistry&);
 } // end namespace llvm
 
Index: lib/Target/AArch64/AArch64SSALoadStoreOptimizer.cpp
===================================================================
--- /dev/null
+++ lib/Target/AArch64/AArch64SSALoadStoreOptimizer.cpp
@@ -0,0 +1,535 @@
+//===--- AArch64SSALoadStoreOptimizer.cpp --- AArch64 load/store opt. pass in
+// SSA form ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that performs load / store related peephole
+// optimizations in SSA form. This pass should be run before register
+// allocation.
+
+// ===---------------------------------------------------------------------===//
+
+#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-ssa-ldst-opt"
+
+#define MAX_UNSCALED_OFFSET 255
+#define MIN_UNSCALED_OFFSET -256
+
+namespace {
+class AArch64SSALoadStoreOpt : public MachineFunctionPass {
+
+  const AArch64InstrInfo *TII;
+  MachineRegisterInfo *MRI;
+  const AArch64Subtarget *Subtarget;
+  AliasAnalysis *AA;
+
+public:
+  static char ID;
+  AArch64SSALoadStoreOpt() : MachineFunctionPass(ID) {}
+  bool tryToWidenLdStInst(MachineInstr *MI,
+                          std::set<MachineInstr *> &StoresWillBeDeleted,
+                          std::set<MachineInstr *> &LoadsWillBeDeleted);
+  bool instrAliased(MachineInstr *MIa, MachineInstr *MIb);
+  bool checkOffsetRange(MachineInstr *MI);
+  bool isConsecutive(MachineInstr *FirstMI, MachineInstr *SecondMI,
+                     SmallVector<MachineInstr *, 2> &MIs);
+  bool hasAliasBetween(MachineInstr *MIa, MachineInstr *MIb);
+  void buildWideLdStInst(MachineInstr *MI, MachineInstr *MergeMI);
+  bool optimizeBlock(MachineBasicBlock &MBB);
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  const char *getPassName() const override {
+    return "AArch64 SSA load / store optimization pass";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addPreserved<AAResultsWrapperPass>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+char AArch64SSALoadStoreOpt::ID = 0;
+} // namespace
+
+// Scaling factor for unscaled load or store.
+static int getMemScale(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  default:
+    llvm_unreachable("Opcode has unknown scale!");
+  case AArch64::LDRWui:
+  case AArch64::LDURWi:
+  case AArch64::STRWui:
+  case AArch64::STURWi:
+    return 4;
+  }
+}
+
+static const MachineOperand &getLdStRegOp(const MachineInstr *MI) {
+  return MI->getOperand(0);
+}
+
+static const MachineOperand &getLdStBaseOp(const MachineInstr *MI) {
+  return MI->getOperand(1);
+}
+
+static const MachineOperand &getLdStOffsetOp(const MachineInstr *MI) {
+  return MI->getOperand(2);
+}
+
+// Check if the memory operands of two instructions are aliased to each other.
+bool AArch64SSALoadStoreOpt::instrAliased(MachineInstr *MIa,
+                                          MachineInstr *MIb) {
+  MachineMemOperand *MMOA =
+      MIa->hasOneMemOperand() ? *MIa->memoperands_begin() : nullptr;
+  MachineMemOperand *MMOB =
+      MIb->hasOneMemOperand() ? *MIb->memoperands_begin() : nullptr;
+
+  if (!MMOA || !MMOB)
+    return true;
+
+  if (!MMOA->getValue() || !MMOB->getValue())
+    return true;
+
+  MemoryLocation LocA(MMOA->getValue(), MMOA->getSize(), MMOA->getAAInfo());
+  MemoryLocation LocB(MMOB->getValue(), MMOB->getSize(), MMOB->getAAInfo());
+
+  return AA->alias(LocA, LocB);
+}
+
+// Check if there is any instruction between MIa and MIb, which may alias with
+// the instruction located later between MIa and MIb.
+bool AArch64SSALoadStoreOpt::hasAliasBetween(MachineInstr *MIa,
+                                             MachineInstr *MIb) {
+  MachineBasicBlock *MBB = MIa->getParent();
+
+  MachineInstr *EndMI = MIb;
+  unsigned Status = 0;
+  for (auto &MBBI : *MBB) {
+    MachineInstr *MI = &MBBI;
+    switch (Status) {
+    default:
+      llvm_unreachable("Unexpected status");
+    case 0: // neither MIa nor MIb is not met
+      if (MI == MIa) {
+        Status++;
+      } else if (MI == MIb) {
+        EndMI = MIa;
+        Status++;
+      }
+      break;
+    case 1: // one of MIa and MIb is met
+      if (MI == EndMI) {
+        Status++;
+      } else if (MI->mayStore()) {
+        // Check if MI is aliased with EndMI.
+        if (instrAliased(MI, EndMI))
+          return true;
+      }
+      break;
+    case 2: // both MIa and MIb are met
+      break;
+    }
+  }
+
+  return false;
+}
+
+bool AArch64SSALoadStoreOpt::checkOffsetRange(MachineInstr *MI) {
+  bool IsScaled = !TII->isUnscaledLdSt(MI);
+  int Offset = getLdStOffsetOp(MI).getImm();
+
+  // When the original load/store is scaled(ldr/str) and it has an odd offset
+  // value, it can be widened
+  // if (MIN_UNSCALED_OFFSET <= unscaled offset value <= MAX_UNSCALED_OFFSET)
+  // is satisfied.
+  // cf.) unscaled offset value = scaled offset value * memory scale size
+  if (IsScaled) {
+    int UnscaledOffset = Offset * getMemScale(MI);
+    if (Offset % 2)
+      return UnscaledOffset <= MAX_UNSCALED_OFFSET &&
+             UnscaledOffset >= MIN_UNSCALED_OFFSET;
+  }
+
+  return true;
+}
+
+// Check if two loads/stores have consecutive memory accesses.
+// If found, the SmallVector has the consecutive instructions in the access
+// order.
+bool AArch64SSALoadStoreOpt::isConsecutive(
+    MachineInstr *FirstMI, MachineInstr *SecondMI,
+    SmallVector<MachineInstr *, 2> &MIs) {
+  assert((FirstMI->mayLoad() ? SecondMI->mayLoad()
+                             : (FirstMI->mayStore() && SecondMI->mayStore())) &&
+         "Unexpected input instructions");
+  bool FirstMIIsUnscaled = TII->isUnscaledLdSt(FirstMI);
+  unsigned FirstMIBaseReg = getLdStBaseOp(FirstMI).getReg();
+  int FirstMIOffset = getLdStOffsetOp(FirstMI).getImm();
+  int FirstMIOffsetStride = FirstMIIsUnscaled ? getMemScale(FirstMI) : 1;
+
+  bool SecondMIIsUnscaled = TII->isUnscaledLdSt(SecondMI);
+  unsigned SecondMIBaseReg = getLdStBaseOp(SecondMI).getReg();
+  int SecondMIOffset = getLdStOffsetOp(SecondMI).getImm();
+  if (FirstMIIsUnscaled != SecondMIIsUnscaled) {
+    // We're trying to pack instructions that differ in how they are scaled.
+    // If FirstMI is scaled then scale the offset of MI accordingly.
+    // Otherwise, do the opposite (i.e., make MI's offset unscaled).
+    int MemSize = getMemScale(SecondMI);
+    if (SecondMIIsUnscaled) {
+      // If the unscaled offset isn't a multiple of the MemSize, we can't
+      // pack the operations together: bail and keep looking.
+      if (SecondMIOffset % MemSize)
+        return false;
+
+      SecondMIOffset /= MemSize;
+    } else {
+      SecondMIOffset *= MemSize;
+    }
+  }
+
+  if (FirstMIBaseReg == SecondMIBaseReg) {
+    if (FirstMIOffset + FirstMIOffsetStride == SecondMIOffset) {
+      MIs.push_back(FirstMI);
+      MIs.push_back(SecondMI);
+      return true;
+    } else if (FirstMIOffset == SecondMIOffset + FirstMIOffsetStride) {
+      MIs.push_back(SecondMI);
+      MIs.push_back(FirstMI);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+// Build wide load/store instruction. MI has lower offset. MergeMI has higher
+// offset.
+void AArch64SSALoadStoreOpt::buildWideLdStInst(MachineInstr *MI,
+                                               MachineInstr *MergeMI) {
+  assert((MI->mayLoad() ? MergeMI->mayLoad()
+                        : (MI->mayStore() && MergeMI->mayStore())) &&
+         "Unexpected input instructions");
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineOperand &RegOp = MI->getOperand(0);
+  unsigned Reg = RegOp.getReg();
+  const MachineOperand &BaseRegOp = MI->getOperand(1);
+  bool IsScaled = !TII->isUnscaledLdSt(MI->getOpcode());
+  int OffsetImm = MI->getOperand(2).getImm();
+
+  bool ShouldBeUnscaled = false;
+  // When the original load/store is scaled(ldr/str),
+  // offset should be unscaled if the offset is an odd value.
+  // Otherwise, the offset shoud be a half of itself.
+  if (IsScaled) {
+    if (OffsetImm % 2) {
+      OffsetImm *= getMemScale(MI);
+      ShouldBeUnscaled = true;
+    } else {
+      OffsetImm /= 2;
+    }
+  }
+
+  // Select the opcode for wide load/store. Consider whether the opcode should
+  // be scaled or unscaled.
+  unsigned NewOpc = 0;
+  switch (MI->getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected MI's opcode.");
+    break;
+  case AArch64::LDRWui:
+    NewOpc = ShouldBeUnscaled ? AArch64::LDURXi : AArch64::LDRXui;
+    break;
+  case AArch64::LDURWi:
+    NewOpc = AArch64::LDURXi;
+    break;
+  case AArch64::STRWui:
+    NewOpc = ShouldBeUnscaled ? AArch64::STURXi : AArch64::STRXui;
+    break;
+  case AArch64::STURWi:
+    NewOpc = AArch64::STURXi;
+    break;
+  }
+
+  // Set the register operand (dst of load or src of store) to wide type.
+  MRI->setRegClass(Reg, &AArch64::GPR64RegClass);
+
+  // Generate wide memory operand.
+  MachineMemOperand *MMO = *MI->memoperands_begin();
+  MachineMemOperand *NewMMO = new MachineMemOperand(
+      MMO->getPointerInfo(), MMO->getFlags(), MMO->getSize() << 1,
+      MMO->getBaseAlignment() << 1, MMO->getAAInfo(), MMO->getRanges());
+
+  // Set the location for inserting new wide instruction.
+  MachineBasicBlock::iterator InsertionPoint = MI;
+
+  // Build wide load/store instruction.
+  MachineInstr *NewMI =
+      BuildMI(*MBB, InsertionPoint, MI->getDebugLoc(), TII->get(NewOpc))
+          .addOperand(RegOp)
+          .addOperand(BaseRegOp)
+          .addImm(OffsetImm)
+          .setMemRefs(&NewMMO, &NewMMO);
+  NewMI->addMemOperand(*MBB->getParent(), NewMMO);
+  (void)NewMI;
+
+  DEBUG(dbgs() << "Creating the wide load/store instruction. Replacing "
+                  "instructions:\n    ");
+  DEBUG(MI->print(dbgs()));
+  DEBUG(dbgs() << "    ");
+  DEBUG(MergeMI->print(dbgs()));
+  DEBUG(dbgs() << "    ");
+  DEBUG(dbgs() << "  with instruction:\n    ");
+  DEBUG((NewMI)->print(dbgs()));
+}
+
+bool AArch64SSALoadStoreOpt::tryToWidenLdStInst(
+    MachineInstr *MI, std::set<MachineInstr *> &StoresWillBeDeleted,
+    std::set<MachineInstr *> &LoadsWillBeDeleted) {
+  MachineInstr *FirstMI = MI;
+
+  // FirstMI(store) should have a register base operand and an immediate offset
+  // operand.
+  if (!getLdStBaseOp(FirstMI).isReg() || !getLdStOffsetOp(FirstMI).isImm())
+    return false;
+
+  unsigned Reg = getLdStRegOp(FirstMI).getReg();
+
+  // Check if there exists a def instruction for the register operand of
+  // FirstMI.
+  if (!MRI->hasOneDef(Reg))
+    return false;
+
+  MachineInstr &DefInst = *MRI->def_instr_begin(Reg);
+
+  // DefInst should be a load instruction.
+  if (DefInst.getOpcode() != AArch64::LDRWui &&
+      DefInst.getOpcode() != AArch64::LDURWi)
+    return false;
+
+  // DefInst(load) should have a register base operand and an immediate offset
+  // operand.
+  if (!getLdStBaseOp(&DefInst).isReg() || !getLdStOffsetOp(&DefInst).isImm())
+    return false;
+
+  // DefInst should have a unique use instruction that should be FirstMI.
+  if (!MRI->hasOneUse(Reg))
+    return false;
+
+  SmallVector<MachineInstr *, 2> ConsecutiveStores;
+  SmallVector<MachineInstr *, 2> ConsecutiveLoads;
+
+  // Find consecutive store for FirstMI.
+  MachineBasicBlock::iterator E = MI->getParent()->end();
+  MachineBasicBlock::iterator MBBI = MI;
+  ++MBBI;
+  for (; MBBI != E; ++MBBI) {
+    MachineInstr *MI = MBBI;
+
+    if (MI->getOpcode() != AArch64::STURWi &&
+        MI->getOpcode() != AArch64::STRWui)
+      continue;
+
+    // MI(store) should have a register base operand and an immediate offset
+    // operand.
+    if (!getLdStBaseOp(MI).isReg() || !getLdStOffsetOp(MI).isImm())
+      return false;
+
+    // Check if FirstMI(store) and MI(store) are consecutive.
+    if (!isConsecutive(FirstMI, MI, ConsecutiveStores))
+      continue;
+
+    // If the first store's offset is out of range, give up widening.
+    if (!checkOffsetRange(ConsecutiveStores[0]))
+      return false;
+
+    unsigned MIReg = getLdStRegOp(MI).getReg();
+
+    // Check if there exists a def instruction for the register operand of MI.
+    if (!MRI->hasOneDef(MIReg)) {
+      ConsecutiveStores.clear();
+      continue;
+    }
+
+    MachineInstr &MIDefInst = *MRI->def_instr_begin(MIReg);
+
+    // MIDefInst should have a unique use instruction that should be MI.
+    if (!MRI->hasOneUse(MIReg))
+      return false;
+
+    // MIDefInst should be a load instruction.
+    if (MIDefInst.getOpcode() != AArch64::LDRWui &&
+        MIDefInst.getOpcode() != AArch64::LDURWi) {
+      ConsecutiveStores.clear();
+      continue;
+    }
+
+    // MIDefInst(load) should have a register base operand and an immediate
+    // offset operand.
+    if (!getLdStBaseOp(&MIDefInst).isReg() ||
+        !getLdStOffsetOp(&MIDefInst).isImm())
+      return false;
+
+    // Check if DefInst(load) and MIDefInst(load) are consecutive.
+    if (isConsecutive(&DefInst, &MIDefInst, ConsecutiveLoads)) {
+      // If the first load's offset is out of range, give up widening.
+      if (!checkOffsetRange(ConsecutiveLoads[0]))
+        return false;
+
+      // If consecutive loads/stores are found, the followings are satisfied.
+      //  dst reg of first load = src reg of first store
+      //  dst reg of second load = src reg of second store
+      if (getLdStRegOp(ConsecutiveLoads[0]).getReg() !=
+              getLdStRegOp(ConsecutiveStores[0]).getReg() ||
+          getLdStRegOp(ConsecutiveLoads[1]).getReg() !=
+              getLdStRegOp(ConsecutiveStores[1]).getReg())
+        return false;
+
+      // The first store instruction should not be aliased to the second load
+      // instruction.
+      if (instrAliased(ConsecutiveStores[0], ConsecutiveLoads[1])) {
+        DEBUG(dbgs() << "The first store instruction:\n    ");
+        DEBUG(ConsecutiveStores[0]->print(dbgs()));
+        DEBUG(dbgs()
+              << "might be aliased with the second load instruction:\n    ");
+        DEBUG(ConsecutiveLoads[1]->print(dbgs()));
+        return false;
+      }
+
+      // The second store instruction should not be aliased to the first load
+      // instruction.
+      if (instrAliased(ConsecutiveStores[1], ConsecutiveLoads[0])) {
+        DEBUG(dbgs() << "The second store instruction:\n    ");
+        DEBUG(ConsecutiveStores[1]->print(dbgs()));
+        DEBUG(dbgs()
+              << "might be aliased with the first load instruction:\n    ");
+        DEBUG(ConsecutiveLoads[0]->print(dbgs()));
+        return false;
+      }
+
+      // Check if there is any instruction between two consecutive loads,
+      // which may alias the loads. If found, give up widening.
+      if (hasAliasBetween(ConsecutiveLoads[0], ConsecutiveLoads[1]))
+        return false;
+
+      // Check if there is any instruction between two consecutive stores,
+      // which may alias the stores. If found, give up widening.
+      if (hasAliasBetween(ConsecutiveStores[0], ConsecutiveStores[1]))
+        return false;
+
+      for (auto &I : ConsecutiveStores)
+        StoresWillBeDeleted.insert(I);
+
+      for (auto &I : ConsecutiveLoads)
+        LoadsWillBeDeleted.insert(I);
+
+      // Build wide load/store instruction from consecutive loads/stores.
+      buildWideLdStInst(ConsecutiveLoads[0], ConsecutiveLoads[1]);
+      buildWideLdStInst(ConsecutiveStores[0], ConsecutiveStores[1]);
+      return true;
+    }
+
+    ConsecutiveStores.clear();
+  }
+  return false;
+}
+
+bool AArch64SSALoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
+  bool Modified = false;
+  // 1) Find consecutive two 32-bit loads and consecutive two 32-bit stores that
+  //    write the values of the consecutive 32-bit loads. Transform the
+  //    loads/stores to 64-bit load/store.
+  //
+  //    When the wide load/store is unscaled(ldur/stur), offset needs not to be
+  //    changed.
+  //     e.g.,
+  //     %vreg2 = LDURWi %vreg0, -76;
+  //     %vreg3 = LDURWi %vreg0, -72;
+  //     STURWi %vreg2, %vreg1, -44;
+  //     STURWi %vreg3, %vreg1, -40;
+  //     ; becomes
+  //     %vreg2 = LDURXi %vreg0, -76;
+  //     STURXi %vreg2, %vreg1, -44;
+  //
+  //    When the wide load/store is scaled(ldr/str), offset should be a half of
+  //    the original value.
+  //     e.g.,
+  //     %vreg2 = LDRWui %vreg0, 4;
+  //     %vreg3 = LDRWui %vreg0, 5;
+  //     STRWui %vreg2, %vreg1, 2;
+  //     STRWui %vreg3, %vreg1, 3;
+  //     ; becomes
+  //     %vreg2 = LDRXui %vreg0, 2;
+  //     STRXui %vreg2, %vreg1, 1;
+  //
+  //    When the original load/store is scaled(ldr/str) and it has an odd offset
+  //    value, it can be widened if
+  //    (MIN_UNSCALED_OFFSET <= unscaled offset value <= MAX_UNSCALED_OFFSET)
+  //    is satisfied.
+  //    cf.) unscaled offset value = scaled offset value * memory scale size
+  //     e.g.,
+  //     %vreg2 = LDRWui %vreg0, 13;
+  //     %vreg3 = LDRWui %vreg0, 14;
+  //     STRWui %vreg2, %vreg1, 37;
+  //     STRWui %vreg3, %vreg1, 38;
+  //     ; becomes
+  //     %vreg2 = LDURXi %vreg0, 52; 52 = 13 * 4
+  //     STURXi %vreg2, %vreg1, 148; 148 = 37 * 4
+  std::set<MachineInstr *> LoadsWillBeDeleted;
+  std::set<MachineInstr *> StoresWillBeDeleted;
+  for (auto &MBBI : MBB) {
+    MachineInstr *MI = &MBBI;
+    switch (MI->getOpcode()) {
+    default:
+      break;
+    case AArch64::STURWi:
+    case AArch64::STRWui:
+      if (StoresWillBeDeleted.find(MI) != StoresWillBeDeleted.end())
+        continue;
+
+      if (tryToWidenLdStInst(MI, StoresWillBeDeleted, LoadsWillBeDeleted))
+        Modified = true;
+
+      break;
+    }
+  }
+
+  for (auto &I : LoadsWillBeDeleted)
+    I->eraseFromParent();
+
+  for (auto &I : StoresWillBeDeleted)
+    I->eraseFromParent();
+
+  return Modified;
+}
+
+bool AArch64SSALoadStoreOpt::runOnMachineFunction(MachineFunction &MF) {
+  Subtarget = &static_cast<const AArch64Subtarget &>(MF.getSubtarget());
+  TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());
+  MRI = &MF.getRegInfo();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+
+  bool Changed = false;
+  for (MachineBasicBlock &MBB : MF)
+    Changed |= optimizeBlock(MBB);
+  return Changed;
+}
+
+FunctionPass *llvm::createAArch64SSALoadStoreOptPass() {
+  return new AArch64SSALoadStoreOpt();
+}
Index: lib/Target/AArch64/AArch64TargetMachine.cpp
===================================================================
--- lib/Target/AArch64/AArch64TargetMachine.cpp
+++ lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -107,6 +107,12 @@
                            cl::desc("Enable the loop data prefetch pass"),
                            cl::init(true));
 
+static cl::opt<bool>
+    EnableSSALoadStoreOpt("aarch64-ssa-load-store-opt",
+                          cl::desc("Enable the load/store pair"
+                                   " optimization pass in SSA form"),
+                          cl::init(false), cl::Hidden);
+
 extern "C" void LLVMInitializeAArch64Target() {
   // Register the target.
   RegisterTargetMachine<AArch64leTargetMachine> X(TheAArch64leTarget);
@@ -346,6 +352,8 @@
 #endif
 
 bool AArch64PassConfig::addILPOpts() {
+  if (EnableSSALoadStoreOpt)
+    addPass(createAArch64SSALoadStoreOptPass());
   if (EnableCondOpt)
     addPass(createAArch64ConditionOptimizerPass());
   if (EnableCCMP)
Index: lib/Target/AArch64/CMakeLists.txt
===================================================================
--- lib/Target/AArch64/CMakeLists.txt
+++ lib/Target/AArch64/CMakeLists.txt
@@ -45,6 +45,7 @@
   AArch64FrameLowering.cpp
   AArch64ConditionOptimizer.cpp
   AArch64RedundantCopyElimination.cpp
+  AArch64SSALoadStoreOptimizer.cpp
   AArch64ISelDAGToDAG.cpp
   AArch64ISelLowering.cpp
   AArch64InstrInfo.cpp
Index: test/CodeGen/AArch64/ldst-opt.ll
===================================================================
--- test/CodeGen/AArch64/ldst-opt.ll
+++ test/CodeGen/AArch64/ldst-opt.ll
@@ -1122,9 +1122,9 @@
   %phi1 = phi i32* [ %gep4, %for.body ], [ %b, %0 ]
   %phi2 = phi i32* [ %gep3, %for.body ], [ %a, %0 ]
   %i = phi i64 [ %dec.i, %for.body], [ %count, %0 ]
-  %gep1 = getelementptr i32, i32* %phi1, i64 -1
+  %gep1 = getelementptr i32, i32* %phi1, i64 -3
   %load1 = load i32, i32* %gep1
-  %gep2 = getelementptr i32, i32* %phi2, i64 -1
+  %gep2 = getelementptr i32, i32* %phi2, i64 -3
   store i32 %load1, i32* %gep2
   %load2 = load i32, i32* %phi1
   store i32 %load2, i32* %phi2
Index: test/CodeGen/AArch64/ssa-ldst-opt.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AArch64/ssa-ldst-opt.ll
@@ -0,0 +1,182 @@
+; RUN: llc -march=aarch64 -verify-machineinstrs -asm-verbose=false -aarch64-ssa-load-store-opt -o - %s | FileCheck %s
+
+; This test is for 'AArch64 load/store optimization in SSA form'.
+; Find consecutive two 32-bit loads and consecutive two 32-bit stores that write the values of the consecutive 32-bit loads.
+; Transform the loads/stores to 64-bit load/store.
+
+; CHECK-LABEL: test_offset_no_changed:
+; CHECK:        ldur x8, [x0, #-76]
+; CHECK-NEXT:   stur x8, [x1, #-44]
+; CHECK-NEXT:   ret
+define void @test_offset_no_changed(i32* %p1, i32* %p2) #0 {
+  %ld.ptr1 = getelementptr i32, i32* %p1, i64 -19
+  %1 = load i32, i32* %ld.ptr1, align 4, !tbaa !8
+  %st.ptr1 = getelementptr i32, i32* %p2, i64 -11
+  store i32 %1, i32* %st.ptr1, align 4, !tbaa !5
+  %ld.ptr2 = getelementptr i32, i32* %p1, i64 -18
+  %2 = load i32, i32* %ld.ptr2, align 4, !tbaa !9
+  %st.ptr2 = getelementptr i32, i32* %p2, i64 -10
+  store i32 %2, i32* %st.ptr2, align 4, !tbaa !6
+  ret void
+}
+
+; CHECK-LABEL: test_offset_halved:
+; CHECK:        ldr x8, [x0, #16]
+; CHECK-NEXT:   str x8, [x1, #8]
+; CHECK-NEXT:   ret
+define void @test_offset_halved(i32* %p1, i32* %p2) #0 {
+  %ld.ptr1 = getelementptr i32, i32* %p1, i64 4
+  %1 = load i32, i32* %ld.ptr1, align 4, !tbaa !8
+  %st.ptr1 = getelementptr i32, i32* %p2, i64 2
+  store i32 %1, i32* %st.ptr1, align 4, !tbaa !5
+  %ld.ptr2 = getelementptr i32, i32* %p1, i64 5
+  %2 = load i32, i32* %ld.ptr2, align 4, !tbaa !9
+  %st.ptr2 = getelementptr i32, i32* %p2, i64 3
+  store i32 %2, i32* %st.ptr2, align 4, !tbaa !6
+  ret void
+}
+
+; CHECK-LABEL: test_offset_unscaled:
+; CHECK:        ldur x8, [x0, #52]
+; CHECK-NEXT:   stur x8, [x1, #148]
+; CHECK-NEXT:   ret
+define void @test_offset_unscaled(i32* %p1, i32* %p2) #0 {
+  %ld.ptr1 = getelementptr i32, i32* %p1, i64 13
+  %1 = load i32, i32* %ld.ptr1, align 4, !tbaa !8
+  %st.ptr1 = getelementptr i32, i32* %p2, i64 37
+  store i32 %1, i32* %st.ptr1, align 4, !tbaa !5
+  %ld.ptr2 = getelementptr i32, i32* %p1, i64 14
+  %2 = load i32, i32* %ld.ptr2, align 4, !tbaa !9
+  %st.ptr2 = getelementptr i32, i32* %p2, i64 38
+  store i32 %2, i32* %st.ptr2, align 4, !tbaa !6
+  ret void
+}
+
+; CHECK-LABEL: test_scaled_offset_range1:
+; CHECK:	ldur	x8, [x0, #252]
+; CHECK-NEXT:	stur	x8, [x1, #252]
+; CHECK-NEXT:   ret
+define void @test_scaled_offset_range1(i32* %p1, i32* %p2) #0 {
+  %ld.ptr1 = getelementptr i32, i32* %p1, i64 63
+  %1 = load i32, i32* %ld.ptr1, align 4, !tbaa !9
+  %st.ptr1 = getelementptr i32, i32* %p2, i64 63
+  store i32 %1, i32* %st.ptr1, align 4, !tbaa !5
+  %ld.ptr2 = getelementptr i32, i32* %p1, i64 64
+  %2 = load i32, i32* %ld.ptr2, align 4, !tbaa !8
+  %st.ptr2 = getelementptr i32, i32* %p2, i64 64
+  store i32 %2, i32* %st.ptr2, align 4, !tbaa !6
+  ret void
+}
+
+; In the following test case, 'aarch64-ssa-load-store-opt' is applied.
+; allowed offset range: 0 <= offset <= 16380 if (offset % 2 == 0)
+
+; CHECK-LABEL: test_scaled_offset_range2:
+; CHECK:	ldr		x8, [x0]
+; CHECK-NEXT:	str	x8, [x1, #16376]
+; CHECK-NEXT:   ret
+define void @test_scaled_offset_range2(i32* %p1, i32* %p2) #0 {
+  %ld.ptr1 = getelementptr i32, i32* %p1, i64 0
+  %1 = load i32, i32* %ld.ptr1, align 4, !tbaa !9
+  %st.ptr1 = getelementptr i32, i32* %p2, i64 4094
+  store i32 %1, i32* %st.ptr1, align 4, !tbaa !5
+  %ld.ptr2 = getelementptr i32, i32* %p1, i64 1
+  %2 = load i32, i32* %ld.ptr2, align 4, !tbaa !8
+  %st.ptr2 = getelementptr i32, i32* %p2, i64 4095
+  store i32 %2, i32* %st.ptr2, align 4, !tbaa !6
+  ret void
+}
+
+; In the following test case, 'aarch64-ssa-load-store-opt' is not applied
+; since the first load offset is out of range.
+; allowed offset range: -256 <= offset < 256 if (offset % 2 == 1)
+
+; CHECK-LABEL: test_scaled_offset_range3:
+; CHECK:        ldr	w8, [x0, #260]
+; CHECK-NEXT:   str	w8, [x1, #252]
+; CHECK-NEXT:   ldr	w8, [x0, #264]
+; CHECK-NEXT:   str	w8, [x1, #256]
+; CHECK-NEXT:   ret
+define void @test_scaled_offset_range3(i32* %p1, i32* %p2) #0 {
+  %ld.ptr1 = getelementptr i32, i32* %p1, i64 65
+  %1 = load i32, i32* %ld.ptr1, align 4, !tbaa !9
+  %st.ptr1 = getelementptr i32, i32* %p2, i64 63
+  store i32 %1, i32* %st.ptr1, align 4, !tbaa !5
+  %ld.ptr2 = getelementptr i32, i32* %p1, i64 66
+  %2 = load i32, i32* %ld.ptr2, align 4, !tbaa !8
+  %st.ptr2 = getelementptr i32, i32* %p2, i64 64
+  store i32 %2, i32* %st.ptr2, align 4, !tbaa !6
+  ret void
+}
+
+; In the follwoing three test cases, 'aarch64-ssa-load-store-opt' is not applied
+; since disjoint memory accesses cannot be guaranteed without alias information.
+
+; CHECK-LABEL: test_offset_no_changed_no_tbaa_info:
+; CHECK:	ldur	w8, [x0, #-76]
+; CHECK-NEXT:	stur	w8, [x1, #-44]
+; CHECK-NEXT:	ldur	w8, [x0, #-72]
+; CHECK-NEXT:	stur	w8, [x1, #-40]
+; CHECK-NEXT:   ret
+define void @test_offset_no_changed_no_tbaa_info(i32* %p1, i32* %p2) #0 {
+  %ld.ptr1 = getelementptr i32, i32* %p1, i64 -19
+  %1 = load i32, i32* %ld.ptr1, align 4
+  %st.ptr1 = getelementptr i32, i32* %p2, i64 -11
+  store i32 %1, i32* %st.ptr1, align 4
+  %ld.ptr2 = getelementptr i32, i32* %p1, i64 -18
+  %2 = load i32, i32* %ld.ptr2, align 4
+  %st.ptr2 = getelementptr i32, i32* %p2, i64 -10
+  store i32 %2, i32* %st.ptr2, align 4
+  ret void
+}
+
+; CHECK-LABEL: test_offset_halved_no_tbaa_info:
+; CHECK:	ldr	w8, [x0, #16]
+; CHECK-NEXT:	str	w8, [x1, #8]
+; CHECK-NEXT:	ldr	w8, [x0, #20]
+; CHECK-NEXT:	str	w8, [x1, #12]
+; CHECK-NEXT:   ret
+define void @test_offset_halved_no_tbaa_info(i32* %p1, i32* %p2) #0 {
+  %ld.ptr1 = getelementptr i32, i32* %p1, i64 4
+  %1 = load i32, i32* %ld.ptr1, align 4
+  %st.ptr1 = getelementptr i32, i32* %p2, i64 2
+  store i32 %1, i32* %st.ptr1, align 4
+  %ld.ptr2 = getelementptr i32, i32* %p1, i64 5
+  %2 = load i32, i32* %ld.ptr2, align 4
+  %st.ptr2 = getelementptr i32, i32* %p2, i64 3
+  store i32 %2, i32* %st.ptr2, align 4
+  ret void
+}
+
+; CHECK-LABEL: test_offset_unscaled_no_tbaa_info:
+; CHECK:	ldr	w8, [x0, #52]
+; CHECK-NEXT:	str	w8, [x1, #148]
+; CHECK-NEXT:	ldr	w8, [x0, #56]
+; CHECK-NEXT:	str	w8, [x1, #152]
+; CHECK-NEXT:   ret
+define void @test_offset_unscaled_no_tbaa_info(i32* %p1, i32* %p2) #0 {
+  %ld.ptr1 = getelementptr i32, i32* %p1, i64 13
+  %1 = load i32, i32* %ld.ptr1, align 4
+  %st.ptr1 = getelementptr i32, i32* %p2, i64 37
+  store i32 %1, i32* %st.ptr1, align 4
+  %ld.ptr2 = getelementptr i32, i32* %p1, i64 14
+  %2 = load i32, i32* %ld.ptr2, align 4
+  %st.ptr2 = getelementptr i32, i32* %p2, i64 38
+  store i32 %2, i32* %st.ptr2, align 4
+  ret void
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 3.9.0 "}
+!1 = !{!"int", !2, i64 0}
+!2 = !{!"omnipotent char", !3, i64 0}
+!3 = !{!"Simple C++ TBAA"}
+!4 = !{!"structA", !1, i64 0, !1, i64 4}
+!5 = !{!4, !1, i64 0}
+!6 = !{!4, !1, i64 4}
+!7 = !{!"structB", !1, i64 0, !1, i64 4}
+!8 = !{!7, !1, i64 0}
+!9 = !{!7, !1, i64 4}