Index: lib/Target/AArch64/AArch64.h
===================================================================
--- lib/Target/AArch64/AArch64.h
+++ lib/Target/AArch64/AArch64.h
@@ -46,6 +46,8 @@
 
 FunctionPass *createAArch64CollectLOHPass();
 
+FunctionPass *createAArch64SSALoadStoreOptPass();
+
 void initializeAArch64ExpandPseudoPass(PassRegistry&);
 } // end namespace llvm
 
Index: lib/Target/AArch64/AArch64SSALoadStoreOptimizer.cpp
===================================================================
--- /dev/null
+++ lib/Target/AArch64/AArch64SSALoadStoreOptimizer.cpp
@@ -0,0 +1,490 @@
+//===--- AArch64ExynosOpt.cpp --- Suppress store pair formation ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that performs load / store related peephole
+// optimizations in SSA form. This pass should be run before register
+// allocation.
+
+// ===---------------------------------------------------------------------===//
+
+#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineTraceMetrics.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/IR/Constants.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-ssa-ldst-opt"
+
+namespace {
+class AArch64SSALoadStoreOpt : public MachineFunctionPass {
+
+  const AArch64InstrInfo *TII;
+  MachineRegisterInfo *MRI;
+  const TargetRegisterInfo *TRI;
+  const AArch64Subtarget *Subtarget;
+
+public:
+  static char ID;
+  AArch64SSALoadStoreOpt() : MachineFunctionPass(ID) {}
+  bool tryToWidenLdStInst(MachineBasicBlock::iterator &I,
+                          std::set<MachineInstr *> &stores,
+                          std::set<MachineInstr *> &loads);
+  bool checkOffsetRange(MachineInstr *MI);
+  bool isConsecutive(MachineInstr *FirstMI, MachineInstr *SecondMI,
+                     std::vector<MachineInstr *> &MIs);
+  void buildWideLdStInst(MachineInstr *MI, MachineInstr *MergeMI);
+  bool optimizeBlock(MachineBasicBlock &MBB);
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  const char *getPassName() const override {
+    return "AArch64 SSA load / store optimization pass";
+  }
+};
+char AArch64SSALoadStoreOpt::ID = 0;
+} // namespace
+
+static void getPosition(MachineBasicBlock::iterator &I, MachineBasicBlock &MBB,
+                        MachineInstr &MI) {
+  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); MBBI != E;
+       ++MBBI) {
+    if (&*MBBI == &MI)
+      I = MBBI;
+  }
+}
+
+// Scaling factor for unscaled load or store.
+static int getMemScale(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  default:
+    llvm_unreachable("Opcode has unknown scale!");
+  case AArch64::LDRBBui:
+  case AArch64::LDURBBi:
+  case AArch64::LDRSBWui:
+  case AArch64::LDURSBWi:
+  case AArch64::STRBBui:
+  case AArch64::STURBBi:
+    return 1;
+  case AArch64::LDRHHui:
+  case AArch64::LDURHHi:
+  case AArch64::LDRSHWui:
+  case AArch64::LDURSHWi:
+  case AArch64::STRHHui:
+  case AArch64::STURHHi:
+    return 2;
+  case AArch64::LDRSui:
+  case AArch64::LDURSi:
+  case AArch64::LDRSWui:
+  case AArch64::LDURSWi:
+  case AArch64::LDRWui:
+  case AArch64::LDURWi:
+  case AArch64::STRSui:
+  case AArch64::STURSi:
+  case AArch64::STRWui:
+  case AArch64::STURWi:
+  case AArch64::LDPSi:
+  case AArch64::LDPSWi:
+  case AArch64::LDPWi:
+  case AArch64::STPSi:
+  case AArch64::STPWi:
+    return 4;
+  case AArch64::LDRDui:
+  case AArch64::LDURDi:
+  case AArch64::LDRXui:
+  case AArch64::LDURXi:
+  case AArch64::STRDui:
+  case AArch64::STURDi:
+  case AArch64::STRXui:
+  case AArch64::STURXi:
+  case AArch64::LDPDi:
+  case AArch64::LDPXi:
+  case AArch64::STPDi:
+  case AArch64::STPXi:
+    return 8;
+  case AArch64::LDRQui:
+  case AArch64::LDURQi:
+  case AArch64::STRQui:
+  case AArch64::STURQi:
+  case AArch64::LDPQi:
+  case AArch64::STPQi:
+    return 16;
+  }
+}
+
+static const MachineOperand &getLdStRegOp(const MachineInstr *MI) {
+  return MI->getOperand(0);
+}
+
+static const MachineOperand &getLdStBaseOp(const MachineInstr *MI) {
+  return MI->getOperand(1);
+}
+
+static const MachineOperand &getLdStOffsetOp(const MachineInstr *MI) {
+  return MI->getOperand(2);
+}
+
+bool AArch64SSALoadStoreOpt::checkOffsetRange(MachineInstr *MI) {
+  bool IsScaled = !TII->isUnscaledLdSt(MI);
+  int Offset = getLdStOffsetOp(MI).getImm();
+
+  // When the original load/store is scaled(ldr/str) and it has an odd offset
+  // value,
+  // it can be widened if (-256 <= unscaled offset value < 256) is satisfied.
+  // cf.) unscaled offset value = scaled offset value * memory scale size
+  if (IsScaled) {
+    if (Offset % 2) {
+      int UnscaledOffset = Offset * getMemScale(MI);
+      if (UnscaledOffset < 256 && UnscaledOffset >= -256)
+        return true;
+      else
+        return false;
+    }
+  }
+
+  return true;
+}
+
+// Check if two loads/stores have consecutive memory accesses.
+bool AArch64SSALoadStoreOpt::isConsecutive(MachineInstr *FirstMI,
+                                           MachineInstr *SecondMI,
+                                           std::vector<MachineInstr *> &MIs) {
+  bool FirstMIIsUnscaled = TII->isUnscaledLdSt(FirstMI);
+  unsigned FirstMIBaseReg = getLdStBaseOp(FirstMI).getReg();
+  int FirstMIOffset = getLdStOffsetOp(FirstMI).getImm();
+  int FirstMIOffsetStride = FirstMIIsUnscaled ? getMemScale(FirstMI) : 1;
+
+  bool SecondMIIsUnscaled = TII->isUnscaledLdSt(SecondMI);
+  unsigned SecondMIBaseReg = getLdStBaseOp(SecondMI).getReg();
+  int SecondMIOffset = getLdStOffsetOp(SecondMI).getImm();
+  if (FirstMIIsUnscaled != SecondMIIsUnscaled) {
+    // We're trying to pack instructions that differ in how they are scaled.
+    // If FirstMI is scaled then scale the offset of MI accordingly.
+    // Otherwise, do the opposite (i.e., make MI's offset unscaled).
+    int MemSize = getMemScale(SecondMI);
+    if (SecondMIIsUnscaled) {
+      // If the unscaled offset isn't a multiple of the MemSize, we can't
+      // pack the operations together: bail and keep looking.
+      if (SecondMIOffset % MemSize)
+        return false;
+
+      SecondMIOffset /= MemSize;
+    } else {
+      SecondMIOffset *= MemSize;
+    }
+  }
+
+  if (FirstMIBaseReg == SecondMIBaseReg &&
+      ((FirstMIOffset == SecondMIOffset + FirstMIOffsetStride) ||
+       (FirstMIOffset + FirstMIOffsetStride == SecondMIOffset))) {
+    if (FirstMIOffset < SecondMIOffset) {
+      MIs.push_back(FirstMI);
+      MIs.push_back(SecondMI);
+    } else {
+      MIs.push_back(SecondMI);
+      MIs.push_back(FirstMI);
+    }
+
+    return true;
+  }
+
+  return false;
+}
+
+// Build wide load/store instruction. MI has lower offset. MergeMI has higher
+// offset.
+void AArch64SSALoadStoreOpt::buildWideLdStInst(MachineInstr *MI,
+                                               MachineInstr *MergeMI) {
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineOperand &RegOp = MI->getOperand(0);
+  unsigned Reg = MI->getOperand(0).getReg();
+  const MachineOperand &BaseRegOp = MI->getOperand(1);
+  bool IsScaled = !TII->isUnscaledLdSt(MI->getOpcode());
+  int OffsetImm = MI->getOperand(2).getImm();
+
+  bool ShouldBeUnscaled = false;
+  // When the original load/store is scaled(ldr/str),
+  // offset should be unscaled if the offset is an odd value.
+  // Otherwise, the offset shoud be a half of itself.
+  if (IsScaled) {
+    if (OffsetImm % 2) {
+      OffsetImm *= getMemScale(MI);
+      ShouldBeUnscaled = true;
+    } else {
+      OffsetImm /= 2;
+    }
+  }
+
+  MachineBasicBlock::iterator InsertionPoint;
+  getPosition(InsertionPoint, *MBB, *MI);
+
+  // Select the opcode for wide load/store. Consider whether the copcode should
+  // be scaled or
+  // unscaled.
+  unsigned NewOpc = 0;
+  switch (MI->getOpcode()) {
+  default:
+    assert(false && "Unexpected MI's opcode.");
+    break;
+  case AArch64::LDRWui:
+    if (ShouldBeUnscaled)
+      NewOpc = AArch64::LDURXi;
+    else
+      NewOpc = AArch64::LDRXui;
+    break;
+  case AArch64::LDURWi:
+    NewOpc = AArch64::LDURXi;
+    break;
+  case AArch64::STRWui:
+    if (ShouldBeUnscaled)
+      NewOpc = AArch64::STURXi;
+    else
+      NewOpc = AArch64::STRXui;
+    break;
+  case AArch64::STURWi:
+    NewOpc = AArch64::STURXi;
+    break;
+  }
+
+  // Set the register operand (dst of load or src of store) to wide type.
+  MRI->setRegClass(Reg, &AArch64::GPR64RegClass);
+
+  // Build wide load/store instruction.
+  MachineInstr *NewMI =
+      BuildMI(*MBB, InsertionPoint, MI->getDebugLoc(), TII->get(NewOpc))
+          .addOperand(RegOp)
+          .addOperand(BaseRegOp)
+          .addImm(OffsetImm)
+          .setMemRefs(MI->mergeMemRefsWith(*MergeMI));
+  (void)NewMI;
+
+  DEBUG(dbgs() << "Creating the wide load/store instruction. Replacing "
+                  "instructions:\n    ");
+  DEBUG(MI->print(dbgs()));
+  DEBUG(dbgs() << "    ");
+  DEBUG(MergeMI->print(dbgs()));
+  DEBUG(dbgs() << "    ");
+  DEBUG(dbgs() << "  with instruction:\n    ");
+  DEBUG((NewMI)->print(dbgs()));
+}
+
+bool AArch64SSALoadStoreOpt::tryToWidenLdStInst(
+    MachineBasicBlock::iterator &I, std::set<MachineInstr *> &stores,
+    std::set<MachineInstr *> &loads) {
+  MachineBasicBlock::iterator E = I->getParent()->end();
+  MachineBasicBlock::iterator MBBI = I;
+  MachineInstr *FirstMI = I;
+  ++MBBI;
+
+  // FirstMI(store) should have a register base operand and an immediate offset
+  // operand.
+  if (!getLdStBaseOp(FirstMI).isReg() || !getLdStOffsetOp(FirstMI).isImm())
+    return false;
+
+  unsigned Reg = getLdStRegOp(FirstMI).getReg();
+
+  // If FirstMI's offset is out of the range, give up widening.
+  if (!checkOffsetRange(FirstMI))
+    return false;
+
+  // Check if there exists a def instruction for the register operand of
+  // FirstMI.
+  if (!MRI->hasOneDef(Reg))
+    return false;
+
+  MachineInstr &DefInst = *MRI->def_instr_begin(Reg);
+
+  // DefInst should be a load instruction.
+  if (DefInst.getOpcode() != AArch64::LDRWui &&
+      DefInst.getOpcode() != AArch64::LDURWi)
+    return false;
+
+  // DefInst(load) should have a register base operand and an immediate offset
+  // operand.
+  if (!getLdStBaseOp(&DefInst).isReg() || !getLdStOffsetOp(&DefInst).isImm())
+    return false;
+
+  // DefInst should have a unique use instruction that should be FirstMI.
+  if (!MRI->hasOneUse(Reg))
+    return false;
+
+  std::vector<MachineInstr *> consecutive_stores;
+  std::vector<MachineInstr *> consecutive_loads;
+
+  // Find consecutive store for FirstMI.
+  for (; MBBI != E; ++MBBI) {
+    MachineInstr *MI = MBBI;
+
+    if (MI->getOpcode() != AArch64::STURWi &&
+        MI->getOpcode() != AArch64::STRWui)
+      continue;
+
+    // MI(store) should have a register base operand and an immediate offset
+    // operand.
+    if (!getLdStBaseOp(MI).isReg() || !getLdStOffsetOp(MI).isImm())
+      return false;
+
+    // Check if FirstMI(store) and MI(store) are consecutive.
+    if (!isConsecutive(FirstMI, MI, consecutive_stores))
+      continue;
+
+    unsigned MIReg = getLdStRegOp(MI).getReg();
+
+    // Check if there exists a def instruction for the register operand of MI.
+    if (!MRI->hasOneDef(MIReg)) {
+      consecutive_stores.clear();
+      continue;
+    }
+
+    MachineInstr &MIDefInst = *MRI->def_instr_begin(MIReg);
+
+    // MIDefInst should have a unique use instruction that should be MI.
+    if (!MRI->hasOneUse(MIReg))
+      return false;
+
+    // MIDefInst should be a load instruction.
+    if (MIDefInst.getOpcode() != AArch64::LDRWui &&
+        MIDefInst.getOpcode() != AArch64::LDURWi) {
+      consecutive_stores.clear();
+      continue;
+    }
+
+    // MIDefInst(load) should have a register base operand and an immediate
+    // offset operand.
+    if (!getLdStBaseOp(&MIDefInst).isReg() ||
+        !getLdStOffsetOp(&MIDefInst).isImm())
+      return false;
+
+    // If MIDefInst's offset is out of the range, give up widening.
+    if (!checkOffsetRange(&MIDefInst))
+      return false;
+
+    // Check if DefInst(load) and MIDefInst(load) are consecutive.
+    if (isConsecutive(&DefInst, &MIDefInst, consecutive_loads)) {
+      // If consecutive loads/stores are found, the followings are satisfied.
+      //  dst reg of first load = src reg of first store
+      //  dst reg of second load = src reg of second store
+      if (getLdStRegOp(consecutive_loads[0]).getReg() !=
+              getLdStRegOp(consecutive_stores[0]).getReg() ||
+          getLdStRegOp(consecutive_loads[1]).getReg() !=
+              getLdStRegOp(consecutive_stores[1]).getReg())
+        return false;
+
+      for (auto &I : consecutive_stores) {
+        stores.insert(I);
+      }
+
+      for (auto &I : consecutive_loads) {
+        loads.insert(I);
+      }
+
+      // Build wide load/store instruction from consecutive loads/stores.
+      buildWideLdStInst(consecutive_loads[0], consecutive_loads[1]);
+      buildWideLdStInst(consecutive_stores[0], consecutive_stores[1]);
+      return true;
+    }
+
+    consecutive_stores.clear();
+  }
+  return false;
+}
+
+bool AArch64SSALoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
+  bool Modified = false;
+  // 1) Find consecutive two 32-bit loads and consecutive two 32-bit stores that
+  //    write the values of the consecutive 32-bit loads. Transform the
+  //    loads/stores
+  //    to 64-bit load/store.
+  //
+  //    When the wide load/store is unscaled(ldur/stur), offset needs not te be
+  //    changed.
+  //     e.g.,
+  //     %vreg2 = LDURWi %vreg0, -76;
+  //     %vreg3 = LDURWi %vreg0, -72;
+  //     STURWi %vreg2, %vreg1, -44;
+  //     STURWi %vreg3, %vreg1, -40;
+  //     ; becomes
+  //     %vreg2 = LDURXi %vreg0, -76;
+  //     STURXi %vreg2, %vreg1, -44;
+  //
+  //    When the wide load/store is scaled(ldr/str), offset should be a half of
+  //    the original value.
+  //     e.g.,
+  //     %vreg2 = LDRWui %vreg0, 4;
+  //     %vreg3 = LDRWui %vreg0, 5;
+  //     STRWui %vreg2, %vreg1, 2;
+  //     STRWui %vreg3, %vreg1, 3;
+  //     ; becomes
+  //     %vreg2 = LDRXui %vreg0, 2;
+  //     STRXui %vreg2, %vreg1, 1;
+  //
+  //    When the original load/store is scaled(ldr/str) and it has an odd offset
+  //    value,
+  //    it can be widened if (-256 <= unscaled offset value < 256) is satisfied.
+  //    cf.) unscaled offset value = scaled offset value * memory scale size
+  //     e.g.,
+  //     %vreg2 = LDRWui %vreg0, 13;
+  //     %vreg3 = LDRWui %vreg0, 14;
+  //     STRWui %vreg2, %vreg1, 37;
+  //     STRWui %vreg3, %vreg1, 38;
+  //     ; becomes
+  //     %vreg2 = LDURXi %vreg0, 52; 52 = 13 * 4
+  //     STURXi %vreg2, %vreg1, 148; 148 = 37 * 4
+  std::set<MachineInstr *> loads;
+  std::set<MachineInstr *> stores;
+  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); MBBI != E;
+       ++MBBI) {
+    MachineInstr *MI = MBBI;
+    switch (MI->getOpcode()) {
+    default:
+      break;
+    case AArch64::STURWi:
+    case AArch64::STRWui:
+      if (stores.find(MI) != stores.end())
+        continue;
+
+      if (tryToWidenLdStInst(MBBI, stores, loads)) {
+        Modified = true;
+      }
+      break;
+    }
+  }
+
+  for (auto &I : loads) {
+    I->eraseFromParent();
+  }
+
+  for (auto &I : stores) {
+    I->eraseFromParent();
+  }
+
+  return Modified;
+}
+
+bool AArch64SSALoadStoreOpt::runOnMachineFunction(MachineFunction &MF) {
+  Subtarget = &static_cast<const AArch64Subtarget &>(MF.getSubtarget());
+  TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());
+  TRI = MF.getSubtarget().getRegisterInfo();
+  MRI = &MF.getRegInfo();
+  bool Changed = false;
+  for (MachineBasicBlock &MBB : MF)
+    Changed |= optimizeBlock(MBB);
+  return Changed;
+}
+
+FunctionPass *llvm::createAArch64SSALoadStoreOptPass() {
+  return new AArch64SSALoadStoreOpt();
+}
Index: lib/Target/AArch64/AArch64TargetMachine.cpp
===================================================================
--- lib/Target/AArch64/AArch64TargetMachine.cpp
+++ lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -107,6 +107,12 @@
                            cl::desc("Enable the loop data prefetch pass"),
                            cl::init(true));
 
+static cl::opt<bool>
+    EnableSSALoadStoreOpt("aarch64-ssa-load-store-opt",
+                          cl::desc("Enable the load/store pair"
+                                   " optimization pass in SSA form"),
+                          cl::init(true), cl::Hidden);
+
 extern "C" void LLVMInitializeAArch64Target() {
   // Register the target.
   RegisterTargetMachine<AArch64leTargetMachine> X(TheAArch64leTarget);
@@ -346,6 +352,8 @@
 #endif
 
 bool AArch64PassConfig::addILPOpts() {
+  if (EnableSSALoadStoreOpt)
+    addPass(createAArch64SSALoadStoreOptPass());
   if (EnableCondOpt)
     addPass(createAArch64ConditionOptimizerPass());
   if (EnableCCMP)
Index: lib/Target/AArch64/CMakeLists.txt
===================================================================
--- lib/Target/AArch64/CMakeLists.txt
+++ lib/Target/AArch64/CMakeLists.txt
@@ -45,6 +45,7 @@
   AArch64FrameLowering.cpp
   AArch64ConditionOptimizer.cpp
   AArch64RedundantCopyElimination.cpp
+  AArch64SSALoadStoreOptimizer.cpp
   AArch64ISelDAGToDAG.cpp
   AArch64ISelLowering.cpp
   AArch64InstrInfo.cpp
Index: test/CodeGen/AArch64/ldst-opt.ll
===================================================================
--- test/CodeGen/AArch64/ldst-opt.ll
+++ test/CodeGen/AArch64/ldst-opt.ll
@@ -1122,9 +1122,9 @@
   %phi1 = phi i32* [ %gep4, %for.body ], [ %b, %0 ]
   %phi2 = phi i32* [ %gep3, %for.body ], [ %a, %0 ]
   %i = phi i64 [ %dec.i, %for.body], [ %count, %0 ]
-  %gep1 = getelementptr i32, i32* %phi1, i64 -1
+  %gep1 = getelementptr i32, i32* %phi1, i64 -3
   %load1 = load i32, i32* %gep1
-  %gep2 = getelementptr i32, i32* %phi2, i64 -1
+  %gep2 = getelementptr i32, i32* %phi2, i64 -3
   store i32 %load1, i32* %gep2
   %load2 = load i32, i32* %phi1
   store i32 %load2, i32* %phi2
Index: test/CodeGen/AArch64/ssa-ldst-opt.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AArch64/ssa-ldst-opt.ll
@@ -0,0 +1,50 @@
+; RUN: llc -march=aarch64 -verify-machineinstrs -asm-verbose=false -o - %s | FileCheck %s
+
+; CHECK-LABEL: test_offset_no_changed:
+; CHECK: ldur x8, [x0, #-76]
+; CHECK-NEXT: stur x8, [x1, #-44]
+; CHECK-NEXT: ret
+define void @test_offset_no_changed(i32* %p1, i32* %p2) #0 {
+  %ld.ptr1 = getelementptr i32, i32* %p1, i64 -19
+  %1 = load i32, i32* %ld.ptr1, align 4
+  %st.ptr1 = getelementptr i32, i32* %p2, i64 -11
+  store i32 %1, i32* %st.ptr1, align 4
+  %ld.ptr2 = getelementptr i32, i32* %p1, i64 -18
+  %2 = load i32, i32* %ld.ptr2, align 4
+  %st.ptr2 = getelementptr i32, i32* %p2, i64 -10
+  store i32 %2, i32* %st.ptr2, align 4
+  ret void
+}
+
+; CHECK-LABEL: test_offset_halved:
+; CHECK: ldr x8, [x0, #16]
+; CHECK-NEXT: str x8, [x1, #8]
+; CHECK-NEXT: ret
+define void @test_offset_halved(i32* %p1, i32* %p2) #0 {
+  %ld.ptr1 = getelementptr i32, i32* %p1, i64 4
+  %1 = load i32, i32* %ld.ptr1, align 4
+  %st.ptr1 = getelementptr i32, i32* %p2, i64 2
+  store i32 %1, i32* %st.ptr1, align 4
+  %ld.ptr2 = getelementptr i32, i32* %p1, i64 5
+  %2 = load i32, i32* %ld.ptr2, align 4
+  %st.ptr2 = getelementptr i32, i32* %p2, i64 3
+  store i32 %2, i32* %st.ptr2, align 4
+  ret void
+}
+
+; CHECK-LABEL: test_offset_unscaled:
+; CHECK: ldur x8, [x0, #52]
+; CHECK-NEXT: stur x8, [x1, #148]
+; CHECK-NEXT: ret
+define void @test_offset_unscaled(i32* %p1, i32* %p2) #0 {
+  %ld.ptr1 = getelementptr i32, i32* %p1, i64 13
+  %1 = load i32, i32* %ld.ptr1, align 4
+  %st.ptr1 = getelementptr i32, i32* %p2, i64 37
+  store i32 %1, i32* %st.ptr1, align 4
+  %ld.ptr2 = getelementptr i32, i32* %p1, i64 14
+  %2 = load i32, i32* %ld.ptr2, align 4
+  %st.ptr2 = getelementptr i32, i32* %p2, i64 38
+  store i32 %2, i32* %st.ptr2, align 4
+  ret void
+}
+