Index: include/llvm/Transforms/Scalar.h
===================================================================
--- include/llvm/Transforms/Scalar.h
+++ include/llvm/Transforms/Scalar.h
@@ -395,7 +395,9 @@
 //
 // SeparateConstOffsetFromGEP - Split GEPs for better CSE
 //
-FunctionPass *createSeparateConstOffsetFromGEPPass();
+FunctionPass *
+createSeparateConstOffsetFromGEPPass(const TargetMachine *TM = nullptr,
+                                     bool SimplifyGEP = false);
 
 //===----------------------------------------------------------------------===//
 //
Index: lib/Target/AArch64/AArch64Subtarget.h
===================================================================
--- lib/Target/AArch64/AArch64Subtarget.h
+++ lib/Target/AArch64/AArch64Subtarget.h
@@ -115,7 +115,7 @@
   bool isCortexA57() const { return CPUString == "cortex-a57"; }
   bool isCortexA53() const { return CPUString == "cortex-a53"; }
 
-  bool useAA() const override { return isCortexA53() || isCortexA57(); }
+  bool useAA() const override { return isCortexA53(); }
 
   /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
   /// that still makes it profitable to inline the call.
Index: lib/Target/AArch64/AArch64TargetMachine.cpp
===================================================================
--- lib/Target/AArch64/AArch64TargetMachine.cpp
+++ lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -80,6 +80,11 @@
                 cl::desc("Work around Cortex-A53 erratum 835769"),
                 cl::init(false));
 
+static cl::opt<bool>
+EnableGEPOpt("aarch64-gep-opt", cl::Hidden,
+             cl::desc("Enable optimizations on GEPs for better address CSE"),
+             cl::init(true));
+
 extern "C" void LLVMInitializeAArch64Target() {
   // Register the target.
   RegisterTargetMachine<AArch64leTargetMachine> X(TheAArch64leTarget);
@@ -193,6 +198,17 @@
     addPass(createCFGSimplificationPass());
 
   TargetPassConfig::addIRPasses();
+
+  if (TM->getOptLevel() == CodeGenOpt::Aggressive && EnableGEPOpt) {
+    // Call SeparateConstOffsetFromGEP pass to extract constants and simplify
+    // complex GEPs.
+    addPass(createSeparateConstOffsetFromGEPPass(TM, true));
+    // Call EarlyCSE pass to find and remove subexpressions related to the GEPs.
+    addPass(createEarlyCSEPass());
+    // Do loop invariant code motion in case any of the address calculations are
+    // actually loop invariant.
+    addPass(createLICMPass());
+  }
 }
 
 // Pass Pipeline Configuration
Index: lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
===================================================================
--- lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -81,6 +81,66 @@
 //
 //===----------------------------------------------------------------------===//
 
+//===----------------------------------------------------------------------===//
+//
+// Even we can extract constants from GEPs, sometimes it is still unable to do
+// fully CSE for complex GEPs. There are two reasons:
+// (1) The above algorithm can not extract constants in structure type indices.
+//     Actually the indices of structure types in GEP are always constants which
+//     can always be extracted.
+// (2) As GEPs can have variable indices and can often not be matched by the
+//     addressing mode matcher. Such GEPs will be kept as-is into CodeGen, which
+//     can only see GEPs in the same basic block. If two GEPs are in different
+//     basic blocks, CodeGen will not be able to tell if there are common
+//     subexpressions between them.
+//
+// For the above reasons, if SimplifyGEP is true, this pass will transform
+// complex GEPs into simpler forms of several simpler GEPs or a
+// "ptrtoint + arithmetic + inttoptr" form (depends on whether the target use
+//  alias analysis in codegen).
+// E.g. the following two GEPs are used in two basic blocks:
+//  BB1:
+//    %p = getelementptr inbounds [240 x %struct]* %in, i64 0, i64 %idx, i32 3
+//    load %p
+//    ...
+//  BB2:
+//    %p2 = getelementptr inbounds [240 x %struct]* %in, i64 0, i64 %idx, i32 2
+//    load %p2
+//    ...
+//
+// This pass will transfer it into arithmetic forms as follows (The simpler
+// GEP forms are similar):
+//  BB1:
+//    %1 = ptrtoint [240 x %struct]* %in to i64
+//    %2 = mul i64 %idx, length_Of_struct
+//    %3 = add i64 %1, %2
+//    %4 = add i64 %3, 12
+//    %p = inttoptr i64 %4 to i32*
+//    load %p
+//    ...
+//  BB2:
+//    %5 = ptrtoint [240 x %struct]* %in to i64
+//    %6 = mul i64 %idx, length_Of_struct
+//    %7 = add i64 %5, %6
+//    %8 = add i64 %7, 8
+//    %p2 = inttoptr i64 %8 to i32*
+//    load %p
+//    ...
+//
+// So that it can extract the constant in the last structure type index and it
+// is easy to remove following subexpressions:
+//    %5 = ptrtoint [240 x %struct]* %in to i64
+//    %6 = mul i64 %idx, length_Of_struct
+//    %7 = add i64 %5, %6
+//
+// This can also improve the address sinking logic in CodeGenPrepare.
+// CodeGenPrepare tries to sink address calculations that match the target's
+// addressing modes. Complex GEPs may not match and will not be sunk in this
+// case. If with such transformation, part of the original GEPs can be matched
+// and should be sunk, so we end up with better address calculations.
+//
+//===----------------------------------------------------------------------===//
+
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Constants.h"
@@ -92,6 +152,9 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/IR/IRBuilder.h"
 
 using namespace llvm;
 
@@ -241,7 +304,9 @@
 class SeparateConstOffsetFromGEP : public FunctionPass {
  public:
   static char ID;
-  SeparateConstOffsetFromGEP() : FunctionPass(ID) {
+  SeparateConstOffsetFromGEP(const TargetMachine *TM = nullptr,
+                             bool SimplifyGEP = false)
+      : FunctionPass(ID), TM(TM), SimplifyGEP(SimplifyGEP) {
     initializeSeparateConstOffsetFromGEPPass(*PassRegistry::getPassRegistry());
   }
 
@@ -264,11 +329,23 @@
   /// Tries to split the given GEP into a variadic base and a constant offset,
   /// and returns true if the splitting succeeds.
   bool splitGEP(GetElementPtrInst *GEP);
-  /// Finds the constant offset within each index, and accumulates them. This
-  /// function only inspects the GEP without changing it. The output
-  /// NeedsExtraction indicates whether we can extract a non-zero constant
-  /// offset from any index.
-  int64_t accumulateByteOffset(GetElementPtrInst *GEP, bool &NeedsExtraction);
+  /// Tries to extract a constant offsets from the the given GEP and simplify
+  /// it into simpler forms of several simpler GEPs or an arithmetic form
+  /// (depends on wheter the target use alias analysis in codegen).
+  bool extractConstantAndSimplifyGEP(GetElementPtrInst *GEP);
+  /// Finds the constant offset within each index for sequential types, and
+  /// accumulates them. This function only inspects the GEP without changing
+  /// it.
+  /// The output NeedsExtraction indicates whether we can extract a constant
+  /// offset.
+  int64_t accumulateByteOffsetForSeq(GetElementPtrInst *GEP,
+                                     bool &NeedsExtraction);
+  /// Finds the constant offset within each index for sequential and struct
+  /// types, and accumulates them. This function only inspects the GEP without
+  /// changing it. The output NeedsExtraction indicates whether we can extract
+  /// a constant offset.
+  int64_t accumulateByteOffsetForSeqAndStruct(GetElementPtrInst *GEP,
+                                              bool &NeedsExtraction);
   /// Canonicalize array indices to pointer-size integers. This helps to
   /// simplify the logic of splitting a GEP. For example, if a + b is a
   /// pointer-size integer, we have
@@ -287,6 +364,8 @@
   bool canonicalizeArrayIndicesToPointerSize(GetElementPtrInst *GEP);
 
   const DataLayout *DL;
+  const TargetMachine *TM;
+  bool SimplifyGEP;
 };
 }  // anonymous namespace
 
@@ -302,8 +381,10 @@
     "Split GEPs to a variadic base and a constant offset for better CSE", false,
     false)
 
-FunctionPass *llvm::createSeparateConstOffsetFromGEPPass() {
-  return new SeparateConstOffsetFromGEP();
+FunctionPass *
+llvm::createSeparateConstOffsetFromGEPPass(const TargetMachine *TM,
+                                           bool SimplifyGEP) {
+  return new SeparateConstOffsetFromGEP(TM, SimplifyGEP);
 }
 
 bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended,
@@ -650,8 +731,8 @@
 }
 
 int64_t
-SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP,
-                                                 bool &NeedsExtraction) {
+SeparateConstOffsetFromGEP::accumulateByteOffsetForSeq(GetElementPtrInst *GEP,
+                                                       bool &NeedsExtraction) {
   NeedsExtraction = false;
   int64_t AccumulativeByteOffset = 0;
   gep_type_iterator GTI = gep_type_begin(*GEP);
@@ -690,7 +771,8 @@
   bool Changed = canonicalizeArrayIndicesToPointerSize(GEP);
 
   bool NeedsExtraction;
-  int64_t AccumulativeByteOffset = accumulateByteOffset(GEP, NeedsExtraction);
+  int64_t AccumulativeByteOffset =
+      accumulateByteOffsetForSeq(GEP, NeedsExtraction);
 
   if (!NeedsExtraction)
     return Changed;
@@ -815,11 +897,185 @@
   return true;
 }
 
+int64_t SeparateConstOffsetFromGEP::accumulateByteOffsetForSeqAndStruct(
+    GetElementPtrInst *GEP, bool &NeedsExtraction) {
+  NeedsExtraction = false;
+  int64_t AccumulativeByteOffset = 0;
+  gep_type_iterator GTI = gep_type_begin(GEP);
+  for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
+    if (isa<SequentialType>(*GTI)) {
+      // Tries to extract a constant offset from this GEP index.
+      bool FoundConst = false;
+      int64_t ConstantOffset = ConstantOffsetExtractor::Find(
+          GEP->getOperand(I), DL, GEP, FoundConst);
+      if (FoundConst) {
+        NeedsExtraction = true;
+        // A GEP may have multiple indices.  We accumulate the extracted
+        // constant offset to a byte offset, and later offset the remainder of
+        // the original GEP with this byte offset.
+        AccumulativeByteOffset +=
+            ConstantOffset * DL->getTypeAllocSize(GTI.getIndexedType());
+      }
+    } else {
+      StructType *StTy = cast<StructType>(*GTI);
+      unsigned Field = cast<ConstantInt>(GEP->getOperand(I))->getZExtValue();
+      // Skip field 0 as the offset is always 0.
+      if (Field != 0) {
+        NeedsExtraction = true;
+        AccumulativeByteOffset +=
+            DL->getStructLayout(StTy)->getElementOffset(Field);
+      }
+    }
+  }
+  return AccumulativeByteOffset;
+}
+
+bool SeparateConstOffsetFromGEP::extractConstantAndSimplifyGEP(
+    GetElementPtrInst *GEP) {
+  // Skip vector GEPs.
+  if (GEP->getType()->isVectorTy())
+    return false;
+
+  // The backend can already nicely handle the case where all indices are
+  // constants.
+  if (GEP->hasAllConstantIndices())
+    return false;
+
+  bool NeedsExtraction;
+  int64_t AccumulativeByteOffset =
+      accumulateByteOffsetForSeqAndStruct(GEP, NeedsExtraction);
+
+  if (!NeedsExtraction)
+    return false;
+
+  canonicalizeArrayIndicesToPointerSize(GEP);
+  // Remove the constant offset in each GEP index. The resultant GEP computes
+  // the variadic base.
+  gep_type_iterator GTI = gep_type_begin(*GEP);
+  for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
+    // Skip indices for struct types as they are always constants and have been
+    // accumulated.
+    if (isa<SequentialType>(*GTI)) {
+      // Tries to extract constant offsets and build new index.
+      Value *NewIdx =
+          ConstantOffsetExtractor::Extract(GEP->getOperand(I), DL, GEP);
+      if (NewIdx != nullptr)
+        GEP->setOperand(I, NewIdx);
+    }
+  }
+
+  // The inbounds keyword describes pointer arithmetic overflow conditions. As
+  // indices are always treated as signed values, set the HasNSW to InBounds
+  // when creating MUL/SHL/ADD.
+  bool InBounds = GEP->isInBounds();
+  IRBuilder<> Builder(GEP);
+  Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
+  // As currently BasicAA does not analyze ptrtoint/inttoptr, do not transform
+  // to such form if the target use alias analysis in codegen,
+  bool TansformToSimplerGEPs =
+      TM && TM->getSubtarget<TargetSubtargetInfo>().useAA();
+  if (TansformToSimplerGEPs) {
+    SmallVector<Value *, 4> ResultIndices;
+    GTI = gep_type_begin(*GEP);
+    // Create new indices for each vairable indices.
+    for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
+      if (!isa<ConstantInt>(GEP->getOperand(I))) {
+        Value *Idx = GEP->getOperand(I);
+        APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(),
+                                  DL->getTypeAllocSize(GTI.getIndexedType()));
+        if (ElementSize != 1) {
+          if (ElementSize.isPowerOf2())
+            Idx = Builder.CreateShl(
+                Idx, ConstantInt::get(IntPtrTy, ElementSize.logBase2()), "",
+                false, InBounds);
+          else
+            Idx =
+                Builder.CreateMul(Idx, ConstantInt::get(IntPtrTy, ElementSize),
+                                  "", false, InBounds);
+        }
+        ResultIndices.push_back(Idx);
+      }
+    }
+
+    Type *I8PtrTy =
+        Builder.getInt8PtrTy(GEP->getType()->getPointerAddressSpace());
+    Value *ResultPtr = GEP->getOperand(0);
+    if (ResultPtr->getType() != I8PtrTy)
+      ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy);
+
+    // Create simpler GEPs for each variable indices.
+    for (auto I = ResultIndices.begin(), E = ResultIndices.end(); I != E; ++I) {
+      if (InBounds)
+        ResultPtr = Builder.CreateInBoundsGEP(ResultPtr, *I, "simplegep");
+      else
+        ResultPtr = Builder.CreateGEP(ResultPtr, *I, "simplegep");
+    }
+    // Create a simple GEP for the constant offset index.
+    if (AccumulativeByteOffset != 0) {
+      Value *Offset = ConstantInt::get(IntPtrTy, AccumulativeByteOffset);
+      if (InBounds)
+        ResultPtr = Builder.CreateInBoundsGEP(ResultPtr, Offset, "simplegep");
+      else
+        ResultPtr = Builder.CreateGEP(ResultPtr, Offset, "simplegep");
+    }
+
+    if (ResultPtr->getType() != GEP->getType())
+      ResultPtr = Builder.CreateBitCast(ResultPtr, GEP->getType());
+    GEP->replaceAllUsesWith(ResultPtr);
+    GEP->eraseFromParent();
+    return true;
+  }
+
+  // Else we transform a complex GEP to ptrtoint/inttoptr.
+  Value *ResultPtr = Builder.CreatePtrToInt(GEP->getOperand(0), IntPtrTy);
+  GTI = gep_type_begin(*GEP);
+  // Create ADD/SHL/MUL arithmetic operations for each vairable indices.
+  for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
+    if (!isa<ConstantInt>(GEP->getOperand(I))) {
+      Value *Idx = GEP->getOperand(I);
+      APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(),
+                                DL->getTypeAllocSize(GTI.getIndexedType()));
+      if (ElementSize != 1) {
+        if (ElementSize.isPowerOf2())
+          Idx = Builder.CreateShl(
+              Idx, ConstantInt::get(IntPtrTy, ElementSize.logBase2()), "",
+              false, InBounds);
+        else
+          Idx = Builder.CreateMul(Idx, ConstantInt::get(IntPtrTy, ElementSize),
+                                  "", false, InBounds);
+      }
+      ResultPtr = Builder.CreateAdd(ResultPtr, Idx, "", false, InBounds);
+    }
+  }
+
+  // Create an ADD for the constant offset.
+  if (AccumulativeByteOffset != 0)
+    ResultPtr = Builder.CreateAdd(
+        ResultPtr, ConstantInt::get(IntPtrTy, AccumulativeByteOffset), "",
+        false, InBounds);
+
+  ResultPtr = Builder.CreateIntToPtr(ResultPtr, GEP->getType());
+  GEP->replaceAllUsesWith(ResultPtr);
+  GEP->eraseFromParent();
+  return true;
+}
+
 bool SeparateConstOffsetFromGEP::runOnFunction(Function &F) {
   if (DisableSeparateConstOffsetFromGEP)
     return false;
 
   bool Changed = false;
+  // Enabling simplifying a complex GEP to several simpler GEPs or a
+  // ptrtoint + arithmetic + inttoptr form.
+  if (SimplifyGEP) {
+    for (auto &B : F) {
+      for (BasicBlock::iterator I = B.begin(), IE = B.end(); I != IE;)
+        if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I++))
+          Changed |= extractConstantAndSimplifyGEP(GEP);
+    }
+    return Changed;
+  }
+
   for (Function::iterator B = F.begin(), BE = F.end(); B != BE; ++B) {
     for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE; ) {
       if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I++)) {
Index: test/CodeGen/AArch64/aarch64-gep-opt.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AArch64/aarch64-gep-opt.ll
@@ -0,0 +1,150 @@
+; RUN: llc -O3 -verify-machineinstrs %s -o - | FileCheck %s
+; RUN: llc -O3 -print-after=codegenprepare -mcpu=cyclone < %s >%t 2>&1 && FileCheck --check-prefix=CHECK-NoAA <%t %s
+; RUN: llc -O3 -print-after=codegenprepare -mcpu=cortex-a53 < %s >%t 2>&1 && FileCheck --check-prefix=CHECK-UseAA <%t %s
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnueabi"
+
+; Following test cases test enabling SeparateConstOffsetFromGEP pass in AArch64
+; backend. If useAA() returns true, it will transform a complex GEP into simpler
+; GEPs, else it will transform a complex GEP into a ptrtoint/inttoptr form.
+
+%struct = type { i32, i32, i32, i32, [20 x i32] }
+
+; Check that when two complex GEPs are used in two basic blocks, LLVM can
+; elimilate the common subexpression for the second use.
+define void @test_GEP_CSE([240 x %struct]* %string, i32* %adj, i32 %lib, i64 %idxprom) {
+  %liberties = getelementptr inbounds [240 x %struct]* %string, i64 1, i64 %idxprom, i32 3
+  %1 = load i32* %liberties, align 4
+  %cmp = icmp eq i32 %1, %lib
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %origin = getelementptr inbounds [240 x %struct]* %string, i64 1, i64 %idxprom, i32 2
+  %2 = load i32* %origin, align 4
+  store i32 %2, i32* %adj, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+; CHECK-LABEL: test_GEP_CSE:
+; CHECK: madd
+; CHECK: ldr
+; CHECK-NOT: madd
+; CHECK:ldr
+
+; CHECK-NoAA-LABEL: @test_GEP_CSE(
+; CHECK-NoAA: [[PTR0:%[a-zA-Z0-9]+]] = ptrtoint [240 x %struct]* %string to i64
+; CHECK-NoAA: [[PTR1:%[a-zA-Z0-9]+]] = mul nsw i64 %idxprom, 96
+; CHECK-NoAA: [[PTR2:%[a-zA-Z0-9]+]] = add nsw i64 [[PTR0]], [[PTR1]]
+; CHECK-NoAA: add nsw i64 [[PTR2]], 23052
+; CHECK-NoAA: inttoptr
+; CHECK-NoAA: if.then:
+; CHECK-NoAA-NOT: ptrtoint
+; CHECK-NoAA-NOT: mul
+; CHECK-NoAA: add nsw i64 [[PTR2]], 23048
+; CHECK-NoAA: inttoptr
+
+; CHECK-UseAA-LABEL: @test_GEP_CSE(
+; CHECK-UseAA: [[IDX:%[a-zA-Z0-9]+]] = mul nsw i64 %idxprom, 96
+; CHECK-UseAA: [[PTR0:%[a-zA-Z0-9]+]] = bitcast [240 x %struct]* %string to i8*
+; CHECK-UseAA: [[PTR1:%[a-zA-Z0-9]+]] = getelementptr inbounds i8* [[PTR0]], i64 [[IDX]]
+; CHECK-UseAA: getelementptr inbounds i8* [[PTR1]], i64 23052
+; CHECK-UseAA: bitcast
+; CHECK-UseAA: if.then:
+; CHECK-UseAA: getelementptr inbounds i8* [[PTR1]], i64 23048
+; CHECK-UseAA: bitcast
+
+%class.my = type { i32, [128 x i32], i32, [256 x %struct.pt]}
+%struct.pt = type { %struct.point*, i32, i32 }
+%struct.point = type { i32, i32 }
+
+; Check when a GEP is used across two basic block, LLVM can sink the address
+; calculation and code gen can generate a better addressing mode for the second
+; use.
+define void @test_GEP_across_BB(%class.my* %this, i64 %idx) {
+  %1 = getelementptr inbounds %class.my* %this, i64 0, i32 3, i64 %idx, i32 1
+  %2 = load i32* %1, align 4
+  %3 = getelementptr inbounds %class.my* %this, i64 0, i32 3, i64 %idx, i32 2
+  %4 = load i32* %3, align 4
+  %5 = icmp eq i32 %2, %4
+  br i1 %5, label %if.true, label %exit
+
+if.true:
+  %6 = shl i32 %4, 1
+  store i32 %6, i32* %3, align 4
+  br label %exit
+
+exit:
+  %7 = add nsw i32 %4, 1
+  store i32 %7, i32* %1, align 4
+  ret void
+}
+; CHECK-LABEL: test_GEP_across_BB:
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, #528]
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, #532]
+; CHECK-NOT: add
+; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, #532]
+; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, #528]
+
+; CHECK-NoAA-LABEL: test_GEP_across_BB(
+; CHECK-NoAA: add nsw i64 [[TMP:%[a-zA-Z0-9]+]], 528
+; CHECK-NoAA: add nsw i64 [[TMP]], 532
+; CHECK-NoAA: if.true:
+; CHECK-NoAA: {{%sunk[a-zA-Z0-9]+}} = add i64 [[TMP]], 532
+; CHECK-NoAA: exit:
+; CHECK-NoAA: {{%sunk[a-zA-Z0-9]+}} = add i64 [[TMP]], 528
+
+; CHECK-UseAA-LABEL: test_GEP_across_BB(
+; CHECK-UseAA: [[PTR0:%[a-zA-Z0-9]+]] = getelementptr
+; CHECK-UseAA: getelementptr inbounds i8* [[PTR0]], i64 528
+; CHECK-UseAA: getelementptr inbounds i8* [[PTR0]], i64 532
+; CHECK-UseAA: if.true:
+; CHECK-UseAA: {{%sunk[a-zA-Z0-9]+}} = getelementptr i8* [[PTR0]], i64 532
+; CHECK-UseAA: exit:
+; CHECK-UseAA: {{%sunk[a-zA-Z0-9]+}} = getelementptr i8* [[PTR0]], i64 528
+
+%struct.S = type { float, double }
+@struct_array = global [1024 x %struct.S] zeroinitializer, align 16
+
+; The following two test cases check we can extract constant from indices of
+; struct type.
+; The constant offsets are from indices "i64 %idxprom" and "i32 1". As the
+; alloca size of %struct.S is 16, and "i32 1" is the 2rd element whose field
+; offset is 8, the total constant offset is (5 * 16 + 8) = 88.
+define double* @test-struct_1(i32 %i) {
+entry:
+  %add = add nsw i32 %i, 5
+  %idxprom = sext i32 %add to i64
+  %p = getelementptr inbounds [1024 x %struct.S]* @struct_array, i64 0, i64 %idxprom, i32 1
+  ret double* %p
+}
+; CHECK-NoAA-LABEL: @test-struct_1(
+; CHECK-NoAA-NOT: getelementptr
+; CHECK-NoAA: add nsw i64 %{{[a-zA-Z0-9]+}}, 88
+
+; CHECK-UseAA-LABEL: @test-struct_1(
+; CHECK-UseAA: getelementptr inbounds i8* %{{[a-zA-Z0-9]+}}, i64 88
+
+%struct3 = type { i64, i32 }
+%struct2 = type { %struct3, i32 }
+%struct1 = type { i64, %struct2 }
+%struct0 = type { i32, i32, i64*, [100 x %struct1] }
+
+; The constant offsets are from indices "i32 3", "i64 %arrayidx" and "i32 1".
+; "i32 3" is the 4th element whose field offset is 16. The alloca size of
+; %struct1 is 32. "i32 1" is the 2rd element whose field offset is 8. So the
+; total constant offset is 16 + (-2 * 32) + 8 = -40
+define %struct2* @test-struct_2(%struct0* %ptr, i64 %idx) {
+entry:
+  %arrayidx = add nsw i64 %idx, -2
+  %ptr2 = getelementptr inbounds %struct0* %ptr, i64 0, i32 3, i64 %arrayidx, i32 1
+  ret %struct2* %ptr2
+}
+; CHECK-NoAA-LABEL: @test-struct_2(
+; CHECK-NoAA-NOT: = getelementptr
+; CHECK-NoAA: add nsw i64 %{{[a-zA-Z0-9]+}}, -40
+
+; CHECK-UseAA-LABEL: @test-struct_2(
+; CHECK-UseAA: getelementptr inbounds i8* %{{[a-zA-Z0-9]+}}, i64 -40
Index: test/CodeGen/AArch64/arm64-addr-mode-folding.ll
===================================================================
--- test/CodeGen/AArch64/arm64-addr-mode-folding.ll
+++ test/CodeGen/AArch64/arm64-addr-mode-folding.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O3 -mtriple arm64-apple-ios3 %s -o - | FileCheck %s
+; RUN: llc -O3 -mtriple arm64-apple-ios3 -aarch64-gep-opt=false %s -o - | FileCheck %s
 ; <rdar://problem/13621857>
 
 @block = common global i8* null, align 8
Index: test/CodeGen/AArch64/arm64-cse.ll
===================================================================
--- test/CodeGen/AArch64/arm64-cse.ll
+++ test/CodeGen/AArch64/arm64-cse.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O3 < %s -aarch64-atomic-cfg-tidy=0 | FileCheck %s
+; RUN: llc -O3 < %s -aarch64-atomic-cfg-tidy=0 -aarch64-gep-opt=false | FileCheck %s
 target triple = "arm64-apple-ios"
 
 ; rdar://12462006