Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -17960,6 +17960,12 @@
     LoadSDNode *LLD = cast<LoadSDNode>(LHS);
     LoadSDNode *RLD = cast<LoadSDNode>(RHS);
 
+    unsigned RequiredAlignment;
+    EVT LDVT = LLD->getMemoryVT();
+    if (DAG.areNonVolatileConsecutiveLoads(LLD, RLD, LDVT.getStoreSize(), 1) &&
+        TLI.hasPairedLoad(LDVT, RequiredAlignment))
+      return false;
+
     // Token chains must be identical.
     if (LHS.getOperand(0) != RHS.getOperand(0) ||
         // Do not let this transformation reduce the number of volatile loads.
Index: lib/Transforms/InstCombine/InstCombineCasts.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1077,6 +1077,19 @@
   if (CI.hasOneUse() && isa<TruncInst>(CI.user_back()))
     return nullptr;
 
+  // zext(cmp) ~> select cmp, 1, 0
+  // Canonicalize into select to give the opportunity of folding a gep
+  // into the select. This will eventually turn back to zext(cmp) if the
+  // gep doesn't fold.
+  if (CI.hasOneUse() && isa<GetElementPtrInst>(CI.user_back()) &&
+      cast<GEPOperator>(CI.user_back())->countNonConstantIndices() == 1) {
+    if (auto *Cmp = dyn_cast<CmpInst>(CI.getOperand(0))) {
+      auto *TV = ConstantInt::get(cast<IntegerType>(CI.getType()), 1);
+      auto *FV = ConstantInt::get(cast<IntegerType>(CI.getType()), 0);
+      return SelectInst::Create(Cmp, TV, FV);
+    }
+  }
+
   // If one of the common conversion will work, do it.
   if (Instruction *Result = commonCastTransforms(CI))
     return Result;
Index: lib/Transforms/InstCombine/InstCombineInternal.h
===================================================================
--- lib/Transforms/InstCombine/InstCombineInternal.h
+++ lib/Transforms/InstCombine/InstCombineInternal.h
@@ -810,6 +810,8 @@
   /// Canonicalize the position of binops relative to shufflevector.
   Instruction *foldVectorBinop(BinaryOperator &Inst);
 
+  Instruction *FoldGEPIntoSelect(GetElementPtrInst &GEP, SelectInst *SI);
+
   /// Given a binary operator, cast instruction, or select which has a PHI node
   /// as operand #0, see if we can fold the instruction into the PHI (which is
   /// only possible if all operands to the PHI are constants).
Index: lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -990,6 +990,53 @@
   return false;
 }
 
+// If we are loading from a select of GEPs, unless either of them was going to
+// trap anyway, we can safely speculate the load as long as we can statically
+// infer that the memory locations from both GEPs are in bounds.
+static bool isSafeToSpeculateLoad(Value *TValue, Value *FValue, LoadInst &LI,
+                                  const DataLayout &DL) {
+  assert(LI.isUnordered() && "Illegal speculation of ordered load");
+
+  if (isa<GetElementPtrInst>(TValue) && isa<GetElementPtrInst>(FValue)) {
+    auto &TGEP = *cast<GetElementPtrInst>(TValue);
+    auto &FGEP = *cast<GetElementPtrInst>(FValue);
+
+    // Returns the index of the different operand between two
+    // Users as long as the rest of the operands are identical.
+    // Returns -1 otherwise.
+    auto HaveIdenticalOperandsExcept = [] (User &U1, User &U2) {
+      if (U1.getNumOperands() != U2.getNumOperands())
+        return -1;
+      for (unsigned Idx = 0; Idx < U1.getNumOperands(); ++Idx)
+        if (U1.getOperand(Idx) != U2.getOperand(Idx))
+          if (std::equal(U1.op_begin()+Idx+1, U1.op_end(), U2.op_begin()+Idx+1))
+            return (int)Idx;
+      return -1;
+    };
+
+    int Idx = HaveIdenticalOperandsExcept(TGEP, FGEP);
+    // Disregard the pointer operand.
+    if (Idx > 0) {
+      auto *C1 = dyn_cast<ConstantInt>(*(TGEP.op_begin()+Idx));
+      auto *C2 = dyn_cast<ConstantInt>(*(FGEP.op_begin()+Idx));
+      if (C1 && C2) {
+        SmallVector<Value *, 8> Indices(TGEP.op_begin()+1, TGEP.op_begin()+Idx);
+        uint64_t Max = std::max(C1->getLimitedValue(), C2->getLimitedValue());
+        Type *GepRetTy = GetElementPtrInst::getGEPReturnType(
+                             TGEP.getPointerOperand(), Indices);
+        Type *ElemTy = GepRetTy->getPointerElementType();
+        if ((ElemTy->isArrayTy() && ElemTy->getArrayNumElements() > Max) ||
+            (ElemTy->isStructTy() && ElemTy->getStructNumElements() > Max))
+          return true;
+      }
+    }
+  }
+
+  // Fall-through to the conservative speculation logic.
+  return isSafeToLoadUnconditionally(TValue, LI.getAlignment(), DL, &LI) &&
+         isSafeToLoadUnconditionally(FValue, LI.getAlignment(), DL, &LI);
+}
+
 Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
   Value *Op = LI.getOperand(0);
 
@@ -1065,8 +1112,7 @@
     if (SelectInst *SI = dyn_cast<SelectInst>(Op)) {
       // load (select (Cond, &V1, &V2))  --> select(Cond, load &V1, load &V2).
       unsigned Align = LI.getAlignment();
-      if (isSafeToLoadUnconditionally(SI->getOperand(1), Align, DL, SI) &&
-          isSafeToLoadUnconditionally(SI->getOperand(2), Align, DL, SI)) {
+      if (isSafeToSpeculateLoad(SI->getOperand(1), SI->getOperand(2), LI, DL)) {
         LoadInst *V1 = Builder.CreateLoad(SI->getOperand(1),
                                           SI->getOperand(1)->getName()+".val");
         LoadInst *V2 = Builder.CreateLoad(SI->getOperand(2),
Index: lib/Transforms/InstCombine/InstCombineSelect.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1568,6 +1568,13 @@
     }
   }
 
+  // Let the GEP fold into the Select if possible. We can still optimize
+  // it later if the GEP doesn't fold.
+  if (SI.hasOneUse() && isa<GetElementPtrInst>(SI.user_back()) &&
+      cast<GEPOperator>(SI.user_back())->countNonConstantIndices() == 1 &&
+      isa<ConstantInt>(TrueVal) && isa<ConstantInt>(FalseVal))
+    return nullptr;
+
   if (Value *V = SimplifySelectInst(CondVal, TrueVal, FalseVal,
                                     SQ.getWithInstruction(&SI)))
     return replaceInstUsesWith(SI, V);
Index: lib/Transforms/InstCombine/InstructionCombining.cpp
===================================================================
--- lib/Transforms/InstCombine/InstructionCombining.cpp
+++ lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -781,6 +781,31 @@
   return nullptr;
 }
 
+Instruction *InstCombiner::FoldGEPIntoSelect(GetElementPtrInst &GEP,
+                                             SelectInst *SI) {
+  assert(GEP.hasOneUse() && isa<LoadInst>(GEP.user_back()) &&
+         "GEP user is not a Load");
+  assert(cast<GEPOperator>(&GEP)->countNonConstantIndices() == 1 &&
+         "GEP has multiple non-constant indices");
+
+  if (!SI->hasOneUse())
+    return nullptr;
+
+  Value *TV = SI->getTrueValue();
+  Value *FV = SI->getFalseValue();
+
+  if (!isa<ConstantInt>(TV) || !isa<ConstantInt>(FV))
+    return nullptr;
+
+  Instruction *NewTV = GEP.clone();
+  Instruction *NewFV = GEP.clone();
+  NewTV->replaceUsesOfWith(SI, TV);
+  NewFV->replaceUsesOfWith(SI, FV);
+  InsertNewInstBefore(NewTV, GEP);
+  InsertNewInstBefore(NewFV, GEP);
+  return SelectInst::Create(SI->getCondition(), NewTV, NewFV);
+}
+
 static Value *foldOperationIntoSelectOperand(Instruction &I, Value *SO,
                                              InstCombiner::BuilderTy &Builder) {
   if (auto *Cast = dyn_cast<CastInst>(&I))
@@ -1688,6 +1713,17 @@
     PtrOp = NewGEP;
   }
 
+  // Before combining indices, try to fold the GEP into a Select as
+  // long as its only user is a load. This can allow futher folding
+  // of the load into the Select.
+  if (GEP.hasOneUse() && isa<LoadInst>(GEP.user_back()) &&
+      cast<GEPOperator>(&GEP)->countNonConstantIndices() == 1) {
+    for (auto &Op : GEP.operands())
+      if (auto *SI = dyn_cast<SelectInst>(Op))
+        if (Instruction *NewSel = FoldGEPIntoSelect(GEP, SI))
+          return NewSel;
+  }
+
   // Combine Indices - If the source pointer to this getelementptr instruction
   // is a getelementptr instruction, combine the indices of the two
   // getelementptr instructions into a single instruction.
Index: test/CodeGen/AArch64/select-load.ll
===================================================================
--- test/CodeGen/AArch64/select-load.ll
+++ test/CodeGen/AArch64/select-load.ll
@@ -17,12 +17,10 @@
 ; CHECK-NEXT:    mov w19, w1
 ; CHECK-NEXT:    mov x20, x0
 ; CHECK-NEXT:    bl foo
-; CHECK-NEXT:    add x8, x20, #8 // =8
-; CHECK-NEXT:    add x9, x20, #4 // =4
+; CHECK-NEXT:    ldp w8, w9, [x20, #4]
 ; CHECK-NEXT:    cmp w0, w19
-; CHECK-NEXT:    csel x8, x8, x9, gt
-; CHECK-NEXT:    ldr w0, [x8]
 ; CHECK-NEXT:    ldp x19, x30, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    csel w0, w9, w8, gt
 ; CHECK-NEXT:    ldr x20, [sp], #32 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
 entry:
Index: test/Transforms/InstCombine/gep-select.ll
===================================================================
--- /dev/null
+++ test/Transforms/InstCombine/gep-select.ll
@@ -0,0 +1,101 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-arm"
+
+%struct.B = type { [4 x %struct.A], i32 }
+%struct.A = type { [2 x i32], i32, i32 }
+
+declare i32 @val()
+
+; It's safe to speculatively execute load(select c, gep1, gep2) if the geps
+; are almost identical, having only one different index, which is statically
+; known to be in bounds.
+define i32 @test1(%struct.B* %b, i64 %idx1, i64 %idx2, i1 %cmp) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [[STRUCT_B:%.*]], %struct.B* [[B:%.*]], i64 [[IDX1:%.*]], i32 0, i64 [[IDX2:%.*]], i32 1
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [[STRUCT_B]], %struct.B* [[B]], i64 [[IDX1]], i32 0, i64 [[IDX2]], i32 2
+; CHECK-NEXT:    [[GEP1_VAL:%.*]] = load i32, i32* [[GEP1]], align 4
+; CHECK-NEXT:    [[GEP2_VAL:%.*]] = load i32, i32* [[GEP2]], align 4
+; CHECK-NEXT:    [[LD:%.*]] = select i1 [[CMP:%.*]], i32 [[GEP1_VAL]], i32 [[GEP2_VAL]]
+; CHECK-NEXT:    ret i32 [[LD]]
+;
+entry:
+  %gep1 = getelementptr inbounds %struct.B, %struct.B* %b, i64 %idx1, i32 0, i64 %idx2, i32 1
+  %gep2 = getelementptr inbounds %struct.B, %struct.B* %b, i64 %idx1, i32 0, i64 %idx2, i32 2
+  %sel = select i1 %cmp, i32* %gep1, i32* %gep2
+  %ld = load i32, i32* %sel, align 4
+  ret i32 %ld
+}
+
+; Can't trivially speculate load(select c , gep1, gep2) since
+; the geps look quite different.
+define i32 @test2(%struct.B* %b, i64 %idx1, i64 %idx2, i1 %cmp) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [[STRUCT_B:%.*]], %struct.B* [[B:%.*]], i64 [[IDX1:%.*]], i32 0, i64 [[IDX2:%.*]], i32 1
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [[STRUCT_B]], %struct.B* [[B]], i64 [[IDX1]], i32 1
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP:%.*]], i32* [[GEP1]], i32* [[GEP2]]
+; CHECK-NEXT:    [[LD:%.*]] = load i32, i32* [[SEL]], align 4
+; CHECK-NEXT:    ret i32 [[LD]]
+;
+entry:
+  %gep1 = getelementptr inbounds %struct.B, %struct.B* %b, i64 %idx1, i32 0, i64 %idx2, i32 1
+  %gep2 = getelementptr inbounds %struct.B, %struct.B* %b, i64 %idx1, i32 1
+  %sel = select i1 %cmp, i32* %gep1, i32* %gep2
+  %ld = load i32, i32* %sel, align 4
+  ret i32 %ld
+}
+
+; Can't speculate load(select c , gep1, gep2) since we can
+; statically infer that gep1 goes out of bounds.
+define i32 @test3(%struct.B* %b, i64 %idx1, i64 %idx2, i1 %cmp) {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [[STRUCT_B:%.*]], %struct.B* [[B:%.*]], i64 [[IDX1:%.*]], i32 0, i64 [[IDX2:%.*]], i32 0, i64 2
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [[STRUCT_B]], %struct.B* [[B]], i64 [[IDX1]], i32 0, i64 [[IDX2]], i32 0, i64 1
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP:%.*]], i32* [[GEP1]], i32* [[GEP2]]
+; CHECK-NEXT:    [[LD:%.*]] = load i32, i32* [[SEL]], align 4
+; CHECK-NEXT:    ret i32 [[LD]]
+;
+entry:
+  %gep1 = getelementptr inbounds %struct.B, %struct.B* %b, i64 %idx1, i32 0, i64 %idx2, i32 0, i64 2
+  %gep2 = getelementptr inbounds %struct.B, %struct.B* %b, i64 %idx1, i32 0, i64 %idx2, i32 0, i64 1
+  %sel = select i1 %cmp, i32* %gep1, i32* %gep2
+  %ld = load i32, i32* %sel, align 4
+  ret i32 %ld
+}
+
+; zext(cmp) ~> select cmp, 1, 0
+; gep(select cmp, 1, 0) ~> select cmp, (gep 1), (gep 0)
+; load(select cmp, (gep 1), (gep 0)) ~> select cmp, load(gep 1), load(gep 0)
+define i32 @test4(%struct.B* %b, i32 %x, i32 %y, i64 %idx) {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @val()
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[CALL]], [[X:%.*]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i32 [[CALL]], [[Y:%.*]]
+; CHECK-NEXT:    [[IDX1:%.*]] = zext i1 [[CMP1]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_B:%.*]], %struct.B* [[B:%.*]], i64 [[IDX:%.*]], i32 0, i64 [[IDX1]], i32 0, i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_B]], %struct.B* [[B]], i64 [[IDX]], i32 0, i64 [[IDX1]], i32 0, i64 0
+; CHECK-NEXT:    [[DOTVAL:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    [[DOTVAL1:%.*]] = load i32, i32* [[TMP1]], align 4
+; CHECK-NEXT:    [[LD:%.*]] = select i1 [[CMP2]], i32 [[DOTVAL]], i32 [[DOTVAL1]]
+; CHECK-NEXT:    ret i32 [[LD]]
+;
+entry:
+  %call = call i32 @val()
+  %cmp1 = icmp sgt i32 %call, %x
+  %cmp2 = icmp sgt i32 %call, %y
+  %idx1 = zext i1 %cmp1 to i64
+  %idx2 = zext i1 %cmp2 to i64
+  %gep1 = getelementptr inbounds %struct.B, %struct.B* %b, i64 %idx
+  %a = getelementptr inbounds %struct.B, %struct.B* %gep1, i32 0, i32 0
+  %gep2 = getelementptr inbounds [4 x %struct.A], [4 x %struct.A]* %a, i64 0, i64 %idx1
+  %c = getelementptr inbounds %struct.A, %struct.A* %gep2, i32 0, i32 0
+  %gep3 = getelementptr inbounds [2 x i32], [2 x i32]* %c, i64 0, i64 %idx2
+  %ld = load i32, i32* %gep3, align 4
+  ret i32 %ld
+}