diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -766,7 +766,8 @@
 // pattern which suggests that the loads can be combined. The one and only use
 // of the loads is to form a wider load.
 static bool foldConsecutiveLoads(Instruction &I, const DataLayout &DL,
-                                 TargetTransformInfo &TTI, AliasAnalysis &AA) {
+                                 TargetTransformInfo &TTI, AliasAnalysis &AA,
+                                 const DominatorTree &DT) {
   // Only consider load chains of scalar values.
   if (isa<VectorType>(I.getType()))
     return false;
@@ -791,15 +792,17 @@
   if (!Allowed || !Fast)
     return false;
 
-  // Make sure the Load pointer of type GEP/non-GEP is above insert point
-  Instruction *Inst = dyn_cast<Instruction>(LI1->getPointerOperand());
-  if (Inst && Inst->getParent() == LI1->getParent() &&
-      !Inst->comesBefore(LOps.RootInsert))
-    Inst->moveBefore(LOps.RootInsert);
-
-  // New load can be generated
+  // Get the Index and Ptr for the new GEP.
   Value *Load1Ptr = LI1->getPointerOperand();
   Builder.SetInsertPoint(LOps.RootInsert);
+  if (!DT.dominates(Load1Ptr, LOps.RootInsert)) {
+    APInt Offset1(DL.getIndexTypeSizeInBits(Load1Ptr->getType()), 0);
+    Load1Ptr = Load1Ptr->stripAndAccumulateConstantOffsets(
+        DL, Offset1, /* AllowNonInbounds */ true);
+    Load1Ptr = Builder.CreateGEP(Builder.getInt8Ty(), Load1Ptr,
+                                 Builder.getInt32(Offset1.getZExtValue()));
+  }
+  // Generate wider load.
   Value *NewPtr = Builder.CreateBitCast(Load1Ptr, WiderType->getPointerTo(AS));
   NewLoad = Builder.CreateAlignedLoad(WiderType, NewPtr, LI1->getAlign(),
                                       LI1->isVolatile(), "");
@@ -936,7 +939,7 @@
       MadeChange |= tryToRecognizePopCount(I);
       MadeChange |= tryToFPToSat(I, TTI);
       MadeChange |= tryToRecognizeTableBasedCttz(I);
-      MadeChange |= foldConsecutiveLoads(I, DL, TTI, AA);
+      MadeChange |= foldConsecutiveLoads(I, DL, TTI, AA, DT);
       MadeChange |= foldPatternedLoads(I, DL);
       // NOTE: This function introduces erasing of the instruction `I`, so it
       // needs to be called at the end of this sequence, otherwise we may make
diff --git a/llvm/test/Transforms/AggressiveInstCombine/AArch64/or-load.ll b/llvm/test/Transforms/AggressiveInstCombine/AArch64/or-load.ll
--- a/llvm/test/Transforms/AggressiveInstCombine/AArch64/or-load.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/AArch64/or-load.ll
@@ -1869,8 +1869,8 @@
 
 define i32 @loadCombine_4consecutive_badinsert3(ptr %p) {
 ; LE-LABEL: @loadCombine_4consecutive_badinsert3(
-; LE-NEXT:    [[P1:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 1
-; LE-NEXT:    [[L1:%.*]] = load i32, ptr [[P1]], align 1
+; LE-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 1
+; LE-NEXT:    [[L1:%.*]] = load i32, ptr [[TMP1]], align 1
 ; LE-NEXT:    ret i32 [[L1]]
 ;
 ; BE-LABEL: @loadCombine_4consecutive_badinsert3(
@@ -2085,3 +2085,82 @@
   %o3 = or i32 %o2, %e1
   ret i32 %o3
 }
+
+define void @nested_gep(ptr %p, ptr %dest) {
+; LE-LABEL: @nested_gep(
+; LE-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 68
+; LE-NEXT:    [[LD2:%.*]] = load i64, ptr [[TMP1]], align 4
+; LE-NEXT:    [[TRUNC:%.*]] = trunc i64 [[LD2]] to i32
+; LE-NEXT:    store i32 [[TRUNC]], ptr [[DEST:%.*]], align 4
+; LE-NEXT:    ret void
+;
+; BE-LABEL: @nested_gep(
+; BE-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 72
+; BE-NEXT:    [[LD1:%.*]] = load i32, ptr [[GEP1]], align 4
+; BE-NEXT:    [[LD1_ZEXT:%.*]] = zext i32 [[LD1]] to i64
+; BE-NEXT:    [[LD1_SHL:%.*]] = shl nuw i64 [[LD1_ZEXT]], 32
+; BE-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 64
+; BE-NEXT:    [[FINAL_PTR:%.*]] = getelementptr inbounds i8, ptr [[GEP2]], i64 4
+; BE-NEXT:    [[LD2:%.*]] = load i32, ptr [[FINAL_PTR]], align 4
+; BE-NEXT:    [[LD2_ZEXT:%.*]] = zext i32 [[LD2]] to i64
+; BE-NEXT:    [[OR:%.*]] = or i64 [[LD1_SHL]], [[LD2_ZEXT]]
+; BE-NEXT:    [[ADD:%.*]] = add i64 [[OR]], 0
+; BE-NEXT:    [[TRUNC:%.*]] = trunc i64 [[ADD]] to i32
+; BE-NEXT:    store i32 [[TRUNC]], ptr [[DEST:%.*]], align 4
+; BE-NEXT:    ret void
+;
+  %gep1 = getelementptr inbounds i8, ptr %p, i64 72
+  %ld1 = load i32, ptr %gep1, align 4
+  %ld1_zext = zext i32 %ld1 to i64
+  %ld1_shl = shl nuw i64 %ld1_zext, 32
+  %gep2 = getelementptr inbounds i8, ptr %p, i64 64
+  ; Don't move final_ptr before gep2
+  %final_ptr = getelementptr inbounds i8, ptr %gep2, i64 4
+  %ld2 = load i32, ptr %final_ptr, align 4
+  %ld2_zext = zext i32 %ld2 to i64
+  %or = or i64 %ld1_shl, %ld2_zext
+  %add = add i64 %or, 0
+  %trunc = trunc i64 %add to i32
+  store i32 %trunc, ptr %dest, align 4
+  ret void
+}
+
+
+define void @bitcast_gep(ptr %p, ptr %dest) {
+; LE-LABEL: @bitcast_gep(
+; LE-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 68
+; LE-NEXT:    [[LD2:%.*]] = load i64, ptr [[TMP1]], align 4
+; LE-NEXT:    [[TRUNC:%.*]] = trunc i64 [[LD2]] to i32
+; LE-NEXT:    store i32 [[TRUNC]], ptr [[DEST:%.*]], align 4
+; LE-NEXT:    ret void
+;
+; BE-LABEL: @bitcast_gep(
+; BE-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 72
+; BE-NEXT:    [[LD1:%.*]] = load i32, ptr [[GEP1]], align 4
+; BE-NEXT:    [[LD1_ZEXT:%.*]] = zext i32 [[LD1]] to i64
+; BE-NEXT:    [[LD1_SHL:%.*]] = shl nuw i64 [[LD1_ZEXT]], 32
+; BE-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 68
+; BE-NEXT:    [[FINAL_PTR:%.*]] = bitcast ptr [[GEP2]] to ptr
+; BE-NEXT:    [[LD2:%.*]] = load i32, ptr [[FINAL_PTR]], align 4
+; BE-NEXT:    [[LD2_ZEXT:%.*]] = zext i32 [[LD2]] to i64
+; BE-NEXT:    [[OR:%.*]] = or i64 [[LD1_SHL]], [[LD2_ZEXT]]
+; BE-NEXT:    [[ADD:%.*]] = add i64 [[OR]], 0
+; BE-NEXT:    [[TRUNC:%.*]] = trunc i64 [[ADD]] to i32
+; BE-NEXT:    store i32 [[TRUNC]], ptr [[DEST:%.*]], align 4
+; BE-NEXT:    ret void
+;
+  %gep1 = getelementptr inbounds i8, ptr %p, i64 72
+  %ld1 = load i32, ptr %gep1, align 4
+  %ld1_zext = zext i32 %ld1 to i64
+  %ld1_shl = shl nuw i64 %ld1_zext, 32
+  %gep2 = getelementptr inbounds i8, ptr %p, i64 68
+  ; Don't move final_ptr before gep2
+  %final_ptr = bitcast ptr %gep2 to ptr
+  %ld2 = load i32, ptr %final_ptr, align 4
+  %ld2_zext = zext i32 %ld2 to i64
+  %or = or i64 %ld1_shl, %ld2_zext
+  %add = add i64 %or, 0
+  %trunc = trunc i64 %add to i32
+  store i32 %trunc, ptr %dest, align 4
+  ret void
+}
diff --git a/llvm/test/Transforms/AggressiveInstCombine/X86/or-load.ll b/llvm/test/Transforms/AggressiveInstCombine/X86/or-load.ll
--- a/llvm/test/Transforms/AggressiveInstCombine/X86/or-load.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/X86/or-load.ll
@@ -2005,8 +2005,8 @@
 
 define i32 @loadCombine_4consecutive_badinsert3(ptr %p) {
 ; LE-LABEL: @loadCombine_4consecutive_badinsert3(
-; LE-NEXT:    [[P1:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 1
-; LE-NEXT:    [[L1:%.*]] = load i32, ptr [[P1]], align 1
+; LE-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 1
+; LE-NEXT:    [[L1:%.*]] = load i32, ptr [[TMP1]], align 1
 ; LE-NEXT:    ret i32 [[L1]]
 ;
 ; BE-LABEL: @loadCombine_4consecutive_badinsert3(
@@ -2303,3 +2303,82 @@
   %o7 = or i64 %s1, %s0
   ret i64 %o7
 }
+
+define void @nested_gep(ptr %p, ptr %dest) {
+; LE-LABEL: @nested_gep(
+; LE-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 68
+; LE-NEXT:    [[LD2:%.*]] = load i64, ptr [[TMP1]], align 4
+; LE-NEXT:    [[TRUNC:%.*]] = trunc i64 [[LD2]] to i32
+; LE-NEXT:    store i32 [[TRUNC]], ptr [[DEST:%.*]], align 4
+; LE-NEXT:    ret void
+;
+; BE-LABEL: @nested_gep(
+; BE-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 72
+; BE-NEXT:    [[LD1:%.*]] = load i32, ptr [[GEP1]], align 4
+; BE-NEXT:    [[LD1_ZEXT:%.*]] = zext i32 [[LD1]] to i64
+; BE-NEXT:    [[LD1_SHL:%.*]] = shl nuw i64 [[LD1_ZEXT]], 32
+; BE-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 64
+; BE-NEXT:    [[FINAL_PTR:%.*]] = getelementptr inbounds i8, ptr [[GEP2]], i64 4
+; BE-NEXT:    [[LD2:%.*]] = load i32, ptr [[FINAL_PTR]], align 4
+; BE-NEXT:    [[LD2_ZEXT:%.*]] = zext i32 [[LD2]] to i64
+; BE-NEXT:    [[OR:%.*]] = or i64 [[LD1_SHL]], [[LD2_ZEXT]]
+; BE-NEXT:    [[ADD:%.*]] = add i64 [[OR]], 0
+; BE-NEXT:    [[TRUNC:%.*]] = trunc i64 [[ADD]] to i32
+; BE-NEXT:    store i32 [[TRUNC]], ptr [[DEST:%.*]], align 4
+; BE-NEXT:    ret void
+;
+  %gep1 = getelementptr inbounds i8, ptr %p, i64 72
+  %ld1 = load i32, ptr %gep1, align 4
+  %ld1_zext = zext i32 %ld1 to i64
+  %ld1_shl = shl nuw i64 %ld1_zext, 32
+  %gep2 = getelementptr inbounds i8, ptr %p, i64 64
+  ; Don't move final_ptr before gep2
+  %final_ptr = getelementptr inbounds i8, ptr %gep2, i64 4
+  %ld2 = load i32, ptr %final_ptr, align 4
+  %ld2_zext = zext i32 %ld2 to i64
+  %or = or i64 %ld1_shl, %ld2_zext
+  %add = add i64 %or, 0
+  %trunc = trunc i64 %add to i32
+  store i32 %trunc, ptr %dest, align 4
+  ret void
+}
+
+
+define void @bitcast_gep(ptr %p, ptr %dest) {
+; LE-LABEL: @bitcast_gep(
+; LE-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 68
+; LE-NEXT:    [[LD2:%.*]] = load i64, ptr [[TMP1]], align 4
+; LE-NEXT:    [[TRUNC:%.*]] = trunc i64 [[LD2]] to i32
+; LE-NEXT:    store i32 [[TRUNC]], ptr [[DEST:%.*]], align 4
+; LE-NEXT:    ret void
+;
+; BE-LABEL: @bitcast_gep(
+; BE-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 72
+; BE-NEXT:    [[LD1:%.*]] = load i32, ptr [[GEP1]], align 4
+; BE-NEXT:    [[LD1_ZEXT:%.*]] = zext i32 [[LD1]] to i64
+; BE-NEXT:    [[LD1_SHL:%.*]] = shl nuw i64 [[LD1_ZEXT]], 32
+; BE-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 68
+; BE-NEXT:    [[FINAL_PTR:%.*]] = bitcast ptr [[GEP2]] to ptr
+; BE-NEXT:    [[LD2:%.*]] = load i32, ptr [[FINAL_PTR]], align 4
+; BE-NEXT:    [[LD2_ZEXT:%.*]] = zext i32 [[LD2]] to i64
+; BE-NEXT:    [[OR:%.*]] = or i64 [[LD1_SHL]], [[LD2_ZEXT]]
+; BE-NEXT:    [[ADD:%.*]] = add i64 [[OR]], 0
+; BE-NEXT:    [[TRUNC:%.*]] = trunc i64 [[ADD]] to i32
+; BE-NEXT:    store i32 [[TRUNC]], ptr [[DEST:%.*]], align 4
+; BE-NEXT:    ret void
+;
+  %gep1 = getelementptr inbounds i8, ptr %p, i64 72
+  %ld1 = load i32, ptr %gep1, align 4
+  %ld1_zext = zext i32 %ld1 to i64
+  %ld1_shl = shl nuw i64 %ld1_zext, 32
+  %gep2 = getelementptr inbounds i8, ptr %p, i64 68
+  ; Don't move final_ptr before gep2
+  %final_ptr = bitcast ptr %gep2 to ptr
+  %ld2 = load i32, ptr %final_ptr, align 4
+  %ld2_zext = zext i32 %ld2 to i64
+  %or = or i64 %ld1_shl, %ld2_zext
+  %add = add i64 %or, 0
+  %trunc = trunc i64 %add to i32
+  store i32 %trunc, ptr %dest, align 4
+  ret void
+}