Index: lib/Transforms/InstCombine/InstructionCombining.cpp
===================================================================
--- lib/Transforms/InstCombine/InstructionCombining.cpp
+++ lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1432,13 +1432,12 @@
     if (!shouldMergeGEPs(*cast<GEPOperator>(&GEP), *Src))
       return nullptr;
 
-    // Note that if our source is a gep chain itself then we wait for that
-    // chain to be resolved before we perform this transformation.  This
-    // avoids us creating a TON of code in some cases.
-    if (GEPOperator *SrcGEP =
-          dyn_cast<GEPOperator>(Src->getOperand(0)))
-      if (SrcGEP->getNumOperands() == 2 && shouldMergeGEPs(*Src, *SrcGEP))
-        return nullptr;   // Wait until our source is folded to completion.
+    // Don't combine the malformed cycle gep instructions like the following:
+    // %gep2 = getelementptr i8, i8* %gep, i32 1
+    // %gep = getelementptr i8, i8* %gep2, i32 1
+    GetElementPtrInst *SrcGEP = dyn_cast<GetElementPtrInst>(Src->getOperand(0));
+    if (SrcGEP == &GEP)
+      return nullptr;
 
     SmallVector<Value*, 8> Indices;
 
@@ -1467,10 +1466,15 @@
         // normalized.
         if (SO1->getType() != GO1->getType())
           return nullptr;
-        // Only do the combine when GO1 and SO1 are both constants. Only in
-        // this case, we are sure the cost after the merge is never more than
-        // that before the merge.
-        if (!isa<Constant>(GO1) || !isa<Constant>(SO1))
+        // Do gep(gep(...)) combine only when
+        // 1. GO1 and SO1 are both constants or
+        // 2. Src has only one use plus that Src and GEP are in the same BB.
+        // In thses cases we are sure the cost of the combined result will be
+        // equal or less than before.
+        if (((!isa<Constant>(GO1) || !isa<Constant>(SO1))) &&
+            (!Src->hasOneUse() ||
+             (isa<Instruction>(Src) &&
+              cast<GetElementPtrInst>(Src)->getParent() != GEP.getParent())))
           return nullptr;
         Sum = Builder->CreateAdd(SO1, GO1, PtrOp->getName()+".sum");
       }
Index: test/Transforms/InstCombine/gep-merge1.ll
===================================================================
--- test/Transforms/InstCombine/gep-merge1.ll
+++ test/Transforms/InstCombine/gep-merge1.ll
@@ -0,0 +1,166 @@
+; PR23580
+; RUN: opt < %s -O2 -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.anon = type { [0 x %class.C] }
+%class.C = type { i8 }
+%struct.B = type { i16 }
+%class.G = type <{ %struct.F, [2 x i32], i8, [7 x i8] }>
+%struct.F = type { i8, i8, i8, i16, i32* }
+%struct.A = type { i32 }
+
+@a = global i32 0, align 4
+@b = global i32 0, align 4
+@c = global i32 0, align 4
+@e = global i32 0, align 4
+@d = internal global %struct.anon zeroinitializer, align 1
+
+declare %struct.B* @_ZN1C5m_fn1Ev(%class.C*)
+
+; Check geps inside for.body are merged so loop vectorizer can recognize loads
+; inside for.body to be inter-iterations consecutive, and generate %wide.loads.
+;
+; CHECK-LABEL: @fn2(
+; CHECK: %wide.load{{[0-9]*}} =
+; CHECK: %wide.load{{[0-9]*}} =
+; CHECK: %wide.load{{[0-9]*}} =
+
+define void @fn2(%class.G* nocapture readonly %this, i1 zeroext %arg) align 2 {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %if.end55, %entry
+  %hor_steps = getelementptr inbounds %class.G, %class.G* %this, i32 0, i32 0
+  %tmp1 = load i32, i32* @a, align 4
+  %idxprom = sext i32 %tmp1 to i64
+  %coset_width_in = getelementptr inbounds %class.G, %class.G* %this, i32 0, i32 1
+  %arrayidx = getelementptr inbounds [2 x i32], [2 x i32]* %coset_width_in, i32 0, i64 %idxprom
+  %tmp2 = load i32, i32* %arrayidx, align 4
+  %use_shorts = getelementptr inbounds %class.G, %class.G* %this, i32 0, i32 2
+  %tmp3 = load i8, i8* %use_shorts, align 1
+  %tobool = trunc i8 %tmp3 to i1
+  br i1 %tobool, label %if.then, label %if.else30
+
+if.then:                                          ; preds = %for.cond
+  %arrayidx3 = getelementptr inbounds [0 x %class.C], [0 x %class.C]* getelementptr inbounds (%struct.anon, %struct.anon* @d, i32 0, i32 0), i32 0, i64 %idxprom
+  %call = call %struct.B* @_ZN1C5m_fn1Ev(%class.C* %arrayidx3)
+  %tmp4 = load i32, i32* @a, align 4
+  %idx.ext = sext i32 %tmp4 to i64
+  %add.ptr = getelementptr inbounds %struct.B, %struct.B* %call, i64 %idx.ext
+  %arrayidx5 = getelementptr inbounds [0 x %class.C], [0 x %class.C]* getelementptr inbounds (%struct.anon, %struct.anon* @d, i32 0, i32 0), i32 0, i64 %idx.ext
+  %call6 = call %struct.B* @_ZN1C5m_fn1Ev(%class.C* %arrayidx5)
+  %downshift = getelementptr inbounds %struct.F, %struct.F* %hor_steps, i32 0, i32 1
+  %tmp5 = load i8, i8* %downshift, align 1
+  %conv = zext i8 %tmp5 to i32
+  %rounding_offset = getelementptr inbounds %struct.F, %struct.F* %hor_steps, i32 0, i32 3
+  %tmp6 = load i16, i16* %rounding_offset, align 2
+  %conv7 = sext i16 %tmp6 to i32
+  %icoeffs = getelementptr inbounds %struct.F, %struct.F* %hor_steps, i32 0, i32 4
+  %tmp7 = load i32*, i32** %icoeffs, align 8
+  %tmp8 = load i32, i32* %tmp7, align 4
+  %cmp = icmp eq i32 %tmp8, 1
+  %cmp10 = icmp eq i32 %tmp8, -1
+  %or.cond = or i1 %cmp, %cmp10
+  br i1 %or.cond, label %if.end29, label %for.cond13
+
+for.cond13:                                       ; preds = %for.body, %if.then
+  %k.0 = phi i32 [ 1, %if.then ], [ %add, %for.body ]
+  %cmp14 = icmp slt i32 %k.0, %tmp2
+  br i1 %cmp14, label %for.body, label %if.end29, !llvm.loop !0
+
+for.body:                                         ; preds = %for.cond13
+  %idxprom15 = sext i32 %k.0 to i64
+  %arrayidx16 = getelementptr inbounds %struct.B, %struct.B* %add.ptr, i64 %idxprom15
+  %ival = getelementptr inbounds %struct.B, %struct.B* %arrayidx16, i32 0, i32 0
+  %tmp9 = load i16, i16* %ival, align 2
+  %conv17 = sext i16 %tmp9 to i32
+  %add = add nsw i32 %k.0, 1
+  %idxprom18 = sext i32 %add to i64
+  %arrayidx19 = getelementptr inbounds %struct.B, %struct.B* %add.ptr, i64 %idxprom18
+  %ival20 = getelementptr inbounds %struct.B, %struct.B* %arrayidx19, i32 0, i32 0
+  %tmp10 = load i16, i16* %ival20, align 2
+  %conv21 = sext i16 %tmp10 to i32
+  %add22 = add nsw i32 %conv17, %conv21
+  %mul = mul nsw i32 %tmp8, %add22
+  %add23 = add nsw i32 %conv7, %mul
+  %shr = ashr i32 %add23, %conv
+  %arrayidx25 = getelementptr inbounds %struct.B, %struct.B* %call6, i64 %idxprom15
+  %ival26 = getelementptr inbounds %struct.B, %struct.B* %arrayidx25, i32 0, i32 0
+  %tmp11 = load i16, i16* %ival26, align 2
+  %conv27 = sext i16 %tmp11 to i32
+  %sub = sub nsw i32 %conv27, %shr
+  %conv28 = trunc i32 %sub to i16
+  store i16 %conv28, i16* %ival26, align 2
+  br label %for.cond13
+
+if.end29:                                         ; preds = %for.cond13, %if.then
+  br label %if.end55
+
+if.else30:                                        ; preds = %for.cond
+  br label %for.cond31
+
+for.cond31:                                       ; preds = %for.end53, %if.else30
+  %o.0 = phi %struct.A* [ null, %if.else30 ], [ %o.1, %for.end53 ]
+  %extend = getelementptr inbounds %struct.F, %struct.F* %hor_steps, i32 0, i32 2
+  %tmp12 = load i8, i8* %extend, align 1
+  %tobool32 = icmp ne i8 %tmp12, 0
+  br i1 %tobool32, label %for.body33, label %for.end54
+
+for.body33:                                       ; preds = %for.cond31
+  %support_length = getelementptr inbounds %struct.F, %struct.F* %hor_steps, i32 0, i32 0
+  %tmp13 = load i8, i8* %support_length, align 1
+  %conv34 = zext i8 %tmp13 to i32
+  %icoeffs35 = getelementptr inbounds %struct.F, %struct.F* %hor_steps, i32 0, i32 4
+  %tmp14 = load i32*, i32** %icoeffs35, align 8
+  br label %for.cond36
+
+for.cond36:                                       ; preds = %for.inc51, %for.body33
+  %o.1 = phi %struct.A* [ %o.0, %for.body33 ], [ %incdec.ptr, %for.inc51 ]
+  %k.1 = phi i32 [ 0, %for.body33 ], [ %inc52, %for.inc51 ]
+  %cmp37 = icmp slt i32 %k.1, %tmp2
+  br i1 %cmp37, label %for.body38, label %for.end53
+
+for.body38:                                       ; preds = %for.cond36
+  store i32 0, i32* @b, align 4
+  br label %for.cond39
+
+for.cond39:                                       ; preds = %for.body41, %for.body38
+  %tmp15 = load i32, i32* @b, align 4
+  %cmp40 = icmp slt i32 %tmp15, %conv34
+  br i1 %cmp40, label %for.body41, label %for.inc51
+
+for.body41:                                       ; preds = %for.cond39
+  %idxprom42 = sext i32 %tmp15 to i64
+  %arrayidx43 = getelementptr inbounds i32, i32* %tmp14, i64 %idxprom42
+  %tmp16 = load i32, i32* %arrayidx43, align 4
+  %arrayidx45 = getelementptr inbounds %struct.A, %struct.A* %o.1, i64 %idxprom42
+  %ival46 = getelementptr inbounds %struct.A, %struct.A* %arrayidx45, i32 0, i32 0
+  %tmp17 = load i32, i32* %ival46, align 4
+  %mul47 = mul nsw i32 %tmp16, %tmp17
+  store i32 %mul47, i32* @c, align 4
+  %tmp18 = load i32, i32* @b, align 4
+  %inc49 = add nsw i32 %tmp18, 1
+  store i32 %inc49, i32* @b, align 4
+  br label %for.cond39
+
+for.inc51:                                        ; preds = %for.cond39
+  %inc52 = add nsw i32 %k.1, 1
+  %incdec.ptr = getelementptr inbounds %struct.A, %struct.A* %o.1, i32 1
+  br label %for.cond36
+
+for.end53:                                        ; preds = %for.cond36
+  %tmp19 = load i32, i32* @c, align 4
+  store i32 %tmp19, i32* @e, align 4
+  br label %for.cond31
+
+for.end54:                                        ; preds = %for.cond31
+  br label %if.end55
+
+if.end55:                                         ; preds = %for.end54, %if.end29
+  br label %for.cond
+}
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.width", i32 4}
Index: test/Transforms/InstCombine/getelementptr.ll
===================================================================
--- test/Transforms/InstCombine/getelementptr.ll
+++ test/Transforms/InstCombine/getelementptr.ll
@@ -104,8 +104,8 @@
         %B = getelementptr i32, i32* %A, i64 %D
         ret i32* %B
 ; CHECK-LABEL: @test7(
-; CHECK: %A = getelementptr i32, i32* %I, i64 %C
-; CHECK: %B = getelementptr i32, i32* %A, i64 %D
+; CHECK: %A.sum = add i64 %C, %D
+; CHECK: getelementptr i32, i32* %I, i64 %A.sum
 }
 
 define i8* @test8([10 x i32]* %X) {