Index: lib/Transforms/InstCombine/InstructionCombining.cpp =================================================================== --- lib/Transforms/InstCombine/InstructionCombining.cpp +++ lib/Transforms/InstCombine/InstructionCombining.cpp @@ -1432,13 +1432,12 @@ if (!shouldMergeGEPs(*cast(&GEP), *Src)) return nullptr; - // Note that if our source is a gep chain itself then we wait for that - // chain to be resolved before we perform this transformation. This - // avoids us creating a TON of code in some cases. - if (GEPOperator *SrcGEP = - dyn_cast(Src->getOperand(0))) - if (SrcGEP->getNumOperands() == 2 && shouldMergeGEPs(*Src, *SrcGEP)) - return nullptr; // Wait until our source is folded to completion. + // Don't combine the malformed cycle gep instructions like the following: + // %gep2 = getelementptr i8, i8* %gep, i32 1 + // %gep = getelementptr i8, i8* %gep2, i32 1 + GetElementPtrInst *SrcGEP = dyn_cast(Src->getOperand(0)); + if (SrcGEP == &GEP) + return nullptr; SmallVector Indices; @@ -1467,10 +1466,15 @@ // normalized. if (SO1->getType() != GO1->getType()) return nullptr; - // Only do the combine when GO1 and SO1 are both constants. Only in - // this case, we are sure the cost after the merge is never more than - // that before the merge. - if (!isa(GO1) || !isa(SO1)) + // Do gep(gep(...)) combine only when + // 1. GO1 and SO1 are both constants or + // 2. Src has only one use plus that Src and GEP are in the same BB. + // In thses cases we are sure the cost of the combined result will be + // equal or less than before. + if (((!isa(GO1) || !isa(SO1))) && + (!Src->hasOneUse() || + (isa(Src) && + cast(Src)->getParent() != GEP.getParent()))) return nullptr; Sum = Builder->CreateAdd(SO1, GO1, PtrOp->getName()+".sum"); } Index: test/Transforms/InstCombine/gep-merge1.ll =================================================================== --- test/Transforms/InstCombine/gep-merge1.ll +++ test/Transforms/InstCombine/gep-merge1.ll @@ -0,0 +1,166 @@ +; PR23580 +; RUN: opt < %s -O2 -S | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.anon = type { [0 x %class.C] } +%class.C = type { i8 } +%struct.B = type { i16 } +%class.G = type <{ %struct.F, [2 x i32], i8, [7 x i8] }> +%struct.F = type { i8, i8, i8, i16, i32* } +%struct.A = type { i32 } + +@a = global i32 0, align 4 +@b = global i32 0, align 4 +@c = global i32 0, align 4 +@e = global i32 0, align 4 +@d = internal global %struct.anon zeroinitializer, align 1 + +declare %struct.B* @_ZN1C5m_fn1Ev(%class.C*) + +; Check geps inside for.body are merged so loop vectorizer can recognize loads +; inside for.body to be inter-iterations consecutive, and generate %wide.loads. +; +; CHECK-LABEL: @fn2( +; CHECK: %wide.load{{[0-9]*}} = +; CHECK: %wide.load{{[0-9]*}} = +; CHECK: %wide.load{{[0-9]*}} = + +define void @fn2(%class.G* nocapture readonly %this, i1 zeroext %arg) align 2 { +entry: + br label %for.cond + +for.cond: ; preds = %if.end55, %entry + %hor_steps = getelementptr inbounds %class.G, %class.G* %this, i32 0, i32 0 + %tmp1 = load i32, i32* @a, align 4 + %idxprom = sext i32 %tmp1 to i64 + %coset_width_in = getelementptr inbounds %class.G, %class.G* %this, i32 0, i32 1 + %arrayidx = getelementptr inbounds [2 x i32], [2 x i32]* %coset_width_in, i32 0, i64 %idxprom + %tmp2 = load i32, i32* %arrayidx, align 4 + %use_shorts = getelementptr inbounds %class.G, %class.G* %this, i32 0, i32 2 + %tmp3 = load i8, i8* %use_shorts, align 1 + %tobool = trunc i8 %tmp3 to i1 + br i1 %tobool, label %if.then, label %if.else30 + +if.then: ; preds = %for.cond + %arrayidx3 = getelementptr inbounds [0 x %class.C], [0 x %class.C]* getelementptr inbounds (%struct.anon, %struct.anon* @d, i32 0, i32 0), i32 0, i64 %idxprom + %call = call %struct.B* @_ZN1C5m_fn1Ev(%class.C* %arrayidx3) + %tmp4 = load i32, i32* @a, align 4 + %idx.ext = sext i32 %tmp4 to i64 + %add.ptr = getelementptr inbounds %struct.B, %struct.B* %call, i64 %idx.ext + %arrayidx5 = getelementptr inbounds [0 x %class.C], [0 x %class.C]* getelementptr inbounds (%struct.anon, %struct.anon* @d, i32 0, i32 0), i32 0, i64 %idx.ext + %call6 = call %struct.B* @_ZN1C5m_fn1Ev(%class.C* %arrayidx5) + %downshift = getelementptr inbounds %struct.F, %struct.F* %hor_steps, i32 0, i32 1 + %tmp5 = load i8, i8* %downshift, align 1 + %conv = zext i8 %tmp5 to i32 + %rounding_offset = getelementptr inbounds %struct.F, %struct.F* %hor_steps, i32 0, i32 3 + %tmp6 = load i16, i16* %rounding_offset, align 2 + %conv7 = sext i16 %tmp6 to i32 + %icoeffs = getelementptr inbounds %struct.F, %struct.F* %hor_steps, i32 0, i32 4 + %tmp7 = load i32*, i32** %icoeffs, align 8 + %tmp8 = load i32, i32* %tmp7, align 4 + %cmp = icmp eq i32 %tmp8, 1 + %cmp10 = icmp eq i32 %tmp8, -1 + %or.cond = or i1 %cmp, %cmp10 + br i1 %or.cond, label %if.end29, label %for.cond13 + +for.cond13: ; preds = %for.body, %if.then + %k.0 = phi i32 [ 1, %if.then ], [ %add, %for.body ] + %cmp14 = icmp slt i32 %k.0, %tmp2 + br i1 %cmp14, label %for.body, label %if.end29, !llvm.loop !0 + +for.body: ; preds = %for.cond13 + %idxprom15 = sext i32 %k.0 to i64 + %arrayidx16 = getelementptr inbounds %struct.B, %struct.B* %add.ptr, i64 %idxprom15 + %ival = getelementptr inbounds %struct.B, %struct.B* %arrayidx16, i32 0, i32 0 + %tmp9 = load i16, i16* %ival, align 2 + %conv17 = sext i16 %tmp9 to i32 + %add = add nsw i32 %k.0, 1 + %idxprom18 = sext i32 %add to i64 + %arrayidx19 = getelementptr inbounds %struct.B, %struct.B* %add.ptr, i64 %idxprom18 + %ival20 = getelementptr inbounds %struct.B, %struct.B* %arrayidx19, i32 0, i32 0 + %tmp10 = load i16, i16* %ival20, align 2 + %conv21 = sext i16 %tmp10 to i32 + %add22 = add nsw i32 %conv17, %conv21 + %mul = mul nsw i32 %tmp8, %add22 + %add23 = add nsw i32 %conv7, %mul + %shr = ashr i32 %add23, %conv + %arrayidx25 = getelementptr inbounds %struct.B, %struct.B* %call6, i64 %idxprom15 + %ival26 = getelementptr inbounds %struct.B, %struct.B* %arrayidx25, i32 0, i32 0 + %tmp11 = load i16, i16* %ival26, align 2 + %conv27 = sext i16 %tmp11 to i32 + %sub = sub nsw i32 %conv27, %shr + %conv28 = trunc i32 %sub to i16 + store i16 %conv28, i16* %ival26, align 2 + br label %for.cond13 + +if.end29: ; preds = %for.cond13, %if.then + br label %if.end55 + +if.else30: ; preds = %for.cond + br label %for.cond31 + +for.cond31: ; preds = %for.end53, %if.else30 + %o.0 = phi %struct.A* [ null, %if.else30 ], [ %o.1, %for.end53 ] + %extend = getelementptr inbounds %struct.F, %struct.F* %hor_steps, i32 0, i32 2 + %tmp12 = load i8, i8* %extend, align 1 + %tobool32 = icmp ne i8 %tmp12, 0 + br i1 %tobool32, label %for.body33, label %for.end54 + +for.body33: ; preds = %for.cond31 + %support_length = getelementptr inbounds %struct.F, %struct.F* %hor_steps, i32 0, i32 0 + %tmp13 = load i8, i8* %support_length, align 1 + %conv34 = zext i8 %tmp13 to i32 + %icoeffs35 = getelementptr inbounds %struct.F, %struct.F* %hor_steps, i32 0, i32 4 + %tmp14 = load i32*, i32** %icoeffs35, align 8 + br label %for.cond36 + +for.cond36: ; preds = %for.inc51, %for.body33 + %o.1 = phi %struct.A* [ %o.0, %for.body33 ], [ %incdec.ptr, %for.inc51 ] + %k.1 = phi i32 [ 0, %for.body33 ], [ %inc52, %for.inc51 ] + %cmp37 = icmp slt i32 %k.1, %tmp2 + br i1 %cmp37, label %for.body38, label %for.end53 + +for.body38: ; preds = %for.cond36 + store i32 0, i32* @b, align 4 + br label %for.cond39 + +for.cond39: ; preds = %for.body41, %for.body38 + %tmp15 = load i32, i32* @b, align 4 + %cmp40 = icmp slt i32 %tmp15, %conv34 + br i1 %cmp40, label %for.body41, label %for.inc51 + +for.body41: ; preds = %for.cond39 + %idxprom42 = sext i32 %tmp15 to i64 + %arrayidx43 = getelementptr inbounds i32, i32* %tmp14, i64 %idxprom42 + %tmp16 = load i32, i32* %arrayidx43, align 4 + %arrayidx45 = getelementptr inbounds %struct.A, %struct.A* %o.1, i64 %idxprom42 + %ival46 = getelementptr inbounds %struct.A, %struct.A* %arrayidx45, i32 0, i32 0 + %tmp17 = load i32, i32* %ival46, align 4 + %mul47 = mul nsw i32 %tmp16, %tmp17 + store i32 %mul47, i32* @c, align 4 + %tmp18 = load i32, i32* @b, align 4 + %inc49 = add nsw i32 %tmp18, 1 + store i32 %inc49, i32* @b, align 4 + br label %for.cond39 + +for.inc51: ; preds = %for.cond39 + %inc52 = add nsw i32 %k.1, 1 + %incdec.ptr = getelementptr inbounds %struct.A, %struct.A* %o.1, i32 1 + br label %for.cond36 + +for.end53: ; preds = %for.cond36 + %tmp19 = load i32, i32* @c, align 4 + store i32 %tmp19, i32* @e, align 4 + br label %for.cond31 + +for.end54: ; preds = %for.cond31 + br label %if.end55 + +if.end55: ; preds = %for.end54, %if.end29 + br label %for.cond +} + +!0 = distinct !{!0, !1} +!1 = !{!"llvm.loop.vectorize.width", i32 4} Index: test/Transforms/InstCombine/getelementptr.ll =================================================================== --- test/Transforms/InstCombine/getelementptr.ll +++ test/Transforms/InstCombine/getelementptr.ll @@ -104,8 +104,8 @@ %B = getelementptr i32, i32* %A, i64 %D ret i32* %B ; CHECK-LABEL: @test7( -; CHECK: %A = getelementptr i32, i32* %I, i64 %C -; CHECK: %B = getelementptr i32, i32* %A, i64 %D +; CHECK: %A.sum = add i64 %C, %D +; CHECK: getelementptr i32, i32* %I, i64 %A.sum } define i8* @test8([10 x i32]* %X) {