Index: llvm/trunk/include/llvm/Analysis/CGSCCPassManager.h =================================================================== --- llvm/trunk/include/llvm/Analysis/CGSCCPassManager.h +++ llvm/trunk/include/llvm/Analysis/CGSCCPassManager.h @@ -275,6 +275,15 @@ /// non-null and can be used to continue processing the "top" of the /// post-order walk. LazyCallGraph::SCC *UpdatedC; + + /// A hacky area where the inliner can retain history about inlining + /// decisions that mutated the call graph's SCC structure in order to avoid + /// infinite inlining. See the comments in the inliner's CG update logic. + /// + /// FIXME: Keeping this here seems like a big layering issue, we should look + /// for a better technique. + SmallDenseSet, 4> + &InlinedInternalEdges; }; /// \brief The core module pass which does a post-order walk of the SCCs and @@ -330,8 +339,12 @@ SmallPtrSet InvalidRefSCCSet; SmallPtrSet InvalidSCCSet; - CGSCCUpdateResult UR = {RCWorklist, CWorklist, InvalidRefSCCSet, - InvalidSCCSet, nullptr, nullptr}; + SmallDenseSet, 4> + InlinedInternalEdges; + + CGSCCUpdateResult UR = {RCWorklist, CWorklist, InvalidRefSCCSet, + InvalidSCCSet, nullptr, nullptr, + InlinedInternalEdges}; PreservedAnalyses PA = PreservedAnalyses::all(); CG.buildRefSCCs(); @@ -433,8 +446,12 @@ // invalid SCC and RefSCCs respectively. But we will short circuit // the processing when we check them in the loop above. } while (UR.UpdatedC); - } while (!CWorklist.empty()); + + // We only need to keep internal inlined edge information within + // a RefSCC, clear it to save on space and let the next time we visit + // any of these functions have a fresh start. + InlinedInternalEdges.clear(); } while (!RCWorklist.empty()); } Index: llvm/trunk/lib/Transforms/IPO/Inliner.cpp =================================================================== --- llvm/trunk/lib/Transforms/IPO/Inliner.cpp +++ llvm/trunk/lib/Transforms/IPO/Inliner.cpp @@ -872,6 +872,19 @@ InlineHistoryIncludes(&Callee, InlineHistoryID, InlineHistory)) continue; + // Check if this inlining may repeat breaking an SCC apart that has + // already been split once before. In that case, inlining here may + // trigger infinite inlining, much like is prevented within the inliner + // itself by the InlineHistory above, but spread across CGSCC iterations + // and thus hidden from the full inline history. + if (CG.lookupSCC(*CG.lookup(Callee)) == C && + UR.InlinedInternalEdges.count({&N, C})) { + DEBUG(dbgs() << "Skipping inlining internal SCC edge from a node " + "previously split out of this SCC by inlining: " + << F.getName() << " -> " << Callee.getName() << "\n"); + continue; + } + // Check whether we want to inline this callsite. if (!shouldInline(CS, GetInlineCost, ORE)) continue; @@ -949,17 +962,38 @@ for (LazyCallGraph::Edge &E : *CalleeN) RC->insertTrivialRefEdge(N, E.getNode()); } - InlinedCallees.clear(); // At this point, since we have made changes we have at least removed // a call instruction. However, in the process we do some incremental // simplification of the surrounding code. This simplification can // essentially do all of the same things as a function pass and we can // re-use the exact same logic for updating the call graph to reflect the - // change.. + // change. + LazyCallGraph::SCC *OldC = C; C = &updateCGAndAnalysisManagerForFunctionPass(CG, *C, N, AM, UR); DEBUG(dbgs() << "Updated inlining SCC: " << *C << "\n"); RC = &C->getOuterRefSCC(); + + // If this causes an SCC to split apart into multiple smaller SCCs, there + // is a subtle risk we need to prepare for. Other transformations may + // expose an "infinite inlining" opportunity later, and because of the SCC + // mutation, we will revisit this function and potentially re-inline. If we + // do, and that re-inlining also has the potentially to mutate the SCC + // structure, the infinite inlining problem can manifest through infinite + // SCC splits and merges. To avoid this, we capture the originating caller + // node and the SCC containing the call edge. This is a slight over + // approximation of the possible inlining decisions that must be avoided, + // but is relatively efficient to store. + // FIXME: This seems like a very heavyweight way of retaining the inline + // history, we should look for a more efficient way of tracking it. + if (C != OldC && llvm::any_of(InlinedCallees, [&](Function *Callee) { + return CG.lookupSCC(*CG.lookup(*Callee)) == OldC; + })) { + DEBUG(dbgs() << "Inlined an internal call edge and split an SCC, " + "retaining this to avoid infinite inlining.\n"); + UR.InlinedInternalEdges.insert({&N, OldC}); + } + InlinedCallees.clear(); } // Now that we've finished inlining all of the calls across this SCC, delete Index: llvm/trunk/test/Transforms/Inline/cgscc-cycle.ll =================================================================== --- llvm/trunk/test/Transforms/Inline/cgscc-cycle.ll +++ llvm/trunk/test/Transforms/Inline/cgscc-cycle.ll @@ -0,0 +1,125 @@ +; This test contains extremely tricky call graph structures for the inliner to +; handle correctly. They form cycles where the inliner introduces code that is +; immediately or can eventually be transformed back into the original code. And +; each step changes the call graph and so will trigger iteration. This requires +; some out-of-band way to prevent infinitely re-inlining and re-transforming the +; code. +; +; RUN: opt < %s -passes='cgscc(inline,function(sroa,instcombine))' -S | FileCheck %s + + +; The `test1_*` collection of functions form a directly cycling pattern. + +define void @test1_a(i8** %ptr) { +; CHECK-LABEL: define void @test1_a( +entry: + call void @test1_b(i8* bitcast (void (i8*, i1, i32)* @test1_b to i8*), i1 false, i32 0) +; Inlining and simplifying this call will reliably produce the exact same call, +; over and over again. However, each inlining increments the count, and so we +; expect this test case to stop after one round of inlining with a final +; argument of '1'. +; CHECK-NOT: call +; CHECK: call void @test1_b(i8* bitcast (void (i8*, i1, i32)* @test1_b to i8*), i1 false, i32 1) +; CHECK-NOT: call + + ret void +} + +define void @test1_b(i8* %arg, i1 %flag, i32 %inline_count) { +; CHECK-LABEL: define void @test1_b( +entry: + %a = alloca i8* + store i8* %arg, i8** %a +; This alloca and store should remain through any optimization. +; CHECK: %[[A:.*]] = alloca +; CHECK: store i8* %arg, i8** %[[A]] + + br i1 %flag, label %bb1, label %bb2 + +bb1: + call void @test1_a(i8** %a) noinline + br label %bb2 + +bb2: + %cast = bitcast i8** %a to void (i8*, i1, i32)** + %p = load void (i8*, i1, i32)*, void (i8*, i1, i32)** %cast + %inline_count_inc = add i32 %inline_count, 1 + call void %p(i8* %arg, i1 %flag, i32 %inline_count_inc) +; And we should continue to load and call indirectly through optimization. +; CHECK: %[[CAST:.*]] = bitcast i8** %[[A]] to void (i8*, i1, i32)** +; CHECK: %[[P:.*]] = load void (i8*, i1, i32)*, void (i8*, i1, i32)** %[[CAST]] +; CHECK: call void %[[P]]( + + ret void +} + +define void @test2_a(i8** %ptr) { +; CHECK-LABEL: define void @test2_a( +entry: + call void @test2_b(i8* bitcast (void (i8*, i8*, i1, i32)* @test2_b to i8*), i8* bitcast (void (i8*, i8*, i1, i32)* @test2_c to i8*), i1 false, i32 0) +; Inlining and simplifying this call will reliably produce the exact same call, +; but only after doing two rounds if inlining, first from @test2_b then +; @test2_c. We check the exact number of inlining rounds before we cut off to +; break the cycle by inspecting the last paramater that gets incremented with +; each inlined function body. +; CHECK-NOT: call +; CHECK: call void @test2_b(i8* bitcast (void (i8*, i8*, i1, i32)* @test2_b to i8*), i8* bitcast (void (i8*, i8*, i1, i32)* @test2_c to i8*), i1 false, i32 2) +; CHECK-NOT: call + ret void +} + +define void @test2_b(i8* %arg1, i8* %arg2, i1 %flag, i32 %inline_count) { +; CHECK-LABEL: define void @test2_b( +entry: + %a = alloca i8* + store i8* %arg2, i8** %a +; This alloca and store should remain through any optimization. +; CHECK: %[[A:.*]] = alloca +; CHECK: store i8* %arg2, i8** %[[A]] + + br i1 %flag, label %bb1, label %bb2 + +bb1: + call void @test2_a(i8** %a) noinline + br label %bb2 + +bb2: + %p = load i8*, i8** %a + %cast = bitcast i8* %p to void (i8*, i8*, i1, i32)* + %inline_count_inc = add i32 %inline_count, 1 + call void %cast(i8* %arg1, i8* %arg2, i1 %flag, i32 %inline_count_inc) +; And we should continue to load and call indirectly through optimization. +; CHECK: %[[CAST:.*]] = bitcast i8** %[[A]] to void (i8*, i8*, i1, i32)** +; CHECK: %[[P:.*]] = load void (i8*, i8*, i1, i32)*, void (i8*, i8*, i1, i32)** %[[CAST]] +; CHECK: call void %[[P]]( + + ret void +} + +define void @test2_c(i8* %arg1, i8* %arg2, i1 %flag, i32 %inline_count) { +; CHECK-LABEL: define void @test2_c( +entry: + %a = alloca i8* + store i8* %arg1, i8** %a +; This alloca and store should remain through any optimization. +; CHECK: %[[A:.*]] = alloca +; CHECK: store i8* %arg1, i8** %[[A]] + + br i1 %flag, label %bb1, label %bb2 + +bb1: + call void @test2_a(i8** %a) noinline + br label %bb2 + +bb2: + %p = load i8*, i8** %a + %cast = bitcast i8* %p to void (i8*, i8*, i1, i32)* + %inline_count_inc = add i32 %inline_count, 1 + call void %cast(i8* %arg1, i8* %arg2, i1 %flag, i32 %inline_count_inc) +; And we should continue to load and call indirectly through optimization. +; CHECK: %[[CAST:.*]] = bitcast i8** %[[A]] to void (i8*, i8*, i1, i32)** +; CHECK: %[[P:.*]] = load void (i8*, i8*, i1, i32)*, void (i8*, i8*, i1, i32)** %[[CAST]] +; CHECK: call void %[[P]]( + + ret void +}