Index: include/llvm/Analysis/InlineCost.h =================================================================== --- include/llvm/Analysis/InlineCost.h +++ include/llvm/Analysis/InlineCost.h @@ -33,9 +33,9 @@ // Various thresholds used by inline cost analysis. /// Use when optsize (-Os) is specified. const int OptSizeThreshold = 50; - /// Use when minsize (-Oz) is specified. const int OptMinSizeThreshold = 5; +const int OptMinSizeGrowthPenalty = 1; /// Use when -O3 is specified. const int OptAggressiveThreshold = 250; Index: lib/Analysis/InlineCost.cpp =================================================================== --- lib/Analysis/InlineCost.cpp +++ lib/Analysis/InlineCost.cpp @@ -230,6 +230,9 @@ InlineResult analyzeBlock(BasicBlock *BB, SmallPtrSetImpl &EphValues); + /// Cost growth analysis + int getGrowthCost(); + // Disable several entry points to the visitor so we don't accidentally use // them by declaring but not defining them here. void visit(Module *); @@ -279,7 +282,8 @@ PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE), CandidateCS(CSArg), Params(Params), Threshold(Params.DefaultThreshold), Cost(0), ComputeFullInlineCost(OptComputeFullInlineCost || - Params.ComputeFullInlineCost || ORE), + Params.ComputeFullInlineCost || ORE || + F.optForMinSize()), IsCallerRecursive(false), IsRecursiveCall(false), ExposesReturnsTwice(false), HasDynamicAlloca(false), ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false), @@ -1879,6 +1883,8 @@ bool OnlyOneCallAndLocalLinkage = F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction(); + + Cost += getGrowthCost(); // If this is a noduplicate call, we can still inline as long as // inlining this would cause the removal of the caller (so the instruction // is not actually duplicated, just moved). @@ -1896,6 +1902,25 @@ return Cost < std::max(1, Threshold); } +int CallAnalyzer::getGrowthCost() { + // If the function of size S is called N times and each inlining cause a + // code growth of X. + // Then, the actual code-size growth is S - (N * X). We assume that the + // code growth is constant when inlining the same function. + if (!F.optForMinSize()) + return 0; + + int Growth = (NumInstructions - NumInstructionsSimplified) * F.getNumUses(); + + // Test that we can eliminate the function, and give a bonus + // TODO: Perhaps we could add smaller bonus if the function can + // isDiscardableIfUnused() + if (F.hasLocalLinkage()) + Growth -= NumInstructions; + + return Growth * InlineConstants::OptMinSizeGrowthPenalty; +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Dump stats about this call's analysis. LLVM_DUMP_METHOD void CallAnalyzer::dump() { Index: test/Transforms/Inline/InlineGrowthCost.ll =================================================================== --- /dev/null +++ test/Transforms/Inline/InlineGrowthCost.ll @@ -0,0 +1,85 @@ +; RUN: opt -S -inline %s -o - | grep "tail call fastcc void @DoNotInline" +; RUN: opt -S -inline %s -o - | not grep "tail call fastcc void @DoInline" +define dso_local i8* @ShouldNotInlined(i8* returned %dest, i8* nocapture readonly %source, i32 %num) #0 { +entry: + br label %while.cond +while.cond: + %num.addr.0 = phi i32 [ %num, %entry ], [ %dec, %while.body ] + %p_dest.0 = phi i8* [ %dest, %entry ], [ %incdec.ptr2, %while.body ] + %p_source.0 = phi i8* [ %source, %entry ], [ %incdec.ptr, %while.body ] + %cmp = icmp eq i32 %num.addr.0, 0 + br i1 %cmp, label %while.end, label %while.body +while.body: + %incdec.ptr = getelementptr inbounds i8, i8* %p_source.0, i32 1 + %0 = load i8, i8* %p_source.0, align 1 + %1 = trunc i32 %num.addr.0 to i8 + %conv1 = sub i8 %1, %0 + %incdec.ptr2 = getelementptr inbounds i8, i8* %p_dest.0, i32 1 + store i8 %conv1, i8* %p_dest.0, align 1 + %dec = add i32 %num.addr.0, -1 + br label %while.cond +while.end: + ret i8* %dest +} +define dso_local void @Caller(i8* %p1, i8* nocapture %p2, i32 %n) #0 { +entry: + %div = lshr i32 %n, 1 + tail call fastcc void @DoNotInline(i8* %p1, i8* %p2, i32 %div) #0 + %div1 = lshr i32 %n, 2 + tail call fastcc void @DoNotInline(i8* %p2, i8* %p1, i32 %div1) #0 + %div3 = lshr i32 %n, 3 + tail call fastcc void @DoNotInline(i8* %p1, i8* %p2, i32 %div3) #0 + %div5 = lshr i32 %n, 4 + tail call fastcc void @DoNotInline(i8* %p2, i8* %p1, i32 %div5) #0 + %div7 = lshr i32 %n, 5 + tail call fastcc void @DoNotInline(i8* %p1, i8* %p2, i32 %div7) #0 + %div9 = lshr i32 %n, 6 + tail call fastcc void @DoInline(i8* %p2, i8* %p1, i32 %div9) #0 + %div11 = lshr i32 %n, 7 + tail call fastcc void @DoInline(i8* %p2, i8* %p1, i32 %div11) #0 + %call13 = tail call i8* @ShouldNotInlined(i8* %p1, i8* %p2, i32 %n) #0 + ret void +} +define internal fastcc void @DoNotInline(i8* nocapture %dest, i8* nocapture readonly %source, i32 %num) #0 { +entry: + br label %while.cond +while.cond: + %num.addr.0 = phi i32 [ %num, %entry ], [ %dec, %while.body ] + %p_dest.0 = phi i8* [ %dest, %entry ], [ %incdec.ptr2, %while.body ] + %p_source.0 = phi i8* [ %source, %entry ], [ %incdec.ptr, %while.body ] + %cmp = icmp eq i32 %num.addr.0, 0 + br i1 %cmp, label %while.end, label %while.body +while.body: + %incdec.ptr = getelementptr inbounds i8, i8* %p_source.0, i32 1 + %0 = load i8, i8* %p_source.0, align 1 + %1 = trunc i32 %num.addr.0 to i8 + %conv1 = add i8 %0, %1 + %incdec.ptr2 = getelementptr inbounds i8, i8* %p_dest.0, i32 1 + store i8 %conv1, i8* %p_dest.0, align 1 + %dec = add i32 %num.addr.0, -1 + br label %while.cond +while.end: + ret void +} +define internal fastcc void @DoInline(i8* nocapture %dest, i8* nocapture readonly %source, i32 %num) #0 { +entry: + br label %while.cond +while.cond: + %num.addr.0 = phi i32 [ %num, %entry ], [ %dec, %while.body ] + %p_dest.0 = phi i8* [ %dest, %entry ], [ %incdec.ptr2, %while.body ] + %p_source.0 = phi i8* [ %source, %entry ], [ %incdec.ptr, %while.body ] + %cmp = icmp eq i32 %num.addr.0, 0 + br i1 %cmp, label %while.end, label %while.body +while.body: + %incdec.ptr = getelementptr inbounds i8, i8* %p_source.0, i32 1 + %0 = load i8, i8* %p_source.0, align 1 + %1 = trunc i32 %num.addr.0 to i8 + %conv1 = sub i8 %1, %0 + %incdec.ptr2 = getelementptr inbounds i8, i8* %p_dest.0, i32 1 + store i8 %conv1, i8* %p_dest.0, align 1 + %dec = add i32 %num.addr.0, -1 + br label %while.cond +while.end: + ret void +} +attributes #0 = { minsize optsize }