Index: include/llvm/Analysis/InlineCost.h =================================================================== --- include/llvm/Analysis/InlineCost.h +++ include/llvm/Analysis/InlineCost.h @@ -33,9 +33,9 @@ // Various thresholds used by inline cost analysis. /// Use when optsize (-Os) is specified. const int OptSizeThreshold = 50; - /// Use when minsize (-Oz) is specified. const int OptMinSizeThreshold = 5; +const int OptMinSizeGrowthPenalty = 1; /// Use when -O3 is specified. const int OptAggressiveThreshold = 250; Index: lib/Analysis/InlineCost.cpp =================================================================== --- lib/Analysis/InlineCost.cpp +++ lib/Analysis/InlineCost.cpp @@ -230,6 +230,9 @@ InlineResult analyzeBlock(BasicBlock *BB, SmallPtrSetImpl &EphValues); + /// Cost growth analysis + int getGrowthCost(); + // Disable several entry points to the visitor so we don't accidentally use // them by declaring but not defining them here. void visit(Module *); @@ -279,7 +282,8 @@ PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE), CandidateCS(CSArg), Params(Params), Threshold(Params.DefaultThreshold), Cost(0), ComputeFullInlineCost(OptComputeFullInlineCost || - Params.ComputeFullInlineCost || ORE), + Params.ComputeFullInlineCost || ORE || + F.optForMinSize()), IsCallerRecursive(false), IsRecursiveCall(false), ExposesReturnsTwice(false), HasDynamicAlloca(false), ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false), @@ -1879,6 +1883,8 @@ bool OnlyOneCallAndLocalLinkage = F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction(); + + Cost += getGrowthCost(); // If this is a noduplicate call, we can still inline as long as // inlining this would cause the removal of the caller (so the instruction // is not actually duplicated, just moved). @@ -1896,6 +1902,28 @@ return Cost < std::max(1, Threshold); } +int CallAnalyzer::getGrowthCost() { + // If the function of size S is called N times and each inlining cause a + // code growth of X. The expected growth for inlining it N times is + // N * X. + // If the function has local linkage, then the cost would actually be + // (N * X) - S, as the function would be eliminated. + // The expected Growth lowers each time the function is inlined, as N + // reduces as well. + if (!F.optForMinSize()) + return 0; + + int Growth = (NumInstructions - NumInstructionsSimplified) * F.getNumUses(); + + // Test that we can eliminate the function, and give a bonus + // TODO: Perhaps we could add smaller bonus if the function can + // isDiscardableIfUnused() + if (F.hasLocalLinkage()) + Growth -= NumInstructions; + + return Growth * InlineConstants::OptMinSizeGrowthPenalty; +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Dump stats about this call's analysis. LLVM_DUMP_METHOD void CallAnalyzer::dump() { Index: test/Transforms/Inline/InlineGrowthCost.ll =================================================================== --- /dev/null +++ test/Transforms/Inline/InlineGrowthCost.ll @@ -0,0 +1,153 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -inline %s -o - | FileCheck %s --implicit-check-not "@DoInline" +define dso_local i8* @ShouldNotInlined(i8* returned %dest, i8* nocapture readonly %source, i32 %num) #0 { +entry: + br label %while.cond +while.cond: + %num.addr.0 = phi i32 [ %num, %entry ], [ %dec, %while.body ] + %p_dest.0 = phi i8* [ %dest, %entry ], [ %incdec.ptr2, %while.body ] + %p_source.0 = phi i8* [ %source, %entry ], [ %incdec.ptr, %while.body ] + %cmp = icmp eq i32 %num.addr.0, 0 + br i1 %cmp, label %while.end, label %while.body +while.body: + %incdec.ptr = getelementptr inbounds i8, i8* %p_source.0, i32 1 + %0 = load i8, i8* %p_source.0, align 1 + %1 = trunc i32 %num.addr.0 to i8 + %conv1 = sub i8 %1, %0 + %incdec.ptr2 = getelementptr inbounds i8, i8* %p_dest.0, i32 1 + store i8 %conv1, i8* %p_dest.0, align 1 + %dec = add i32 %num.addr.0, -1 + br label %while.cond +while.end: + ret i8* %dest +} +define dso_local void @Caller(i8* %p1, i8* nocapture %p2, i32 %n) #0 { +; CHECK-LABEL: @Caller( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DIV:%.*]] = lshr i32 [[N:%.*]], 1 +; CHECK-NEXT: tail call fastcc void @DoNotInline(i8* [[P1:%.*]], i8* [[P2:%.*]], i32 [[DIV]]) #0 +; CHECK-NEXT: [[DIV1:%.*]] = lshr i32 [[N]], 2 +; CHECK-NEXT: tail call fastcc void @DoNotInline(i8* [[P2]], i8* [[P1]], i32 [[DIV1]]) #0 +; CHECK-NEXT: [[DIV3:%.*]] = lshr i32 [[N]], 3 +; CHECK-NEXT: tail call fastcc void @DoNotInline(i8* [[P1]], i8* [[P2]], i32 [[DIV3]]) #0 +; CHECK-NEXT: [[DIV5:%.*]] = lshr i32 [[N]], 4 +; CHECK-NEXT: tail call fastcc void @DoNotInline(i8* [[P2]], i8* [[P1]], i32 [[DIV5]]) #0 +; CHECK-NEXT: [[DIV7:%.*]] = lshr i32 [[N]], 5 +; CHECK-NEXT: tail call fastcc void @DoNotInline(i8* [[P1]], i8* [[P2]], i32 [[DIV7]]) #0 +; CHECK-NEXT: [[DIV9:%.*]] = lshr i32 [[N]], 6 +; CHECK-NEXT: br label [[WHILE_COND_I:%.*]] +; CHECK: while.cond.i: +; CHECK-NEXT: [[NUM_ADDR_0_I:%.*]] = phi i32 [ [[DIV9]], [[ENTRY:%.*]] ], [ [[DEC_I:%.*]], [[WHILE_BODY_I:%.*]] ] +; CHECK-NEXT: [[P_DEST_0_I:%.*]] = phi i8* [ [[P2]], [[ENTRY]] ], [ [[INCDEC_PTR2_I:%.*]], [[WHILE_BODY_I]] ] +; CHECK-NEXT: [[P_SOURCE_0_I:%.*]] = phi i8* [ [[P1]], [[ENTRY]] ], [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ] +; CHECK-NEXT: [[CMP_I:%.*]] = icmp eq i32 [[NUM_ADDR_0_I]], 0 +; CHECK-NEXT: br i1 [[CMP_I]], label [[DOINLINE_EXIT:%.*]], label [[WHILE_BODY_I]] +; CHECK: while.body.i: +; CHECK-NEXT: [[INCDEC_PTR_I]] = getelementptr inbounds i8, i8* [[P_SOURCE_0_I]], i32 1 +; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[P_SOURCE_0_I]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[NUM_ADDR_0_I]] to i8 +; CHECK-NEXT: [[CONV1_I:%.*]] = sub i8 [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[INCDEC_PTR2_I]] = getelementptr inbounds i8, i8* [[P_DEST_0_I]], i32 1 +; CHECK-NEXT: store i8 [[CONV1_I]], i8* [[P_DEST_0_I]], align 1 +; CHECK-NEXT: [[DEC_I]] = add i32 [[NUM_ADDR_0_I]], -1 +; CHECK-NEXT: br label [[WHILE_COND_I]] +; CHECK: DoInline.exit: +; CHECK-NEXT: [[DIV11:%.*]] = lshr i32 [[N]], 7 +; CHECK-NEXT: br label [[WHILE_COND_I15:%.*]] +; CHECK: while.cond.i15: +; CHECK-NEXT: [[NUM_ADDR_0_I11:%.*]] = phi i32 [ [[DIV11]], [[DOINLINE_EXIT]] ], [ [[DEC_I19:%.*]], [[WHILE_BODY_I20:%.*]] ] +; CHECK-NEXT: [[P_DEST_0_I12:%.*]] = phi i8* [ [[P2]], [[DOINLINE_EXIT]] ], [ [[INCDEC_PTR2_I18:%.*]], [[WHILE_BODY_I20]] ] +; CHECK-NEXT: [[P_SOURCE_0_I13:%.*]] = phi i8* [ [[P1]], [[DOINLINE_EXIT]] ], [ [[INCDEC_PTR_I16:%.*]], [[WHILE_BODY_I20]] ] +; CHECK-NEXT: [[CMP_I14:%.*]] = icmp eq i32 [[NUM_ADDR_0_I11]], 0 +; CHECK-NEXT: br i1 [[CMP_I14]], label [[DOINLINE_EXIT21:%.*]], label [[WHILE_BODY_I20]] +; CHECK: while.body.i20: +; CHECK-NEXT: [[INCDEC_PTR_I16]] = getelementptr inbounds i8, i8* [[P_SOURCE_0_I13]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, i8* [[P_SOURCE_0_I13]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[NUM_ADDR_0_I11]] to i8 +; CHECK-NEXT: [[CONV1_I17:%.*]] = sub i8 [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[INCDEC_PTR2_I18]] = getelementptr inbounds i8, i8* [[P_DEST_0_I12]], i32 1 +; CHECK-NEXT: store i8 [[CONV1_I17]], i8* [[P_DEST_0_I12]], align 1 +; CHECK-NEXT: [[DEC_I19]] = add i32 [[NUM_ADDR_0_I11]], -1 +; CHECK-NEXT: br label [[WHILE_COND_I15]] +; CHECK: DoInline.exit21: +; CHECK-NEXT: br label [[WHILE_COND_I5:%.*]] +; CHECK: while.cond.i5: +; CHECK-NEXT: [[NUM_ADDR_0_I1:%.*]] = phi i32 [ [[N]], [[DOINLINE_EXIT21]] ], [ [[DEC_I9:%.*]], [[WHILE_BODY_I10:%.*]] ] +; CHECK-NEXT: [[P_DEST_0_I2:%.*]] = phi i8* [ [[P1]], [[DOINLINE_EXIT21]] ], [ [[INCDEC_PTR2_I8:%.*]], [[WHILE_BODY_I10]] ] +; CHECK-NEXT: [[P_SOURCE_0_I3:%.*]] = phi i8* [ [[P2]], [[DOINLINE_EXIT21]] ], [ [[INCDEC_PTR_I6:%.*]], [[WHILE_BODY_I10]] ] +; CHECK-NEXT: [[CMP_I4:%.*]] = icmp eq i32 [[NUM_ADDR_0_I1]], 0 +; CHECK-NEXT: br i1 [[CMP_I4]], label [[SHOULDNOTINLINED_EXIT:%.*]], label [[WHILE_BODY_I10]] +; CHECK: while.body.i10: +; CHECK-NEXT: [[INCDEC_PTR_I6]] = getelementptr inbounds i8, i8* [[P_SOURCE_0_I3]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, i8* [[P_SOURCE_0_I3]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[NUM_ADDR_0_I1]] to i8 +; CHECK-NEXT: [[CONV1_I7:%.*]] = sub i8 [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[INCDEC_PTR2_I8]] = getelementptr inbounds i8, i8* [[P_DEST_0_I2]], i32 1 +; CHECK-NEXT: store i8 [[CONV1_I7]], i8* [[P_DEST_0_I2]], align 1 +; CHECK-NEXT: [[DEC_I9]] = add i32 [[NUM_ADDR_0_I1]], -1 +; CHECK-NEXT: br label [[WHILE_COND_I5]] +; CHECK: ShouldNotInlined.exit: +; CHECK-NEXT: ret void +; +entry: + %div = lshr i32 %n, 1 + tail call fastcc void @DoNotInline(i8* %p1, i8* %p2, i32 %div) #0 + %div1 = lshr i32 %n, 2 + tail call fastcc void @DoNotInline(i8* %p2, i8* %p1, i32 %div1) #0 + %div3 = lshr i32 %n, 3 + tail call fastcc void @DoNotInline(i8* %p1, i8* %p2, i32 %div3) #0 + %div5 = lshr i32 %n, 4 + tail call fastcc void @DoNotInline(i8* %p2, i8* %p1, i32 %div5) #0 + %div7 = lshr i32 %n, 5 + tail call fastcc void @DoNotInline(i8* %p1, i8* %p2, i32 %div7) #0 + %div9 = lshr i32 %n, 6 + tail call fastcc void @DoInline(i8* %p2, i8* %p1, i32 %div9) #0 + %div11 = lshr i32 %n, 7 + tail call fastcc void @DoInline(i8* %p2, i8* %p1, i32 %div11) #0 + %call13 = tail call i8* @ShouldNotInlined(i8* %p1, i8* %p2, i32 %n) #0 + ret void +} +define internal fastcc void @DoNotInline(i8* nocapture %dest, i8* nocapture readonly %source, i32 %num) #0 { +entry: + br label %while.cond +while.cond: + %num.addr.0 = phi i32 [ %num, %entry ], [ %dec, %while.body ] + %p_dest.0 = phi i8* [ %dest, %entry ], [ %incdec.ptr2, %while.body ] + %p_source.0 = phi i8* [ %source, %entry ], [ %incdec.ptr, %while.body ] + %cmp = icmp eq i32 %num.addr.0, 0 + br i1 %cmp, label %while.end, label %while.body +while.body: + %incdec.ptr = getelementptr inbounds i8, i8* %p_source.0, i32 1 + %0 = load i8, i8* %p_source.0, align 1 + %1 = trunc i32 %num.addr.0 to i8 + %conv1 = add i8 %0, %1 + %incdec.ptr2 = getelementptr inbounds i8, i8* %p_dest.0, i32 1 + store i8 %conv1, i8* %p_dest.0, align 1 + %dec = add i32 %num.addr.0, -1 + br label %while.cond +while.end: + ret void +} +define internal fastcc void @DoInline(i8* nocapture %dest, i8* nocapture readonly %source, i32 %num) #0 { +; CHECK-NOT-LABEL: @DoInline +entry: + br label %while.cond +while.cond: + %num.addr.0 = phi i32 [ %num, %entry ], [ %dec, %while.body ] + %p_dest.0 = phi i8* [ %dest, %entry ], [ %incdec.ptr2, %while.body ] + %p_source.0 = phi i8* [ %source, %entry ], [ %incdec.ptr, %while.body ] + %cmp = icmp eq i32 %num.addr.0, 0 + br i1 %cmp, label %while.end, label %while.body +while.body: + %incdec.ptr = getelementptr inbounds i8, i8* %p_source.0, i32 1 + %0 = load i8, i8* %p_source.0, align 1 + %1 = trunc i32 %num.addr.0 to i8 + %conv1 = sub i8 %1, %0 + %incdec.ptr2 = getelementptr inbounds i8, i8* %p_dest.0, i32 1 + store i8 %conv1, i8* %p_dest.0, align 1 + %dec = add i32 %num.addr.0, -1 + br label %while.cond +while.end: + ret void +} +attributes #0 = { minsize optsize }