Index: include/llvm/Analysis/InlineCost.h =================================================================== --- include/llvm/Analysis/InlineCost.h +++ include/llvm/Analysis/InlineCost.h @@ -44,6 +44,7 @@ const int InstrCost = 5; const int IndirectCallThreshold = 100; const int CallPenalty = 25; +const int CallMinSizePenalty = 15; const int LastCallToStaticBonus = 15000; const int ColdccPenalty = 2000; const int NoreturnPenalty = 10000; @@ -201,7 +202,7 @@ /// Return the cost associated with a callsite, including parameter passing /// and the call/return instruction. -int getCallsiteCost(CallSite CS, const DataLayout &DL); +int getCallsiteCost(CallSite CS, const DataLayout &DL, int CallPenalty); /// Get an InlineCost object representing the cost of inlining this /// callsite. Index: lib/Analysis/InlineCost.cpp =================================================================== --- lib/Analysis/InlineCost.cpp +++ lib/Analysis/InlineCost.cpp @@ -129,6 +129,11 @@ int Cost; bool ComputeFullInlineCost; + /// The penalty for making calls. Defaults to InlineCost:CallPenalty, but + /// lower at minsize (still not zero due to the inaccuracies in cost + /// modelling). + int CallPenalty; + bool IsCallerRecursive; bool IsRecursiveCall; bool ExposesReturnsTwice; @@ -280,16 +285,16 @@ CandidateCS(CSArg), Params(Params), Threshold(Params.DefaultThreshold), Cost(0), ComputeFullInlineCost(OptComputeFullInlineCost || Params.ComputeFullInlineCost || ORE), - IsCallerRecursive(false), IsRecursiveCall(false), - ExposesReturnsTwice(false), HasDynamicAlloca(false), - ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false), - HasUninlineableIntrinsic(false), InitsVargArgs(false), AllocatedSize(0), - NumInstructions(0), NumVectorInstructions(0), VectorBonus(0), - SingleBBBonus(0), EnableLoadElimination(true), LoadEliminationCost(0), - NumConstantArgs(0), NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), - NumConstantPtrCmps(0), NumConstantPtrDiffs(0), - NumInstructionsSimplified(0), SROACostSavings(0), - SROACostSavingsLost(0) {} + CallPenalty(InlineConstants::CallPenalty), IsCallerRecursive(false), + IsRecursiveCall(false), ExposesReturnsTwice(false), + HasDynamicAlloca(false), ContainsNoDuplicateCall(false), + HasReturn(false), HasIndirectBr(false), HasUninlineableIntrinsic(false), + InitsVargArgs(false), AllocatedSize(0), NumInstructions(0), + NumVectorInstructions(0), VectorBonus(0), SingleBBBonus(0), + EnableLoadElimination(true), LoadEliminationCost(0), NumConstantArgs(0), + NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0), + NumConstantPtrDiffs(0), NumInstructionsSimplified(0), + SROACostSavings(0), SROACostSavingsLost(0) {} InlineResult analyzeCall(CallSite CS); @@ -585,6 +590,12 @@ SROAArgValues[&I] = SROAArg; // Constant GEPs are modeled as free. + // Except at minsize, we don't count those used in PHI nodes. + if (I.getParent()->getParent()->optForMinSize()) { + for (auto U : I.users()) + if (isa(U)) + return false; + } return true; } @@ -719,7 +730,7 @@ case Instruction::FPToUI: case Instruction::FPToSI: if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive) - Cost += InlineConstants::CallPenalty; + Cost += CallPenalty; default: break; } @@ -897,6 +908,7 @@ // and reduce the threshold if the caller has the necessary attribute. if (Caller->optForMinSize()) { Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold); + CallPenalty = InlineConstants::CallMinSizePenalty; // For minsize, we want to disable the single BB bonus and the vector // bonuses, but not the last-call-to-static bonus. Inlining the last call to // a static function will, at the minimum, eliminate the parameter setup and @@ -1087,7 +1099,7 @@ // as such. if (I.getType()->isFloatingPointTy() && TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive) - Cost += InlineConstants::CallPenalty; + Cost += CallPenalty; return false; } @@ -1259,7 +1271,7 @@ // Everything other than inline ASM will also have a significant cost // merely from making the call. if (!isa(CS.getCalledValue())) - Cost += InlineConstants::CallPenalty; + Cost += CallPenalty; } if (!CS.onlyReadsMemory()) @@ -1735,7 +1747,7 @@ // Give out bonuses for the callsite, as the instructions setting them up // will be gone after inlining. - Cost -= getCallsiteCost(CS, DL); + Cost -= getCallsiteCost(CS, DL, CallPenalty); // If this function uses the coldcc calling convention, prefer not to inline // it. @@ -1825,6 +1837,30 @@ if (BB->hasAddressTaken()) return "blockaddress"; + // Count the number of unconditional branches leading to this block. Add an + // instruction cost for each past the first. This models the cost of + // branches to the block. + if (Caller->optForMinSize()) { + bool FoundUnconditional = false; + for (auto *PredBB : predecessors(BB)) { + // Discount edges we know to not be taken + if (DeadBlocks.count(PredBB)) + continue; + BasicBlock *KnownSuccessor = KnownSuccessors[PredBB]; + if (KnownSuccessor && KnownSuccessor != BB) + continue; + + Instruction *TI = PredBB->getTerminator(); + if (BranchInst *BI = dyn_cast(TI)) { + if (!BI->isConditional()) { + if (FoundUnconditional) + Cost += InlineConstants::InstrCost; + FoundUnconditional = true; + } + } + } + } + // Analyze the cost of this block. If we blow through the threshold, this // returns false, and we can bail on out. InlineResult IR = analyzeBlock(BB, EphValues); @@ -1925,7 +1961,7 @@ AttributeFuncs::areInlineCompatible(*Caller, *Callee); } -int llvm::getCallsiteCost(CallSite CS, const DataLayout &DL) { +int llvm::getCallsiteCost(CallSite CS, const DataLayout &DL, int CallPenalty) { int Cost = 0; for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) { if (CS.isByValArgument(I)) { @@ -1954,7 +1990,7 @@ } } // The call instruction also disappears after inlining. - Cost += InlineConstants::InstrCost + InlineConstants::CallPenalty; + Cost += InlineConstants::InstrCost + CallPenalty; return Cost; } Index: lib/Transforms/IPO/PartialInlining.cpp =================================================================== --- lib/Transforms/IPO/PartialInlining.cpp +++ lib/Transforms/IPO/PartialInlining.cpp @@ -798,7 +798,7 @@ const DataLayout &DL = Caller->getParent()->getDataLayout(); // The savings of eliminating the call: - int NonWeightedSavings = getCallsiteCost(CS, DL); + int NonWeightedSavings = getCallsiteCost(CS, DL, InlineConstants::CallPenalty); BlockFrequency NormWeightedSavings(NonWeightedSavings); // Weighted saving is smaller than weighted cost, return false @@ -860,12 +860,12 @@ } if (CallInst *CI = dyn_cast(I)) { - InlineCost += getCallsiteCost(CallSite(CI), DL); + InlineCost += getCallsiteCost(CallSite(CI), DL, InlineConstants::CallPenalty); continue; } if (InvokeInst *II = dyn_cast(I)) { - InlineCost += getCallsiteCost(CallSite(II), DL); + InlineCost += getCallsiteCost(CallSite(II), DL, InlineConstants::CallPenalty); continue; } Index: test/Transforms/Inline/AArch64/phi.ll =================================================================== --- test/Transforms/Inline/AArch64/phi.ll +++ test/Transforms/Inline/AArch64/phi.ll @@ -345,7 +345,7 @@ br label %exit exit: - %phi = phi i32* [%gep1, %entry], [%gep3, %if_true] ; Simplifeid to %gep1 + %phi = phi i32* [%gep1, %entry], [%gep3, %if_true] ; Simplified to %gep1 %load = load i32, i32* %phi call void @pad() ret i32 %load Index: test/Transforms/Inline/ARM/loop-add.ll =================================================================== --- /dev/null +++ test/Transforms/Inline/ARM/loop-add.ll @@ -0,0 +1,95 @@ +; RUN: opt -inline %s -S | FileCheck %s + +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "thumbv7m-arm-none-eabi" + +; CHECK-LABEL: void @doCalls +define void @doCalls(i8* nocapture %p1, i8* nocapture %p2, i32 %n) #0 { +entry: + %div = lshr i32 %n, 1 +; CHECK: call void @LoopCall + tail call void @LoopCall(i8* %p1, i8* %p2, i32 %div) #0 + + %div2 = lshr i32 %n, 2 +; CHECK: call void @LoopCall + tail call void @LoopCall(i8* %p1, i8* %p2, i32 %div2) #0 + +; CHECK-NOT: call void @LoopCall + tail call void @LoopCall(i8* %p2, i8* %p1, i32 0) #0 + +; CHECK-NOT: call void @LoopCall_internal + tail call void @LoopCall_internal(i8* %p1, i8* %p2, i32 %div2) #0 + + %div3 = lshr i32 %n, 4 +; CHECK-NOT: call void @SimpleCall + tail call void @SimpleCall(i8* %p2, i8* %p1, i32 %div3) #0 + ret void +} + +; CHECK-LABEL: define void @LoopCall +define void @LoopCall(i8* nocapture %dest, i8* nocapture readonly %source, i32 %num) #0 { +entry: + %c = icmp ne i32 %num, 0 + br i1 %c, label %while.cond, label %while.end + +while.cond: ; preds = %while.body, %entry + %num.addr.0 = phi i32 [ %num, %entry ], [ %dec, %while.body ] + %p_dest.0 = phi i8* [ %dest, %entry ], [ %incdec.ptr2, %while.body ] + %p_source.0 = phi i8* [ %source, %entry ], [ %incdec.ptr, %while.body ] + %cmp = icmp eq i32 %num.addr.0, 0 + br i1 %cmp, label %while.end, label %while.body + +while.body: ; preds = %while.cond + %incdec.ptr = getelementptr inbounds i8, i8* %p_source.0, i32 1 + %0 = load i8, i8* %p_source.0, align 1 + %1 = trunc i32 %num.addr.0 to i8 + %conv1 = add i8 %0, %1 + %incdec.ptr2 = getelementptr inbounds i8, i8* %p_dest.0, i32 1 + store i8 %conv1, i8* %p_dest.0, align 1 + %dec = add i32 %num.addr.0, -1 + br label %while.cond + +while.end: ; preds = %while.cond + ret void +} + +; CHECK-LABEL-NOT: define void @LoopCall_internal +define internal void @LoopCall_internal(i8* nocapture %dest, i8* nocapture readonly %source, i32 %num) #0 { +entry: + %c = icmp ne i32 %num, 0 + br i1 %c, label %while.cond, label %while.end + +while.cond: ; preds = %while.body, %entry + %num.addr.0 = phi i32 [ %num, %entry ], [ %dec, %while.body ] + %p_dest.0 = phi i8* [ %dest, %entry ], [ %incdec.ptr2, %while.body ] + %p_source.0 = phi i8* [ %source, %entry ], [ %incdec.ptr, %while.body ] + %cmp = icmp eq i32 %num.addr.0, 0 + br i1 %cmp, label %while.end, label %while.body + +while.body: ; preds = %while.cond + %incdec.ptr = getelementptr inbounds i8, i8* %p_source.0, i32 1 + %0 = load i8, i8* %p_source.0, align 1 + %1 = trunc i32 %num.addr.0 to i8 + %conv1 = add i8 %0, %1 + %incdec.ptr2 = getelementptr inbounds i8, i8* %p_dest.0, i32 1 + store i8 %conv1, i8* %p_dest.0, align 1 + %dec = add i32 %num.addr.0, -1 + br label %while.cond + +while.end: ; preds = %while.cond + ret void +} + +; CHECK-LABEL: define void @SimpleCall +define void @SimpleCall(i8* nocapture %dest, i8* nocapture readonly %source, i32 %num) #0 { +entry: + %arrayidx = getelementptr inbounds i8, i8* %source, i32 %num + %0 = load i8, i8* %arrayidx, align 1 + %1 = xor i8 %0, 127 + %arrayidx2 = getelementptr inbounds i8, i8* %dest, i32 %num + store i8 %1, i8* %arrayidx2, align 1 + ret void +} + +attributes #0 = { minsize optsize } + Index: test/Transforms/Inline/ARM/loop-memcpy.ll =================================================================== --- /dev/null +++ test/Transforms/Inline/ARM/loop-memcpy.ll @@ -0,0 +1,87 @@ +; RUN: opt -inline %s -S | FileCheck %s + +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "thumbv7m-arm-none-eabi" + +; CHECK-LABEL: define void @matcpy +define void @matcpy(i8* %dest, i8* %source, i32 %num) #0 { +entry: + %0 = ptrtoint i8* %dest to i32 + %1 = ptrtoint i8* %source to i32 + %2 = xor i32 %0, %1 + %3 = and i32 %2, 3 + %cmp = icmp eq i32 %3, 0 + br i1 %cmp, label %if.then, label %if.else20 + +if.then: ; preds = %entry + %sub = sub i32 0, %0 + %and2 = and i32 %sub, 3 + %add = or i32 %and2, 4 + %cmp3 = icmp ugt i32 %add, %num + br i1 %cmp3, label %if.else, label %if.then4 + +if.then4: ; preds = %if.then + %sub5 = sub i32 %num, %and2 + %shr = and i32 %sub5, -4 + %sub7 = sub i32 %sub5, %shr + %tobool = icmp eq i32 %and2, 0 + br i1 %tobool, label %if.end, label %if.then8 + +if.then8: ; preds = %if.then4 +; CHECK: call fastcc void @memcpy + call fastcc void @memcpy(i8* %dest, i8* %source, i32 %and2) #0 + %add.ptr = getelementptr inbounds i8, i8* %dest, i32 %and2 + %add.ptr9 = getelementptr inbounds i8, i8* %source, i32 %and2 + br label %if.end + +if.end: ; preds = %if.then4, %if.then8 + %p_dest.0 = phi i8* [ %add.ptr, %if.then8 ], [ %dest, %if.then4 ] + %p_source.0 = phi i8* [ %add.ptr9, %if.then8 ], [ %source, %if.then4 ] + %tobool14 = icmp eq i32 %sub7, 0 + br i1 %tobool14, label %if.end22, label %if.then15 + +if.then15: ; preds = %if.end + %add.ptr13 = getelementptr inbounds i8, i8* %p_source.0, i32 %shr + %add.ptr11 = getelementptr inbounds i8, i8* %p_dest.0, i32 %shr +; CHECK: call fastcc void @memcpy + call fastcc void @memcpy(i8* %add.ptr11, i8* %add.ptr13, i32 %sub7) #0 + br label %if.end22 + +if.else: ; preds = %if.then + call fastcc void @memcpy(i8* %dest, i8* %source, i32 %num) #0 + br label %if.end22 + +if.else20: ; preds = %entry + call fastcc void @memcpy(i8* %dest, i8* %source, i32 %num) #0 + br label %if.end22 + +if.end22: ; preds = %if.then15, %if.end, %if.else, %if.else20 + ret void +} + +; CHECK-LABEL: define internal void @memcpy +define internal void @memcpy(i8* nocapture %dest, i8* nocapture readonly %source, i32 %num) #0 { +entry: + br label %while.cond + +while.cond: ; preds = %while.body, %entry + %num.addr.0 = phi i32 [ %num, %entry ], [ %dec, %while.body ] + %p_dest.0 = phi i8* [ %dest, %entry ], [ %incdec.ptr1, %while.body ] + %p_source.0 = phi i8* [ %source, %entry ], [ %incdec.ptr, %while.body ] + %cmp = icmp eq i32 %num.addr.0, 0 + br i1 %cmp, label %while.end, label %while.body + +while.body: ; preds = %while.cond + %incdec.ptr = getelementptr inbounds i8, i8* %p_source.0, i32 1 + %0 = load i8, i8* %p_source.0, align 1 + %incdec.ptr1 = getelementptr inbounds i8, i8* %p_dest.0, i32 1 + store i8 %0, i8* %p_dest.0, align 1 + %dec = add i32 %num.addr.0, -1 + br label %while.cond + +while.end: ; preds = %while.cond + ret void +} + +attributes #0 = { minsize optsize } + Index: test/Transforms/Inline/ARM/loop-noinline.ll =================================================================== --- /dev/null +++ test/Transforms/Inline/ARM/loop-noinline.ll @@ -0,0 +1,49 @@ +; RUN: opt -inline %s -S | FileCheck %s + +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "thumbv7m-arm-none-eabi" + +; Check we don't inline loops at -Oz. They tend to be larger than we +; expect. + +; CHECK: define i8* @H +@digits = constant [16 x i8] c"0123456789ABCDEF", align 1 +define i8* @H(i8* %p, i32 %val, i32 %num) #0 { +entry: + br label %do.body + +do.body: ; preds = %do.body, %entry + %p.addr.0 = phi i8* [ %p, %entry ], [ %incdec.ptr, %do.body ] + %val.addr.0 = phi i32 [ %val, %entry ], [ %shl, %do.body ] + %num.addr.0 = phi i32 [ %num, %entry ], [ %dec, %do.body ] + %shr = lshr i32 %val.addr.0, 28 + %arrayidx = getelementptr inbounds [16 x i8], [16 x i8]* @digits, i32 0, i32 %shr + %0 = load i8, i8* %arrayidx, align 1 + %incdec.ptr = getelementptr inbounds i8, i8* %p.addr.0, i32 1 + store i8 %0, i8* %p.addr.0, align 1 + %shl = shl i32 %val.addr.0, 4 + %dec = add i32 %num.addr.0, -1 + %tobool = icmp eq i32 %dec, 0 + br i1 %tobool, label %do.end, label %do.body + +do.end: ; preds = %do.body + %scevgep = getelementptr i8, i8* %p, i32 %num + ret i8* %scevgep +} + +define nonnull i8* @call1(i8* %p, i32 %val, i32 %num) #0 { +entry: +; CHECK: tail call i8* @H + %call = tail call i8* @H(i8* %p, i32 %val, i32 %num) #0 + ret i8* %call +} + +define nonnull i8* @call2(i8* %p, i32 %val) #0 { +entry: +; CHECK: tail call i8* @H + %call = tail call i8* @H(i8* %p, i32 %val, i32 32) #0 + ret i8* %call +} + +attributes #0 = { minsize optsize } +