Index: llvm/trunk/lib/Analysis/InlineCost.cpp =================================================================== --- llvm/trunk/lib/Analysis/InlineCost.cpp +++ llvm/trunk/lib/Analysis/InlineCost.cpp @@ -23,6 +23,7 @@ #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" @@ -30,6 +31,7 @@ #include "llvm/IR/CallSite.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/GlobalAlias.h" #include "llvm/IR/InstVisitor.h" @@ -1885,6 +1887,24 @@ if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall) return "noduplicate"; + // Loops generally act a lot like calls in that they act like barriers to + // movement, require a certain amount of setup, etc. So when optimising for + // size, we penalise any call sites that perform loops. We do this after all + // other costs here, so will likely only be dealing with relatively small + // functions (and hence DT and LI will hopefully be cheap). + if (Caller->optForMinSize()) { + DominatorTree DT(F); + LoopInfo LI(DT); + int NumLoops = 0; + for (Loop *L : LI) { + // Ignore loops that will not be executed + if (DeadBlocks.count(L->getHeader())) + continue; + NumLoops++; + } + Cost += NumLoops * InlineConstants::CallPenalty; + } + // We applied the maximum possible vector bonus at the beginning. Now, // subtract the excess bonus, if any, from the Threshold before // comparing against Cost. Index: llvm/trunk/test/Transforms/Inline/ARM/loop-add.ll =================================================================== --- llvm/trunk/test/Transforms/Inline/ARM/loop-add.ll +++ llvm/trunk/test/Transforms/Inline/ARM/loop-add.ll @@ -0,0 +1,95 @@ +; RUN: opt -inline %s -S | FileCheck %s + +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "thumbv7m-arm-none-eabi" + +; CHECK-LABEL: void @doCalls +define void @doCalls(i8* nocapture %p1, i8* nocapture %p2, i32 %n) #0 { +entry: + %div = lshr i32 %n, 1 +; CHECK: call void @LoopCall + tail call void @LoopCall(i8* %p1, i8* %p2, i32 %div) #0 + + %div2 = lshr i32 %n, 2 +; CHECK: call void @LoopCall + tail call void @LoopCall(i8* %p1, i8* %p2, i32 %div2) #0 + +; CHECK-NOT: call void @LoopCall + tail call void @LoopCall(i8* %p2, i8* %p1, i32 0) #0 + +; CHECK-NOT: call void @LoopCall_internal + tail call void @LoopCall_internal(i8* %p1, i8* %p2, i32 %div2) #0 + + %div3 = lshr i32 %n, 4 +; CHECK-NOT: call void @SimpleCall + tail call void @SimpleCall(i8* %p2, i8* %p1, i32 %div3) #0 + ret void +} + +; CHECK-LABEL: define void @LoopCall +define void @LoopCall(i8* nocapture %dest, i8* nocapture readonly %source, i32 %num) #0 { +entry: + %c = icmp ne i32 %num, 0 + br i1 %c, label %while.cond, label %while.end + +while.cond: ; preds = %while.body, %entry + %num.addr.0 = phi i32 [ %num, %entry ], [ %dec, %while.body ] + %p_dest.0 = phi i8* [ %dest, %entry ], [ %incdec.ptr2, %while.body ] + %p_source.0 = phi i8* [ %source, %entry ], [ %incdec.ptr, %while.body ] + %cmp = icmp eq i32 %num.addr.0, 0 + br i1 %cmp, label %while.end, label %while.body + +while.body: ; preds = %while.cond + %incdec.ptr = getelementptr inbounds i8, i8* %p_source.0, i32 1 + %0 = load i8, i8* %p_source.0, align 1 + %1 = trunc i32 %num.addr.0 to i8 + %conv1 = add i8 %0, %1 + %incdec.ptr2 = getelementptr inbounds i8, i8* %p_dest.0, i32 1 + store i8 %conv1, i8* %p_dest.0, align 1 + %dec = add i32 %num.addr.0, -1 + br label %while.cond + +while.end: ; preds = %while.cond + ret void +} + +; CHECK-LABEL-NOT: define void @LoopCall_internal +define internal void @LoopCall_internal(i8* nocapture %dest, i8* nocapture readonly %source, i32 %num) #0 { +entry: + %c = icmp ne i32 %num, 0 + br i1 %c, label %while.cond, label %while.end + +while.cond: ; preds = %while.body, %entry + %num.addr.0 = phi i32 [ %num, %entry ], [ %dec, %while.body ] + %p_dest.0 = phi i8* [ %dest, %entry ], [ %incdec.ptr2, %while.body ] + %p_source.0 = phi i8* [ %source, %entry ], [ %incdec.ptr, %while.body ] + %cmp = icmp eq i32 %num.addr.0, 0 + br i1 %cmp, label %while.end, label %while.body + +while.body: ; preds = %while.cond + %incdec.ptr = getelementptr inbounds i8, i8* %p_source.0, i32 1 + %0 = load i8, i8* %p_source.0, align 1 + %1 = trunc i32 %num.addr.0 to i8 + %conv1 = add i8 %0, %1 + %incdec.ptr2 = getelementptr inbounds i8, i8* %p_dest.0, i32 1 + store i8 %conv1, i8* %p_dest.0, align 1 + %dec = add i32 %num.addr.0, -1 + br label %while.cond + +while.end: ; preds = %while.cond + ret void +} + +; CHECK-LABEL: define void @SimpleCall +define void @SimpleCall(i8* nocapture %dest, i8* nocapture readonly %source, i32 %num) #0 { +entry: + %arrayidx = getelementptr inbounds i8, i8* %source, i32 %num + %0 = load i8, i8* %arrayidx, align 1 + %1 = xor i8 %0, 127 + %arrayidx2 = getelementptr inbounds i8, i8* %dest, i32 %num + store i8 %1, i8* %arrayidx2, align 1 + ret void +} + +attributes #0 = { minsize optsize } + Index: llvm/trunk/test/Transforms/Inline/ARM/loop-memcpy.ll =================================================================== --- llvm/trunk/test/Transforms/Inline/ARM/loop-memcpy.ll +++ llvm/trunk/test/Transforms/Inline/ARM/loop-memcpy.ll @@ -0,0 +1,87 @@ +; RUN: opt -inline %s -S | FileCheck %s + +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "thumbv7m-arm-none-eabi" + +; CHECK-LABEL: define void @matcpy +define void @matcpy(i8* %dest, i8* %source, i32 %num) #0 { +entry: + %0 = ptrtoint i8* %dest to i32 + %1 = ptrtoint i8* %source to i32 + %2 = xor i32 %0, %1 + %3 = and i32 %2, 3 + %cmp = icmp eq i32 %3, 0 + br i1 %cmp, label %if.then, label %if.else20 + +if.then: ; preds = %entry + %sub = sub i32 0, %0 + %and2 = and i32 %sub, 3 + %add = or i32 %and2, 4 + %cmp3 = icmp ugt i32 %add, %num + br i1 %cmp3, label %if.else, label %if.then4 + +if.then4: ; preds = %if.then + %sub5 = sub i32 %num, %and2 + %shr = and i32 %sub5, -4 + %sub7 = sub i32 %sub5, %shr + %tobool = icmp eq i32 %and2, 0 + br i1 %tobool, label %if.end, label %if.then8 + +if.then8: ; preds = %if.then4 +; CHECK: call fastcc void @memcpy + call fastcc void @memcpy(i8* %dest, i8* %source, i32 %and2) #0 + %add.ptr = getelementptr inbounds i8, i8* %dest, i32 %and2 + %add.ptr9 = getelementptr inbounds i8, i8* %source, i32 %and2 + br label %if.end + +if.end: ; preds = %if.then4, %if.then8 + %p_dest.0 = phi i8* [ %add.ptr, %if.then8 ], [ %dest, %if.then4 ] + %p_source.0 = phi i8* [ %add.ptr9, %if.then8 ], [ %source, %if.then4 ] + %tobool14 = icmp eq i32 %sub7, 0 + br i1 %tobool14, label %if.end22, label %if.then15 + +if.then15: ; preds = %if.end + %add.ptr13 = getelementptr inbounds i8, i8* %p_source.0, i32 %shr + %add.ptr11 = getelementptr inbounds i8, i8* %p_dest.0, i32 %shr +; CHECK: call fastcc void @memcpy + call fastcc void @memcpy(i8* %add.ptr11, i8* %add.ptr13, i32 %sub7) #0 + br label %if.end22 + +if.else: ; preds = %if.then + call fastcc void @memcpy(i8* %dest, i8* %source, i32 %num) #0 + br label %if.end22 + +if.else20: ; preds = %entry + call fastcc void @memcpy(i8* %dest, i8* %source, i32 %num) #0 + br label %if.end22 + +if.end22: ; preds = %if.then15, %if.end, %if.else, %if.else20 + ret void +} + +; CHECK-LABEL: define internal void @memcpy +define internal void @memcpy(i8* nocapture %dest, i8* nocapture readonly %source, i32 %num) #0 { +entry: + br label %while.cond + +while.cond: ; preds = %while.body, %entry + %num.addr.0 = phi i32 [ %num, %entry ], [ %dec, %while.body ] + %p_dest.0 = phi i8* [ %dest, %entry ], [ %incdec.ptr1, %while.body ] + %p_source.0 = phi i8* [ %source, %entry ], [ %incdec.ptr, %while.body ] + %cmp = icmp eq i32 %num.addr.0, 0 + br i1 %cmp, label %while.end, label %while.body + +while.body: ; preds = %while.cond + %incdec.ptr = getelementptr inbounds i8, i8* %p_source.0, i32 1 + %0 = load i8, i8* %p_source.0, align 1 + %incdec.ptr1 = getelementptr inbounds i8, i8* %p_dest.0, i32 1 + store i8 %0, i8* %p_dest.0, align 1 + %dec = add i32 %num.addr.0, -1 + br label %while.cond + +while.end: ; preds = %while.cond + ret void +} + +attributes #0 = { minsize optsize } + Index: llvm/trunk/test/Transforms/Inline/ARM/loop-noinline.ll =================================================================== --- llvm/trunk/test/Transforms/Inline/ARM/loop-noinline.ll +++ llvm/trunk/test/Transforms/Inline/ARM/loop-noinline.ll @@ -0,0 +1,49 @@ +; RUN: opt -inline %s -S | FileCheck %s + +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "thumbv7m-arm-none-eabi" + +; Check we don't inline loops at -Oz. They tend to be larger than we +; expect. + +; CHECK: define i8* @H +@digits = constant [16 x i8] c"0123456789ABCDEF", align 1 +define i8* @H(i8* %p, i32 %val, i32 %num) #0 { +entry: + br label %do.body + +do.body: ; preds = %do.body, %entry + %p.addr.0 = phi i8* [ %p, %entry ], [ %incdec.ptr, %do.body ] + %val.addr.0 = phi i32 [ %val, %entry ], [ %shl, %do.body ] + %num.addr.0 = phi i32 [ %num, %entry ], [ %dec, %do.body ] + %shr = lshr i32 %val.addr.0, 28 + %arrayidx = getelementptr inbounds [16 x i8], [16 x i8]* @digits, i32 0, i32 %shr + %0 = load i8, i8* %arrayidx, align 1 + %incdec.ptr = getelementptr inbounds i8, i8* %p.addr.0, i32 1 + store i8 %0, i8* %p.addr.0, align 1 + %shl = shl i32 %val.addr.0, 4 + %dec = add i32 %num.addr.0, -1 + %tobool = icmp eq i32 %dec, 0 + br i1 %tobool, label %do.end, label %do.body + +do.end: ; preds = %do.body + %scevgep = getelementptr i8, i8* %p, i32 %num + ret i8* %scevgep +} + +define nonnull i8* @call1(i8* %p, i32 %val, i32 %num) #0 { +entry: +; CHECK: tail call i8* @H + %call = tail call i8* @H(i8* %p, i32 %val, i32 %num) #0 + ret i8* %call +} + +define nonnull i8* @call2(i8* %p, i32 %val) #0 { +entry: +; CHECK: tail call i8* @H + %call = tail call i8* @H(i8* %p, i32 %val, i32 32) #0 + ret i8* %call +} + +attributes #0 = { minsize optsize } +