Index: lib/Analysis/InlineCost.cpp =================================================================== --- lib/Analysis/InlineCost.cpp +++ lib/Analysis/InlineCost.cpp @@ -97,6 +97,9 @@ bool HasReturn; bool HasIndirectBr; bool HasFrameEscape; + bool HasVaStart; // whether F has va_start in it. + bool HasMustTailCall; // whether F has a tail or musttail call in it. + bool FIsVarArg; // whether F is variadic by declaration. /// Number of bytes allocated statically by the callee. uint64_t AllocatedSize; @@ -209,7 +212,8 @@ Cost(0), IsCallerRecursive(false), IsRecursiveCall(false), ExposesReturnsTwice(false), HasDynamicAlloca(false), ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false), - HasFrameEscape(false), AllocatedSize(0), NumInstructions(0), + HasFrameEscape(false), HasVaStart(false), HasMustTailCall(false), + FIsVarArg(Callee.isVarArg()), AllocatedSize(0), NumInstructions(0), NumVectorInstructions(0), FiftyPercentVectorBonus(0), TenPercentVectorBonus(0), VectorBonus(0), NumConstantArgs(0), NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0), @@ -891,6 +895,16 @@ } if (CS.isCall() && cast(CS.getInstruction())->cannotDuplicate()) ContainsNoDuplicateCall = true; + if (CS.isCall()) { + auto CI = cast(CS.getInstruction()); + if (CI->isMustTailCall()) { + HasMustTailCall = true; + // If A calls a variadic function B and B has a musttail call to C, then + // C can do va_start and access args passed from A to B against '...'. + // Therefore we can not inline B inside A. + if (FIsVarArg) return false; // No point looking further. + } + } if (Function *F = CS.getCalledFunction()) { // When we have a concrete function, first try to simplify it directly. @@ -917,6 +931,10 @@ case Intrinsic::localescape: HasFrameEscape = true; return false; + case Intrinsic::vastart: + // The callee has a va_start. We can not handle that in inlinining. + HasVaStart = true; + return false; } } @@ -1129,7 +1147,8 @@ // If the visit this instruction detected an uninlinable pattern, abort. if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca || - HasIndirectBr || HasFrameEscape) + HasIndirectBr || HasFrameEscape || HasVaStart || + (HasMustTailCall && FIsVarArg)) return false; // If the caller is a recursive function then we don't want to inline @@ -1509,6 +1528,7 @@ bool llvm::isInlineViable(Function &F) { bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice); + bool isVarArg = F.isVarArg(); for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) { // Disallow inlining of functions which contain indirect branches or // blockaddresses. @@ -1520,6 +1540,14 @@ if (!CS) continue; + if (isVarArg) { + if (const CallInst*CI = dyn_cast(&II)) { + // Same logic as in analyzeBlock above. + if (CI->isMustTailCall()) + return false; + } + } + // Disallow recursive calls. if (&F == CS.getCalledFunction()) return false; @@ -1530,12 +1558,13 @@ cast(CS.getInstruction())->canReturnTwice()) return false; - // Disallow inlining functions that call @llvm.localescape. Doing this - // correctly would require major changes to the inliner. - if (CS.getCalledFunction() && - CS.getCalledFunction()->getIntrinsicID() == - llvm::Intrinsic::localescape) - return false; + // Disallow inlining functions that call @llvm.localescape or @vastart. + // Doing this correctly would require major changes to the inliner. + if (auto cf = CS.getCalledFunction()) { + auto intrId = cf->getIntrinsicID(); + if (intrId == Intrinsic::localescape || intrId == Intrinsic::vastart) + return false; + } } } Index: lib/Transforms/Utils/InlineFunction.cpp =================================================================== --- lib/Transforms/Utils/InlineFunction.cpp +++ lib/Transforms/Utils/InlineFunction.cpp @@ -1441,9 +1441,7 @@ const Function *CalledFunc = CS.getCalledFunction(); if (!CalledFunc || // Can't inline external function or indirect - CalledFunc->isDeclaration() || // call, or call to a vararg function! - CalledFunc->getFunctionType()->isVarArg()) return false; - + CalledFunc->isDeclaration()) return false; // calls. // The inliner does not know how to inline through calls with operand bundles // in general ... if (CS.hasOperandBundles()) { @@ -1569,7 +1567,11 @@ auto &DL = Caller->getParent()->getDataLayout(); - assert(CalledFunc->arg_size() == CS.arg_size() && + if (CalledFunc->getFunctionType()->isVarArg()) + assert(CalledFunc->arg_size() <= CS.arg_size() && + "Not enough arguments passed to vararg function"); + else + assert(CalledFunc->arg_size() == CS.arg_size() && "No varargs calls can be inlined!"); // Calculate the vector of arguments to pass into the function cloner, which @@ -1746,16 +1748,25 @@ if (CallInst *CI = dyn_cast(TheCall)) CallSiteTailKind = CI->getTailCallKind(); + bool calleeIsVararg = CalledFunc->isVarArg(); // Just for asserts below. for (Function::iterator BB = FirstNewBlock, E = Caller->end(); BB != E; ++BB) { for (Instruction &I : *BB) { CallInst *CI = dyn_cast(&I); if (!CI) continue; + if (calleeIsVararg) { + assert((!CI->isMustTailCall()) + && "Inlined musttailcall in vararg function"); + } - if (Function *F = CI->getCalledFunction()) + if (Function *F = CI->getCalledFunction()) { InlinedDeoptimizeCalls |= F->getIntrinsicID() == Intrinsic::experimental_deoptimize; + if (calleeIsVararg) + assert((F->getIntrinsicID() != Intrinsic::vastart) && + "Inlined vastart"); + } // We need to reduce the strength of any inlined tail calls. For // musttail, we have to avoid introducing potential unbounded stack Index: test/Transforms/Inline/inline-varargs.ll =================================================================== --- test/Transforms/Inline/inline-varargs.ll +++ test/Transforms/Inline/inline-varargs.ll @@ -0,0 +1,254 @@ +; RUN: opt -inline -S %s | FileCheck %s + +; Check that the side effects of unused arguments happen. +; CHECK: %call{{.*}}callThis +; CHECK: %call{{.*}}callThat + +; Verify that calleeY gets inlined. Call should not exist. +; CHECK-NOT: call{{.*}}calleeY + +; Verify that calleeN* do not get inlined. Call should exist. +; CHECK: call{{.*}}calleeNMT +; CHECK: call{{.*}}calleeNMTAI +; CHECK: call{{.*}}calleeN +; CHECK: call{{.*}}calleeNAI + +; ModuleID = 'inl.cpp' +;source_filename = "inl.cpp" +;target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +;target triple = "x86_64-scei-ps4" +%struct.__va_list_tag = type { i32, i32, i8*, i8* } +$_Z7calleeYiPKcz = comdat any +$_Z9calleeNMTiPKcz = comdat any +$_Z11calleeNMTAIiPKcz = comdat any +$_Z7calleeNiPKcz = comdat any +$_Z9calleeNAIiPKcz = comdat any +@.str = private unnamed_addr constant [4 x i8] c"abc\00", align 1 + +; Function Attrs: nounwind +define i32 @_Z6calleriPKcz(i32 %p, i8* %x, ...) #0 { +entry: + %p.addr = alloca i32, align 4 + %x.addr = alloca i8*, align 8 + %r = alloca i32, align 4 + %q = alloca i32, align 4 + store i32 %p, i32* %p.addr, align 4 + store i8* %x, i8** %x.addr, align 8 + %call = call i32 @_Z8callThisv() + %call1 = call i32 @_Z8callThatv() + %call2 = call i32 (i32, i8*, ...) @_Z7calleeYiPKcz(i32 12, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32* %p.addr, i32 %call, i32 2, i32 %call1) + store i32 %call2, i32* %r, align 4 + %call3 = call i32 (i32, i8*, ...) @_Z9calleeNMTiPKcz(i32 12, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32* %p.addr, i32 2, i32 3) + %0 = load i32, i32* %r, align 4 + %add = add nsw i32 %0, %call3 + store i32 %add, i32* %r, align 4 + %call4 = call i32 (i32, i8*, ...) @_Z11calleeNMTAIiPKcz(i32 12, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32* %p.addr, i32 2, i32 3) + %1 = load i32, i32* %r, align 4 + %add5 = add nsw i32 %1, %call4 + store i32 %add5, i32* %r, align 4 + %2 = load i32, i32* %p.addr, align 4 + %inc = add nsw i32 %2, 1 + store i32 %inc, i32* %p.addr, align 4 + %call6 = call i32 (i32, i8*, ...) @_Z7calleeNiPKcz(i32 142, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32* %p.addr, i32 2, i32 3) + store i32 %call6, i32* %q, align 4 + %call7 = call i32 (i32, i8*, ...) @_Z9calleeNAIiPKcz(i32 122, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32* %p.addr, i32 5, i32 3) + %3 = load i32, i32* %q, align 4 + %add8 = add nsw i32 %3, %call7 + store i32 %add8, i32* %q, align 4 + %4 = load i32, i32* %r, align 4 + %5 = load i32, i32* %q, align 4 + %add9 = add nsw i32 %4, %5 + %6 = load i8*, i8** %x.addr, align 8 + %call10 = musttail call i32 (i32, i8*, ...) @_Z5addMTiPKcz(i32 %add9, i8* %6, ...) + ret i32 %call10 +} + +; Function Attrs: inlinehint nounwind +define linkonce_odr i32 @_Z7calleeYiPKcz(i32 %a, i8* %m, ...) #1 comdat { +entry: + %a.addr = alloca i32, align 4 + %m.addr = alloca i8*, align 8 + store i32 %a, i32* %a.addr, align 4 + store i8* %m, i8** %m.addr, align 8 + %0 = load i32, i32* %a.addr, align 4 + %add = add nsw i32 %0, 1 + %call = call i32 @_Z4add2i(i32 %add) + ret i32 %call +} + +declare i32 @_Z8callThisv() #2 + +declare i32 @_Z8callThatv() #2 + +; Function Attrs: inlinehint nounwind +define linkonce_odr i32 @_Z9calleeNMTiPKcz(i32 %a, i8* %m, ...) #1 comdat { +entry: + %a.addr = alloca i32, align 4 + %m.addr = alloca i8*, align 8 + store i32 %a, i32* %a.addr, align 4 + store i8* %m, i8** %m.addr, align 8 + %0 = load i32, i32* %a.addr, align 4 + %add = add nsw i32 %0, 2 + %1 = load i8*, i8** %m.addr, align 8 + %call = musttail call i32 (i32, i8*, ...) @_Z5addMTiPKcz(i32 %add, i8* %1, i32 5, ...) + ret i32 %call +} + +; Function Attrs: inlinehint nounwind +define linkonce_odr i32 @_Z11calleeNMTAIiPKcz(i32 %a, i8* %m, ...) #3 comdat { +entry: + %a.addr = alloca i32, align 4 + %m.addr = alloca i8*, align 8 + store i32 %a, i32* %a.addr, align 4 + store i8* %m, i8** %m.addr, align 8 + %0 = load i32, i32* %a.addr, align 4 + %sub = sub nsw i32 %0, 2 + %1 = load i8*, i8** %m.addr, align 8 + %call = musttail call i32 (i32, i8*, ...) @_Z5addMTiPKcz(i32 %sub, i8* %1, i32 6, ...) + ret i32 %call +} + +; Function Attrs: inlinehint nounwind +define linkonce_odr i32 @_Z7calleeNiPKcz(i32 %a, i8* %m, ...) #1 comdat { +entry: + %a.addr = alloca i32, align 4 + %m.addr = alloca i8*, align 8 + %ap = alloca [1 x %struct.__va_list_tag], align 16 + store i32 %a, i32* %a.addr, align 4 + store i8* %m, i8** %m.addr, align 8 + %arraydecay = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %ap, i32 0, i32 0 + %arraydecay1 = bitcast %struct.__va_list_tag* %arraydecay to i8* + call void @llvm.va_start(i8* %arraydecay1) + %0 = load i32, i32* %a.addr, align 4 + %add = add nsw i32 %0, 3 + %arraydecay2 = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %ap, i32 0, i32 0 + %gp_offset_p = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %arraydecay2, i32 0, i32 0 + %gp_offset = load i32, i32* %gp_offset_p, align 16 + %fits_in_gp = icmp ule i32 %gp_offset, 40 + br i1 %fits_in_gp, label %vaarg.in_reg, label %vaarg.in_mem + +vaarg.in_reg: ; preds = %entry + %1 = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %arraydecay2, i32 0, i32 3 + %reg_save_area = load i8*, i8** %1, align 16 + %2 = getelementptr i8, i8* %reg_save_area, i32 %gp_offset + %3 = bitcast i8* %2 to i32* + %4 = add i32 %gp_offset, 8 + store i32 %4, i32* %gp_offset_p, align 16 + br label %vaarg.end + +vaarg.in_mem: ; preds = %entry + %overflow_arg_area_p = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %arraydecay2, i32 0, i32 2 + %overflow_arg_area = load i8*, i8** %overflow_arg_area_p, align 8 + %5 = bitcast i8* %overflow_arg_area to i32* + %overflow_arg_area.next = getelementptr i8, i8* %overflow_arg_area, i32 8 + store i8* %overflow_arg_area.next, i8** %overflow_arg_area_p, align 8 + br label %vaarg.end + +vaarg.end: ; preds = %vaarg.in_mem, %vaarg.in_reg + %vaarg.addr = phi i32* [ %3, %vaarg.in_reg ], [ %5, %vaarg.in_mem ] + %6 = load i32, i32* %vaarg.addr, align 4 + %add3 = add nsw i32 %add, %6 + ret i32 %add3 +} + +; Function Attrs: alwaysinline inlinehint nounwind +define linkonce_odr i32 @_Z9calleeNAIiPKcz(i32 %a, i8* %m, ...) #3 comdat { +entry: + %a.addr = alloca i32, align 4 + %m.addr = alloca i8*, align 8 + %ap = alloca [1 x %struct.__va_list_tag], align 16 + store i32 %a, i32* %a.addr, align 4 + store i8* %m, i8** %m.addr, align 8 + %arraydecay = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %ap, i32 0, i32 0 + %arraydecay1 = bitcast %struct.__va_list_tag* %arraydecay to i8* + call void @llvm.va_start(i8* %arraydecay1) + %0 = load i32, i32* %a.addr, align 4 + %mul = mul nsw i32 %0, 2 + %arraydecay2 = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %ap, i32 0, i32 0 + %gp_offset_p = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %arraydecay2, i32 0, i32 0 + %gp_offset = load i32, i32* %gp_offset_p, align 16 + %fits_in_gp = icmp ule i32 %gp_offset, 40 + br i1 %fits_in_gp, label %vaarg.in_reg, label %vaarg.in_mem + +vaarg.in_reg: ; preds = %entry + %1 = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %arraydecay2, i32 0, i32 3 + %reg_save_area = load i8*, i8** %1, align 16 + %2 = getelementptr i8, i8* %reg_save_area, i32 %gp_offset + %3 = bitcast i8* %2 to i32* + %4 = add i32 %gp_offset, 8 + store i32 %4, i32* %gp_offset_p, align 16 + br label %vaarg.end + +vaarg.in_mem: ; preds = %entry + %overflow_arg_area_p = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %arraydecay2, i32 0, i32 2 + %overflow_arg_area = load i8*, i8** %overflow_arg_area_p, align 8 + %5 = bitcast i8* %overflow_arg_area to i32* + %overflow_arg_area.next = getelementptr i8, i8* %overflow_arg_area, i32 8 + store i8* %overflow_arg_area.next, i8** %overflow_arg_area_p, align 8 + br label %vaarg.end + +vaarg.end: ; preds = %vaarg.in_mem, %vaarg.in_reg + %vaarg.addr = phi i32* [ %3, %vaarg.in_reg ], [ %5, %vaarg.in_mem ] + %6 = load i32, i32* %vaarg.addr, align 4 + %add = add nsw i32 %mul, %6 + ret i32 %add +} + +declare i32 @_Z5addMTiPKcz(i32, i8*, ...) #2 + +declare i32 @_Z4add2i(i32) #2 + +; Function Attrs: nounwind +declare void @llvm.va_start(i8*) #4 + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { inlinehint nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { alwaysinline inlinehint nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { nounwind } + +!llvm.ident = !{!0} + +!0 = !{!"clang version 4.0.0 (trunk)"} + +;Generated by processing this file with: +; clang++.exe -cc1 -triple x86_64-scei-ps4 inl.cpp -emit-llvm +; +;#include +;int addMT(int p, const char *, ...); +;int add2(int); +; +;inline int calleeY(int a, const char *m, ...) {return add2(a+1);} +;inline int calleeNMT(int a, const char *m, ...) {return addMT(a+2,m, 5);} +;inline int // __attribute__((always_inline)) // will add always_inline in .ll +; calleeNMTAI(int a, const char *m, ...) {return addMT(a-2,m, 6);} +; +;// do not inline this +;inline int calleeN(int a, const char *m, ...) { +; va_list ap; +; va_start(ap, m); +; return a+3 + va_arg(ap, int); +;} +;// even if it is always_inline +;inline int __attribute__((always_inline)) calleeNAI(int a, const char *m, ...) { +; va_list ap; +; va_start(ap, m); +; return a*2 + va_arg(ap, int); +;} +;int callThis(); // calls to this must be preserved +;int callThat(); // calls to this must be preserved +;int caller(int p,const char *x, ...) +;{ +; int r = calleeY(12, "abc", &p, callThis(), 2, callThat()); // inline this +; r += calleeNMT(12, "abc", &p, 2, 3); // No. calleeNMT has mustatil call +; r += calleeNMTAI(12, "abc", &p, 2, 3); // Not even if it is alway_inline +; p++; +; int q = calleeN(142, "abc", &p, 2, 3); // No. calleeN has va_start +; q += calleeNAI(122, "abc", &p, 5, 3); // Not even if it has always_inline +; return addMT(r + q, x); +;} +; +;// and then running following SED script on it +;// SED /define.*calleeNMTAI/s/#1/#3/ +;// SED /call.*addMT/s/= call/= musttail call/ +;// SED /call.*addMT/s/)$/, ...)/