diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -40,6 +40,7 @@ #include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" @@ -2270,6 +2271,118 @@ return true; } +static bool hasMustTailCallers(Function *F) { + for (User *U : F->users()) { + CallBase *CB = dyn_cast(U); + if (!CB) { + assert(isa(U) && + "Expected either CallBase or BlockAddress"); + continue; + } + if (CB->isMustTailCall()) + return true; + } + return false; +} + +static bool hasInvokeCallers(Function *F) { + for (User *U : F->users()) + if (isa(U)) + return true; + return false; +} + +static void RemovePreallocated(Function *F) { + RemoveAttribute(F, Attribute::Preallocated); + + auto *M = F->getParent(); + + IRBuilder<> Builder(M->getContext()); + + // Cannot modify users() while iterating over it, so make a copy. + SmallVector PreallocatedCalls(F->users()); + for (User *U : PreallocatedCalls) { + CallBase *CB = dyn_cast(U); + if (!CB) + continue; + + assert( + !CB->isMustTailCall() && + "Shouldn't call RemotePreallocated() on a musttail preallocated call"); + // Create copy of call without "preallocated" operand bundle. + SmallVector OpBundles; + CB->getOperandBundlesAsDefs(OpBundles); + CallBase *PreallocatedSetup = nullptr; + for (auto *It = OpBundles.begin(); It != OpBundles.end(); ++It) { + if (It->getTag() == "preallocated") { + PreallocatedSetup = cast(*It->input_begin()); + OpBundles.erase(It); + break; + } + } + assert(PreallocatedSetup && "Did not find preallocated bundle"); + uint64_t ArgCount = + cast(PreallocatedSetup->getArgOperand(0))->getZExtValue(); + CallBase *NewCB = nullptr; + if (InvokeInst *II = dyn_cast(CB)) { + NewCB = InvokeInst::Create(II, OpBundles, CB); + } else { + CallInst *CI = cast(CB); + NewCB = CallInst::Create(CI, OpBundles, CB); + } + CB->replaceAllUsesWith(NewCB); + NewCB->takeName(CB); + CB->eraseFromParent(); + + Builder.SetInsertPoint(PreallocatedSetup); + auto *StackSave = + Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::stacksave)); + + Builder.SetInsertPoint(NewCB->getNextNonDebugInstruction()); + Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::stackrestore), + StackSave); + + // Replace @llvm.call.preallocated.arg() with alloca. + // Cannot modify users() while iterating over it, so make a copy. + // @llvm.call.preallocated.arg() can be called with the same index multiple + // times. So for each @llvm.call.preallocated.arg(), we see if we have + // already created a Value* for the index, and if not, create an alloca and + // bitcast right after the @llvm.call.preallocated.setup() so that it + // dominates all uses. + SmallVector ArgAllocas(ArgCount); + SmallVector PreallocatedArgs(PreallocatedSetup->users()); + for (auto *User : PreallocatedArgs) { + auto *UseCall = cast(User); + assert(UseCall->getCalledFunction()->getIntrinsicID() == + Intrinsic::call_preallocated_arg && + "preallocated token use was not a llvm.call.preallocated.arg"); + uint64_t AllocArgIndex = + cast(UseCall->getArgOperand(1))->getZExtValue(); + Value *AllocaReplacement = ArgAllocas[AllocArgIndex]; + if (!AllocaReplacement) { + auto AddressSpace = UseCall->getType()->getPointerAddressSpace(); + auto *ArgType = UseCall + ->getAttribute(AttributeList::FunctionIndex, + Attribute::Preallocated) + .getValueAsType(); + auto *InsertBefore = PreallocatedSetup->getNextNonDebugInstruction(); + Builder.SetInsertPoint(InsertBefore); + auto *Alloca = + Builder.CreateAlloca(ArgType, AddressSpace, nullptr, "paarg"); + auto *BitCast = Builder.CreateBitCast( + Alloca, Type::getInt8PtrTy(M->getContext()), UseCall->getName()); + ArgAllocas[AllocArgIndex] = BitCast; + AllocaReplacement = BitCast; + } + + UseCall->replaceAllUsesWith(AllocaReplacement); + UseCall->eraseFromParent(); + } + // Remove @llvm.call.preallocated.setup(). + cast(PreallocatedSetup)->eraseFromParent(); + } +} + static bool OptimizeFunctions(Module &M, function_ref GetTLI, @@ -2333,13 +2446,23 @@ // wouldn't be safe in the presence of inalloca. // FIXME: We should also hoist alloca affected by this to the entry // block if possible. - // FIXME: handle preallocated if (F->getAttributes().hasAttrSomewhere(Attribute::InAlloca) && !F->hasAddressTaken()) { RemoveAttribute(F, Attribute::InAlloca); Changed = true; } + // FIXME: handle invokes + // FIXME: handle musttail + if (F->getAttributes().hasAttrSomewhere(Attribute::Preallocated)) { + if (!F->hasAddressTaken() && !hasMustTailCallers(F) && + !hasInvokeCallers(F)) { + RemovePreallocated(F); + Changed = true; + } + continue; + } + if (hasChangeableCC(F) && !F->isVarArg() && !F->hasAddressTaken()) { NumInternalFunc++; TargetTransformInfo &TTI = GetTTI(*F); diff --git a/llvm/test/Transforms/GlobalOpt/fastcc.ll b/llvm/test/Transforms/GlobalOpt/fastcc.ll --- a/llvm/test/Transforms/GlobalOpt/fastcc.ll +++ b/llvm/test/Transforms/GlobalOpt/fastcc.ll @@ -36,8 +36,7 @@ } define internal i32 @preallocated(i32* preallocated(i32) %p) { -; TODO: handle preallocated: -; CHECK-NOT-LABEL: define internal fastcc i32 @preallocated(i32* %p) +; CHECK-LABEL: define internal fastcc i32 @preallocated(i32* %p) %rv = load i32, i32* %p ret i32 %rv } @@ -50,21 +49,21 @@ call i32 @j(i32* %m) %args = alloca inalloca i32 call i32 @inalloca(i32* inalloca %args) - ; TODO: handle preallocated - ;%c = call token @llvm.call.preallocated.setup(i32 1) - ;%N = call i8* @llvm.call.preallocated.arg(token %c, i32 0) preallocated(i32) - ;%n = bitcast i8* %N to i32* - ;call i32 @preallocated(i32* preallocated(i32) %n) ["preallocated"(token %c)] + %c = call token @llvm.call.preallocated.setup(i32 1) + %N = call i8* @llvm.call.preallocated.arg(token %c, i32 0) preallocated(i32) + %n = bitcast i8* %N to i32* + call i32 @preallocated(i32* preallocated(i32) %n) ["preallocated"(token %c)] ret void } - -@llvm.used = appending global [1 x i8*] [ - i8* bitcast (i32(i32*)* @j to i8*) -], section "llvm.metadata" - ; CHECK-LABEL: define void @call_things() ; CHECK: call fastcc i32 @f ; CHECK: call fastcc i32 @g ; CHECK: call coldcc i32 @h ; CHECK: call i32 @j ; CHECK: call fastcc i32 @inalloca(i32* %args) +; CHECK-NOT: llvm.call.preallocated +; CHECK: call fastcc i32 @preallocated(i32* %n) + +@llvm.used = appending global [1 x i8*] [ + i8* bitcast (i32(i32*)* @j to i8*) +], section "llvm.metadata" diff --git a/llvm/test/Transforms/GlobalOpt/preallocated.ll b/llvm/test/Transforms/GlobalOpt/preallocated.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/GlobalOpt/preallocated.ll @@ -0,0 +1,88 @@ +; RUN: opt < %s -globalopt -S | FileCheck %s + +declare token @llvm.call.preallocated.setup(i32) +declare i8* @llvm.call.preallocated.arg(token, i32) +declare i32 @__CxxFrameHandler3(...) + +; Don't touch functions with any musttail calls +define internal i32 @preallocated_musttail(i32* preallocated(i32) %p) { +; CHECK-LABEL: define internal i32 @preallocated_musttail(i32* preallocated(i32) %p) + %rv = load i32, i32* %p + ret i32 %rv +} + +define i32 @call_preallocated_musttail(i32* preallocated(i32) %a) { + %r = musttail call i32 @preallocated_musttail(i32* preallocated(i32) %a) + ret i32 %r +} +; CHECK-LABEL: define i32 @call_preallocated_musttail(i32* preallocated(i32) %a) +; CHECK: musttail call i32 @preallocated_musttail(i32* preallocated(i32) %a) + +define i32 @call_preallocated_musttail_without_musttail() { + %c = call token @llvm.call.preallocated.setup(i32 1) + %N = call i8* @llvm.call.preallocated.arg(token %c, i32 0) preallocated(i32) + %n = bitcast i8* %N to i32* + %r = call i32 @preallocated_musttail(i32* preallocated(i32) %n) ["preallocated"(token %c)] + ret i32 %r +} +; CHECK-LABEL: define i32 @call_preallocated_musttail_without_musttail() +; CHECK: call i32 @preallocated_musttail(i32* preallocated(i32) %n) + +; Check that only one alloca per preallocated arg +define internal i32 @preallocated(i32* preallocated(i32) %a) { +; CHECK-LABEL: define internal fastcc i32 @preallocated(i32* %a) + %rv = load i32, i32* %a + ret i32 %rv +} + +declare void @foo(i8*) + +define i32 @call_preallocated_multiple_args() { +; CHECK-LABEL: define i32 @call_preallocated_multiple_args() +; CHECK-NEXT: [[SS:%[0-9a-zA-Z_]+]] = call i8* @llvm.stacksave() +; CHECK-NEXT: [[ARG0:%[0-9a-zA-Z_]+]] = alloca i32 +; CHECK-NEXT: [[ARG1:%[0-9a-zA-Z_]+]] = bitcast i32* [[ARG0]] to i8* +; CHECK-NEXT: call void @foo(i8* [[ARG1]]) +; CHECK-NEXT: call void @foo(i8* [[ARG1]]) +; CHECK-NEXT: call void @foo(i8* [[ARG1]]) +; CHECK-NEXT: [[ARG2:%[0-9a-zA-Z_]+]] = bitcast i8* [[ARG1]] to i32* +; CHECK-NEXT: call fastcc i32 @preallocated(i32* [[ARG2]]) +; CHECK-NEXT: call void @llvm.stackrestore(i8* [[SS]]) +; CHECK-NEXT: ret + %c = call token @llvm.call.preallocated.setup(i32 1) + %a1 = call i8* @llvm.call.preallocated.arg(token %c, i32 0) preallocated(i32) + call void @foo(i8* %a1) + %a2 = call i8* @llvm.call.preallocated.arg(token %c, i32 0) preallocated(i32) + call void @foo(i8* %a2) + %a3 = call i8* @llvm.call.preallocated.arg(token %c, i32 0) preallocated(i32) + call void @foo(i8* %a3) + %b = bitcast i8* %a3 to i32* + %r = call i32 @preallocated(i32* preallocated(i32) %b) ["preallocated"(token %c)] + ret i32 %r +} + +; Don't touch functions with any invokes +define internal i32 @preallocated_invoke(i32* preallocated(i32) %p) { +; CHECK-LABEL: define internal i32 @preallocated_invoke(i32* preallocated(i32) %p) + %rv = load i32, i32* %p + ret i32 %rv +} + +define i32 @call_preallocated_invoke() personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) { + %c = call token @llvm.call.preallocated.setup(i32 1) + %a = call i8* @llvm.call.preallocated.arg(token %c, i32 0) preallocated(i32) + %b = bitcast i8* %a to i32* + %r = invoke i32 @preallocated_invoke(i32* preallocated(i32) %b) ["preallocated"(token %c)] + to label %conta unwind label %contb +conta: + ret i32 %r +contb: + %s = catchswitch within none [label %catch] unwind to caller +catch: + %p = catchpad within %s [] + catchret from %p to label %cont +cont: + ret i32 42 +} +; CHECK-LABEL: define i32 @call_preallocated_invoke() +; CHECK: invoke i32 @preallocated_invoke(i32* preallocated(i32) %b)