Index: lib/Transforms/IPO/Inliner.cpp =================================================================== --- lib/Transforms/IPO/Inliner.cpp +++ lib/Transforms/IPO/Inliner.cpp @@ -517,6 +517,20 @@ Function *Caller = CS.getCaller(); Function *Callee = CS.getCalledFunction(); + // Track whether this is the last use of a callee. + // This is used later to delete unused functions. + // TODO: Use this information to splice instead of clone a function. + bool LastCalleeUse = Callee && Callee->hasOneUse() && + Callee->hasLocalLinkage() && + // TODO: Can remove if in SCC now. + !SCCFunctions.count(Callee) && + + // The function may be apparently dead, but if + // there are indirect callgraph references to the + // node, we cannot delete it yet, this could + // invalidate the CGSCC iterator. + CG[Callee]->getNumReferences() == 1; + // If this call site is dead and it is to a readonly function, we should // just delete the call instead of trying to inline it, regardless of // size. This happens because IPSCCP propagates the result out of the @@ -531,7 +545,7 @@ } else { // We can only inline direct calls to non-declarations. if (!Callee || Callee->isDeclaration()) continue; - + // If this call site was obtained by inlining another function, verify // that the include path for the function did not include the callee // itself. If so, we'd be recursively inlining the same function, @@ -588,19 +602,15 @@ } } } - + // If we inlined or deleted the last possible call site to the function, // delete the function body now. - if (Callee && Callee->use_empty() && Callee->hasLocalLinkage() && - // TODO: Can remove if in SCC now. - !SCCFunctions.count(Callee) && - - // The function may be apparently dead, but if there are indirect - // callgraph references to the node, we cannot delete it yet, this - // could invalidate the CGSCC iterator. - CG[Callee]->getNumReferences() == 0) { + if (LastCalleeUse) { DEBUG(dbgs() << " -> Deleting dead function: " << Callee->getName() << "\n"); + + assert(Callee->use_empty() && CG[Callee]->getNumReferences() == 0 && + "Failed to update call graph after devirtualizing"); CallGraphNode *CalleeNode = CG[Callee]; // Remove any call graph edges from the callee to its callees. Index: lib/Transforms/Utils/InlineFunction.cpp =================================================================== --- lib/Transforms/Utils/InlineFunction.cpp +++ lib/Transforms/Utils/InlineFunction.cpp @@ -506,6 +506,98 @@ return nullptr; } +/// \brief Given a return instruction which is being inlined, and the inlined +/// call-site, try to devirtualize any uses of the returned value if it points +/// to a Function. Updates the call graph if any calls were devirtualized. +static bool devirtualizedCallUsers(Value *ReturnValue, + Instruction *TheCall, + InlineFunctionInfo &IFI) { + if (!IFI.CG) + return false; + // Check if we point to a Function*. Look through Bitcasts to do this. + // Without a Function*, there's nothing to devirtualize. + Function *CalledF = nullptr; + if (Operator::getOpcode(ReturnValue) == Instruction::BitCast) + CalledF = dyn_cast(cast(ReturnValue)->getOperand(0)); + else + CalledF = dyn_cast(ReturnValue); + if (!CalledF) + return false; + + // When we find an eligible CallInst, we will call replaceCallEdge which is + // linear. Instead of RAUW then a walk of the users of the return value, we + // first find all the eligible call sites. Then we can call replaceCallEdge + // only when needed. + SmallVector Bitcasts; + SmallPtrSet Calls; + + // Note, calling 'Use.set' invalidates the iterators here so we can't use + // foreach. + for (Use &Use : TheCall->uses()) { + User *U = Use.getUser(); + // Note down bitcasts for later. We'll recursively walk them looking for + // more calls. + if (BitCastInst *BI = dyn_cast(U)) { + Bitcasts.push_back(BI); + continue; + } + // If we find a call, and this use is the called value (not an argument to + // the call, then devirtualize the call) + CallSite CS(U); + if (CS && CS.getCalledValue() == TheCall) { + Calls.insert(CS.getInstruction()); + continue; + } + } + + TheCall->replaceAllUsesWith(ReturnValue); + + // Now recursively walk the bitcast users trying to look for more calls. + while (!Bitcasts.empty()) { + BitCastInst *BI = Bitcasts.pop_back_val(); + for (User *U : BI->users()) { + if (BitCastInst *UserBI = dyn_cast(U)) { + Bitcasts.push_back(UserBI); + continue; + } + // If we find a call to this bitcast, note it down as one we should + // try to devirtualize + CallSite CS(U); + if (CS && CS.getCalledValue() == BI) + Calls.insert(CS.getInstruction()); + } + } + + // Now walk all the calls we found, and update the call graph for each we + // manage to devirtualize. + if (Calls.empty()) + return true; + + const Function *Caller = TheCall->getParent()->getParent(); + CallGraphNode *CallerNode = IFI.CG->getOrInsertFunction(Caller); + CallGraphNode *CalleeNode = IFI.CG->getOrInsertFunction(CalledF); + + for (Instruction *CallI : Calls) { + CallSite CS(CallI); + assert(CS && "Expected Call/Invoke instruction"); + Value *CalledValue = CS.getCalledValue(); + + // If we're calling a bitcast then try simplify it. + if (CalledValue != CalledF) + if (Instruction *CalledInstruction = dyn_cast(CalledValue)) + if (Value *V = SimplifyInstruction(CalledInstruction, IFI.DL)) { + CS.setCalledFunction(V); + // Delete the original instruction if it has no more users + if (CalledValue->use_empty()) + CalledInstruction->eraseFromParent(); + } + + if (CS.getCalledFunction() == CalledF) + CallerNode->replaceCallEdge(CS, CS, CalleeNode); + } + return true; +} + /// InlineFunction - This function inlines the called function into the basic /// block of the caller. This returns false if it is not possible to inline /// this call. The program is still in a well defined state if this occurs @@ -623,7 +715,7 @@ // have no dead or constant instructions leftover after inlining occurs // (which can happen, e.g., because an argument was constant), but we'll be // happy with whatever the cloner can do. - CloneAndPruneFunctionInto(Caller, CalledFunc, VMap, + CloneAndPruneFunctionInto(Caller, CalledFunc, VMap, /*ModuleLevelChanges=*/false, Returns, ".i", &InlinedFunctionInfo, IFI.DL, TheCall); @@ -852,7 +944,8 @@ ReturnInst *R = Returns[0]; if (TheCall == R->getReturnValue()) TheCall->replaceAllUsesWith(UndefValue::get(TheCall->getType())); - else + else if (!devirtualizedCallUsers(R->getReturnValue(), TheCall, + IFI)) TheCall->replaceAllUsesWith(R->getReturnValue()); } // Since we are now done with the Call/Invoke, we can delete it. @@ -956,7 +1049,8 @@ if (!TheCall->use_empty()) { if (TheCall == Returns[0]->getReturnValue()) TheCall->replaceAllUsesWith(UndefValue::get(TheCall->getType())); - else + else if (!devirtualizedCallUsers(Returns[0]->getReturnValue(), + TheCall, IFI)) TheCall->replaceAllUsesWith(Returns[0]->getReturnValue()); } Index: test/Transforms/Inline/devirtualize-2.ll =================================================================== --- test/Transforms/Inline/devirtualize-2.ll +++ test/Transforms/Inline/devirtualize-2.ll @@ -42,3 +42,113 @@ ; CHECK-LABEL: @test2( ; CHECK-NEXT: ret i32 41 + +; This is test1 but with multiple return blocks to test that +; we have coverage over all parts of the inliner which devirtualize +; based on return values from inlined calls. +define i32 @test3(i1 %cond) { + %funcall1_ = call fastcc i32 ()* (i1)* @f4a(i1 %cond) + %executecommandptr1_ = call i32 %funcall1_() + ret i32 %executecommandptr1_ +} + +define internal fastcc i32 ()* @f4a(i1 %cond) nounwind readnone { + br i1 %cond, label %l1, label %l2 +l2: + br label %l1 +l1: + ret i32 ()* @f5a +} + +define internal i32 @f5a() nounwind readnone { + ret i32 1 +} + +; CHECK: @test3(i1 %cond) +; CHECK: f4a.exit: +; CHECK-NEXT: ret i32 1 + +; This is test1 but with an invoke instead of a call. +define i32 @test4(i1 %cond) { + %funcall1_ = invoke fastcc i32 ()* ()* @f1() to label %normal_bb unwind label %unwind_bb +normal_bb: + %executecommandptr1_ = call i32 %funcall1_() + ret i32 %executecommandptr1_ +unwind_bb: + landingpad { i8*, i32 } personality i32 (...)* null + catch i8** null + unreachable +} + +; CHECK: @test4(i1 %cond) +; CHECK: normal_bb: +; CHECK-NEXT: ret i32 1 + +; This is test3 but with an invoke instead of a call. +define i32 @test5(i1 %cond) { + %funcall1_ = invoke fastcc i32 ()* (i1)* @f4a(i1 %cond) to label %normal_bb unwind label %unwind_bb +normal_bb: + %executecommandptr1_ = call i32 %funcall1_() + ret i32 %executecommandptr1_ +unwind_bb: + landingpad { i8*, i32 } personality i32 (...)* null + catch i8** null + unreachable +} + +; CHECK: @test5(i1 %cond) +; CHECK: normal_bb: +; CHECK-NEXT: ret i32 1 + +; As we devirtualize, calls are simplified to look through bitcasts. +; Check for a number of different cases of bitcasts below. + +; Check we simplify a bitcast of the return value +define i8 @bitcast_result() { + %funcall1_ = call fastcc i32 ()* ()* @f1() + %bc = bitcast i32 ()* %funcall1_ to i8 ()* + %executecommandptr1_ = call i8 %bc() + ret i8 %executecommandptr1_ +} + +; CHECK: @bitcast_result() +; CHECK: %executecommandptr1_ = call i8 bitcast (i32 ()* @f2 to i8 ()*)() + +; Check we simplify when the returned value was a bitcast +define i8 @bitcast_returned_value() { + %funcall1_ = call fastcc i8 ()* ()* @f1_bitcast() + %executecommandptr1_ = call i8 %funcall1_() + ret i8 %executecommandptr1_ +} + +define internal fastcc i8 ()* @f1_bitcast() nounwind readnone { + %bc = bitcast i32 ()* @f2 to i8 ()* + ret i8 ()* %bc +} + +; CHECK: @bitcast_returned_value() +; CHECK: %executecommandptr1_ = call i8 bitcast (i32 ()* @f2 to i8 ()*)() + +; Check we simplify a bitcast of a bitcast but that we still have +; a constant bitcast of a Function* in the result +define i16 @bitcast_both1() { + %funcall1_ = call fastcc i8 ()* ()* @f1_bitcast() + %bc = bitcast i8 ()* %funcall1_ to i16 ()* + %executecommandptr1_ = call i16 %bc() + ret i16 %executecommandptr1_ +} + +; CHECK: @bitcast_both1() +; CHECK: %executecommandptr1_ = call i16 bitcast (i32 ()* @f2 to i16 ()*)() + +; Check we simplify a bitcast of a bitcast. This time however, we +; optimize the bitcasts away completely and inline everything +define i32 @bitcast_both2() { + %funcall1_ = call fastcc i8 ()* ()* @f1_bitcast() + %bc = bitcast i8 ()* %funcall1_ to i32 ()* + %executecommandptr1_ = call i32 %bc() + ret i32 %executecommandptr1_ +} + +; CHECK: @bitcast_both2() +; CHECK-NEXT: ret i32 1 \ No newline at end of file