Index: lib/Transforms/IPO/Inliner.cpp
===================================================================
--- lib/Transforms/IPO/Inliner.cpp
+++ lib/Transforms/IPO/Inliner.cpp
@@ -517,6 +517,20 @@
       Function *Caller = CS.getCaller();
       Function *Callee = CS.getCalledFunction();
 
+      // Track whether this is the last use of a callee.
+      // This is used later to delete unused functions.
+      // TODO: Use this information to splice instead of clone a function.
+      bool LastCalleeUse = Callee && Callee->hasOneUse() &&
+                           Callee->hasLocalLinkage() &&
+                           // TODO: Can remove if in SCC now.
+                           !SCCFunctions.count(Callee) &&
+
+                           // The function may be apparently dead, but if
+                           // there are indirect callgraph references to the
+                           // node, we cannot delete it yet, this could
+                           // invalidate the CGSCC iterator.
+                           CG[Callee]->getNumReferences() == 1;
+
       // If this call site is dead and it is to a readonly function, we should
       // just delete the call instead of trying to inline it, regardless of
       // size.  This happens because IPSCCP propagates the result out of the
@@ -531,7 +545,7 @@
       } else {
         // We can only inline direct calls to non-declarations.
         if (!Callee || Callee->isDeclaration()) continue;
-      
+
         // If this call site was obtained by inlining another function, verify
         // that the include path for the function did not include the callee
         // itself.  If so, we'd be recursively inlining the same function,
@@ -588,19 +602,15 @@
           }
         }
       }
-      
+
       // If we inlined or deleted the last possible call site to the function,
       // delete the function body now.
-      if (Callee && Callee->use_empty() && Callee->hasLocalLinkage() &&
-          // TODO: Can remove if in SCC now.
-          !SCCFunctions.count(Callee) &&
-          
-          // The function may be apparently dead, but if there are indirect
-          // callgraph references to the node, we cannot delete it yet, this
-          // could invalidate the CGSCC iterator.
-          CG[Callee]->getNumReferences() == 0) {
+      if (LastCalleeUse) {
         DEBUG(dbgs() << "    -> Deleting dead function: "
               << Callee->getName() << "\n");
+
+        assert(Callee->use_empty() && CG[Callee]->getNumReferences() == 0 &&
+               "Failed to update call graph after devirtualizing");
         CallGraphNode *CalleeNode = CG[Callee];
         
         // Remove any call graph edges from the callee to its callees.
Index: lib/Transforms/Utils/InlineFunction.cpp
===================================================================
--- lib/Transforms/Utils/InlineFunction.cpp
+++ lib/Transforms/Utils/InlineFunction.cpp
@@ -506,6 +506,98 @@
   return nullptr;
 }
 
+/// \brief Given a return instruction which is being inlined, and the inlined
+/// call-site, try to devirtualize any uses of the returned value if it points
+/// to a Function.  Updates the call graph if any calls were devirtualized.
+static bool devirtualizedCallUsers(Value *ReturnValue,
+                                   Instruction *TheCall,
+                                   InlineFunctionInfo &IFI) {
+  if (!IFI.CG)
+    return false;
+  // Check if we point to a Function*.  Look through Bitcasts to do this.
+  // Without a Function*, there's nothing to devirtualize.
+  Function *CalledF = nullptr;
+  if (Operator::getOpcode(ReturnValue) == Instruction::BitCast)
+    CalledF = dyn_cast<Function>(cast<Operator>(ReturnValue)->getOperand(0));
+  else
+    CalledF = dyn_cast<Function>(ReturnValue);
+  if (!CalledF)
+    return false;
+
+  // When we find an eligible CallInst, we will call replaceCallEdge which is
+  // linear.  Instead of RAUW then a walk of the users of the return value, we
+  // first find all the eligible call sites.  Then we can call replaceCallEdge
+  // only when needed.
+  SmallVector<BitCastInst*, 4> Bitcasts;
+  SmallPtrSet<Instruction*, 4> Calls;
+
+  // Note, calling 'Use.set' invalidates the iterators here so we can't use
+  // foreach.
+  for (Use &Use : TheCall->uses()) {
+    User *U = Use.getUser();
+    // Note down bitcasts for later.  We'll recursively walk them looking for
+    // more calls.
+    if (BitCastInst *BI = dyn_cast<BitCastInst>(U)) {
+      Bitcasts.push_back(BI);
+      continue;
+    }
+    // If we find a call, and this use is the called value (not an argument to
+    // the call, then devirtualize the call)
+    CallSite CS(U);
+    if (CS && CS.getCalledValue() == TheCall) {
+      Calls.insert(CS.getInstruction());
+      continue;
+    }
+  }
+
+  TheCall->replaceAllUsesWith(ReturnValue);
+
+  // Now recursively walk the bitcast users trying to look for more calls.
+  while (!Bitcasts.empty()) {
+    BitCastInst *BI = Bitcasts.pop_back_val();
+    for (User *U : BI->users()) {
+      if (BitCastInst *UserBI = dyn_cast<BitCastInst>(U)) {
+        Bitcasts.push_back(UserBI);
+        continue;
+      }
+      // If we find a call to this bitcast, note it down as one we should
+      // try to devirtualize
+      CallSite CS(U);
+      if (CS && CS.getCalledValue() == BI)
+        Calls.insert(CS.getInstruction());
+    }
+  }
+
+  // Now walk all the calls we found, and update the call graph for each we
+  // manage to devirtualize.
+  if (Calls.empty())
+    return true;
+
+  const Function *Caller = TheCall->getParent()->getParent();
+  CallGraphNode *CallerNode = IFI.CG->getOrInsertFunction(Caller);
+  CallGraphNode *CalleeNode = IFI.CG->getOrInsertFunction(CalledF);
+
+  for (Instruction *CallI : Calls) {
+    CallSite CS(CallI);
+    assert(CS && "Expected Call/Invoke instruction");
+    Value *CalledValue = CS.getCalledValue();
+
+    // If we're calling a bitcast then try simplify it.
+    if (CalledValue != CalledF)
+      if (Instruction *CalledInstruction = dyn_cast<Instruction>(CalledValue))
+        if (Value *V = SimplifyInstruction(CalledInstruction, IFI.DL)) {
+          CS.setCalledFunction(V);
+          // Delete the original instruction if it has no more users
+          if (CalledValue->use_empty())
+            CalledInstruction->eraseFromParent();
+        }
+
+    if (CS.getCalledFunction() == CalledF)
+      CallerNode->replaceCallEdge(CS, CS, CalleeNode);
+  }
+  return true;
+}
+
 /// InlineFunction - This function inlines the called function into the basic
 /// block of the caller.  This returns false if it is not possible to inline
 /// this call.  The program is still in a well defined state if this occurs
@@ -623,7 +715,7 @@
     // have no dead or constant instructions leftover after inlining occurs
     // (which can happen, e.g., because an argument was constant), but we'll be
     // happy with whatever the cloner can do.
-    CloneAndPruneFunctionInto(Caller, CalledFunc, VMap, 
+    CloneAndPruneFunctionInto(Caller, CalledFunc, VMap,
                               /*ModuleLevelChanges=*/false, Returns, ".i",
                               &InlinedFunctionInfo, IFI.DL, TheCall);
 
@@ -852,7 +944,8 @@
       ReturnInst *R = Returns[0];
       if (TheCall == R->getReturnValue())
         TheCall->replaceAllUsesWith(UndefValue::get(TheCall->getType()));
-      else
+      else if (!devirtualizedCallUsers(R->getReturnValue(), TheCall,
+                                       IFI))
         TheCall->replaceAllUsesWith(R->getReturnValue());
     }
     // Since we are now done with the Call/Invoke, we can delete it.
@@ -956,7 +1049,8 @@
     if (!TheCall->use_empty()) {
       if (TheCall == Returns[0]->getReturnValue())
         TheCall->replaceAllUsesWith(UndefValue::get(TheCall->getType()));
-      else
+      else if (!devirtualizedCallUsers(Returns[0]->getReturnValue(),
+                                       TheCall, IFI))
         TheCall->replaceAllUsesWith(Returns[0]->getReturnValue());
     }
 
Index: test/Transforms/Inline/devirtualize-2.ll
===================================================================
--- test/Transforms/Inline/devirtualize-2.ll
+++ test/Transforms/Inline/devirtualize-2.ll
@@ -42,3 +42,113 @@
 
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT: ret i32 41
+
+; This is test1 but with multiple return blocks to test that
+; we have coverage over all parts of the inliner which devirtualize
+; based on return values from inlined calls.
+define i32 @test3(i1 %cond) {
+  %funcall1_ = call fastcc i32 ()* (i1)* @f4a(i1 %cond)
+  %executecommandptr1_ = call i32 %funcall1_()
+  ret i32 %executecommandptr1_
+}
+
+define internal fastcc i32 ()* @f4a(i1 %cond) nounwind readnone {
+  br i1 %cond, label %l1, label %l2
+l2:
+  br label %l1
+l1:
+  ret i32 ()* @f5a
+}
+
+define internal i32 @f5a() nounwind readnone {
+  ret i32 1
+}
+
+; CHECK: @test3(i1 %cond)
+; CHECK: f4a.exit:
+; CHECK-NEXT: ret i32 1
+
+; This is test1 but with an invoke instead of a call.
+define i32 @test4(i1 %cond) {
+  %funcall1_ = invoke fastcc i32 ()* ()* @f1() to label %normal_bb unwind label %unwind_bb
+normal_bb:
+  %executecommandptr1_ = call i32 %funcall1_()
+  ret i32 %executecommandptr1_
+unwind_bb:
+  landingpad { i8*, i32 } personality i32 (...)* null
+         catch i8** null
+  unreachable
+}
+
+; CHECK: @test4(i1 %cond)
+; CHECK: normal_bb:
+; CHECK-NEXT: ret i32 1
+
+; This is test3 but with an invoke instead of a call.
+define i32 @test5(i1 %cond) {
+  %funcall1_ = invoke fastcc i32 ()* (i1)* @f4a(i1 %cond) to label %normal_bb unwind label %unwind_bb
+normal_bb:
+  %executecommandptr1_ = call i32 %funcall1_()
+  ret i32 %executecommandptr1_
+unwind_bb:
+  landingpad { i8*, i32 } personality i32 (...)* null
+         catch i8** null
+  unreachable
+}
+
+; CHECK: @test5(i1 %cond)
+; CHECK: normal_bb:
+; CHECK-NEXT: ret i32 1
+
+; As we devirtualize, calls are simplified to look through bitcasts.
+; Check for a number of different cases of bitcasts below.
+
+; Check we simplify a bitcast of the return value
+define i8 @bitcast_result() {
+  %funcall1_ = call fastcc i32 ()* ()* @f1()
+  %bc = bitcast i32 ()* %funcall1_ to i8 ()*
+  %executecommandptr1_ = call i8 %bc()
+  ret i8 %executecommandptr1_
+}
+
+; CHECK: @bitcast_result()
+; CHECK: %executecommandptr1_ = call i8 bitcast (i32 ()* @f2 to i8 ()*)()
+
+; Check we simplify when the returned value was a bitcast
+define i8 @bitcast_returned_value() {
+  %funcall1_ = call fastcc i8 ()* ()* @f1_bitcast()
+  %executecommandptr1_ = call i8 %funcall1_()
+  ret i8 %executecommandptr1_
+}
+
+define internal fastcc i8 ()* @f1_bitcast() nounwind readnone {
+  %bc = bitcast i32 ()* @f2 to i8 ()*
+  ret i8 ()* %bc
+}
+
+; CHECK: @bitcast_returned_value()
+; CHECK: %executecommandptr1_ = call i8 bitcast (i32 ()* @f2 to i8 ()*)()
+
+; Check we simplify a bitcast of a bitcast but that we still have
+; a constant bitcast of a Function* in the result
+define i16 @bitcast_both1() {
+  %funcall1_ = call fastcc i8 ()* ()* @f1_bitcast()
+  %bc = bitcast i8 ()* %funcall1_ to i16 ()*
+  %executecommandptr1_ = call i16 %bc()
+  ret i16 %executecommandptr1_
+}
+
+; CHECK: @bitcast_both1()
+; CHECK: %executecommandptr1_ = call i16 bitcast (i32 ()* @f2 to i16 ()*)()
+
+; Check we simplify a bitcast of a bitcast.  This time however, we
+; optimize the bitcasts away completely and inline everything
+define i32 @bitcast_both2() {
+  %funcall1_ = call fastcc i8 ()* ()* @f1_bitcast()
+  %bc = bitcast i8 ()* %funcall1_ to i32 ()*
+  %executecommandptr1_ = call i32 %bc()
+  ret i32 %executecommandptr1_
+}
+
+; CHECK: @bitcast_both2()
+; CHECK-NEXT: ret i32 1
\ No newline at end of file