Index: llvm/trunk/lib/Analysis/InlineCost.cpp
===================================================================
--- llvm/trunk/lib/Analysis/InlineCost.cpp
+++ llvm/trunk/lib/Analysis/InlineCost.cpp
@@ -136,6 +136,7 @@
   bool HasReturn;
   bool HasIndirectBr;
   bool HasFrameEscape;
+  bool UsesVarArgs;
 
   /// Number of bytes allocated statically by the callee.
   uint64_t AllocatedSize;
@@ -280,7 +281,7 @@
         IsCallerRecursive(false), IsRecursiveCall(false),
         ExposesReturnsTwice(false), HasDynamicAlloca(false),
         ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false),
-        HasFrameEscape(false), AllocatedSize(0), NumInstructions(0),
+        HasFrameEscape(false), UsesVarArgs(false), AllocatedSize(0), NumInstructions(0),
         NumVectorInstructions(0), VectorBonus(0), SingleBBBonus(0),
         EnableLoadElimination(true), LoadEliminationCost(0), NumConstantArgs(0),
         NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0),
@@ -1233,6 +1234,10 @@
       case Intrinsic::localescape:
         HasFrameEscape = true;
         return false;
+      case Intrinsic::vastart:
+      case Intrinsic::vaend:
+        UsesVarArgs = true;
+        return false;
       }
     }
 
@@ -1567,7 +1572,7 @@
     using namespace ore;
     // If the visit this instruction detected an uninlinable pattern, abort.
     if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca ||
-        HasIndirectBr || HasFrameEscape) {
+        HasIndirectBr || HasFrameEscape || UsesVarArgs) {
       if (ORE)
         ORE->emit([&]() {
           return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
Index: llvm/trunk/lib/Transforms/Utils/InlineFunction.cpp
===================================================================
--- llvm/trunk/lib/Transforms/Utils/InlineFunction.cpp
+++ llvm/trunk/lib/Transforms/Utils/InlineFunction.cpp
@@ -1500,10 +1500,9 @@
   IFI.reset();
 
   Function *CalledFunc = CS.getCalledFunction();
-  if (!CalledFunc ||              // Can't inline external function or indirect
-      CalledFunc->isDeclaration() ||
-      (!ForwardVarArgsTo && CalledFunc->isVarArg())) // call, or call to a vararg function!
-      return false;
+  if (!CalledFunc ||               // Can't inline external function or indirect
+      CalledFunc->isDeclaration()) // call!
+    return false;
 
   // The inliner does not know how to inline through calls with operand bundles
   // in general ...
@@ -1630,9 +1629,6 @@
 
     auto &DL = Caller->getParent()->getDataLayout();
 
-    assert((CalledFunc->arg_size() == CS.arg_size() || ForwardVarArgsTo) &&
-           "Varargs calls can only be inlined if the Varargs are forwarded!");
-
     // Calculate the vector of arguments to pass into the function cloner, which
     // matches up the formal to the actual argument values.
     CallSite::arg_iterator AI = CS.arg_begin();
@@ -1833,6 +1829,23 @@
         if (!CI)
           continue;
 
+        // Forward varargs from inlined call site to calls to the
+        // ForwardVarArgsTo function, if requested, and to musttail calls.
+        if (!VarArgsToForward.empty() &&
+            ((ForwardVarArgsTo &&
+              CI->getCalledFunction() == ForwardVarArgsTo) ||
+             CI->isMustTailCall())) {
+          SmallVector<Value *, 6> Params(CI->arg_operands());
+          Params.append(VarArgsToForward.begin(), VarArgsToForward.end());
+          CallInst *Call =
+              CallInst::Create(CI->getCalledFunction() ? CI->getCalledFunction()
+                                                       : CI->getCalledValue(),
+                               Params, "", CI);
+          Call->setDebugLoc(CI->getDebugLoc());
+          CI->replaceAllUsesWith(Call);
+          CI->eraseFromParent();
+        }
+
         if (Function *F = CI->getCalledFunction())
           InlinedDeoptimizeCalls |=
               F->getIntrinsicID() == Intrinsic::experimental_deoptimize;
@@ -1860,16 +1873,6 @@
         // 'nounwind'.
         if (MarkNoUnwind)
           CI->setDoesNotThrow();
-
-        if (ForwardVarArgsTo && !VarArgsToForward.empty() &&
-            CI->getCalledFunction() == ForwardVarArgsTo) {
-          SmallVector<Value*, 6> Params(CI->arg_operands());
-          Params.append(VarArgsToForward.begin(), VarArgsToForward.end());
-          CallInst *Call = CallInst::Create(CI->getCalledFunction(), Params, "", CI);
-          Call->setDebugLoc(CI->getDebugLoc());
-          CI->replaceAllUsesWith(Call);
-          CI->eraseFromParent();
-        }
       }
     }
   }
Index: llvm/trunk/test/Transforms/Inline/inline-musttail-varargs.ll
===================================================================
--- llvm/trunk/test/Transforms/Inline/inline-musttail-varargs.ll
+++ llvm/trunk/test/Transforms/Inline/inline-musttail-varargs.ll
@@ -1,23 +0,0 @@
-; RUN: opt < %s -inline -instcombine -S | FileCheck %s
-; RUN: opt < %s -passes='cgscc(inline,function(instcombine))' -S | FileCheck %s
-
-; We can't inline this thunk yet, but one day we will be able to.  And when we
-; do, this test case will be ready.
-
-declare void @ext_method(i8*, i32)
-
-define linkonce_odr void @thunk(i8* %this, ...) {
-  %this_adj = getelementptr i8, i8* %this, i32 4
-  musttail call void (i8*, ...) bitcast (void (i8*, i32)* @ext_method to void (i8*, ...)*)(i8* %this_adj, ...)
-  ret void
-}
-
-define void @thunk_caller(i8* %p) {
-  call void (i8*, ...) @thunk(i8* %p, i32 42)
-  ret void
-}
-; CHECK-LABEL: define void @thunk_caller(i8* %p)
-; CHECK: call void (i8*, ...) @thunk(i8* %p, i32 42)
-
-; FIXME: Inline the thunk. This should be significantly easier than inlining
-; general varargs functions.
Index: llvm/trunk/test/Transforms/Inline/inline-varargs.ll
===================================================================
--- llvm/trunk/test/Transforms/Inline/inline-varargs.ll
+++ llvm/trunk/test/Transforms/Inline/inline-varargs.ll
@@ -0,0 +1,52 @@
+; RUN: opt < %s -inline -S | FileCheck %s
+; RUN: opt < %s -passes='cgscc(inline,function(instcombine))' -S | FileCheck %s
+
+declare void @ext_method(i8*, i32)
+declare void @vararg_fn(i8*, ...)
+
+define linkonce_odr void @thunk(i8* %this, ...) {
+  %this_adj = getelementptr i8, i8* %this, i32 4
+  musttail call void (i8*, ...) bitcast (void (i8*, i32)* @ext_method to void (i8*, ...)*)(i8* %this_adj, ...)
+  ret void
+}
+
+define void @thunk_caller(i8* %p) {
+  call void (i8*, ...) @thunk(i8* %p, i32 42)
+  ret void
+}
+; CHECK-LABEL: define void @thunk_caller(i8* %p)
+; CHECK: call void (i8*, ...) bitcast (void (i8*, i32)* @ext_method to void (i8*, ...)*)(i8* %this_adj.i, i32 42)
+
+define void @test_callee_2(i8* %this, ...) {
+  %this_adj = getelementptr i8, i8* %this, i32 4
+  musttail call void (i8*, ...) @vararg_fn(i8* %this_adj, ...)
+  ret void
+}
+
+define void @test_caller_2(i8* %p) {
+  call void (i8*, ...) @test_callee_2(i8* %p)
+  ret void
+}
+; CHECK-LABEL: define void @test_caller_2(i8* %p)
+; CHECK: call void (i8*, ...) @vararg_fn(i8* %this_adj.i)
+
+
+define internal i32 @varg_accessed(...) {
+entry:
+  %vargs = alloca i8*, align 8
+  %vargs.ptr = bitcast i8** %vargs to i8*
+  call void @llvm.va_start(i8* %vargs.ptr)
+  %va1 = va_arg i8** %vargs, i32
+  call void @llvm.va_end(i8* %vargs.ptr)
+  ret i32 %va1
+}
+
+define i32 @call_vargs() {
+  %res = call i32 (...) @varg_accessed(i32 10)
+  ret i32 %res
+}
+; CHECK-LABEL: @call_vargs
+; CHECK: %res = call i32 (...) @varg_accessed(i32 10)
+
+declare void @llvm.va_start(i8*)
+declare void @llvm.va_end(i8*)