Index: lib/Analysis/InlineCost.cpp
===================================================================
--- lib/Analysis/InlineCost.cpp
+++ lib/Analysis/InlineCost.cpp
@@ -97,6 +97,9 @@
   bool HasReturn;
   bool HasIndirectBr;
   bool HasFrameEscape;
+  bool HasVaStart;    // whether F has va_start in it.
+  bool HasMustTailCall;   // whether F has a tail or musttail call in it.
+  bool FIsVarArg;     // whether F is variadic by declaration.
 
   /// Number of bytes allocated statically by the callee.
   uint64_t AllocatedSize;
@@ -209,7 +212,8 @@
         Cost(0), IsCallerRecursive(false), IsRecursiveCall(false),
         ExposesReturnsTwice(false), HasDynamicAlloca(false),
         ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false),
-        HasFrameEscape(false), AllocatedSize(0), NumInstructions(0),
+        HasFrameEscape(false), HasVaStart(false), HasMustTailCall(false),
+        FIsVarArg(Callee.isVarArg()), AllocatedSize(0), NumInstructions(0),
         NumVectorInstructions(0), FiftyPercentVectorBonus(0),
         TenPercentVectorBonus(0), VectorBonus(0), NumConstantArgs(0),
         NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0),
@@ -891,6 +895,16 @@
   }
   if (CS.isCall() && cast<CallInst>(CS.getInstruction())->cannotDuplicate())
     ContainsNoDuplicateCall = true;
+  if (CS.isCall()) {
+    auto CI = cast<CallInst>(CS.getInstruction());
+    if (CI->isMustTailCall()) {
+      HasMustTailCall = true;
+      // If A calls a variadic function B and B has a musttail call to C, then
+      // C can do va_start and access args passed from A to B against '...'.
+      // Therefore we can not inline B inside A.
+      if (FIsVarArg) return false; // No point looking further.
+    }
+  }
 
   if (Function *F = CS.getCalledFunction()) {
     // When we have a concrete function, first try to simplify it directly.
@@ -917,6 +931,10 @@
       case Intrinsic::localescape:
         HasFrameEscape = true;
         return false;
+      case Intrinsic::vastart:
+        // The callee has a va_start. We can not handle that in inlinining.
+        HasVaStart = true;
+        return false;
       }
     }
 
@@ -1129,7 +1147,8 @@
 
     // If the visit this instruction detected an uninlinable pattern, abort.
     if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca ||
-        HasIndirectBr || HasFrameEscape)
+        HasIndirectBr || HasFrameEscape || HasVaStart ||
+        (HasMustTailCall && FIsVarArg))
       return false;
 
     // If the caller is a recursive function then we don't want to inline
@@ -1509,6 +1528,7 @@
 
 bool llvm::isInlineViable(Function &F) {
   bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);
+  bool isVarArg = F.isVarArg();
   for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
     // Disallow inlining of functions which contain indirect branches or
     // blockaddresses.
@@ -1520,6 +1540,14 @@
       if (!CS)
         continue;
 
+      if (isVarArg) {
+        if (const CallInst*CI = dyn_cast<CallInst>(&II)) {
+          // Same logic as in analyzeBlock above.
+          if (CI->isMustTailCall()) 
+            return false;
+        }
+      }
+
       // Disallow recursive calls.
       if (&F == CS.getCalledFunction())
         return false;
@@ -1530,12 +1558,13 @@
           cast<CallInst>(CS.getInstruction())->canReturnTwice())
         return false;
 
-      // Disallow inlining functions that call @llvm.localescape. Doing this
-      // correctly would require major changes to the inliner.
-      if (CS.getCalledFunction() &&
-          CS.getCalledFunction()->getIntrinsicID() ==
-              llvm::Intrinsic::localescape)
-        return false;
+      // Disallow inlining functions that call @llvm.localescape or @vastart.
+      // Doing this correctly would require major changes to the inliner.
+      if (auto cf = CS.getCalledFunction()) {
+        auto intrId = cf->getIntrinsicID();
+        if (intrId == Intrinsic::localescape || intrId == Intrinsic::vastart)
+          return false;
+      }
     }
   }
 
Index: lib/Transforms/Utils/InlineFunction.cpp
===================================================================
--- lib/Transforms/Utils/InlineFunction.cpp
+++ lib/Transforms/Utils/InlineFunction.cpp
@@ -1441,9 +1441,7 @@
   
   const Function *CalledFunc = CS.getCalledFunction();
   if (!CalledFunc ||              // Can't inline external function or indirect
-      CalledFunc->isDeclaration() || // call, or call to a vararg function!
-      CalledFunc->getFunctionType()->isVarArg()) return false;
-
+      CalledFunc->isDeclaration()) return false; // calls.
   // The inliner does not know how to inline through calls with operand bundles
   // in general ...
   if (CS.hasOperandBundles()) {
@@ -1569,7 +1567,11 @@
 
     auto &DL = Caller->getParent()->getDataLayout();
 
-    assert(CalledFunc->arg_size() == CS.arg_size() &&
+    if (CalledFunc->getFunctionType()->isVarArg())
+      assert(CalledFunc->arg_size() <= CS.arg_size() &&
+           "Not enough arguments passed to vararg function");
+    else
+      assert(CalledFunc->arg_size() == CS.arg_size() &&
            "No varargs calls can be inlined!");
 
     // Calculate the vector of arguments to pass into the function cloner, which
@@ -1746,16 +1748,25 @@
     if (CallInst *CI = dyn_cast<CallInst>(TheCall))
       CallSiteTailKind = CI->getTailCallKind();
 
+    bool calleeIsVararg = CalledFunc->isVarArg(); // Just for asserts below.
     for (Function::iterator BB = FirstNewBlock, E = Caller->end(); BB != E;
          ++BB) {
       for (Instruction &I : *BB) {
         CallInst *CI = dyn_cast<CallInst>(&I);
         if (!CI)
           continue;
+	if (calleeIsVararg) {
+          assert((!CI->isMustTailCall())
+                        && "Inlined musttailcall in vararg function");
+	}
 
-        if (Function *F = CI->getCalledFunction())
+        if (Function *F = CI->getCalledFunction()) {
           InlinedDeoptimizeCalls |=
               F->getIntrinsicID() == Intrinsic::experimental_deoptimize;
+          if (calleeIsVararg)
+            assert((F->getIntrinsicID() != Intrinsic::vastart) && 
+                                            "Inlined vastart");
+        }
 
         // We need to reduce the strength of any inlined tail calls.  For
         // musttail, we have to avoid introducing potential unbounded stack
Index: test/Transforms/Inline/inline-varargs.ll
===================================================================
--- test/Transforms/Inline/inline-varargs.ll
+++ test/Transforms/Inline/inline-varargs.ll
@@ -0,0 +1,254 @@
+; RUN: opt -inline -S %s | FileCheck %s
+
+; Check that the side effects of unused arguments happen.
+; CHECK: %call{{.*}}callThis
+; CHECK: %call{{.*}}callThat
+
+; Verify that calleeY gets inlined. Call should not exist.
+; CHECK-NOT: call{{.*}}calleeY
+
+; Verify that calleeN* do not get inlined. Call should exist.
+; CHECK: call{{.*}}calleeNMT
+; CHECK: call{{.*}}calleeNMTAI
+; CHECK: call{{.*}}calleeN
+; CHECK: call{{.*}}calleeNAI
+
+; ModuleID = 'inl.cpp'
+;source_filename = "inl.cpp"
+;target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+;target triple = "x86_64-scei-ps4"
+%struct.__va_list_tag = type { i32, i32, i8*, i8* }
+$_Z7calleeYiPKcz = comdat any
+$_Z9calleeNMTiPKcz = comdat any
+$_Z11calleeNMTAIiPKcz = comdat any
+$_Z7calleeNiPKcz = comdat any
+$_Z9calleeNAIiPKcz = comdat any
+@.str = private unnamed_addr constant [4 x i8] c"abc\00", align 1
+
+; Function Attrs: nounwind
+define i32 @_Z6calleriPKcz(i32 %p, i8* %x, ...) #0 {
+entry:
+  %p.addr = alloca i32, align 4
+  %x.addr = alloca i8*, align 8
+  %r = alloca i32, align 4
+  %q = alloca i32, align 4
+  store i32 %p, i32* %p.addr, align 4
+  store i8* %x, i8** %x.addr, align 8
+  %call = call i32 @_Z8callThisv()
+  %call1 = call i32 @_Z8callThatv()
+  %call2 = call i32 (i32, i8*, ...) @_Z7calleeYiPKcz(i32 12, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32* %p.addr, i32 %call, i32 2, i32 %call1)
+  store i32 %call2, i32* %r, align 4
+  %call3 = call i32 (i32, i8*, ...) @_Z9calleeNMTiPKcz(i32 12, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32* %p.addr, i32 2, i32 3)
+  %0 = load i32, i32* %r, align 4
+  %add = add nsw i32 %0, %call3
+  store i32 %add, i32* %r, align 4
+  %call4 = call i32 (i32, i8*, ...) @_Z11calleeNMTAIiPKcz(i32 12, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32* %p.addr, i32 2, i32 3)
+  %1 = load i32, i32* %r, align 4
+  %add5 = add nsw i32 %1, %call4
+  store i32 %add5, i32* %r, align 4
+  %2 = load i32, i32* %p.addr, align 4
+  %inc = add nsw i32 %2, 1
+  store i32 %inc, i32* %p.addr, align 4
+  %call6 = call i32 (i32, i8*, ...) @_Z7calleeNiPKcz(i32 142, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32* %p.addr, i32 2, i32 3)
+  store i32 %call6, i32* %q, align 4
+  %call7 = call i32 (i32, i8*, ...) @_Z9calleeNAIiPKcz(i32 122, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32* %p.addr, i32 5, i32 3)
+  %3 = load i32, i32* %q, align 4
+  %add8 = add nsw i32 %3, %call7
+  store i32 %add8, i32* %q, align 4
+  %4 = load i32, i32* %r, align 4
+  %5 = load i32, i32* %q, align 4
+  %add9 = add nsw i32 %4, %5
+  %6 = load i8*, i8** %x.addr, align 8
+  %call10 = musttail call i32 (i32, i8*, ...) @_Z5addMTiPKcz(i32 %add9, i8* %6, ...)
+  ret i32 %call10
+}
+
+; Function Attrs: inlinehint nounwind
+define linkonce_odr i32 @_Z7calleeYiPKcz(i32 %a, i8* %m, ...) #1 comdat {
+entry:
+  %a.addr = alloca i32, align 4
+  %m.addr = alloca i8*, align 8
+  store i32 %a, i32* %a.addr, align 4
+  store i8* %m, i8** %m.addr, align 8
+  %0 = load i32, i32* %a.addr, align 4
+  %add = add nsw i32 %0, 1
+  %call = call i32 @_Z4add2i(i32 %add)
+  ret i32 %call
+}
+
+declare i32 @_Z8callThisv() #2
+
+declare i32 @_Z8callThatv() #2
+
+; Function Attrs: inlinehint nounwind
+define linkonce_odr i32 @_Z9calleeNMTiPKcz(i32 %a, i8* %m, ...) #1 comdat {
+entry:
+  %a.addr = alloca i32, align 4
+  %m.addr = alloca i8*, align 8
+  store i32 %a, i32* %a.addr, align 4
+  store i8* %m, i8** %m.addr, align 8
+  %0 = load i32, i32* %a.addr, align 4
+  %add = add nsw i32 %0, 2
+  %1 = load i8*, i8** %m.addr, align 8
+  %call = musttail call i32 (i32, i8*, ...) @_Z5addMTiPKcz(i32 %add, i8* %1, i32 5, ...)
+  ret i32 %call
+}
+
+; Function Attrs: inlinehint nounwind
+define linkonce_odr i32 @_Z11calleeNMTAIiPKcz(i32 %a, i8* %m, ...) #3 comdat {
+entry:
+  %a.addr = alloca i32, align 4
+  %m.addr = alloca i8*, align 8
+  store i32 %a, i32* %a.addr, align 4
+  store i8* %m, i8** %m.addr, align 8
+  %0 = load i32, i32* %a.addr, align 4
+  %sub = sub nsw i32 %0, 2
+  %1 = load i8*, i8** %m.addr, align 8
+  %call = musttail call i32 (i32, i8*, ...) @_Z5addMTiPKcz(i32 %sub, i8* %1, i32 6, ...)
+  ret i32 %call
+}
+
+; Function Attrs: inlinehint nounwind
+define linkonce_odr i32 @_Z7calleeNiPKcz(i32 %a, i8* %m, ...) #1 comdat {
+entry:
+  %a.addr = alloca i32, align 4
+  %m.addr = alloca i8*, align 8
+  %ap = alloca [1 x %struct.__va_list_tag], align 16
+  store i32 %a, i32* %a.addr, align 4
+  store i8* %m, i8** %m.addr, align 8
+  %arraydecay = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %ap, i32 0, i32 0
+  %arraydecay1 = bitcast %struct.__va_list_tag* %arraydecay to i8*
+  call void @llvm.va_start(i8* %arraydecay1)
+  %0 = load i32, i32* %a.addr, align 4
+  %add = add nsw i32 %0, 3
+  %arraydecay2 = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %ap, i32 0, i32 0
+  %gp_offset_p = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %arraydecay2, i32 0, i32 0
+  %gp_offset = load i32, i32* %gp_offset_p, align 16
+  %fits_in_gp = icmp ule i32 %gp_offset, 40
+  br i1 %fits_in_gp, label %vaarg.in_reg, label %vaarg.in_mem
+
+vaarg.in_reg:                                     ; preds = %entry
+  %1 = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %arraydecay2, i32 0, i32 3
+  %reg_save_area = load i8*, i8** %1, align 16
+  %2 = getelementptr i8, i8* %reg_save_area, i32 %gp_offset
+  %3 = bitcast i8* %2 to i32*
+  %4 = add i32 %gp_offset, 8
+  store i32 %4, i32* %gp_offset_p, align 16
+  br label %vaarg.end
+
+vaarg.in_mem:                                     ; preds = %entry
+  %overflow_arg_area_p = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %arraydecay2, i32 0, i32 2
+  %overflow_arg_area = load i8*, i8** %overflow_arg_area_p, align 8
+  %5 = bitcast i8* %overflow_arg_area to i32*
+  %overflow_arg_area.next = getelementptr i8, i8* %overflow_arg_area, i32 8
+  store i8* %overflow_arg_area.next, i8** %overflow_arg_area_p, align 8
+  br label %vaarg.end
+
+vaarg.end:                                        ; preds = %vaarg.in_mem, %vaarg.in_reg
+  %vaarg.addr = phi i32* [ %3, %vaarg.in_reg ], [ %5, %vaarg.in_mem ]
+  %6 = load i32, i32* %vaarg.addr, align 4
+  %add3 = add nsw i32 %add, %6
+  ret i32 %add3
+}
+
+; Function Attrs: alwaysinline inlinehint nounwind
+define linkonce_odr i32 @_Z9calleeNAIiPKcz(i32 %a, i8* %m, ...) #3 comdat {
+entry:
+  %a.addr = alloca i32, align 4
+  %m.addr = alloca i8*, align 8
+  %ap = alloca [1 x %struct.__va_list_tag], align 16
+  store i32 %a, i32* %a.addr, align 4
+  store i8* %m, i8** %m.addr, align 8
+  %arraydecay = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %ap, i32 0, i32 0
+  %arraydecay1 = bitcast %struct.__va_list_tag* %arraydecay to i8*
+  call void @llvm.va_start(i8* %arraydecay1)
+  %0 = load i32, i32* %a.addr, align 4
+  %mul = mul nsw i32 %0, 2
+  %arraydecay2 = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %ap, i32 0, i32 0
+  %gp_offset_p = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %arraydecay2, i32 0, i32 0
+  %gp_offset = load i32, i32* %gp_offset_p, align 16
+  %fits_in_gp = icmp ule i32 %gp_offset, 40
+  br i1 %fits_in_gp, label %vaarg.in_reg, label %vaarg.in_mem
+
+vaarg.in_reg:                                     ; preds = %entry
+  %1 = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %arraydecay2, i32 0, i32 3
+  %reg_save_area = load i8*, i8** %1, align 16
+  %2 = getelementptr i8, i8* %reg_save_area, i32 %gp_offset
+  %3 = bitcast i8* %2 to i32*
+  %4 = add i32 %gp_offset, 8
+  store i32 %4, i32* %gp_offset_p, align 16
+  br label %vaarg.end
+
+vaarg.in_mem:                                     ; preds = %entry
+  %overflow_arg_area_p = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %arraydecay2, i32 0, i32 2
+  %overflow_arg_area = load i8*, i8** %overflow_arg_area_p, align 8
+  %5 = bitcast i8* %overflow_arg_area to i32*
+  %overflow_arg_area.next = getelementptr i8, i8* %overflow_arg_area, i32 8
+  store i8* %overflow_arg_area.next, i8** %overflow_arg_area_p, align 8
+  br label %vaarg.end
+
+vaarg.end:                                        ; preds = %vaarg.in_mem, %vaarg.in_reg
+  %vaarg.addr = phi i32* [ %3, %vaarg.in_reg ], [ %5, %vaarg.in_mem ]
+  %6 = load i32, i32* %vaarg.addr, align 4
+  %add = add nsw i32 %mul, %6
+  ret i32 %add
+}
+
+declare i32 @_Z5addMTiPKcz(i32, i8*, ...) #2
+
+declare i32 @_Z4add2i(i32) #2
+
+; Function Attrs: nounwind
+declare void @llvm.va_start(i8*) #4
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { inlinehint nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { alwaysinline inlinehint nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { nounwind }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 4.0.0 (trunk)"}
+
+;Generated by processing this file with: 
+; clang++.exe -cc1 -triple x86_64-scei-ps4 inl.cpp -emit-llvm
+;
+;#include <stdarg.h>
+;int addMT(int p, const char *, ...);
+;int add2(int);
+;
+;inline int calleeY(int a, const char *m, ...) {return add2(a+1);}
+;inline int calleeNMT(int a, const char *m, ...) {return addMT(a+2,m, 5);}
+;inline int //  __attribute__((always_inline))   // will add always_inline in .ll
+;    calleeNMTAI(int a, const char *m, ...) {return addMT(a-2,m, 6);}
+;
+;// do not inline this
+;inline int calleeN(int a, const char *m, ...) {
+;  va_list ap;
+;  va_start(ap, m);
+;  return a+3 + va_arg(ap, int);
+;}
+;// even if it is always_inline
+;inline int __attribute__((always_inline)) calleeNAI(int a, const char *m, ...) {
+;  va_list ap;
+;  va_start(ap, m);
+;  return a*2 + va_arg(ap, int);
+;}
+;int callThis(); // calls to this must be preserved
+;int callThat(); // calls to this must be preserved
+;int caller(int p,const char *x, ...)
+;{
+;  int r = calleeY(12, "abc", &p, callThis(), 2, callThat()); // inline this
+;  r += calleeNMT(12, "abc", &p, 2, 3); // No. calleeNMT has mustatil call
+;  r += calleeNMTAI(12, "abc", &p, 2, 3); // Not even if it is alway_inline
+;  p++;
+;  int q = calleeN(142, "abc", &p, 2, 3); // No. calleeN has va_start
+;  q +=  calleeNAI(122, "abc", &p, 5, 3); // Not even if it has always_inline
+;  return addMT(r + q, x);
+;}
+;
+;// and then  running following SED script on it
+;// SED /define.*calleeNMTAI/s/#1/#3/
+;// SED /call.*addMT/s/= call/= musttail call/
+;// SED /call.*addMT/s/)$/, ...)/