Index: llvm/trunk/include/llvm/CodeGen/MachineFrameInfo.h
===================================================================
--- llvm/trunk/include/llvm/CodeGen/MachineFrameInfo.h
+++ llvm/trunk/include/llvm/CodeGen/MachineFrameInfo.h
@@ -239,6 +239,9 @@
   /// True if the function contains a call to the llvm.vastart intrinsic.
   bool HasVAStart;
 
+  /// True if this is a varargs function that contains a musttail call.
+  bool HasMustTailInVarArgFunc;
+
   const TargetFrameLowering *getFrameLowering() const;
 public:
     explicit MachineFrameInfo(const TargetMachine &TM, bool RealignOpt)
@@ -260,6 +263,7 @@
     UseLocalStackAllocationBlock = false;
     HasInlineAsmWithSPAdjust = false;
     HasVAStart = false;
+    HasMustTailInVarArgFunc = false;
   }
 
   /// hasStackObjects - Return true if there are any stack objects in this
@@ -483,6 +487,10 @@
   bool hasVAStart() const { return HasVAStart; }
   void setHasVAStart(bool B) { HasVAStart = B; }
 
+  /// Returns true if the function is variadic and contains a musttail call.
+  bool hasMustTailInVarArgFunc() const { return HasMustTailInVarArgFunc; }
+  void setHasMustTailInVarArgFunc(bool B) { HasMustTailInVarArgFunc = B; }
+
   /// getMaxCallFrameSize - Return the maximum size of a call frame that must be
   /// allocated for an outgoing function call.  This is only available if
   /// CallFrameSetup/Destroy pseudo instructions are used by the target, and
Index: llvm/trunk/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
===================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ llvm/trunk/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -146,6 +146,13 @@
           MF->getFrameInfo()->setHasVAStart(true);
       }
 
+      // If we have a musttail call in a variadic funciton, we need to ensure we
+      // forward implicit register parameters.
+      if (auto *CI = dyn_cast<CallInst>(I)) {
+        if (CI->isMustTailCall() && Fn->isVarArg())
+          MF->getFrameInfo()->setHasMustTailInVarArgFunc(true);
+      }
+
       // Mark values used outside their block as exported, by allocating
       // a virtual register for them.
       if (isUsedOutsideOfDefiningBlock(I))
Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
@@ -2326,6 +2326,52 @@
   }
 }
 
+static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
+                                                const X86Subtarget *Subtarget) {
+  assert(Subtarget->is64Bit());
+
+  if (Subtarget->isCallingConvWin64(CallConv)) {
+    static const MCPhysReg GPR64ArgRegsWin64[] = {
+      X86::RCX, X86::RDX, X86::R8,  X86::R9
+    };
+    return GPR64ArgRegsWin64;
+  }
+
+  static const MCPhysReg GPR64ArgRegs64Bit[] = {
+    X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
+  };
+  return GPR64ArgRegs64Bit;
+}
+
+static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
+                                                CallingConv::ID CallConv,
+                                                const X86Subtarget *Subtarget) {
+  assert(Subtarget->is64Bit());
+  if (Subtarget->isCallingConvWin64(CallConv)) {
+    // The XMM registers which might contain var arg parameters are shadowed
+    // in their paired GPR.  So we only need to save the GPR to their home
+    // slots.
+    return None;
+  }
+
+  const Function *Fn = MF.getFunction();
+  bool NoImplicitFloatOps = Fn->getAttributes().
+      hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
+  assert(!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps) &&
+         "SSE register cannot be used when SSE is disabled!");
+  if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
+      !Subtarget->hasSSE1())
+    // Kernel mode asks for SSE to be disabled, so there are no XMM argument
+    // registers.
+    return None;
+
+  static const MCPhysReg XMMArgRegs64Bit[] = {
+    X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+    X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+  };
+  return XMMArgRegs64Bit;
+}
+
 SDValue
 X86TargetLowering::LowerFormalArguments(SDValue Chain,
                                         CallingConv::ID CallConv,
@@ -2469,57 +2515,49 @@
   // If the function takes variable number of arguments, make a frame index for
   // the start of the first vararg value... for expansion of llvm.va_start. We
   // can skip this if there are no va_start calls.
-  if (isVarArg && MFI->hasVAStart()) {
-    if (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
-                    CallConv != CallingConv::X86_ThisCall)) {
-      FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true));
-    }
-    if (Is64Bit) {
-      unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;
-
-      // FIXME: We should really autogenerate these arrays
-      static const MCPhysReg GPR64ArgRegsWin64[] = {
-        X86::RCX, X86::RDX, X86::R8,  X86::R9
-      };
-      static const MCPhysReg GPR64ArgRegs64Bit[] = {
-        X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
-      };
-      static const MCPhysReg XMMArgRegs64Bit[] = {
-        X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
-        X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
-      };
-      const MCPhysReg *GPR64ArgRegs;
-      unsigned NumXMMRegs = 0;
-
-      if (IsWin64) {
-        // The XMM registers which might contain var arg parameters are shadowed
-        // in their paired GPR.  So we only need to save the GPR to their home
-        // slots.
-        TotalNumIntRegs = 4;
-        GPR64ArgRegs = GPR64ArgRegsWin64;
-      } else {
-        TotalNumIntRegs = 6; TotalNumXMMRegs = 8;
-        GPR64ArgRegs = GPR64ArgRegs64Bit;
-
-        NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit,
-                                                TotalNumXMMRegs);
+  if (MFI->hasVAStart() &&
+      (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
+                   CallConv != CallingConv::X86_ThisCall))) {
+    FuncInfo->setVarArgsFrameIndex(
+        MFI->CreateFixedObject(1, StackSize, true));
+  }
+
+  // 64-bit calling conventions support varargs and register parameters, so we
+  // have to do extra work to spill them in the prologue or forward them to
+  // musttail calls.
+  if (Is64Bit && isVarArg &&
+      (MFI->hasVAStart() || MFI->hasMustTailInVarArgFunc())) {
+    // Find the first unallocated argument registers.
+    ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
+    ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
+    unsigned NumIntRegs =
+        CCInfo.getFirstUnallocated(ArgGPRs.data(), ArgGPRs.size());
+    unsigned NumXMMRegs =
+        CCInfo.getFirstUnallocated(ArgXMMs.data(), ArgXMMs.size());
+    assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
+           "SSE register cannot be used when SSE is disabled!");
+
+    // Gather all the live in physical registers.
+    SmallVector<SDValue, 6> LiveGPRs;
+    SmallVector<SDValue, 8> LiveXMMRegs;
+    SDValue ALVal;
+    for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
+      unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
+      LiveGPRs.push_back(
+          DAG.getCopyFromReg(DAG.getEntryNode(), dl, GPR, MVT::i64));
+    }
+    if (!ArgXMMs.empty()) {
+      unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
+      ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8);
+      for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
+        unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
+        LiveXMMRegs.push_back(
+            DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
       }
-      unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs,
-                                                       TotalNumIntRegs);
-
-      bool NoImplicitFloatOps = Fn->getAttributes().
-        hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
-      assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
-             "SSE register cannot be used when SSE is disabled!");
-      assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat &&
-               NoImplicitFloatOps) &&
-             "SSE register cannot be used when SSE is disabled!");
-      if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
-          !Subtarget->hasSSE1())
-        // Kernel mode asks for SSE to be disabled, so don't push them
-        // on the stack.
-        TotalNumXMMRegs = 0;
+    }
 
+    // Store them to the va_list returned by va_start.
+    if (MFI->hasVAStart()) {
       if (IsWin64) {
         const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering();
         // Get to the caller-allocated home save location.  Add 8 to account
@@ -2535,10 +2573,9 @@
         // registers, then we must store them to their spots on the stack so
         // they may be loaded by deferencing the result of va_next.
         FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
-        FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16);
-        FuncInfo->setRegSaveFrameIndex(
-          MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16,
-                               false));
+        FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
+        FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject(
+            ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
       }
 
       // Store the integer parameter registers.
@@ -2546,12 +2583,9 @@
       SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
                                         getPointerTy());
       unsigned Offset = FuncInfo->getVarArgsGPOffset();
-      for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) {
+      for (SDValue Val : LiveGPRs) {
         SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
                                   DAG.getIntPtrConstant(Offset));
-        unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs],
-                                     &X86::GR64RegClass);
-        SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
         SDValue Store =
           DAG.getStore(Val.getValue(1), dl, Val, FIN,
                        MachinePointerInfo::getFixedStack(
@@ -2561,32 +2595,52 @@
         Offset += 8;
       }
 
-      if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) {
+      if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
         // Now store the XMM (fp + vector) parameter registers.
         SmallVector<SDValue, 12> SaveXMMOps;
         SaveXMMOps.push_back(Chain);
-
-        unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
-        SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8);
         SaveXMMOps.push_back(ALVal);
-
         SaveXMMOps.push_back(DAG.getIntPtrConstant(
                                FuncInfo->getRegSaveFrameIndex()));
         SaveXMMOps.push_back(DAG.getIntPtrConstant(
                                FuncInfo->getVarArgsFPOffset()));
-
-        for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
-          unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs],
-                                       &X86::VR128RegClass);
-          SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32);
-          SaveXMMOps.push_back(Val);
-        }
+        SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
+                          LiveXMMRegs.end());
         MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
                                      MVT::Other, SaveXMMOps));
       }
 
       if (!MemOps.empty())
         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
+    } else {
+      // TODO: Save virtual registers away some where so we can do
+      // getCopyFromReg in the musttail call lowering bb.
+      assert(MFI->hasMustTailInVarArgFunc());
+      auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
+      typedef X86MachineFunctionInfo::Forward Forward;
+
+      // Add all GPRs, al, and XMMs to the list of forwards.
+      for (unsigned I = 0, E = LiveGPRs.size(); I != E; ++I) {
+        unsigned VReg =
+            MF.getRegInfo().createVirtualRegister(&X86::GR64RegClass);
+        Chain = DAG.getCopyToReg(Chain, dl, VReg, LiveGPRs[I]);
+        Forwards.push_back(Forward(VReg, ArgGPRs[NumIntRegs + I], MVT::i64));
+      }
+
+      if (!ArgXMMs.empty()) {
+        unsigned ALVReg =
+            MF.getRegInfo().createVirtualRegister(&X86::GR8RegClass);
+        Chain = DAG.getCopyToReg(Chain, dl, ALVReg, ALVal);
+        Forwards.push_back(Forward(ALVReg, X86::AL, MVT::i8));
+
+        for (unsigned I = 0, E = LiveXMMRegs.size(); I != E; ++I) {
+          unsigned VReg =
+              MF.getRegInfo().createVirtualRegister(&X86::VR128RegClass);
+          Chain = DAG.getCopyToReg(Chain, dl, VReg, LiveXMMRegs[I]);
+          Forwards.push_back(
+              Forward(VReg, ArgXMMs[NumXMMRegs + I], MVT::v4f32));
+        }
+      }
     }
   }
 
@@ -2689,6 +2743,7 @@
   bool IsWin64        = Subtarget->isCallingConvWin64(CallConv);
   StructReturnType SR = callIsStructReturn(Outs);
   bool IsSibcall      = false;
+  X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
 
   if (MF.getTarget().Options.DisableTailCalls)
     isTailCall = false;
@@ -2741,7 +2796,6 @@
   int FPDiff = 0;
   if (isTailCall && !IsSibcall && !IsMustTail) {
     // Lower arguments at fp - stackoffset + fpdiff.
-    X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
 
     FPDiff = NumBytesCallerPushed - NumBytes;
@@ -2884,7 +2938,7 @@
     }
   }
 
-  if (Is64Bit && isVarArg && !IsWin64) {
+  if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
     // From AMD64 ABI document:
     // For calls that may call functions that use varargs or stdargs
     // (prototype-less calls or calls to functions containing ellipsis (...) in
@@ -2906,6 +2960,14 @@
                                         DAG.getConstant(NumXMMRegs, MVT::i8)));
   }
 
+  if (Is64Bit && isVarArg && IsMustTail) {
+    const auto &Forwards = X86Info->getForwardedMustTailRegParms();
+    for (const auto &F : Forwards) {
+      SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
+      RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
+    }
+  }
+
   // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
   // don't need this because the eligibility check rejects calls that require
   // shuffling arguments passed in memory.
Index: llvm/trunk/lib/Target/X86/X86MachineFunctionInfo.h
===================================================================
--- llvm/trunk/lib/Target/X86/X86MachineFunctionInfo.h
+++ llvm/trunk/lib/Target/X86/X86MachineFunctionInfo.h
@@ -15,6 +15,8 @@
 #define LLVM_LIB_TARGET_X86_X86MACHINEFUNCTIONINFO_H
 
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include <vector>
 
 namespace llvm {
 
@@ -70,6 +72,22 @@
   unsigned NumLocalDynamics;
 
 public:
+  /// Describes a register that needs to be forwarded from the prologue to a
+  /// musttail call.
+  struct Forward {
+    Forward(unsigned VReg, MCPhysReg PReg, MVT VT)
+        : VReg(VReg), PReg(PReg), VT(VT) {}
+    unsigned VReg;
+    MCPhysReg PReg;
+    MVT VT;
+  };
+
+private:
+  /// ForwardedMustTailRegParms - A list of virtual and physical registers
+  /// that must be forwarded to every musttail call.
+  std::vector<Forward> ForwardedMustTailRegParms;
+
+public:
   X86MachineFunctionInfo() : ForceFramePointer(false),
                              CalleeSavedFrameSize(0),
                              BytesToPopOnReturn(0),
@@ -138,6 +156,9 @@
   unsigned getNumLocalDynamicTLSAccesses() const { return NumLocalDynamics; }
   void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamics; }
 
+  std::vector<Forward> &getForwardedMustTailRegParms() {
+    return ForwardedMustTailRegParms;
+  }
 };
 
 } // End llvm namespace
Index: llvm/trunk/test/CodeGen/X86/musttail-varargs.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/musttail-varargs.ll
+++ llvm/trunk/test/CodeGen/X86/musttail-varargs.ll
@@ -0,0 +1,119 @@
+; RUN: llc < %s -enable-tail-merge=0 -mtriple=x86_64-linux | FileCheck %s --check-prefix=LINUX
+; RUN: llc < %s -enable-tail-merge=0 -mtriple=x86_64-windows | FileCheck %s --check-prefix=WINDOWS
+
+; Test that we actually spill and reload all arguments in the variadic argument
+; pack. Doing a normal call will clobber all argument registers, and we will
+; spill around it. A simple adjustment should not require any XMM spills.
+
+declare void(i8*, ...)* @get_f(i8* %this)
+
+define void @f_thunk(i8* %this, ...) {
+  %fptr = call void(i8*, ...)*(i8*)* @get_f(i8* %this)
+  musttail call void (i8*, ...)* %fptr(i8* %this, ...)
+  ret void
+}
+
+; Save and restore 6 GPRs, 8 XMMs, and AL around the call.
+
+; LINUX-LABEL: f_thunk:
+; LINUX-DAG: movq %rdi, {{.*}}
+; LINUX-DAG: movq %rsi, {{.*}}
+; LINUX-DAG: movq %rdx, {{.*}}
+; LINUX-DAG: movq %rcx, {{.*}}
+; LINUX-DAG: movq %r8, {{.*}}
+; LINUX-DAG: movq %r9, {{.*}}
+; LINUX-DAG: movb %al, {{.*}}
+; LINUX-DAG: movaps %xmm0, {{[0-9]*}}(%rsp)
+; LINUX-DAG: movaps %xmm1, {{[0-9]*}}(%rsp)
+; LINUX-DAG: movaps %xmm2, {{[0-9]*}}(%rsp)
+; LINUX-DAG: movaps %xmm3, {{[0-9]*}}(%rsp)
+; LINUX-DAG: movaps %xmm4, {{[0-9]*}}(%rsp)
+; LINUX-DAG: movaps %xmm5, {{[0-9]*}}(%rsp)
+; LINUX-DAG: movaps %xmm6, {{[0-9]*}}(%rsp)
+; LINUX-DAG: movaps %xmm7, {{[0-9]*}}(%rsp)
+; LINUX: callq get_f
+; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm0
+; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm1
+; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm2
+; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm3
+; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm4
+; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm5
+; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm6
+; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm7
+; LINUX-DAG: movq {{.*}}, %rdi
+; LINUX-DAG: movq {{.*}}, %rsi
+; LINUX-DAG: movq {{.*}}, %rdx
+; LINUX-DAG: movq {{.*}}, %rcx
+; LINUX-DAG: movq {{.*}}, %r8
+; LINUX-DAG: movq {{.*}}, %r9
+; LINUX-DAG: movb {{.*}}, %al
+; LINUX: jmpq *{{.*}}  # TAILCALL
+
+; WINDOWS-LABEL: f_thunk:
+; WINDOWS-NOT: mov{{.}}ps
+; WINDOWS-DAG: movq %rdx, {{.*}}
+; WINDOWS-DAG: movq %rcx, {{.*}}
+; WINDOWS-DAG: movq %r8, {{.*}}
+; WINDOWS-DAG: movq %r9, {{.*}}
+; WINDOWS-NOT: mov{{.}}ps
+; WINDOWS: callq get_f
+; WINDOWS-NOT: mov{{.}}ps
+; WINDOWS-DAG: movq {{.*}}, %rdx
+; WINDOWS-DAG: movq {{.*}}, %rcx
+; WINDOWS-DAG: movq {{.*}}, %r8
+; WINDOWS-DAG: movq {{.*}}, %r9
+; WINDOWS-NOT: mov{{.}}ps
+; WINDOWS: jmpq *{{.*}} # TAILCALL
+
+; This thunk shouldn't require any spills and reloads, assuming the register
+; allocator knows what it's doing.
+
+define void @g_thunk(i8* %fptr_i8, ...) {
+  %fptr = bitcast i8* %fptr_i8 to void (i8*, ...)*
+  musttail call void (i8*, ...)* %fptr(i8* %fptr_i8, ...)
+  ret void
+}
+
+; LINUX-LABEL: g_thunk:
+; LINUX-NOT: movq
+; LINUX: jmpq *%rdi  # TAILCALL
+
+; WINDOWS-LABEL: g_thunk:
+; WINDOWS-NOT: movq
+; WINDOWS: jmpq *%rcx # TAILCALL
+
+; Do a simple multi-exit multi-bb test.
+
+%struct.Foo = type { i1, i8*, i8* }
+
+@g = external global i32
+
+define void @h_thunk(%struct.Foo* %this, ...) {
+  %cond_p = getelementptr %struct.Foo* %this, i32 0, i32 0
+  %cond = load i1* %cond_p
+  br i1 %cond, label %then, label %else
+
+then:
+  %a_p = getelementptr %struct.Foo* %this, i32 0, i32 1
+  %a_i8 = load i8** %a_p
+  %a = bitcast i8* %a_i8 to void (%struct.Foo*, ...)*
+  musttail call void (%struct.Foo*, ...)* %a(%struct.Foo* %this, ...)
+  ret void
+
+else:
+  %b_p = getelementptr %struct.Foo* %this, i32 0, i32 2
+  %b_i8 = load i8** %b_p
+  %b = bitcast i8* %b_i8 to void (%struct.Foo*, ...)*
+  store i32 42, i32* @g
+  musttail call void (%struct.Foo*, ...)* %b(%struct.Foo* %this, ...)
+  ret void
+}
+
+; LINUX-LABEL: h_thunk:
+; LINUX: jne
+; LINUX: jmpq *{{.*}} # TAILCALL
+; LINUX: jmpq *{{.*}} # TAILCALL
+; WINDOWS-LABEL: h_thunk:
+; WINDOWS: jne
+; WINDOWS: jmpq *{{.*}} # TAILCALL
+; WINDOWS: jmpq *{{.*}} # TAILCALL