Index: llvm/include/llvm/CodeGen/CallingConvLower.h
===================================================================
--- llvm/include/llvm/CodeGen/CallingConvLower.h
+++ llvm/include/llvm/CodeGen/CallingConvLower.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_CODEGEN_CALLINGCONVLOWER_H
 #define LLVM_CODEGEN_CALLINGCONVLOWER_H
 
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -163,10 +164,12 @@
 };
 
 /// Describes a register that needs to be forwarded from the prologue to a
-/// musttail call.
+/// musttail call. Specifying VReg == 0 means that the register should be
+/// put into guarded area and no virtual register was created for it.
 struct ForwardedRegister {
   ForwardedRegister(unsigned VReg, MCPhysReg PReg, MVT VT)
       : VReg(VReg), PReg(PReg), VT(VT) {}
+  bool IsGuarded() const { return VReg == 0; }
   unsigned VReg;
   MCPhysReg PReg;
   MVT VT;
@@ -525,8 +528,9 @@
   /// Compute the set of registers that need to be preserved and forwarded to
   /// any musttail calls.
   void analyzeMustTailForwardedRegisters(
-      SmallVectorImpl<ForwardedRegister> &Forwards, ArrayRef<MVT> RegParmTypes,
-      CCAssignFn Fn);
+      SmallVectorImpl<ForwardedRegister> &Forwards,
+      const SmallDenseSet<MCPhysReg, 8> &GuardedForwardedRegs,
+      ArrayRef<MVT> RegParmTypes, CCAssignFn Fn);
 
   /// Returns true if the results of the two calling conventions are compatible.
   /// This is usually part of the check for tailcall eligibility.
Index: llvm/include/llvm/CodeGen/MachineBasicBlock.h
===================================================================
--- llvm/include/llvm/CodeGen/MachineBasicBlock.h
+++ llvm/include/llvm/CodeGen/MachineBasicBlock.h
@@ -110,6 +110,10 @@
   /// Indicate that this basic block is entered via an exception handler.
   bool IsEHPad = false;
 
+  /// Indicate that this basic block used for saving vararg registers
+  /// and is entered from entry block.
+  bool IsGuardedRegsBlk = false;
+
   /// Indicate that this basic block is potentially the target of an indirect
   /// branch.
   bool AddressTaken = false;
@@ -378,6 +382,14 @@
   /// Set alignment of the basic block.
   void setAlignment(Align A) { Alignment = A; }
 
+  /// Returns true if the block is used to save guarded varargs registers.
+  /// This basic block is entered from an entry block.
+  bool isGuardedRegsBlk() const { return IsGuardedRegsBlk; }
+
+  /// Marks the block as one which is used to save guarded varargs registers.
+  /// This basic block is entered from an entry block.
+  void setIsGuardedRegsBlk(bool V = true) { IsGuardedRegsBlk = V; }
+
   /// Returns true if the block is a landing pad. That is this basic block is
   /// entered via an exception handler.
   bool isEHPad() const { return IsEHPad; }
Index: llvm/lib/CodeGen/CallingConvLower.cpp
===================================================================
--- llvm/lib/CodeGen/CallingConvLower.cpp
+++ llvm/lib/CodeGen/CallingConvLower.cpp
@@ -236,8 +236,9 @@
 }
 
 void CCState::analyzeMustTailForwardedRegisters(
-    SmallVectorImpl<ForwardedRegister> &Forwards, ArrayRef<MVT> RegParmTypes,
-    CCAssignFn Fn) {
+    SmallVectorImpl<ForwardedRegister> &Forwards,
+    const SmallDenseSet<MCPhysReg, 8> &GuardedForwardedRegs,
+    ArrayRef<MVT> RegParmTypes, CCAssignFn Fn) {
   // Oftentimes calling conventions will not user register parameters for
   // variadic functions, so we need to assume we're not variadic so that we get
   // all the registers that might be used in a non-variadic call.
@@ -250,8 +251,11 @@
     const TargetLowering *TL = MF.getSubtarget().getTargetLowering();
     const TargetRegisterClass *RC = TL->getRegClassFor(RegVT);
     for (MCPhysReg PReg : RemainingRegs) {
-      unsigned VReg = MF.addLiveIn(PReg, RC);
-      Forwards.push_back(ForwardedRegister(VReg, PReg, RegVT));
+      if (GuardedForwardedRegs.count(PReg) == 0) {
+        unsigned VReg = MF.addLiveIn(PReg, RC);
+        Forwards.push_back(ForwardedRegister(VReg, PReg, RegVT));
+      } else
+        Forwards.push_back(ForwardedRegister(0, PReg, RegVT));
     }
   }
 }
Index: llvm/lib/CodeGen/MachineVerifier.cpp
===================================================================
--- llvm/lib/CodeGen/MachineVerifier.cpp
+++ llvm/lib/CodeGen/MachineVerifier.cpp
@@ -621,9 +621,11 @@
   if (!MF->getProperties().hasProperty(
       MachineFunctionProperties::Property::NoPHIs) && MRI->tracksLiveness()) {
     // If this block has allocatable physical registers live-in, check that
-    // it is an entry block or landing pad.
+    // it is an entry block or landing pad or varargs guarded registers
+    // saving block.
     for (const auto &LI : MBB->liveins()) {
       if (isAllocatable(LI.PhysReg) && !MBB->isEHPad() &&
+          !MBB->isGuardedRegsBlk() &&
           MBB->getIterator() != MBB->getParent()->begin()) {
         report("MBB has allocatable live-in, but isn't entry or landing-pad.", MBB);
         report_context(LI.PhysReg);
Index: llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
===================================================================
--- llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -209,6 +209,13 @@
       // If we have a musttail call in a variadic function, we need to ensure we
       // forward implicit register parameters.
       if (const auto *CI = dyn_cast<CallInst>(&I)) {
+        // check for llvm::Intrinsic::icall_branch_funnel intrinsic.
+        // we do not store varargs parameters explicitly for icall_branch_funnel
+        if (CI->getCalledFunction() &&
+            CI->getCalledFunction()->getIntrinsicID() ==
+                llvm::Intrinsic::icall_branch_funnel)
+          continue;
+
         if (CI->isMustTailCall() && Fn->isVarArg())
           MF->getFrameInfo().setHasMustTailInVarArgFunc(true);
       }
Index: llvm/lib/Target/AArch64/AArch64CallLowering.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64CallLowering.cpp
+++ llvm/lib/Target/AArch64/AArch64CallLowering.cpp
@@ -397,7 +397,16 @@
   // Later on, we can use this vector to restore the registers if necessary.
   SmallVectorImpl<ForwardedRegister> &Forwards =
       FuncInfo->getForwardedMustTailRegParms();
-  CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, AssignFn);
+
+  // TODO: At x86 platform, XMM varargs parameters should be
+  // TODO: guarded with check for %al register to avoid using xmm
+  // TODO: registers(if they were not actually specified).
+  // TODO: Define set of guarded registers here if the same is neccessary
+  // TODO: for AArch64 (https://bugs.llvm.org/show_bug.cgi?id=42219).
+  // TODO: Otherwise remove this comment.
+  SmallDenseSet<MCPhysReg, 8> guardedRegs;
+  CCInfo.analyzeMustTailForwardedRegisters(Forwards, guardedRegs, RegParmTypes,
+                                           AssignFn);
 
   // Conservatively forward X8, since it might be used for an aggregate
   // return.
Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -3322,8 +3322,16 @@
       // Compute the set of forwarded registers. The rest are scratch.
       SmallVectorImpl<ForwardedRegister> &Forwards =
                                        FuncInfo->getForwardedMustTailRegParms();
-      CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
-                                               CC_AArch64_AAPCS);
+
+      // TODO: At x86 platform, XMM varargs parameters should be
+      // TODO: guarded with check for %al register to avoid using xmm
+      // TODO: registers(if they were not actually specified).
+      // TODO: Define set of guarded registers here if the same is neccessary
+      // TODO: for AArch64 (https://bugs.llvm.org/show_bug.cgi?id=42219).
+      // TODO: Otherwise remove this comment.
+      SmallDenseSet<MCPhysReg, 8> guardedRegs;
+      CCInfo.analyzeMustTailForwardedRegisters(Forwards, guardedRegs,
+                                               RegParmTypes, CC_AArch64_AAPCS);
 
       // Conservatively forward X8, since it might be used for aggregate return.
       if (!CCInfo.isAllocated(AArch64::X8)) {
Index: llvm/lib/Target/X86/X86ExpandPseudo.cpp
===================================================================
--- llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -64,6 +64,9 @@
 
   bool ExpandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
   bool ExpandMBB(MachineBasicBlock &MBB);
+
+  void CreateTailCallBlocksPair(MachineBasicBlock &OriginalTailCallBlk,
+                                MachineBasicBlock::iterator &TCPseudoInstr);
 };
 char X86ExpandPseudo::ID = 0;
 
@@ -173,6 +176,209 @@
   JTMBB->erase(JTInst);
 }
 
+// this function replaces original tail call instruction with two versions
+// of tailcall instruction. One is fully similar to original, another has xmm
+// registers restoring code inserted previously. Additionally there is created a
+// branch which checks %al and selects proper version of tailcall.
+//
+//   f_thunk:                     f_thunk:
+//   # %bb.1:             =>      # %bb.1:
+//   addq    32, %rsp             testb   %al, %al
+//   jmpq    tc_func              je      .LBB0_2
+//                                # %bb.2:
+//                                movaps  96(%rsp), %xmm0
+//                                addq    32, %rsp
+//                                jmpq    tc_func
+//                                .LBB0_2:
+//                                # %bb.3:
+//                                addq    32, %rsp
+//                                jmpq    tc_func
+//
+void X86ExpandPseudo::CreateTailCallBlocksPair(
+    MachineBasicBlock &OriginalTailCallBlk,
+    MachineBasicBlock::iterator &TCPseudoInstr) {
+
+  MachineFunction *Func = OriginalTailCallBlk.getParent();
+  X86MachineFunctionInfo *X86Info = Func->getInfo<X86MachineFunctionInfo>();
+  const auto &Forwards = X86Info->getForwardedMustTailRegParms();
+
+  // enumerate forwarded registers and check for existance
+  // any of guarded registers.
+  bool hasGuardedArgs = false;
+  for (auto &F : Forwards)
+    if (F.IsGuarded()) {
+      hasGuardedArgs = true;
+      break;
+    }
+
+  // do nothing if there are no guarded registers
+  if (!hasGuardedArgs)
+    return;
+
+  const BasicBlock *LLVM_BB = OriginalTailCallBlk.getBasicBlock();
+
+  MachineBasicBlock::iterator TailCallMInstr = std::prev(TCPseudoInstr);
+  DebugLoc DL = TCPseudoInstr->getDebugLoc();
+
+  // create two blocks for tailcalls.
+  MachineFunction::iterator MBBIter = ++OriginalTailCallBlk.getIterator();
+  MachineBasicBlock *TailCallBlkWithGuardedRegs =
+      Func->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *TailCallBlk = Func->CreateMachineBasicBlock(LLVM_BB);
+  Func->insert(MBBIter, TailCallBlkWithGuardedRegs);
+  Func->insert(MBBIter, TailCallBlk);
+
+  TailCallBlk->transferSuccessors(&OriginalTailCallBlk);
+  OriginalTailCallBlk.addSuccessor(TailCallBlkWithGuardedRegs);
+  OriginalTailCallBlk.addSuccessor(TailCallBlk);
+
+  // search for the start of stack restoring code
+  MachineInstr *FirstInstructionOfStackRestoringCode = &*TailCallMInstr;
+
+  for (MachineBasicBlock::reverse_iterator CurStackRestoreInstr =
+           TailCallMInstr.getReverse();
+       CurStackRestoreInstr != OriginalTailCallBlk.rend();
+       ++CurStackRestoreInstr) {
+
+    // skip tail call instruction
+    if (CurStackRestoreInstr->getOpcode() == TailCallMInstr->getOpcode())
+      continue;
+
+    // skip CFI instructions
+    if (CurStackRestoreInstr->isCFIInstruction())
+      continue;
+
+    if ((CurStackRestoreInstr->getOpcode() == X86::SUB64ri32 ||
+         CurStackRestoreInstr->getOpcode() == X86::SUB64ri8 ||
+         CurStackRestoreInstr->getOpcode() == X86::SUB32ri ||
+         CurStackRestoreInstr->getOpcode() == X86::SUB32ri8) &&
+        CurStackRestoreInstr->getOperand(0).getReg() ==
+            TRI->getStackRegister()) {
+      FirstInstructionOfStackRestoringCode = &*CurStackRestoreInstr;
+      continue;
+    } else if ((CurStackRestoreInstr->getOpcode() == X86::ADD64ri32 ||
+                CurStackRestoreInstr->getOpcode() == X86::ADD64ri8 ||
+                CurStackRestoreInstr->getOpcode() == X86::ADD32ri ||
+                CurStackRestoreInstr->getOpcode() == X86::ADD32ri8) &&
+               CurStackRestoreInstr->getOperand(0).getReg() ==
+                   TRI->getStackRegister()) {
+      FirstInstructionOfStackRestoringCode = &*CurStackRestoreInstr;
+      continue;
+    } else if (CurStackRestoreInstr->getOpcode() == X86::POP64r) {
+      FirstInstructionOfStackRestoringCode = &*CurStackRestoreInstr;
+      continue;
+    } else if ((CurStackRestoreInstr->getOpcode() == X86::LEA32r ||
+                CurStackRestoreInstr->getOpcode() == X86::LEA64_32r) &&
+               CurStackRestoreInstr->getOperand(0).getReg() ==
+                   TRI->getStackRegister() &&
+               CurStackRestoreInstr->getOperand(1).getReg() ==
+                   TRI->getStackRegister() &&
+               CurStackRestoreInstr->getOperand(2).getImm() == 1 &&
+               CurStackRestoreInstr->getOperand(3).getReg() ==
+                   X86::NoRegister &&
+               CurStackRestoreInstr->getOperand(5).getReg() ==
+                   X86::NoRegister) {
+      // For LEAs we have: def = lea SP, FI, noreg, Offset, noreg.
+      FirstInstructionOfStackRestoringCode = &*CurStackRestoreInstr;
+      continue;
+    }
+
+    break;
+  }
+
+  // copy stack restoring code and tailcall instruction into
+  // two created blocks. Delete copied instructions from the
+  // OriginalTailCallBlk.
+  MachineBasicBlock::iterator curInstr = FirstInstructionOfStackRestoringCode;
+
+  do {
+    // copy instructions into TailCallBlkWithGuardedRegs
+    MachineInstrBuilder MIB = BuildMI(TailCallBlkWithGuardedRegs, DL,
+                                      TII->get(curInstr->getOpcode()));
+
+    for (auto MO : curInstr->operands())
+      MIB->addOperand(*Func, MO);
+
+    // copy instructions into TailCallBlk
+    MachineInstrBuilder SMIB =
+        BuildMI(TailCallBlk, DL, TII->get(curInstr->getOpcode()));
+
+    for (auto MO : curInstr->operands())
+      SMIB->addOperand(*Func, MO);
+
+    // stop copying if we achieved tail call instruction
+    if (curInstr->getOpcode() == TailCallMInstr->getOpcode()) {
+      OriginalTailCallBlk.erase(curInstr);
+      break;
+    }
+
+    curInstr = &*OriginalTailCallBlk.erase(curInstr);
+  } while (curInstr != OriginalTailCallBlk.end());
+
+  // copy call site information into new tail call instructions
+  OriginalTailCallBlk.getParent()->copyCallSiteInfo(
+      &*TCPseudoInstr, &*TailCallBlkWithGuardedRegs->getLastNonDebugInstr());
+
+  OriginalTailCallBlk.getParent()->copyCallSiteInfo(
+      &*TCPseudoInstr, &*TailCallBlk->getLastNonDebugInstr());
+
+  // If %al is 0, branch around the XMM save block.
+  BuildMI(&OriginalTailCallBlk, DL, TII->get(X86::TEST8rr))
+      .addReg(X86::AL)
+      .addReg(X86::AL);
+  BuildMI(&OriginalTailCallBlk, DL, TII->get(X86::JCC_1))
+      .addMBB(TailCallBlk)
+      .addImm(X86::COND_E);
+
+  // add code restoring xmm regsiters into start of TailCallInstrFromGuardedBlk
+  MachineInstr &TailCallInstrFromGuardedBlk =
+      *TailCallBlkWithGuardedRegs->getLastNonDebugInstr();
+
+  // TODO: take into account YMM, ZMM here
+  unsigned MOVOpc = STI->hasAVX() ? X86::VMOVAPSrm : X86::MOVAPSrm;
+
+  int RegIdx = 0;
+  for (const auto &Fwd : Forwards) {
+    if (Fwd.IsGuarded()) {
+      int64_t OffsetInsideSaveArea =
+          (Func->getFrameInfo().hasVAStart() ? X86Info->getVarArgsFPOffset()
+                                             : 0);
+      unsigned BaseReg;
+      int64_t Offset =
+          X86FL->getFrameIndexReference(
+              *Func, X86Info->getThunkRegSaveFrameIndex(), BaseReg) +
+          RegIdx * 16 + OffsetInsideSaveArea;
+
+      MachineMemOperand *MMO = Func->getMachineMemOperand(
+          MachinePointerInfo::getFixedStack(
+              *Func, X86Info->getThunkRegSaveFrameIndex(), Offset),
+          MachineMemOperand::MOLoad,
+          /*Size=*/16, /*Align=*/16);
+
+      BuildMI(*TailCallBlkWithGuardedRegs, TailCallBlkWithGuardedRegs->begin(),
+              DL, TII->get(MOVOpc), Fwd.PReg)
+          .addReg(BaseReg)
+          .addImm(/*Scale=*/1)
+          .addReg(/*IndexReg=*/0)
+          .addImm(/*Disp=*/Offset)
+          .addReg(/*Segment=*/0)
+          .addMemOperand(MMO);
+
+      TailCallInstrFromGuardedBlk.addOperand(
+          MachineOperand::CreateReg(Fwd.PReg, false /*IsDef*/, true /*IsImp*/));
+      RegIdx++;
+    }
+  }
+
+  // add liveins into newly created blocks
+  for (auto &MO : TCPseudoInstr->operands()) {
+    if (MO.isReg() && Register::isPhysicalRegister(MO.getReg())) {
+      TailCallBlk->addLiveIn(MO.getReg());
+      TailCallBlkWithGuardedRegs->addLiveIn(MO.getReg());
+    }
+  }
+}
+
 /// If \p MBBI is a pseudo instruction, this method expands
 /// it to the corresponding (sequence of) actual instruction(s).
 /// \returns true if \p MBBI has been expanded.
@@ -275,7 +481,17 @@
 
     MachineInstr &NewMI = *std::prev(MBBI);
     NewMI.copyImplicitOps(*MBBI->getParent()->getParent(), *MBBI);
-    MBB.getParent()->moveCallSiteInfo(&*MBBI, &NewMI);
+    MBB.getParent()->copyCallSiteInfo(&*MBBI, &NewMI);
+    MachineFunction *Func = MBB.getParent();
+
+    // check for case when variadic function is a thunk.
+    // We need to propagate parameters into final tailcall then.
+    // Passing xmm parameters a bit tricky in this case.
+    // Xmm parameters should be guarded with the check for %al
+    // register.
+    if (!STI->isCallingConvWin64(Func->getFunction().getCallingConv()) &&
+        STI->is64Bit() && Func->getFrameInfo().hasMustTailInVarArgFunc())
+      CreateTailCallBlocksPair(MBB, MBBI);
 
     // Delete the pseudo instruction TCRETURN.
     MBB.erase(MBBI);
Index: llvm/lib/Target/X86/X86ISelLowering.h
===================================================================
--- llvm/lib/Target/X86/X86ISelLowering.h
+++ llvm/lib/Target/X86/X86ISelLowering.h
@@ -24,652 +24,771 @@
 
   namespace X86ISD {
     // X86 Specific DAG Nodes
-    enum NodeType : unsigned {
-      // Start the numbering where the builtin ops leave off.
-      FIRST_NUMBER = ISD::BUILTIN_OP_END,
-
-      /// Bit scan forward.
-      BSF,
-      /// Bit scan reverse.
-      BSR,
-
-      /// Double shift instructions. These correspond to
-      /// X86::SHLDxx and X86::SHRDxx instructions.
-      SHLD,
-      SHRD,
-
-      /// Bitwise logical AND of floating point values. This corresponds
-      /// to X86::ANDPS or X86::ANDPD.
-      FAND,
-
-      /// Bitwise logical OR of floating point values. This corresponds
-      /// to X86::ORPS or X86::ORPD.
-      FOR,
-
-      /// Bitwise logical XOR of floating point values. This corresponds
-      /// to X86::XORPS or X86::XORPD.
-      FXOR,
-
-      ///  Bitwise logical ANDNOT of floating point values. This
-      /// corresponds to X86::ANDNPS or X86::ANDNPD.
-      FANDN,
-
-      /// These operations represent an abstract X86 call
-      /// instruction, which includes a bunch of information.  In particular the
-      /// operands of these node are:
-      ///
-      ///     #0 - The incoming token chain
-      ///     #1 - The callee
-      ///     #2 - The number of arg bytes the caller pushes on the stack.
-      ///     #3 - The number of arg bytes the callee pops off the stack.
-      ///     #4 - The value to pass in AL/AX/EAX (optional)
-      ///     #5 - The value to pass in DL/DX/EDX (optional)
-      ///
-      /// The result values of these nodes are:
-      ///
-      ///     #0 - The outgoing token chain
-      ///     #1 - The first register result value (optional)
-      ///     #2 - The second register result value (optional)
-      ///
-      CALL,
-
-      /// Same as call except it adds the NoTrack prefix.
-      NT_CALL,
-
-      /// X86 compare and logical compare instructions.
-      CMP, COMI, UCOMI,
-
-      /// X86 bit-test instructions.
-      BT,
-
-      /// X86 SetCC. Operand 0 is condition code, and operand 1 is the EFLAGS
-      /// operand, usually produced by a CMP instruction.
-      SETCC,
-
-      /// X86 Select
-      SELECTS,
-
-      // Same as SETCC except it's materialized with a sbb and the value is all
-      // one's or all zero's.
-      SETCC_CARRY,  // R = carry_bit ? ~0 : 0
-
-      /// X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
-      /// Operands are two FP values to compare; result is a mask of
-      /// 0s or 1s.  Generally DTRT for C/C++ with NaNs.
-      FSETCC,
-
-      /// X86 FP SETCC, similar to above, but with output as an i1 mask and
-      /// and a version with SAE.
-      FSETCCM, FSETCCM_SAE,
-
-      /// X86 conditional moves. Operand 0 and operand 1 are the two values
-      /// to select from. Operand 2 is the condition code, and operand 3 is the
-      /// flag operand produced by a CMP or TEST instruction.
-      CMOV,
-
-      /// X86 conditional branches. Operand 0 is the chain operand, operand 1
-      /// is the block to branch if condition is true, operand 2 is the
-      /// condition code, and operand 3 is the flag operand produced by a CMP
-      /// or TEST instruction.
-      BRCOND,
-
-      /// BRIND node with NoTrack prefix. Operand 0 is the chain operand and
-      /// operand 1 is the target address.
-      NT_BRIND,
-
-      /// Return with a flag operand. Operand 0 is the chain operand, operand
-      /// 1 is the number of bytes of stack to pop.
-      RET_FLAG,
-
-      /// Return from interrupt. Operand 0 is the number of bytes to pop.
-      IRET,
-
-      /// Repeat fill, corresponds to X86::REP_STOSx.
-      REP_STOS,
-
-      /// Repeat move, corresponds to X86::REP_MOVSx.
-      REP_MOVS,
-
-      /// On Darwin, this node represents the result of the popl
-      /// at function entry, used for PIC code.
-      GlobalBaseReg,
-
-      /// A wrapper node for TargetConstantPool, TargetJumpTable,
-      /// TargetExternalSymbol, TargetGlobalAddress, TargetGlobalTLSAddress,
-      /// MCSymbol and TargetBlockAddress.
-      Wrapper,
-
-      /// Special wrapper used under X86-64 PIC mode for RIP
-      /// relative displacements.
-      WrapperRIP,
-
-      /// Copies a 64-bit value from an MMX vector to the low word
-      /// of an XMM vector, with the high word zero filled.
-      MOVQ2DQ,
-
-      /// Copies a 64-bit value from the low word of an XMM vector
-      /// to an MMX vector.
-      MOVDQ2Q,
-
-      /// Copies a 32-bit value from the low word of a MMX
-      /// vector to a GPR.
-      MMX_MOVD2W,
-
-      /// Copies a GPR into the low 32-bit word of a MMX vector
-      /// and zero out the high word.
-      MMX_MOVW2D,
-
-      /// Extract an 8-bit value from a vector and zero extend it to
-      /// i32, corresponds to X86::PEXTRB.
-      PEXTRB,
-
-      /// Extract a 16-bit value from a vector and zero extend it to
-      /// i32, corresponds to X86::PEXTRW.
-      PEXTRW,
-
-      /// Insert any element of a 4 x float vector into any element
-      /// of a destination 4 x floatvector.
-      INSERTPS,
-
-      /// Insert the lower 8-bits of a 32-bit value to a vector,
-      /// corresponds to X86::PINSRB.
-      PINSRB,
-
-      /// Insert the lower 16-bits of a 32-bit value to a vector,
-      /// corresponds to X86::PINSRW.
-      PINSRW,
-
-      /// Shuffle 16 8-bit values within a vector.
-      PSHUFB,
-
-      /// Compute Sum of Absolute Differences.
-      PSADBW,
-      /// Compute Double Block Packed Sum-Absolute-Differences
-      DBPSADBW,
-
-      /// Bitwise Logical AND NOT of Packed FP values.
-      ANDNP,
-
-      /// Blend where the selector is an immediate.
-      BLENDI,
-
-      /// Dynamic (non-constant condition) vector blend where only the sign bits
-      /// of the condition elements are used. This is used to enforce that the
-      /// condition mask is not valid for generic VSELECT optimizations. This
-      /// is also used to implement the intrinsics.
-      /// Operands are in VSELECT order: MASK, TRUE, FALSE
-      BLENDV,
-
-      /// Combined add and sub on an FP vector.
-      ADDSUB,
-
-      //  FP vector ops with rounding mode.
-      FADD_RND, FADDS, FADDS_RND,
-      FSUB_RND, FSUBS, FSUBS_RND,
-      FMUL_RND, FMULS, FMULS_RND,
-      FDIV_RND, FDIVS, FDIVS_RND,
-      FMAX_SAE, FMAXS_SAE,
-      FMIN_SAE, FMINS_SAE,
-      FSQRT_RND, FSQRTS, FSQRTS_RND,
-
-      // FP vector get exponent.
-      FGETEXP, FGETEXP_SAE, FGETEXPS, FGETEXPS_SAE,
-      // Extract Normalized Mantissas.
-      VGETMANT, VGETMANT_SAE, VGETMANTS, VGETMANTS_SAE,
-      // FP Scale.
-      SCALEF, SCALEF_RND,
-      SCALEFS, SCALEFS_RND,
-
-      // Unsigned Integer average.
-      AVG,
-
-      /// Integer horizontal add/sub.
-      HADD,
-      HSUB,
-
-      /// Floating point horizontal add/sub.
-      FHADD,
-      FHSUB,
-
-      // Detect Conflicts Within a Vector
-      CONFLICT,
-
-      /// Floating point max and min.
-      FMAX, FMIN,
-
-      /// Commutative FMIN and FMAX.
-      FMAXC, FMINC,
-
-      /// Scalar intrinsic floating point max and min.
-      FMAXS, FMINS,
-
-      /// Floating point reciprocal-sqrt and reciprocal approximation.
-      /// Note that these typically require refinement
-      /// in order to obtain suitable precision.
-      FRSQRT, FRCP,
-
-      // AVX-512 reciprocal approximations with a little more precision.
-      RSQRT14, RSQRT14S, RCP14, RCP14S,
-
-      // Thread Local Storage.
-      TLSADDR,
+  enum NodeType : unsigned {
+    // Start the numbering where the builtin ops leave off.
+    FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+    /// Bit scan forward.
+    BSF,
+    /// Bit scan reverse.
+    BSR,
+
+    /// Double shift instructions. These correspond to
+    /// X86::SHLDxx and X86::SHRDxx instructions.
+    SHLD,
+    SHRD,
+
+    /// Bitwise logical AND of floating point values. This corresponds
+    /// to X86::ANDPS or X86::ANDPD.
+    FAND,
+
+    /// Bitwise logical OR of floating point values. This corresponds
+    /// to X86::ORPS or X86::ORPD.
+    FOR,
+
+    /// Bitwise logical XOR of floating point values. This corresponds
+    /// to X86::XORPS or X86::XORPD.
+    FXOR,
+
+    ///  Bitwise logical ANDNOT of floating point values. This
+    /// corresponds to X86::ANDNPS or X86::ANDNPD.
+    FANDN,
+
+    /// These operations represent an abstract X86 call
+    /// instruction, which includes a bunch of information.  In particular the
+    /// operands of these node are:
+    ///
+    ///     #0 - The incoming token chain
+    ///     #1 - The callee
+    ///     #2 - The number of arg bytes the caller pushes on the stack.
+    ///     #3 - The number of arg bytes the callee pops off the stack.
+    ///     #4 - The value to pass in AL/AX/EAX (optional)
+    ///     #5 - The value to pass in DL/DX/EDX (optional)
+    ///
+    /// The result values of these nodes are:
+    ///
+    ///     #0 - The outgoing token chain
+    ///     #1 - The first register result value (optional)
+    ///     #2 - The second register result value (optional)
+    ///
+    CALL,
 
-      // Thread Local Storage. A call to get the start address
-      // of the TLS block for the current module.
-      TLSBASEADDR,
+    /// Same as call except it adds the NoTrack prefix.
+    NT_CALL,
 
-      // Thread Local Storage.  When calling to an OS provided
-      // thunk at the address from an earlier relocation.
-      TLSCALL,
+    /// X86 compare and logical compare instructions.
+    CMP,
+    COMI,
+    UCOMI,
 
-      // Exception Handling helpers.
-      EH_RETURN,
+    /// X86 bit-test instructions.
+    BT,
 
-      // SjLj exception handling setjmp.
-      EH_SJLJ_SETJMP,
+    /// X86 SetCC. Operand 0 is condition code, and operand 1 is the EFLAGS
+    /// operand, usually produced by a CMP instruction.
+    SETCC,
 
-      // SjLj exception handling longjmp.
-      EH_SJLJ_LONGJMP,
+    /// X86 Select
+    SELECTS,
 
-      // SjLj exception handling dispatch.
-      EH_SJLJ_SETUP_DISPATCH,
+    // Same as SETCC except it's materialized with a sbb and the value is all
+    // one's or all zero's.
+    SETCC_CARRY, // R = carry_bit ? ~0 : 0
 
-      /// Tail call return. See X86TargetLowering::LowerCall for
-      /// the list of operands.
-      TC_RETURN,
+    /// X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
+    /// Operands are two FP values to compare; result is a mask of
+    /// 0s or 1s.  Generally DTRT for C/C++ with NaNs.
+    FSETCC,
 
-      // Vector move to low scalar and zero higher vector elements.
-      VZEXT_MOVL,
+    /// X86 FP SETCC, similar to above, but with output as an i1 mask and
+    /// and a version with SAE.
+    FSETCCM,
+    FSETCCM_SAE,
 
-      // Vector integer truncate.
-      VTRUNC,
-      // Vector integer truncate with unsigned/signed saturation.
-      VTRUNCUS, VTRUNCS,
+    /// X86 conditional moves. Operand 0 and operand 1 are the two values
+    /// to select from. Operand 2 is the condition code, and operand 3 is the
+    /// flag operand produced by a CMP or TEST instruction.
+    CMOV,
 
-      // Masked version of the above. Used when less than a 128-bit result is
-      // produced since the mask only applies to the lower elements and can't
-      // be represented by a select.
-      // SRC, PASSTHRU, MASK
-      VMTRUNC, VMTRUNCUS, VMTRUNCS,
-
-      // Vector FP extend.
-      VFPEXT, VFPEXT_SAE, VFPEXTS, VFPEXTS_SAE,
-
-      // Vector FP round.
-      VFPROUND, VFPROUND_RND, VFPROUNDS, VFPROUNDS_RND,
-
-      // Masked version of above. Used for v2f64->v4f32.
-      // SRC, PASSTHRU, MASK
-      VMFPROUND,
-
-      // 128-bit vector logical left / right shift
-      VSHLDQ, VSRLDQ,
-
-      // Vector shift elements
-      VSHL, VSRL, VSRA,
-
-      // Vector variable shift
-      VSHLV, VSRLV, VSRAV,
-
-      // Vector shift elements by immediate
-      VSHLI, VSRLI, VSRAI,
-
-      // Shifts of mask registers.
-      KSHIFTL, KSHIFTR,
-
-      // Bit rotate by immediate
-      VROTLI, VROTRI,
-
-      // Vector packed double/float comparison.
-      CMPP,
-
-      // Vector integer comparisons.
-      PCMPEQ, PCMPGT,
-
-      // v8i16 Horizontal minimum and position.
-      PHMINPOS,
-
-      MULTISHIFT,
-
-      /// Vector comparison generating mask bits for fp and
-      /// integer signed and unsigned data types.
-      CMPM,
-      // Vector comparison with SAE for FP values
-      CMPM_SAE,
-
-      // Arithmetic operations with FLAGS results.
-      ADD, SUB, ADC, SBB, SMUL, UMUL,
-      OR, XOR, AND,
-
-      // Bit field extract.
-      BEXTR,
-
-      // Zero High Bits Starting with Specified Bit Position.
-      BZHI,
-
-      // X86-specific multiply by immediate.
-      MUL_IMM,
-
-      // Vector sign bit extraction.
-      MOVMSK,
-
-      // Vector bitwise comparisons.
-      PTEST,
-
-      // Vector packed fp sign bitwise comparisons.
-      TESTP,
-
-      // OR/AND test for masks.
-      KORTEST,
-      KTEST,
-
-      // ADD for masks.
-      KADD,
-
-      // Several flavors of instructions with vector shuffle behaviors.
-      // Saturated signed/unnsigned packing.
-      PACKSS,
-      PACKUS,
-      // Intra-lane alignr.
-      PALIGNR,
-      // AVX512 inter-lane alignr.
-      VALIGN,
-      PSHUFD,
-      PSHUFHW,
-      PSHUFLW,
-      SHUFP,
-      // VBMI2 Concat & Shift.
-      VSHLD,
-      VSHRD,
-      VSHLDV,
-      VSHRDV,
-      //Shuffle Packed Values at 128-bit granularity.
-      SHUF128,
-      MOVDDUP,
-      MOVSHDUP,
-      MOVSLDUP,
-      MOVLHPS,
-      MOVHLPS,
-      MOVSD,
-      MOVSS,
-      UNPCKL,
-      UNPCKH,
-      VPERMILPV,
-      VPERMILPI,
-      VPERMI,
-      VPERM2X128,
-
-      // Variable Permute (VPERM).
-      // Res = VPERMV MaskV, V0
-      VPERMV,
-
-      // 3-op Variable Permute (VPERMT2).
-      // Res = VPERMV3 V0, MaskV, V1
-      VPERMV3,
-
-      // Bitwise ternary logic.
-      VPTERNLOG,
-      // Fix Up Special Packed Float32/64 values.
-      VFIXUPIMM, VFIXUPIMM_SAE,
-      VFIXUPIMMS, VFIXUPIMMS_SAE,
-      // Range Restriction Calculation For Packed Pairs of Float32/64 values.
-      VRANGE, VRANGE_SAE, VRANGES, VRANGES_SAE,
-      // Reduce - Perform Reduction Transformation on scalar\packed FP.
-      VREDUCE, VREDUCE_SAE, VREDUCES, VREDUCES_SAE,
-      // RndScale - Round FP Values To Include A Given Number Of Fraction Bits.
-      // Also used by the legacy (V)ROUND intrinsics where we mask out the
-      // scaling part of the immediate.
-      VRNDSCALE, VRNDSCALE_SAE, VRNDSCALES, VRNDSCALES_SAE,
-      // Tests Types Of a FP Values for packed types.
-      VFPCLASS,
-      // Tests Types Of a FP Values for scalar types.
-      VFPCLASSS,
-
-      // Broadcast (splat) scalar or element 0 of a vector. If the operand is
-      // a vector, this node may change the vector length as part of the splat.
-      VBROADCAST,
-      // Broadcast mask to vector.
-      VBROADCASTM,
-      // Broadcast subvector to vector.
-      SUBV_BROADCAST,
-
-      /// SSE4A Extraction and Insertion.
-      EXTRQI, INSERTQI,
-
-      // XOP arithmetic/logical shifts.
-      VPSHA, VPSHL,
-      // XOP signed/unsigned integer comparisons.
-      VPCOM, VPCOMU,
-      // XOP packed permute bytes.
-      VPPERM,
-      // XOP two source permutation.
-      VPERMIL2,
-
-      // Vector multiply packed unsigned doubleword integers.
-      PMULUDQ,
-      // Vector multiply packed signed doubleword integers.
-      PMULDQ,
-      // Vector Multiply Packed UnsignedIntegers with Round and Scale.
-      MULHRS,
-
-      // Multiply and Add Packed Integers.
-      VPMADDUBSW, VPMADDWD,
-
-      // AVX512IFMA multiply and add.
-      // NOTE: These are different than the instruction and perform
-      // op0 x op1 + op2.
-      VPMADD52L, VPMADD52H,
-
-      // VNNI
-      VPDPBUSD,
-      VPDPBUSDS,
-      VPDPWSSD,
-      VPDPWSSDS,
-
-      // FMA nodes.
-      // We use the target independent ISD::FMA for the non-inverted case.
-      FNMADD,
-      FMSUB,
-      FNMSUB,
-      FMADDSUB,
-      FMSUBADD,
-
-      // FMA with rounding mode.
-      FMADD_RND,
-      FNMADD_RND,
-      FMSUB_RND,
-      FNMSUB_RND,
-      FMADDSUB_RND,
-      FMSUBADD_RND,
-
-      // Compress and expand.
-      COMPRESS,
-      EXPAND,
-
-      // Bits shuffle
-      VPSHUFBITQMB,
-
-      // Convert Unsigned/Integer to Floating-Point Value with rounding mode.
-      SINT_TO_FP_RND, UINT_TO_FP_RND,
-      SCALAR_SINT_TO_FP, SCALAR_UINT_TO_FP,
-      SCALAR_SINT_TO_FP_RND, SCALAR_UINT_TO_FP_RND,
-
-      // Vector float/double to signed/unsigned integer.
-      CVTP2SI, CVTP2UI, CVTP2SI_RND, CVTP2UI_RND,
-      // Scalar float/double to signed/unsigned integer.
-      CVTS2SI, CVTS2UI, CVTS2SI_RND, CVTS2UI_RND,
-
-      // Vector float/double to signed/unsigned integer with truncation.
-      CVTTP2SI, CVTTP2UI, CVTTP2SI_SAE, CVTTP2UI_SAE,
-      // Scalar float/double to signed/unsigned integer with truncation.
-      CVTTS2SI, CVTTS2UI, CVTTS2SI_SAE, CVTTS2UI_SAE,
-
-      // Vector signed/unsigned integer to float/double.
-      CVTSI2P, CVTUI2P,
-
-      // Masked versions of above. Used for v2f64->v4f32.
-      // SRC, PASSTHRU, MASK
-      MCVTP2SI, MCVTP2UI, MCVTTP2SI, MCVTTP2UI,
-      MCVTSI2P, MCVTUI2P,
-
-      // Vector float to bfloat16.
-      // Convert TWO packed single data to one packed BF16 data
-      CVTNE2PS2BF16, 
-      // Convert packed single data to packed BF16 data
-      CVTNEPS2BF16,
-      // Masked version of above.
-      // SRC, PASSTHRU, MASK
-      MCVTNEPS2BF16,
-
-      // Dot product of BF16 pairs to accumulated into
-      // packed single precision.
-      DPBF16PS,
-
-      // Save xmm argument registers to the stack, according to %al. An operator
-      // is needed so that this can be expanded with control flow.
-      VASTART_SAVE_XMM_REGS,
-
-      // Windows's _chkstk call to do stack probing.
-      WIN_ALLOCA,
-
-      // For allocating variable amounts of stack space when using
-      // segmented stacks. Check if the current stacklet has enough space, and
-      // falls back to heap allocation if not.
-      SEG_ALLOCA,
-
-      // Memory barriers.
-      MEMBARRIER,
-      MFENCE,
-
-      // Store FP status word into i16 register.
-      FNSTSW16r,
-
-      // Store contents of %ah into %eflags.
-      SAHF,
-
-      // Get a random integer and indicate whether it is valid in CF.
-      RDRAND,
-
-      // Get a NIST SP800-90B & C compliant random integer and
-      // indicate whether it is valid in CF.
-      RDSEED,
-
-      // Protection keys
-      // RDPKRU - Operand 0 is chain. Operand 1 is value for ECX.
-      // WRPKRU - Operand 0 is chain. Operand 1 is value for EDX. Operand 2 is
-      // value for ECX.
-      RDPKRU, WRPKRU,
-
-      // SSE42 string comparisons.
-      // These nodes produce 3 results, index, mask, and flags. X86ISelDAGToDAG
-      // will emit one or two instructions based on which results are used. If
-      // flags and index/mask this allows us to use a single instruction since
-      // we won't have to pick and opcode for flags. Instead we can rely on the
-      // DAG to CSE everything and decide at isel.
-      PCMPISTR,
-      PCMPESTR,
-
-      // Test if in transactional execution.
-      XTEST,
-
-      // ERI instructions.
-      RSQRT28, RSQRT28_SAE, RSQRT28S, RSQRT28S_SAE,
-      RCP28, RCP28_SAE, RCP28S, RCP28S_SAE, EXP2, EXP2_SAE,
-
-      // Conversions between float and half-float.
-      CVTPS2PH, CVTPH2PS, CVTPH2PS_SAE,
-
-      // Masked version of above.
-      // SRC, RND, PASSTHRU, MASK
-      MCVTPS2PH,
-
-      // Galois Field Arithmetic Instructions
-      GF2P8AFFINEINVQB, GF2P8AFFINEQB, GF2P8MULB,
-
-      // LWP insert record.
-      LWPINS,
-
-      // User level wait
-      UMWAIT, TPAUSE,
-
-      // Enqueue Stores Instructions
-      ENQCMD, ENQCMDS,
-
-      // For avx512-vp2intersect
-      VP2INTERSECT,
-
-      // Compare and swap.
-      LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE,
-      LCMPXCHG8_DAG,
-      LCMPXCHG16_DAG,
-      LCMPXCHG8_SAVE_EBX_DAG,
-      LCMPXCHG16_SAVE_RBX_DAG,
-
-      /// LOCK-prefixed arithmetic read-modify-write instructions.
-      /// EFLAGS, OUTCHAIN = LADD(INCHAIN, PTR, RHS)
-      LADD, LSUB, LOR, LXOR, LAND,
-
-      // Load, scalar_to_vector, and zero extend.
-      VZEXT_LOAD,
-
-      // extract_vector_elt, store.
-      VEXTRACT_STORE,
-
-      // scalar broadcast from memory
-      VBROADCAST_LOAD,
-
-      // Store FP control world into i16 memory.
-      FNSTCW16m,
-
-      /// This instruction implements FP_TO_SINT with the
-      /// integer destination in memory and a FP reg source.  This corresponds
-      /// to the X86::FIST*m instructions and the rounding mode change stuff. It
-      /// has two inputs (token chain and address) and two outputs (int value
-      /// and token chain). Memory VT specifies the type to store to.
-      FP_TO_INT_IN_MEM,
-
-      /// This instruction implements SINT_TO_FP with the
-      /// integer source in memory and FP reg result.  This corresponds to the
-      /// X86::FILD*m instructions. It has two inputs (token chain and address)
-      /// and two outputs (FP value and token chain). FILD_FLAG also produces a
-      /// flag). The integer source type is specified by the memory VT.
-      FILD,
-      FILD_FLAG,
-
-      /// This instruction implements a fp->int store from FP stack
-      /// slots. This corresponds to the fist instruction. It takes a
-      /// chain operand, value to store, address, and glue. The memory VT
-      /// specifies the type to store as.
-      FIST,
-
-      /// This instruction implements an extending load to FP stack slots.
-      /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain
-      /// operand, and ptr to load from. The memory VT specifies the type to
-      /// load from.
-      FLD,
-
-      /// This instruction implements a truncating store from FP stack
-      /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a
-      /// chain operand, value to store, address, and glue. The memory VT
-      /// specifies the type to store as.
-      FST,
-
-      /// This instruction grabs the address of the next argument
-      /// from a va_list. (reads and modifies the va_list in memory)
-      VAARG_64,
-
-      // Vector truncating store with unsigned/signed saturation
-      VTRUNCSTOREUS, VTRUNCSTORES,
-      // Vector truncating masked store with unsigned/signed saturation
-      VMTRUNCSTOREUS, VMTRUNCSTORES,
-
-      // X86 specific gather and scatter
-      MGATHER, MSCATTER,
-
-      // WARNING: Do not add anything in the end unless you want the node to
-      // have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all
-      // opcodes will be thought as target memory ops!
-    };
+    /// X86 conditional branches. Operand 0 is the chain operand, operand 1
+    /// is the block to branch if condition is true, operand 2 is the
+    /// condition code, and operand 3 is the flag operand produced by a CMP
+    /// or TEST instruction.
+    BRCOND,
+
+    /// BRIND node with NoTrack prefix. Operand 0 is the chain operand and
+    /// operand 1 is the target address.
+    NT_BRIND,
+
+    /// Return with a flag operand. Operand 0 is the chain operand, operand
+    /// 1 is the number of bytes of stack to pop.
+    RET_FLAG,
+
+    /// Return from interrupt. Operand 0 is the number of bytes to pop.
+    IRET,
+
+    /// Repeat fill, corresponds to X86::REP_STOSx.
+    REP_STOS,
+
+    /// Repeat move, corresponds to X86::REP_MOVSx.
+    REP_MOVS,
+
+    /// On Darwin, this node represents the result of the popl
+    /// at function entry, used for PIC code.
+    GlobalBaseReg,
+
+    /// A wrapper node for TargetConstantPool, TargetJumpTable,
+    /// TargetExternalSymbol, TargetGlobalAddress, TargetGlobalTLSAddress,
+    /// MCSymbol and TargetBlockAddress.
+    Wrapper,
+
+    /// Special wrapper used under X86-64 PIC mode for RIP
+    /// relative displacements.
+    WrapperRIP,
+
+    /// Copies a 64-bit value from an MMX vector to the low word
+    /// of an XMM vector, with the high word zero filled.
+    MOVQ2DQ,
+
+    /// Copies a 64-bit value from the low word of an XMM vector
+    /// to an MMX vector.
+    MOVDQ2Q,
+
+    /// Copies a 32-bit value from the low word of a MMX
+    /// vector to a GPR.
+    MMX_MOVD2W,
+
+    /// Copies a GPR into the low 32-bit word of a MMX vector
+    /// and zero out the high word.
+    MMX_MOVW2D,
+
+    /// Extract an 8-bit value from a vector and zero extend it to
+    /// i32, corresponds to X86::PEXTRB.
+    PEXTRB,
+
+    /// Extract a 16-bit value from a vector and zero extend it to
+    /// i32, corresponds to X86::PEXTRW.
+    PEXTRW,
+
+    /// Insert any element of a 4 x float vector into any element
+    /// of a destination 4 x floatvector.
+    INSERTPS,
+
+    /// Insert the lower 8-bits of a 32-bit value to a vector,
+    /// corresponds to X86::PINSRB.
+    PINSRB,
+
+    /// Insert the lower 16-bits of a 32-bit value to a vector,
+    /// corresponds to X86::PINSRW.
+    PINSRW,
+
+    /// Shuffle 16 8-bit values within a vector.
+    PSHUFB,
+
+    /// Compute Sum of Absolute Differences.
+    PSADBW,
+    /// Compute Double Block Packed Sum-Absolute-Differences
+    DBPSADBW,
+
+    /// Bitwise Logical AND NOT of Packed FP values.
+    ANDNP,
+
+    /// Blend where the selector is an immediate.
+    BLENDI,
+
+    /// Dynamic (non-constant condition) vector blend where only the sign bits
+    /// of the condition elements are used. This is used to enforce that the
+    /// condition mask is not valid for generic VSELECT optimizations. This
+    /// is also used to implement the intrinsics.
+    /// Operands are in VSELECT order: MASK, TRUE, FALSE
+    BLENDV,
+
+    /// Combined add and sub on an FP vector.
+    ADDSUB,
+
+    //  FP vector ops with rounding mode.
+    FADD_RND,
+    FADDS,
+    FADDS_RND,
+    FSUB_RND,
+    FSUBS,
+    FSUBS_RND,
+    FMUL_RND,
+    FMULS,
+    FMULS_RND,
+    FDIV_RND,
+    FDIVS,
+    FDIVS_RND,
+    FMAX_SAE,
+    FMAXS_SAE,
+    FMIN_SAE,
+    FMINS_SAE,
+    FSQRT_RND,
+    FSQRTS,
+    FSQRTS_RND,
+
+    // FP vector get exponent.
+    FGETEXP,
+    FGETEXP_SAE,
+    FGETEXPS,
+    FGETEXPS_SAE,
+    // Extract Normalized Mantissas.
+    VGETMANT,
+    VGETMANT_SAE,
+    VGETMANTS,
+    VGETMANTS_SAE,
+    // FP Scale.
+    SCALEF,
+    SCALEF_RND,
+    SCALEFS,
+    SCALEFS_RND,
+
+    // Unsigned Integer average.
+    AVG,
+
+    /// Integer horizontal add/sub.
+    HADD,
+    HSUB,
+
+    /// Floating point horizontal add/sub.
+    FHADD,
+    FHSUB,
+
+    // Detect Conflicts Within a Vector
+    CONFLICT,
+
+    /// Floating point max and min.
+    FMAX,
+    FMIN,
+
+    /// Commutative FMIN and FMAX.
+    FMAXC,
+    FMINC,
+
+    /// Scalar intrinsic floating point max and min.
+    FMAXS,
+    FMINS,
+
+    /// Floating point reciprocal-sqrt and reciprocal approximation.
+    /// Note that these typically require refinement
+    /// in order to obtain suitable precision.
+    FRSQRT,
+    FRCP,
+
+    // AVX-512 reciprocal approximations with a little more precision.
+    RSQRT14,
+    RSQRT14S,
+    RCP14,
+    RCP14S,
+
+    // Thread Local Storage.
+    TLSADDR,
+
+    // Thread Local Storage. A call to get the start address
+    // of the TLS block for the current module.
+    TLSBASEADDR,
+
+    // Thread Local Storage.  When calling to an OS provided
+    // thunk at the address from an earlier relocation.
+    TLSCALL,
+
+    // Exception Handling helpers.
+    EH_RETURN,
+
+    // SjLj exception handling setjmp.
+    EH_SJLJ_SETJMP,
+
+    // SjLj exception handling longjmp.
+    EH_SJLJ_LONGJMP,
+
+    // SjLj exception handling dispatch.
+    EH_SJLJ_SETUP_DISPATCH,
+
+    /// Tail call return. See X86TargetLowering::LowerCall for
+    /// the list of operands.
+    TC_RETURN,
+
+    // Vector move to low scalar and zero higher vector elements.
+    VZEXT_MOVL,
+
+    // Vector integer truncate.
+    VTRUNC,
+    // Vector integer truncate with unsigned/signed saturation.
+    VTRUNCUS,
+    VTRUNCS,
+
+    // Masked version of the above. Used when less than a 128-bit result is
+    // produced since the mask only applies to the lower elements and can't
+    // be represented by a select.
+    // SRC, PASSTHRU, MASK
+    VMTRUNC,
+    VMTRUNCUS,
+    VMTRUNCS,
+
+    // Vector FP extend.
+    VFPEXT,
+    VFPEXT_SAE,
+    VFPEXTS,
+    VFPEXTS_SAE,
+
+    // Vector FP round.
+    VFPROUND,
+    VFPROUND_RND,
+    VFPROUNDS,
+    VFPROUNDS_RND,
+
+    // Masked version of above. Used for v2f64->v4f32.
+    // SRC, PASSTHRU, MASK
+    VMFPROUND,
+
+    // 128-bit vector logical left / right shift
+    VSHLDQ,
+    VSRLDQ,
+
+    // Vector shift elements
+    VSHL,
+    VSRL,
+    VSRA,
+
+    // Vector variable shift
+    VSHLV,
+    VSRLV,
+    VSRAV,
+
+    // Vector shift elements by immediate
+    VSHLI,
+    VSRLI,
+    VSRAI,
+
+    // Shifts of mask registers.
+    KSHIFTL,
+    KSHIFTR,
+
+    // Bit rotate by immediate
+    VROTLI,
+    VROTRI,
+
+    // Vector packed double/float comparison.
+    CMPP,
+
+    // Vector integer comparisons.
+    PCMPEQ,
+    PCMPGT,
+
+    // v8i16 Horizontal minimum and position.
+    PHMINPOS,
+
+    MULTISHIFT,
+
+    /// Vector comparison generating mask bits for fp and
+    /// integer signed and unsigned data types.
+    CMPM,
+    // Vector comparison with SAE for FP values
+    CMPM_SAE,
+
+    // Arithmetic operations with FLAGS results.
+    ADD,
+    SUB,
+    ADC,
+    SBB,
+    SMUL,
+    UMUL,
+    OR,
+    XOR,
+    AND,
+
+    // Bit field extract.
+    BEXTR,
+
+    // Zero High Bits Starting with Specified Bit Position.
+    BZHI,
+
+    // X86-specific multiply by immediate.
+    MUL_IMM,
+
+    // Vector sign bit extraction.
+    MOVMSK,
+
+    // Vector bitwise comparisons.
+    PTEST,
+
+    // Vector packed fp sign bitwise comparisons.
+    TESTP,
+
+    // OR/AND test for masks.
+    KORTEST,
+    KTEST,
+
+    // ADD for masks.
+    KADD,
+
+    // Several flavors of instructions with vector shuffle behaviors.
+    // Saturated signed/unnsigned packing.
+    PACKSS,
+    PACKUS,
+    // Intra-lane alignr.
+    PALIGNR,
+    // AVX512 inter-lane alignr.
+    VALIGN,
+    PSHUFD,
+    PSHUFHW,
+    PSHUFLW,
+    SHUFP,
+    // VBMI2 Concat & Shift.
+    VSHLD,
+    VSHRD,
+    VSHLDV,
+    VSHRDV,
+    // Shuffle Packed Values at 128-bit granularity.
+    SHUF128,
+    MOVDDUP,
+    MOVSHDUP,
+    MOVSLDUP,
+    MOVLHPS,
+    MOVHLPS,
+    MOVSD,
+    MOVSS,
+    UNPCKL,
+    UNPCKH,
+    VPERMILPV,
+    VPERMILPI,
+    VPERMI,
+    VPERM2X128,
+
+    // Variable Permute (VPERM).
+    // Res = VPERMV MaskV, V0
+    VPERMV,
+
+    // 3-op Variable Permute (VPERMT2).
+    // Res = VPERMV3 V0, MaskV, V1
+    VPERMV3,
+
+    // Bitwise ternary logic.
+    VPTERNLOG,
+    // Fix Up Special Packed Float32/64 values.
+    VFIXUPIMM,
+    VFIXUPIMM_SAE,
+    VFIXUPIMMS,
+    VFIXUPIMMS_SAE,
+    // Range Restriction Calculation For Packed Pairs of Float32/64 values.
+    VRANGE,
+    VRANGE_SAE,
+    VRANGES,
+    VRANGES_SAE,
+    // Reduce - Perform Reduction Transformation on scalar\packed FP.
+    VREDUCE,
+    VREDUCE_SAE,
+    VREDUCES,
+    VREDUCES_SAE,
+    // RndScale - Round FP Values To Include A Given Number Of Fraction Bits.
+    // Also used by the legacy (V)ROUND intrinsics where we mask out the
+    // scaling part of the immediate.
+    VRNDSCALE,
+    VRNDSCALE_SAE,
+    VRNDSCALES,
+    VRNDSCALES_SAE,
+    // Tests Types Of a FP Values for packed types.
+    VFPCLASS,
+    // Tests Types Of a FP Values for scalar types.
+    VFPCLASSS,
+
+    // Broadcast (splat) scalar or element 0 of a vector. If the operand is
+    // a vector, this node may change the vector length as part of the splat.
+    VBROADCAST,
+    // Broadcast mask to vector.
+    VBROADCASTM,
+    // Broadcast subvector to vector.
+    SUBV_BROADCAST,
+
+    /// SSE4A Extraction and Insertion.
+    EXTRQI,
+    INSERTQI,
+
+    // XOP arithmetic/logical shifts.
+    VPSHA,
+    VPSHL,
+    // XOP signed/unsigned integer comparisons.
+    VPCOM,
+    VPCOMU,
+    // XOP packed permute bytes.
+    VPPERM,
+    // XOP two source permutation.
+    VPERMIL2,
+
+    // Vector multiply packed unsigned doubleword integers.
+    PMULUDQ,
+    // Vector multiply packed signed doubleword integers.
+    PMULDQ,
+    // Vector Multiply Packed UnsignedIntegers with Round and Scale.
+    MULHRS,
+
+    // Multiply and Add Packed Integers.
+    VPMADDUBSW,
+    VPMADDWD,
+
+    // AVX512IFMA multiply and add.
+    // NOTE: These are different than the instruction and perform
+    // op0 x op1 + op2.
+    VPMADD52L,
+    VPMADD52H,
+
+    // VNNI
+    VPDPBUSD,
+    VPDPBUSDS,
+    VPDPWSSD,
+    VPDPWSSDS,
+
+    // FMA nodes.
+    // We use the target independent ISD::FMA for the non-inverted case.
+    FNMADD,
+    FMSUB,
+    FNMSUB,
+    FMADDSUB,
+    FMSUBADD,
+
+    // FMA with rounding mode.
+    FMADD_RND,
+    FNMADD_RND,
+    FMSUB_RND,
+    FNMSUB_RND,
+    FMADDSUB_RND,
+    FMSUBADD_RND,
+
+    // Compress and expand.
+    COMPRESS,
+    EXPAND,
+
+    // Bits shuffle
+    VPSHUFBITQMB,
+
+    // Convert Unsigned/Integer to Floating-Point Value with rounding mode.
+    SINT_TO_FP_RND,
+    UINT_TO_FP_RND,
+    SCALAR_SINT_TO_FP,
+    SCALAR_UINT_TO_FP,
+    SCALAR_SINT_TO_FP_RND,
+    SCALAR_UINT_TO_FP_RND,
+
+    // Vector float/double to signed/unsigned integer.
+    CVTP2SI,
+    CVTP2UI,
+    CVTP2SI_RND,
+    CVTP2UI_RND,
+    // Scalar float/double to signed/unsigned integer.
+    CVTS2SI,
+    CVTS2UI,
+    CVTS2SI_RND,
+    CVTS2UI_RND,
+
+    // Vector float/double to signed/unsigned integer with truncation.
+    CVTTP2SI,
+    CVTTP2UI,
+    CVTTP2SI_SAE,
+    CVTTP2UI_SAE,
+    // Scalar float/double to signed/unsigned integer with truncation.
+    CVTTS2SI,
+    CVTTS2UI,
+    CVTTS2SI_SAE,
+    CVTTS2UI_SAE,
+
+    // Vector signed/unsigned integer to float/double.
+    CVTSI2P,
+    CVTUI2P,
+
+    // Masked versions of above. Used for v2f64->v4f32.
+    // SRC, PASSTHRU, MASK
+    MCVTP2SI,
+    MCVTP2UI,
+    MCVTTP2SI,
+    MCVTTP2UI,
+    MCVTSI2P,
+    MCVTUI2P,
+
+    // Vector float to bfloat16.
+    // Convert TWO packed single data to one packed BF16 data
+    CVTNE2PS2BF16,
+    // Convert packed single data to packed BF16 data
+    CVTNEPS2BF16,
+    // Masked version of above.
+    // SRC, PASSTHRU, MASK
+    MCVTNEPS2BF16,
+
+    // Dot product of BF16 pairs to accumulated into
+    // packed single precision.
+    DPBF16PS,
+
+    // Save xmm argument registers to the stack, according to %al. An operator
+    // is needed so that this can be expanded with control flow.
+    VASTART_SAVE_XMM_REGS,
+
+    // Save xmm argument registers of the vararg thunk function to the stack,
+    // according to %al. An operator is needed so that this can be expanded with
+    // control flow.
+    VARARG_THUNK_SAVE_XMM_REGS,
+
+    // Windows's _chkstk call to do stack probing.
+    WIN_ALLOCA,
+
+    // For allocating variable amounts of stack space when using
+    // segmented stacks. Check if the current stacklet has enough space, and
+    // falls back to heap allocation if not.
+    SEG_ALLOCA,
+
+    // Memory barriers.
+    MEMBARRIER,
+    MFENCE,
+
+    // Store FP status word into i16 register.
+    FNSTSW16r,
+
+    // Store contents of %ah into %eflags.
+    SAHF,
+
+    // Get a random integer and indicate whether it is valid in CF.
+    RDRAND,
+
+    // Get a NIST SP800-90B & C compliant random integer and
+    // indicate whether it is valid in CF.
+    RDSEED,
+
+    // Protection keys
+    // RDPKRU - Operand 0 is chain. Operand 1 is value for ECX.
+    // WRPKRU - Operand 0 is chain. Operand 1 is value for EDX. Operand 2 is
+    // value for ECX.
+    RDPKRU,
+    WRPKRU,
+
+    // SSE42 string comparisons.
+    // These nodes produce 3 results, index, mask, and flags. X86ISelDAGToDAG
+    // will emit one or two instructions based on which results are used. If
+    // flags and index/mask this allows us to use a single instruction since
+    // we won't have to pick and opcode for flags. Instead we can rely on the
+    // DAG to CSE everything and decide at isel.
+    PCMPISTR,
+    PCMPESTR,
+
+    // Test if in transactional execution.
+    XTEST,
+
+    // ERI instructions.
+    RSQRT28,
+    RSQRT28_SAE,
+    RSQRT28S,
+    RSQRT28S_SAE,
+    RCP28,
+    RCP28_SAE,
+    RCP28S,
+    RCP28S_SAE,
+    EXP2,
+    EXP2_SAE,
+
+    // Conversions between float and half-float.
+    CVTPS2PH,
+    CVTPH2PS,
+    CVTPH2PS_SAE,
+
+    // Masked version of above.
+    // SRC, RND, PASSTHRU, MASK
+    MCVTPS2PH,
+
+    // Galois Field Arithmetic Instructions
+    GF2P8AFFINEINVQB,
+    GF2P8AFFINEQB,
+    GF2P8MULB,
+
+    // LWP insert record.
+    LWPINS,
+
+    // User level wait
+    UMWAIT,
+    TPAUSE,
+
+    // Enqueue Stores Instructions
+    ENQCMD,
+    ENQCMDS,
+
+    // For avx512-vp2intersect
+    VP2INTERSECT,
+
+    // Compare and swap.
+    LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE,
+    LCMPXCHG8_DAG,
+    LCMPXCHG16_DAG,
+    LCMPXCHG8_SAVE_EBX_DAG,
+    LCMPXCHG16_SAVE_RBX_DAG,
+
+    /// LOCK-prefixed arithmetic read-modify-write instructions.
+    /// EFLAGS, OUTCHAIN = LADD(INCHAIN, PTR, RHS)
+    LADD,
+    LSUB,
+    LOR,
+    LXOR,
+    LAND,
+
+    // Load, scalar_to_vector, and zero extend.
+    VZEXT_LOAD,
+
+    // extract_vector_elt, store.
+    VEXTRACT_STORE,
+
+    // scalar broadcast from memory
+    VBROADCAST_LOAD,
+
+    // Store FP control world into i16 memory.
+    FNSTCW16m,
+
+    /// This instruction implements FP_TO_SINT with the
+    /// integer destination in memory and a FP reg source.  This corresponds
+    /// to the X86::FIST*m instructions and the rounding mode change stuff. It
+    /// has two inputs (token chain and address) and two outputs (int value
+    /// and token chain). Memory VT specifies the type to store to.
+    FP_TO_INT_IN_MEM,
+
+    /// This instruction implements SINT_TO_FP with the
+    /// integer source in memory and FP reg result.  This corresponds to the
+    /// X86::FILD*m instructions. It has two inputs (token chain and address)
+    /// and two outputs (FP value and token chain). FILD_FLAG also produces a
+    /// flag). The integer source type is specified by the memory VT.
+    FILD,
+    FILD_FLAG,
+
+    /// This instruction implements a fp->int store from FP stack
+    /// slots. This corresponds to the fist instruction. It takes a
+    /// chain operand, value to store, address, and glue. The memory VT
+    /// specifies the type to store as.
+    FIST,
+
+    /// This instruction implements an extending load to FP stack slots.
+    /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain
+    /// operand, and ptr to load from. The memory VT specifies the type to
+    /// load from.
+    FLD,
+
+    /// This instruction implements a truncating store from FP stack
+    /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a
+    /// chain operand, value to store, address, and glue. The memory VT
+    /// specifies the type to store as.
+    FST,
+
+    /// This instruction grabs the address of the next argument
+    /// from a va_list. (reads and modifies the va_list in memory)
+    VAARG_64,
+
+    // Vector truncating store with unsigned/signed saturation
+    VTRUNCSTOREUS,
+    VTRUNCSTORES,
+    // Vector truncating masked store with unsigned/signed saturation
+    VMTRUNCSTOREUS,
+    VMTRUNCSTORES,
+
+    // X86 specific gather and scatter
+    MGATHER,
+    MSCATTER,
+
+    // WARNING: Do not add anything in the end unless you want the node to
+    // have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all
+    // opcodes will be thought as target memory ops!
+  };
   } // end namespace X86ISD
 
   /// Define some predicates that are used for node matching.
@@ -1431,6 +1550,10 @@
     EmitVAStartSaveXMMRegsWithCustomInserter(MachineInstr &BInstr,
                                              MachineBasicBlock *BB) const;
 
+    MachineBasicBlock *
+    EmitVarargThunkSaveXMMRegsWithCustomInserter(MachineInstr &BInstr,
+                                                 MachineBasicBlock *BB) const;
+
     MachineBasicBlock *EmitLoweredCascadedSelect(MachineInstr &MI1,
                                                  MachineInstr &MI2,
                                                  MachineBasicBlock *BB) const;
Index: llvm/lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- llvm/lib/Target/X86/X86ISelLowering.cpp
+++ llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -3354,9 +3354,14 @@
            F.hasFnAttribute(Attribute::NoImplicitFloat)) &&
          "SSE register cannot be used when SSE is disabled!");
 
+  SmallDenseSet<MCPhysReg, 8> guardedXmmRegs;
+  SmallVector<SDValue, 6> LiveGPRs;
+  SmallVector<SDValue, 8> LiveXMMRegs;
+  SDValue ALVal;
+
   // 64-bit calling conventions support varargs and register parameters, so we
   // have to do extra work to spill them in the prologue.
-  if (Is64Bit && isVarArg && MFI.hasVAStart()) {
+  if (Is64Bit && isVarArg) {
     // Find the first unallocated argument registers.
     ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
     ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
@@ -3366,77 +3371,83 @@
            "SSE register cannot be used when SSE is disabled!");
 
     // Gather all the live in physical registers.
-    SmallVector<SDValue, 6> LiveGPRs;
-    SmallVector<SDValue, 8> LiveXMMRegs;
-    SDValue ALVal;
     for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
       unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
       LiveGPRs.push_back(
           DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
     }
+
     if (!ArgXMMs.empty()) {
       unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
       ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
       for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
-        unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
-        LiveXMMRegs.push_back(
-            DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
-      }
-    }
-
-    if (IsWin64) {
-      // Get to the caller-allocated home save location.  Add 8 to account
-      // for the return address.
-      int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
-      FuncInfo->setRegSaveFrameIndex(
-          MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
-      // Fixup to set vararg frame on shadow area (4 x i64).
-      if (NumIntRegs < 4)
-        FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
-    } else {
-      // For X86-64, if there are vararg parameters that are passed via
-      // registers, then we must store them to their spots on the stack so
-      // they may be loaded by dereferencing the result of va_next.
-      FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
-      FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
-      FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
-          ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
-    }
-
-    // Store the integer parameter registers.
-    SmallVector<SDValue, 8> MemOps;
-    SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
-                                      getPointerTy(DAG.getDataLayout()));
-    unsigned Offset = FuncInfo->getVarArgsGPOffset();
-    for (SDValue Val : LiveGPRs) {
-      SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
-                                RSFIN, DAG.getIntPtrConstant(Offset, dl));
-      SDValue Store =
-          DAG.getStore(Val.getValue(1), dl, Val, FIN,
-                       MachinePointerInfo::getFixedStack(
-                           DAG.getMachineFunction(),
-                           FuncInfo->getRegSaveFrameIndex(), Offset));
-      MemOps.push_back(Store);
-      Offset += 8;
-    }
-
-    if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
-      // Now store the XMM (fp + vector) parameter registers.
-      SmallVector<SDValue, 12> SaveXMMOps;
-      SaveXMMOps.push_back(Chain);
-      SaveXMMOps.push_back(ALVal);
-      SaveXMMOps.push_back(DAG.getIntPtrConstant(
-                             FuncInfo->getRegSaveFrameIndex(), dl));
-      SaveXMMOps.push_back(DAG.getIntPtrConstant(
-                             FuncInfo->getVarArgsFPOffset(), dl));
-      SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
-                        LiveXMMRegs.end());
-      MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
-                                   MVT::Other, SaveXMMOps));
+        // FastRegisterAllocator spills virtual registers at basic
+        // block boundary. That leads to usages of xmm registers
+        // outside of check for %al. Pass physical registers to
+        // VASTART_SAVE_XMM_REGS to avoid unneccessary spilling.
+        // See https://bugs.llvm.org/show_bug.cgi?id=42219.
+        MF.getRegInfo().addLiveIn(Reg);
+        LiveXMMRegs.push_back(DAG.getRegister(Reg, MVT::v4f32));
+        guardedXmmRegs.insert(Reg);
+      }
+    }
+
+    if (MFI.hasVAStart()) {
+      if (IsWin64) {
+        // Get to the caller-allocated home save location.  Add 8 to account
+        // for the return address.
+        int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
+        FuncInfo->setRegSaveFrameIndex(
+            MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
+        // Fixup to set vararg frame on shadow area (4 x i64).
+        if (NumIntRegs < 4)
+          FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
+      } else {
+        // For X86-64, if there are vararg parameters that are passed via
+        // registers, then we must store them to their spots on the stack so
+        // they may be loaded by dereferencing the result of va_next.
+        FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
+        FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
+        FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
+            ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
+      }
+
+      // Store the integer parameter registers.
+      SmallVector<SDValue, 8> MemOps;
+      SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
+                                        getPointerTy(DAG.getDataLayout()));
+      unsigned Offset = FuncInfo->getVarArgsGPOffset();
+      for (SDValue Val : LiveGPRs) {
+        SDValue FIN =
+            DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), RSFIN,
+                        DAG.getIntPtrConstant(Offset, dl));
+        SDValue Store =
+            DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                         MachinePointerInfo::getFixedStack(
+                             DAG.getMachineFunction(),
+                             FuncInfo->getRegSaveFrameIndex(), Offset));
+        MemOps.push_back(Store);
+        Offset += 8;
+      }
+
+      if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
+        // Now store the XMM (fp + vector) parameter registers.
+        SmallVector<SDValue, 12> SaveXMMOps;
+        SaveXMMOps.push_back(Chain);
+        SaveXMMOps.push_back(ALVal);
+        SaveXMMOps.push_back(
+            DAG.getIntPtrConstant(FuncInfo->getRegSaveFrameIndex(), dl));
+        SaveXMMOps.push_back(
+            DAG.getIntPtrConstant(FuncInfo->getVarArgsFPOffset(), dl));
+        SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
+                          LiveXMMRegs.end());
+        MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
+                                     MVT::Other, SaveXMMOps));
+      }
+
+      if (!MemOps.empty())
+        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
     }
-
-    if (!MemOps.empty())
-      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
   }
 
   if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
@@ -3462,7 +3473,8 @@
     // Compute the set of forwarded registers. The rest are scratch.
     SmallVectorImpl<ForwardedRegister> &Forwards =
         FuncInfo->getForwardedMustTailRegParms();
-    CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
+    CCInfo.analyzeMustTailForwardedRegisters(Forwards, guardedXmmRegs,
+                                             RegParmTypes, CC_X86);
 
     // Conservatively forward AL on x86_64, since it might be used for varargs.
     if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
@@ -3473,9 +3485,48 @@
     // Copy all forwards from physical to virtual registers.
     for (ForwardedRegister &FR : Forwards) {
       // FIXME: Can we use a less constrained schedule?
-      SDValue RegVal = DAG.getCopyFromReg(Chain, dl, FR.VReg, FR.VT);
-      FR.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(FR.VT));
-      Chain = DAG.getCopyToReg(Chain, dl, FR.VReg, RegVal);
+      if (!FR.IsGuarded()) {
+        SDValue RegVal = DAG.getCopyFromReg(Chain, dl, FR.VReg, FR.VT);
+        FR.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(FR.VT));
+        Chain = DAG.getCopyToReg(Chain, dl, FR.VReg, RegVal);
+      }
+    }
+
+    if (guardedXmmRegs.size() > 0) {
+      if (MFI.hasVAStart()) {
+        // all incoming xmm registers are already stored by VAStart
+        // handling. Reuse these stored values for thunk forwarded
+        // parameters here.
+        FuncInfo->setThunkRegSaveFrameIndex(FuncInfo->getRegSaveFrameIndex());
+      } else {
+        // TODO: add check for possibility to not store guarded vararg
+        // TODO: parameters. If function contains only musttail calls, if it
+        // TODO: does not use floating point types,
+        // TODO: if Attribute::NoImplicitFloat specified then:
+        // TODO: it is possible to not store/restore guarded vararg parameters
+        // TODO: of thunk.
+
+        // TODO: implement support for YMM, ZMM vararg registers
+
+        // allocate stack space to save guardedXmmRegs, 16 is size of XMM
+        FuncInfo->setThunkRegSaveFrameIndex(
+            MFI.CreateStackObject(guardedXmmRegs.size() * 16, 16, false));
+
+        // Save guarded forwards into guarded area
+        SmallVector<SDValue, 8> VarargMemOps;
+        SmallVector<SDValue, 12> VarargXMMOps;
+        VarargXMMOps.push_back(Chain);
+        VarargXMMOps.push_back(ALVal);
+        VarargXMMOps.push_back(
+            DAG.getIntPtrConstant(FuncInfo->getThunkRegSaveFrameIndex(), dl));
+        VarargXMMOps.push_back(DAG.getIntPtrConstant(0, dl));
+        VarargXMMOps.insert(VarargXMMOps.end(), LiveXMMRegs.begin(),
+                            LiveXMMRegs.end());
+        VarargMemOps.push_back(DAG.getNode(X86ISD::VARARG_THUNK_SAVE_XMM_REGS,
+                                           dl, MVT::Other, VarargXMMOps));
+        if (!VarargMemOps.empty())
+          Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, VarargMemOps);
+      }
     }
   }
 
@@ -3497,8 +3548,9 @@
   }
 
   if (!Is64Bit) {
-    // RegSaveFrameIndex is X86-64 only.
+    // RegSaveFrameIndex and ThunkRegSaveFrameIndex is X86-64 only.
     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
+    FuncInfo->setThunkRegSaveFrameIndex(0xAAAAAAA);
     if (CallConv == CallingConv::X86_FastCall ||
         CallConv == CallingConv::X86_ThisCall)
       // fastcc functions can't have varargs.
@@ -3904,8 +3956,10 @@
   if (isVarArg && IsMustTail) {
     const auto &Forwards = X86Info->getForwardedMustTailRegParms();
     for (const auto &F : Forwards) {
-      SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
-      RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
+      if (!F.IsGuarded()) {
+        SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
+        RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
+      }
     }
   }
 
@@ -28745,6 +28799,8 @@
   case X86ISD::PSADBW:             return "X86ISD::PSADBW";
   case X86ISD::DBPSADBW:           return "X86ISD::DBPSADBW";
   case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
+  case X86ISD::VARARG_THUNK_SAVE_XMM_REGS:
+    return "X86::VARARG_THUNK_SAVE_XMM_REGS";
   case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
   case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
   case X86ISD::MEMBARRIER:         return "X86ISD::MEMBARRIER";
@@ -29509,8 +29565,68 @@
   return endMBB;
 }
 
+// This function creates additional block for storing varargs guarded
+// registers. It adds check for %al into entry block, to skip
+// GuardedRegsBlk if xmm registers should not be stored.
+//
+//     EntryBlk[VAPseudoInstr]          EntryBlk
+//        |                              |     .
+//        |                              |        .
+//        |                              |   GuardedRegsBlk
+//        |                      =>      |        .
+//        |                              |     .
+//        |                             TailBlk[VAPseudoInstr]
+//        |                              |
+//        |                              |
+//
+static std::pair<MachineBasicBlock *, MachineBasicBlock *>
+CreateGuardedRegsBlock(MachineBasicBlock *EntryBlk, MachineInstr &VAPseudoInstr,
+                       const X86Subtarget &Subtarget) {
+
+  MachineFunction *Func = EntryBlk->getParent();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  DebugLoc DL = VAPseudoInstr.getDebugLoc();
+  Register CountReg = VAPseudoInstr.getOperand(0).getReg();
+
+  // Create the new basic blocks. One block contains all the XMM stores,
+  // and one block is the final destination regardless of whether any
+  // stores were performed.
+  const BasicBlock *LLVMBlk = EntryBlk->getBasicBlock();
+  MachineFunction::iterator EntryBlkIter = ++EntryBlk->getIterator();
+  MachineBasicBlock *GuardedRegsBlk = Func->CreateMachineBasicBlock(LLVMBlk);
+  MachineBasicBlock *TailBlk = Func->CreateMachineBasicBlock(LLVMBlk);
+  Func->insert(EntryBlkIter, GuardedRegsBlk);
+  Func->insert(EntryBlkIter, TailBlk);
+
+  GuardedRegsBlk->setIsGuardedRegsBlk();
+
+  // Transfer the remainder of MBB and its successor edges to EndMBB.
+  TailBlk->splice(TailBlk->begin(), EntryBlk,
+                  std::next(MachineBasicBlock::iterator(VAPseudoInstr)),
+                  EntryBlk->end());
+  TailBlk->transferSuccessorsAndUpdatePHIs(EntryBlk);
+
+  // The original block will now fall through to the XMM save block.
+  EntryBlk->addSuccessor(GuardedRegsBlk);
+  // The XMMSaveMBB will fall through to the end block.
+  GuardedRegsBlk->addSuccessor(TailBlk);
+
+  if (!Subtarget.isCallingConvWin64(Func->getFunction().getCallingConv())) {
+    // If %al is 0, branch around the XMM save block.
+    BuildMI(EntryBlk, DL, TII->get(X86::TEST8rr))
+        .addReg(CountReg)
+        .addReg(CountReg);
+    BuildMI(EntryBlk, DL, TII->get(X86::JCC_1))
+        .addMBB(TailBlk)
+        .addImm(X86::COND_E);
+    EntryBlk->addSuccessor(TailBlk);
+  }
+
+  return std::make_pair(GuardedRegsBlk, TailBlk);
+}
+
 MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
-    MachineInstr &MI, MachineBasicBlock *MBB) const {
+    MachineInstr &PseudoVaStartInstr, MachineBasicBlock *EntryBlk) const {
   // Emit code to save XMM registers to the stack. The ABI says that the
   // number of registers to save is given in %al, so it's theoretically
   // possible to do an indirect jump trick to avoid saving all of them,
@@ -29519,69 +29635,133 @@
   // easier on the hardware branch predictor, and stores aren't all that
   // expensive anyway.
 
-  // Create the new basic blocks. One block contains all the XMM stores,
-  // and one block is the final destination regardless of whether any
-  // stores were performed.
-  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
-  MachineFunction *F = MBB->getParent();
-  MachineFunction::iterator MBBIter = ++MBB->getIterator();
-  MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  F->insert(MBBIter, XMMSaveMBB);
-  F->insert(MBBIter, EndMBB);
+  MachineBasicBlock *GuardedRegsBlk = nullptr;
+  MachineBasicBlock *TailBlk = nullptr;
 
-  // Transfer the remainder of MBB and its successor edges to EndMBB.
-  EndMBB->splice(EndMBB->begin(), MBB,
-                 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
-  EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
+  std::tie(GuardedRegsBlk, TailBlk) =
+      CreateGuardedRegsBlock(EntryBlk, PseudoVaStartInstr, Subtarget);
 
-  // The original block will now fall through to the XMM save block.
-  MBB->addSuccessor(XMMSaveMBB);
-  // The XMMSaveMBB will fall through to the end block.
-  XMMSaveMBB->addSuccessor(EndMBB);
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  DebugLoc DL = PseudoVaStartInstr.getDebugLoc();
+  int64_t RegSaveFrameIndex = PseudoVaStartInstr.getOperand(1).getImm();
+  int64_t VarArgsFPOffset = PseudoVaStartInstr.getOperand(2).getImm();
+  MachineFunction *Func = EntryBlk->getParent();
 
   // Now add the instructions.
-  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
-  DebugLoc DL = MI.getDebugLoc();
 
-  Register CountReg = MI.getOperand(0).getReg();
-  int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
-  int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
+  // Make sure the last operand is EFLAGS, which gets clobbered by the branch
+  // that was just emitted, but clearly shouldn't be "saved".
+  assert(
+      (PseudoVaStartInstr.getNumOperands() <= 3 ||
+       !PseudoVaStartInstr.getOperand(PseudoVaStartInstr.getNumOperands() - 1)
+            .isReg() ||
+       PseudoVaStartInstr.getOperand(PseudoVaStartInstr.getNumOperands() - 1)
+               .getReg() == X86::EFLAGS) &&
+      "Expected last argument to be EFLAGS");
+
+  // TODO: add support for YMM and ZMM here.
+  unsigned MovOpc = Subtarget.hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr;
+  // save all guarded XMM registers.
+  for (unsigned OpndIdx = 3, RegIdx = 0;
+       OpndIdx + 1 < PseudoVaStartInstr.getNumOperands(); OpndIdx++, RegIdx++) {
+    int64_t offset = RegIdx * 16 + VarArgsFPOffset;
+    MachineMemOperand *memoryOpnd = Func->getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(*Func, RegSaveFrameIndex, offset),
+        MachineMemOperand::MOStore,
+        /*Size=*/16, /*Align=*/16);
+    BuildMI(GuardedRegsBlk, DL, TII->get(MovOpc))
+        .addFrameIndex(RegSaveFrameIndex)
+        .addImm(/*Scale=*/1)
+        .addReg(/*IndexReg=*/0)
+        .addImm(/*Disp=*/offset)
+        .addReg(/*Segment=*/0)
+        .addReg(PseudoVaStartInstr.getOperand(OpndIdx).getReg())
+        .addMemOperand(memoryOpnd);
+    assert(Register::isPhysicalRegister(
+        PseudoVaStartInstr.getOperand(OpndIdx).getReg()));
+    GuardedRegsBlk->addLiveIn(PseudoVaStartInstr.getOperand(OpndIdx).getReg());
+  }
+
+  PseudoVaStartInstr.eraseFromParent(); // The pseudo instruction is gone now.
 
-  if (!Subtarget.isCallingConvWin64(F->getFunction().getCallingConv())) {
-    // If %al is 0, branch around the XMM save block.
-    BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
-    BuildMI(MBB, DL, TII->get(X86::JCC_1)).addMBB(EndMBB).addImm(X86::COND_E);
-    MBB->addSuccessor(EndMBB);
+  return TailBlk;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitVarargThunkSaveXMMRegsWithCustomInserter(
+    MachineInstr &PseudoVarargThunkInstr, MachineBasicBlock *EntryBlk) const {
+  MachineBasicBlock *GuardedRegsBlk = nullptr;
+  MachineBasicBlock *TailBlk = nullptr;
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  DebugLoc DL = PseudoVarargThunkInstr.getDebugLoc();
+  int64_t ThunkRegSaveFrameIndex =
+      PseudoVarargThunkInstr.getOperand(1).getImm();
+  int64_t VarArgsRegsOffset = PseudoVarargThunkInstr.getOperand(2).getImm();
+  MachineFunction *Func = EntryBlk->getParent();
+  bool NeedToAddLiveInsIntoGuardedRegsBlk = true;
+
+  // check whether GuardedRegsBlk is already created by VASTART handling code
+  assert(Func->begin() != Func->end());
+  for (auto &Succ : (*Func->begin()).successors()) {
+    if (Succ->isGuardedRegsBlk()) {
+      GuardedRegsBlk = Succ;
+      TailBlk = *GuardedRegsBlk->succ_begin();
+      NeedToAddLiveInsIntoGuardedRegsBlk = false;
+      break;
+    }
   }
 
+  if (GuardedRegsBlk == nullptr)
+    std::tie(GuardedRegsBlk, TailBlk) =
+        CreateGuardedRegsBlock(EntryBlk, PseudoVarargThunkInstr, Subtarget);
+
+  // Now add the instructions.
+
   // Make sure the last operand is EFLAGS, which gets clobbered by the branch
   // that was just emitted, but clearly shouldn't be "saved".
-  assert((MI.getNumOperands() <= 3 ||
-          !MI.getOperand(MI.getNumOperands() - 1).isReg() ||
-          MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
+  assert((PseudoVarargThunkInstr.getNumOperands() <= 3 ||
+          !PseudoVarargThunkInstr
+               .getOperand(PseudoVarargThunkInstr.getNumOperands() - 1)
+               .isReg() ||
+          PseudoVarargThunkInstr
+                  .getOperand(PseudoVarargThunkInstr.getNumOperands() - 1)
+                  .getReg() == X86::EFLAGS) &&
          "Expected last argument to be EFLAGS");
+
+  // TODO: add support for YMM and ZMM here.
   unsigned MOVOpc = Subtarget.hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr;
+
   // In the XMM save block, save all the XMM argument registers.
-  for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
-    int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
-    MachineMemOperand *MMO = F->getMachineMemOperand(
-        MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
-        MachineMemOperand::MOStore,
-        /*Size=*/16, /*Align=*/16);
-    BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
-        .addFrameIndex(RegSaveFrameIndex)
+  for (unsigned OpndIdx = 3, RegIdx = 0;
+       OpndIdx + 1 < PseudoVarargThunkInstr.getNumOperands();
+       OpndIdx++, RegIdx++) {
+    int64_t Offset = RegIdx * 16 + VarArgsRegsOffset;
+
+    MachineMemOperand *MMO =
+        Func->getMachineMemOperand(MachinePointerInfo::getFixedStack(
+                                       *Func, ThunkRegSaveFrameIndex, Offset),
+                                   MachineMemOperand::MOStore,
+                                   /*Size=*/16, /*Align=*/16);
+    BuildMI(GuardedRegsBlk, DL, TII->get(MOVOpc))
+        .addFrameIndex(ThunkRegSaveFrameIndex)
         .addImm(/*Scale=*/1)
         .addReg(/*IndexReg=*/0)
         .addImm(/*Disp=*/Offset)
         .addReg(/*Segment=*/0)
-        .addReg(MI.getOperand(i).getReg())
+        .addReg(PseudoVarargThunkInstr.getOperand(OpndIdx).getReg())
         .addMemOperand(MMO);
+    assert(Register::isPhysicalRegister(
+        PseudoVarargThunkInstr.getOperand(OpndIdx).getReg()));
+
+    if (NeedToAddLiveInsIntoGuardedRegsBlk)
+      GuardedRegsBlk->addLiveIn(
+          PseudoVarargThunkInstr.getOperand(OpndIdx).getReg());
   }
 
-  MI.eraseFromParent(); // The pseudo instruction is gone now.
+  PseudoVarargThunkInstr
+      .eraseFromParent(); // The pseudo instruction is gone now.
 
-  return EndMBB;
+  return TailBlk;
 }
 
 // The EFLAGS operand of SelectItr might be missing a kill marker
@@ -31320,6 +31500,9 @@
   case X86::VASTART_SAVE_XMM_REGS:
     return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
 
+  case X86::VARARG_THUNK_SAVE_XMM_REGS:
+    return EmitVarargThunkSaveXMMRegsWithCustomInserter(MI, BB);
+
   case X86::VAARG_64:
     return EmitVAARG64WithCustomInserter(MI, BB);
 
Index: llvm/lib/Target/X86/X86InstrCompiler.td
===================================================================
--- llvm/lib/Target/X86/X86InstrCompiler.td
+++ llvm/lib/Target/X86/X86InstrCompiler.td
@@ -81,6 +81,19 @@
                                                          imm:$offset),
                                (implicit EFLAGS)]>;
 
+// x86-64 %al guarded thunk arguments lowering magic.
+def VARARG_THUNK_SAVE_XMM_REGS : I<0, Pseudo,
+                              (outs),
+                              (ins GR8:$al,
+                                   i64imm:$regsavefi, i64imm:$offset,
+                                   variable_ops),
+                              "#VARARG_THUNK_SAVE_XMM_REGS $al, $regsavefi, $offset",
+                              [(X86vararg_thunk_save_xmm_regs GR8:$al,
+                                                         imm:$regsavefi,
+                                                         imm:$offset),
+                               (implicit EFLAGS)]>;
+
+
 // The VAARG_64 pseudo-instruction takes the address of the va_list,
 // and places the address of the next argument into a register.
 let Defs = [EFLAGS] in
Index: llvm/lib/Target/X86/X86InstrInfo.td
===================================================================
--- llvm/lib/Target/X86/X86InstrInfo.td
+++ llvm/lib/Target/X86/X86InstrInfo.td
@@ -99,6 +99,11 @@
                                                          SDTCisVT<1, iPTR>,
                                                          SDTCisVT<2, iPTR>]>;
 
+def SDT_X86VARARG_THUNK_SAVE_XMM_REGS : SDTypeProfile<0, -1, [SDTCisVT<0, i8>,
+                                                         SDTCisVT<1, iPTR>,
+                                                         SDTCisVT<2, iPTR>]>;
+
+
 def SDT_X86VAARG_64 : SDTypeProfile<1, -1, [SDTCisPtrTy<0>,
                                             SDTCisPtrTy<1>,
                                             SDTCisVT<2, i32>,
@@ -190,6 +195,12 @@
                  SDNode<"X86ISD::VASTART_SAVE_XMM_REGS",
                         SDT_X86VASTART_SAVE_XMM_REGS,
                         [SDNPHasChain, SDNPVariadic]>;
+
+def X86vararg_thunk_save_xmm_regs :
+                 SDNode<"X86ISD::VARARG_THUNK_SAVE_XMM_REGS",
+                        SDT_X86VARARG_THUNK_SAVE_XMM_REGS,
+                        [SDNPHasChain, SDNPVariadic]>;
+
 def X86vaarg64 :
                  SDNode<"X86ISD::VAARG_64", SDT_X86VAARG_64,
                         [SDNPHasChain, SDNPMayLoad, SDNPMayStore,
Index: llvm/lib/Target/X86/X86MachineFunctionInfo.h
===================================================================
--- llvm/lib/Target/X86/X86MachineFunctionInfo.h
+++ llvm/lib/Target/X86/X86MachineFunctionInfo.h
@@ -73,6 +73,9 @@
   int VarArgsFrameIndex = 0;
   /// RegSaveFrameIndex - X86-64 vararg func register save area.
   int RegSaveFrameIndex = 0;
+  /// thunkRegSaveFrameIndex - X86-64 vararg func register save area for thunk
+  /// functions.
+  int thunkRegSaveFrameIndex = 0;
   /// VarArgsGPOffset - X86-64 vararg func int reg offset.
   unsigned VarArgsGPOffset = 0;
   /// VarArgsFPOffset - X86-64 vararg func fp reg offset.
@@ -155,6 +158,9 @@
   int getRegSaveFrameIndex() const { return RegSaveFrameIndex; }
   void setRegSaveFrameIndex(int Idx) { RegSaveFrameIndex = Idx; }
 
+  int getThunkRegSaveFrameIndex() const { return thunkRegSaveFrameIndex; }
+  void setThunkRegSaveFrameIndex(int Idx) { thunkRegSaveFrameIndex = Idx; }
+
   unsigned getVarArgsGPOffset() const { return VarArgsGPOffset; }
   void setVarArgsGPOffset(unsigned Offset) { VarArgsGPOffset = Offset; }
 
Index: llvm/test/CodeGen/X86/musttail-varargs.ll
===================================================================
--- llvm/test/CodeGen/X86/musttail-varargs.ll
+++ llvm/test/CodeGen/X86/musttail-varargs.ll
@@ -1,9 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs < %s -enable-tail-merge=0 -mtriple=x86_64-linux | FileCheck %s --check-prefix=LINUX
+; RUN: llc -verify-machineinstrs -O0 < %s -enable-tail-merge=0 -mtriple=x86_64-linux | FileCheck %s --check-prefix=LINUX-OPT0
 ; RUN: llc -verify-machineinstrs < %s -enable-tail-merge=0 -mtriple=x86_64-linux-gnux32 | FileCheck %s --check-prefix=LINUX-X32
+; RUN: llc -verify-machineinstrs -O0 < %s -enable-tail-merge=0 -mtriple=x86_64-linux-gnux32 | FileCheck %s --check-prefix=LINUX-X32-OPT0
 ; RUN: llc -verify-machineinstrs < %s -enable-tail-merge=0 -mtriple=x86_64-windows | FileCheck %s --check-prefix=WINDOWS
+; RUN: llc -verify-machineinstrs -O0 < %s -enable-tail-merge=0 -mtriple=x86_64-windows | FileCheck %s --check-prefix=WINDOWS-OPT0
 ; RUN: llc -verify-machineinstrs < %s -enable-tail-merge=0 -mtriple=i686-windows | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE
+; RUN: llc -verify-machineinstrs -O0 < %s -enable-tail-merge=0 -mtriple=i686-windows | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE-OPT0
 ; RUN: llc -verify-machineinstrs < %s -enable-tail-merge=0 -mtriple=i686-windows -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE
+; RUN: llc -verify-machineinstrs -O0 < %s -enable-tail-merge=0 -mtriple=i686-windows -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE-OPT0
 
 ; Test that we actually spill and reload all arguments in the variadic argument
 ; pack. Doing a normal call will clobber all argument registers, and we will
@@ -29,8 +34,8 @@
 ; LINUX-NEXT:    .cfi_def_cfa_offset 48
 ; LINUX-NEXT:    pushq %rbx
 ; LINUX-NEXT:    .cfi_def_cfa_offset 56
-; LINUX-NEXT:    subq $360, %rsp # imm = 0x168
-; LINUX-NEXT:    .cfi_def_cfa_offset 416
+; LINUX-NEXT:    subq $232, %rsp
+; LINUX-NEXT:    .cfi_def_cfa_offset 288
 ; LINUX-NEXT:    .cfi_offset %rbx, -56
 ; LINUX-NEXT:    .cfi_offset %r12, -48
 ; LINUX-NEXT:    .cfi_offset %r13, -40
@@ -43,6 +48,11 @@
 ; LINUX-NEXT:    movq %rdx, %rbp
 ; LINUX-NEXT:    movq %rsi, %rbx
 ; LINUX-NEXT:    movq %rdi, %r14
+; LINUX-NEXT:    movq %rsi, {{[0-9]+}}(%rsp)
+; LINUX-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; LINUX-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; LINUX-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; LINUX-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
 ; LINUX-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; LINUX-NEXT:    testb %al, %al
 ; LINUX-NEXT:    je .LBB0_2
@@ -56,11 +66,6 @@
 ; LINUX-NEXT:    movaps %xmm6, {{[0-9]+}}(%rsp)
 ; LINUX-NEXT:    movaps %xmm7, {{[0-9]+}}(%rsp)
 ; LINUX-NEXT:  .LBB0_2:
-; LINUX-NEXT:    movq %rbx, {{[0-9]+}}(%rsp)
-; LINUX-NEXT:    movq %rbp, {{[0-9]+}}(%rsp)
-; LINUX-NEXT:    movq %r13, {{[0-9]+}}(%rsp)
-; LINUX-NEXT:    movq %r12, {{[0-9]+}}(%rsp)
-; LINUX-NEXT:    movq %r15, {{[0-9]+}}(%rsp)
 ; LINUX-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
 ; LINUX-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
 ; LINUX-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
@@ -68,14 +73,6 @@
 ; LINUX-NEXT:    movabsq $206158430216, %rax # imm = 0x3000000008
 ; LINUX-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
 ; LINUX-NEXT:    movq %r14, %rdi
-; LINUX-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; LINUX-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; LINUX-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; LINUX-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; LINUX-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; LINUX-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; LINUX-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; LINUX-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; LINUX-NEXT:    callq get_f
 ; LINUX-NEXT:    movq %rax, %r11
 ; LINUX-NEXT:    movq %r14, %rdi
@@ -84,16 +81,36 @@
 ; LINUX-NEXT:    movq %r13, %rcx
 ; LINUX-NEXT:    movq %r12, %r8
 ; LINUX-NEXT:    movq %r15, %r9
-; LINUX-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; LINUX-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; LINUX-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; LINUX-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; LINUX-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; LINUX-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; LINUX-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; LINUX-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
 ; LINUX-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
-; LINUX-NEXT:    addq $360, %rsp # imm = 0x168
+; LINUX-NEXT:    testb %al, %al
+; LINUX-NEXT:    je .LBB0_4
+; LINUX-NEXT:  # %bb.3:
+; LINUX-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm7
+; LINUX-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm6
+; LINUX-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm5
+; LINUX-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm4
+; LINUX-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm3
+; LINUX-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
+; LINUX-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
+; LINUX-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; LINUX-NEXT:    addq $232, %rsp
+; LINUX-NEXT:    .cfi_def_cfa_offset 56
+; LINUX-NEXT:    popq %rbx
+; LINUX-NEXT:    .cfi_def_cfa_offset 48
+; LINUX-NEXT:    popq %r12
+; LINUX-NEXT:    .cfi_def_cfa_offset 40
+; LINUX-NEXT:    popq %r13
+; LINUX-NEXT:    .cfi_def_cfa_offset 32
+; LINUX-NEXT:    popq %r14
+; LINUX-NEXT:    .cfi_def_cfa_offset 24
+; LINUX-NEXT:    popq %r15
+; LINUX-NEXT:    .cfi_def_cfa_offset 16
+; LINUX-NEXT:    popq %rbp
+; LINUX-NEXT:    .cfi_def_cfa_offset 8
+; LINUX-NEXT:    jmpq *%r11 # TAILCALL
+; LINUX-NEXT:  .LBB0_4:
+; LINUX-NEXT:    .cfi_def_cfa_offset 288
+; LINUX-NEXT:    addq $232, %rsp
 ; LINUX-NEXT:    .cfi_def_cfa_offset 56
 ; LINUX-NEXT:    popq %rbx
 ; LINUX-NEXT:    .cfi_def_cfa_offset 48
@@ -109,6 +126,85 @@
 ; LINUX-NEXT:    .cfi_def_cfa_offset 8
 ; LINUX-NEXT:    jmpq *%r11 # TAILCALL
 ;
+; LINUX-OPT0-LABEL: f_thunk:
+; LINUX-OPT0:       # %bb.0:
+; LINUX-OPT0-NEXT:    subq $328, %rsp # imm = 0x148
+; LINUX-OPT0-NEXT:    .cfi_def_cfa_offset 336
+; LINUX-OPT0-NEXT:    testb %al, %al
+; LINUX-OPT0-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; LINUX-OPT0-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; LINUX-OPT0-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; LINUX-OPT0-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; LINUX-OPT0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; LINUX-OPT0-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; LINUX-OPT0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; LINUX-OPT0-NEXT:    je .LBB0_2
+; LINUX-OPT0-NEXT:  # %bb.1:
+; LINUX-OPT0-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; LINUX-OPT0-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; LINUX-OPT0-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; LINUX-OPT0-NEXT:    movaps %xmm3, {{[0-9]+}}(%rsp)
+; LINUX-OPT0-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; LINUX-OPT0-NEXT:    movaps %xmm5, {{[0-9]+}}(%rsp)
+; LINUX-OPT0-NEXT:    movaps %xmm6, {{[0-9]+}}(%rsp)
+; LINUX-OPT0-NEXT:    movaps %xmm7, {{[0-9]+}}(%rsp)
+; LINUX-OPT0-NEXT:  .LBB0_2:
+; LINUX-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; LINUX-OPT0-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; LINUX-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; LINUX-OPT0-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; LINUX-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; LINUX-OPT0-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; LINUX-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; LINUX-OPT0-NEXT:    movq %rsi, {{[0-9]+}}(%rsp)
+; LINUX-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; LINUX-OPT0-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; LINUX-OPT0-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %r8b # 1-byte Reload
+; LINUX-OPT0-NEXT:    leaq {{[0-9]+}}(%rsp), %r9
+; LINUX-OPT0-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; LINUX-OPT0-NEXT:    leaq {{[0-9]+}}(%rsp), %r9
+; LINUX-OPT0-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; LINUX-OPT0-NEXT:    movl $48, {{[0-9]+}}(%rsp)
+; LINUX-OPT0-NEXT:    movl $8, {{[0-9]+}}(%rsp)
+; LINUX-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; LINUX-OPT0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; LINUX-OPT0-NEXT:    movq %r9, %rdi
+; LINUX-OPT0-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; LINUX-OPT0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; LINUX-OPT0-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; LINUX-OPT0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; LINUX-OPT0-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; LINUX-OPT0-NEXT:    callq get_f
+; LINUX-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; LINUX-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; LINUX-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; LINUX-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; LINUX-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; LINUX-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; LINUX-OPT0-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %r10b # 1-byte Reload
+; LINUX-OPT0-NEXT:    movq %rax, (%rsp) # 8-byte Spill
+; LINUX-OPT0-NEXT:    movb %r10b, %al
+; LINUX-OPT0-NEXT:    movq (%rsp), %r11 # 8-byte Reload
+; LINUX-OPT0-NEXT:    testb %al, %al
+; LINUX-OPT0-NEXT:    je .LBB0_4
+; LINUX-OPT0-NEXT:  # %bb.3:
+; LINUX-OPT0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm7
+; LINUX-OPT0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm6
+; LINUX-OPT0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm5
+; LINUX-OPT0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm4
+; LINUX-OPT0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm3
+; LINUX-OPT0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
+; LINUX-OPT0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
+; LINUX-OPT0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; LINUX-OPT0-NEXT:    addq $328, %rsp # imm = 0x148
+; LINUX-OPT0-NEXT:    .cfi_def_cfa_offset 8
+; LINUX-OPT0-NEXT:    jmpq *%r11 # TAILCALL
+; LINUX-OPT0-NEXT:  .LBB0_4:
+; LINUX-OPT0-NEXT:    .cfi_def_cfa_offset 336
+; LINUX-OPT0-NEXT:    addq $328, %rsp # imm = 0x148
+; LINUX-OPT0-NEXT:    .cfi_def_cfa_offset 8
+; LINUX-OPT0-NEXT:    jmpq *%r11 # TAILCALL
+;
 ; LINUX-X32-LABEL: f_thunk:
 ; LINUX-X32:       # %bb.0:
 ; LINUX-X32-NEXT:    pushq %rbp
@@ -123,8 +219,8 @@
 ; LINUX-X32-NEXT:    .cfi_def_cfa_offset 48
 ; LINUX-X32-NEXT:    pushq %rbx
 ; LINUX-X32-NEXT:    .cfi_def_cfa_offset 56
-; LINUX-X32-NEXT:    subl $344, %esp # imm = 0x158
-; LINUX-X32-NEXT:    .cfi_def_cfa_offset 400
+; LINUX-X32-NEXT:    subl $216, %esp
+; LINUX-X32-NEXT:    .cfi_def_cfa_offset 272
 ; LINUX-X32-NEXT:    .cfi_offset %rbx, -56
 ; LINUX-X32-NEXT:    .cfi_offset %r12, -48
 ; LINUX-X32-NEXT:    .cfi_offset %r13, -40
@@ -137,6 +233,11 @@
 ; LINUX-X32-NEXT:    movq %rdx, %rbp
 ; LINUX-X32-NEXT:    movq %rsi, %rbx
 ; LINUX-X32-NEXT:    movl %edi, %r14d
+; LINUX-X32-NEXT:    movq %rsi, {{[0-9]+}}(%esp)
+; LINUX-X32-NEXT:    movq %rdx, {{[0-9]+}}(%esp)
+; LINUX-X32-NEXT:    movq %rcx, {{[0-9]+}}(%esp)
+; LINUX-X32-NEXT:    movq %r8, {{[0-9]+}}(%esp)
+; LINUX-X32-NEXT:    movq %r9, {{[0-9]+}}(%esp)
 ; LINUX-X32-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; LINUX-X32-NEXT:    testb %al, %al
 ; LINUX-X32-NEXT:    je .LBB0_2
@@ -150,11 +251,6 @@
 ; LINUX-X32-NEXT:    movaps %xmm6, {{[0-9]+}}(%esp)
 ; LINUX-X32-NEXT:    movaps %xmm7, {{[0-9]+}}(%esp)
 ; LINUX-X32-NEXT:  .LBB0_2:
-; LINUX-X32-NEXT:    movq %rbx, {{[0-9]+}}(%esp)
-; LINUX-X32-NEXT:    movq %rbp, {{[0-9]+}}(%esp)
-; LINUX-X32-NEXT:    movq %r13, {{[0-9]+}}(%esp)
-; LINUX-X32-NEXT:    movq %r12, {{[0-9]+}}(%esp)
-; LINUX-X32-NEXT:    movq %r15, {{[0-9]+}}(%esp)
 ; LINUX-X32-NEXT:    leal {{[0-9]+}}(%rsp), %eax
 ; LINUX-X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; LINUX-X32-NEXT:    leal {{[0-9]+}}(%rsp), %eax
@@ -162,14 +258,6 @@
 ; LINUX-X32-NEXT:    movabsq $206158430216, %rax # imm = 0x3000000008
 ; LINUX-X32-NEXT:    movq %rax, {{[0-9]+}}(%esp)
 ; LINUX-X32-NEXT:    movl %r14d, %edi
-; LINUX-X32-NEXT:    movaps %xmm7, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; LINUX-X32-NEXT:    movaps %xmm6, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; LINUX-X32-NEXT:    movaps %xmm5, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; LINUX-X32-NEXT:    movaps %xmm4, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; LINUX-X32-NEXT:    movaps %xmm3, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; LINUX-X32-NEXT:    movaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; LINUX-X32-NEXT:    movaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; LINUX-X32-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; LINUX-X32-NEXT:    callq get_f
 ; LINUX-X32-NEXT:    movl %eax, %r11d
 ; LINUX-X32-NEXT:    movl %r14d, %edi
@@ -178,16 +266,36 @@
 ; LINUX-X32-NEXT:    movq %r13, %rcx
 ; LINUX-X32-NEXT:    movq %r12, %r8
 ; LINUX-X32-NEXT:    movq %r15, %r9
-; LINUX-X32-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; LINUX-X32-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; LINUX-X32-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload
-; LINUX-X32-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm3 # 16-byte Reload
-; LINUX-X32-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm4 # 16-byte Reload
-; LINUX-X32-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm5 # 16-byte Reload
-; LINUX-X32-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm6 # 16-byte Reload
-; LINUX-X32-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm7 # 16-byte Reload
 ; LINUX-X32-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload
-; LINUX-X32-NEXT:    addl $344, %esp # imm = 0x158
+; LINUX-X32-NEXT:    testb %al, %al
+; LINUX-X32-NEXT:    je .LBB0_4
+; LINUX-X32-NEXT:  # %bb.3:
+; LINUX-X32-NEXT:    movaps {{[0-9]+}}(%esp), %xmm7
+; LINUX-X32-NEXT:    movaps {{[0-9]+}}(%esp), %xmm6
+; LINUX-X32-NEXT:    movaps {{[0-9]+}}(%esp), %xmm5
+; LINUX-X32-NEXT:    movaps {{[0-9]+}}(%esp), %xmm4
+; LINUX-X32-NEXT:    movaps {{[0-9]+}}(%esp), %xmm3
+; LINUX-X32-NEXT:    movaps {{[0-9]+}}(%esp), %xmm2
+; LINUX-X32-NEXT:    movaps {{[0-9]+}}(%esp), %xmm1
+; LINUX-X32-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
+; LINUX-X32-NEXT:    addl $216, %esp
+; LINUX-X32-NEXT:    .cfi_def_cfa_offset 56
+; LINUX-X32-NEXT:    popq %rbx
+; LINUX-X32-NEXT:    .cfi_def_cfa_offset 48
+; LINUX-X32-NEXT:    popq %r12
+; LINUX-X32-NEXT:    .cfi_def_cfa_offset 40
+; LINUX-X32-NEXT:    popq %r13
+; LINUX-X32-NEXT:    .cfi_def_cfa_offset 32
+; LINUX-X32-NEXT:    popq %r14
+; LINUX-X32-NEXT:    .cfi_def_cfa_offset 24
+; LINUX-X32-NEXT:    popq %r15
+; LINUX-X32-NEXT:    .cfi_def_cfa_offset 16
+; LINUX-X32-NEXT:    popq %rbp
+; LINUX-X32-NEXT:    .cfi_def_cfa_offset 8
+; LINUX-X32-NEXT:    jmpq *%r11 # TAILCALL
+; LINUX-X32-NEXT:  .LBB0_4:
+; LINUX-X32-NEXT:    .cfi_def_cfa_offset 272
+; LINUX-X32-NEXT:    addl $216, %esp
 ; LINUX-X32-NEXT:    .cfi_def_cfa_offset 56
 ; LINUX-X32-NEXT:    popq %rbx
 ; LINUX-X32-NEXT:    .cfi_def_cfa_offset 48
@@ -203,6 +311,87 @@
 ; LINUX-X32-NEXT:    .cfi_def_cfa_offset 8
 ; LINUX-X32-NEXT:    jmpq *%r11 # TAILCALL
 ;
+; LINUX-X32-OPT0-LABEL: f_thunk:
+; LINUX-X32-OPT0:       # %bb.0:
+; LINUX-X32-OPT0-NEXT:    subl $312, %esp # imm = 0x138
+; LINUX-X32-OPT0-NEXT:    .cfi_def_cfa_offset 320
+; LINUX-X32-OPT0-NEXT:    testb %al, %al
+; LINUX-X32-OPT0-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; LINUX-X32-OPT0-NEXT:    movq %r9, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill
+; LINUX-X32-OPT0-NEXT:    movq %r8, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill
+; LINUX-X32-OPT0-NEXT:    movq %rcx, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill
+; LINUX-X32-OPT0-NEXT:    movq %rdx, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill
+; LINUX-X32-OPT0-NEXT:    movq %rsi, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill
+; LINUX-X32-OPT0-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; LINUX-X32-OPT0-NEXT:    je .LBB0_2
+; LINUX-X32-OPT0-NEXT:  # %bb.1:
+; LINUX-X32-OPT0-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; LINUX-X32-OPT0-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; LINUX-X32-OPT0-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; LINUX-X32-OPT0-NEXT:    movaps %xmm3, {{[0-9]+}}(%esp)
+; LINUX-X32-OPT0-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; LINUX-X32-OPT0-NEXT:    movaps %xmm5, {{[0-9]+}}(%esp)
+; LINUX-X32-OPT0-NEXT:    movaps %xmm6, {{[0-9]+}}(%esp)
+; LINUX-X32-OPT0-NEXT:    movaps %xmm7, {{[0-9]+}}(%esp)
+; LINUX-X32-OPT0-NEXT:  .LBB0_2:
+; LINUX-X32-OPT0-NEXT:    movq {{[-0-9]+}}(%e{{[sb]}}p), %rax # 8-byte Reload
+; LINUX-X32-OPT0-NEXT:    movq %rax, {{[0-9]+}}(%esp)
+; LINUX-X32-OPT0-NEXT:    movq {{[-0-9]+}}(%e{{[sb]}}p), %rcx # 8-byte Reload
+; LINUX-X32-OPT0-NEXT:    movq %rcx, {{[0-9]+}}(%esp)
+; LINUX-X32-OPT0-NEXT:    movq {{[-0-9]+}}(%e{{[sb]}}p), %rdx # 8-byte Reload
+; LINUX-X32-OPT0-NEXT:    movq %rdx, {{[0-9]+}}(%esp)
+; LINUX-X32-OPT0-NEXT:    movq {{[-0-9]+}}(%e{{[sb]}}p), %rsi # 8-byte Reload
+; LINUX-X32-OPT0-NEXT:    movq %rsi, {{[0-9]+}}(%esp)
+; LINUX-X32-OPT0-NEXT:    movq {{[-0-9]+}}(%e{{[sb]}}p), %rdi # 8-byte Reload
+; LINUX-X32-OPT0-NEXT:    movq %rdi, {{[0-9]+}}(%esp)
+; LINUX-X32-OPT0-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %r8b # 1-byte Reload
+; LINUX-X32-OPT0-NEXT:    leal {{[0-9]+}}(%rsp), %r9d
+; LINUX-X32-OPT0-NEXT:    movl %r9d, {{[0-9]+}}(%esp)
+; LINUX-X32-OPT0-NEXT:    leal {{[0-9]+}}(%rsp), %r9d
+; LINUX-X32-OPT0-NEXT:    movl %r9d, {{[0-9]+}}(%esp)
+; LINUX-X32-OPT0-NEXT:    movl $48, {{[0-9]+}}(%esp)
+; LINUX-X32-OPT0-NEXT:    movl $8, {{[0-9]+}}(%esp)
+; LINUX-X32-OPT0-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %r9d # 4-byte Reload
+; LINUX-X32-OPT0-NEXT:    movq %rdi, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill
+; LINUX-X32-OPT0-NEXT:    movl %r9d, %edi
+; LINUX-X32-OPT0-NEXT:    movq %rsi, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill
+; LINUX-X32-OPT0-NEXT:    movq %rdx, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill
+; LINUX-X32-OPT0-NEXT:    movq %rcx, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill
+; LINUX-X32-OPT0-NEXT:    movq %rax, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill
+; LINUX-X32-OPT0-NEXT:    movb %r8b, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; LINUX-X32-OPT0-NEXT:    callq get_f
+; LINUX-X32-OPT0-NEXT:    movl %eax, %eax
+; LINUX-X32-OPT0-NEXT:    movl %eax, %ecx
+; LINUX-X32-OPT0-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; LINUX-X32-OPT0-NEXT:    movq {{[-0-9]+}}(%e{{[sb]}}p), %rsi # 8-byte Reload
+; LINUX-X32-OPT0-NEXT:    movq {{[-0-9]+}}(%e{{[sb]}}p), %rdx # 8-byte Reload
+; LINUX-X32-OPT0-NEXT:    movq {{[-0-9]+}}(%e{{[sb]}}p), %r10 # 8-byte Reload
+; LINUX-X32-OPT0-NEXT:    movq %rcx, (%esp) # 8-byte Spill
+; LINUX-X32-OPT0-NEXT:    movq %r10, %rcx
+; LINUX-X32-OPT0-NEXT:    movq {{[-0-9]+}}(%e{{[sb]}}p), %r8 # 8-byte Reload
+; LINUX-X32-OPT0-NEXT:    movq {{[-0-9]+}}(%e{{[sb]}}p), %r9 # 8-byte Reload
+; LINUX-X32-OPT0-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload
+; LINUX-X32-OPT0-NEXT:    movq (%esp), %r11 # 8-byte Reload
+; LINUX-X32-OPT0-NEXT:    testb %al, %al
+; LINUX-X32-OPT0-NEXT:    je .LBB0_4
+; LINUX-X32-OPT0-NEXT:  # %bb.3:
+; LINUX-X32-OPT0-NEXT:    movaps {{[0-9]+}}(%esp), %xmm7
+; LINUX-X32-OPT0-NEXT:    movaps {{[0-9]+}}(%esp), %xmm6
+; LINUX-X32-OPT0-NEXT:    movaps {{[0-9]+}}(%esp), %xmm5
+; LINUX-X32-OPT0-NEXT:    movaps {{[0-9]+}}(%esp), %xmm4
+; LINUX-X32-OPT0-NEXT:    movaps {{[0-9]+}}(%esp), %xmm3
+; LINUX-X32-OPT0-NEXT:    movaps {{[0-9]+}}(%esp), %xmm2
+; LINUX-X32-OPT0-NEXT:    movaps {{[0-9]+}}(%esp), %xmm1
+; LINUX-X32-OPT0-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
+; LINUX-X32-OPT0-NEXT:    addl $312, %esp # imm = 0x138
+; LINUX-X32-OPT0-NEXT:    .cfi_def_cfa_offset 8
+; LINUX-X32-OPT0-NEXT:    jmpq *%r11 # TAILCALL
+; LINUX-X32-OPT0-NEXT:  .LBB0_4:
+; LINUX-X32-OPT0-NEXT:    .cfi_def_cfa_offset 320
+; LINUX-X32-OPT0-NEXT:    addl $312, %esp # imm = 0x138
+; LINUX-X32-OPT0-NEXT:    .cfi_def_cfa_offset 8
+; LINUX-X32-OPT0-NEXT:    jmpq *%r11 # TAILCALL
+;
 ; WINDOWS-LABEL: f_thunk:
 ; WINDOWS:       # %bb.0:
 ; WINDOWS-NEXT:    pushq %r14
@@ -246,6 +435,36 @@
 ; WINDOWS-NEXT:    .text
 ; WINDOWS-NEXT:    .seh_endproc
 ;
+; WINDOWS-OPT0-LABEL: f_thunk:
+; WINDOWS-OPT0:       # %bb.0:
+; WINDOWS-OPT0-NEXT:    subq $120, %rsp
+; WINDOWS-OPT0-NEXT:    .seh_stackalloc 120
+; WINDOWS-OPT0-NEXT:    .seh_endprologue
+; WINDOWS-OPT0-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
+; WINDOWS-OPT0-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
+; WINDOWS-OPT0-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; WINDOWS-OPT0-NEXT:    leaq {{[0-9]+}}(%rsp), %r10
+; WINDOWS-OPT0-NEXT:    movq %r10, {{[0-9]+}}(%rsp)
+; WINDOWS-OPT0-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; WINDOWS-OPT0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; WINDOWS-OPT0-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; WINDOWS-OPT0-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; WINDOWS-OPT0-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; WINDOWS-OPT0-NEXT:    callq get_f
+; WINDOWS-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; WINDOWS-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; WINDOWS-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; WINDOWS-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; WINDOWS-OPT0-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %r11b # 1-byte Reload
+; WINDOWS-OPT0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; WINDOWS-OPT0-NEXT:    movb %r11b, %al
+; WINDOWS-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; WINDOWS-OPT0-NEXT:    addq $120, %rsp
+; WINDOWS-OPT0-NEXT:    rex64 jmpq *%r10 # TAILCALL
+; WINDOWS-OPT0-NEXT:    .seh_handlerdata
+; WINDOWS-OPT0-NEXT:    .text
+; WINDOWS-OPT0-NEXT:    .seh_endproc
+;
 ; X86-NOSSE-LABEL: f_thunk:
 ; X86-NOSSE:       # %bb.0:
 ; X86-NOSSE-NEXT:    pushl %ebp
@@ -264,6 +483,25 @@
 ; X86-NOSSE-NEXT:    popl %ebp
 ; X86-NOSSE-NEXT:    jmpl *%eax # TAILCALL
 ;
+; X86-NOSSE-OPT0-LABEL: f_thunk:
+; X86-NOSSE-OPT0:       # %bb.0:
+; X86-NOSSE-OPT0-NEXT:    pushl %ebp
+; X86-NOSSE-OPT0-NEXT:    movl %esp, %ebp
+; X86-NOSSE-OPT0-NEXT:    andl $-16, %esp
+; X86-NOSSE-OPT0-NEXT:    subl $48, %esp
+; X86-NOSSE-OPT0-NEXT:    movl 8(%ebp), %eax
+; X86-NOSSE-OPT0-NEXT:    leal 12(%ebp), %ecx
+; X86-NOSSE-OPT0-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NOSSE-OPT0-NEXT:    movl %esp, %ecx
+; X86-NOSSE-OPT0-NEXT:    movl %eax, (%ecx)
+; X86-NOSSE-OPT0-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOSSE-OPT0-NEXT:    calll _get_f
+; X86-NOSSE-OPT0-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NOSSE-OPT0-NEXT:    movl %ecx, 8(%ebp)
+; X86-NOSSE-OPT0-NEXT:    movl %ebp, %esp
+; X86-NOSSE-OPT0-NEXT:    popl %ebp
+; X86-NOSSE-OPT0-NEXT:    jmpl *%eax # TAILCALL
+;
 ; X86-SSE-LABEL: f_thunk:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    pushl %ebp
@@ -287,6 +525,31 @@
 ; X86-SSE-NEXT:    popl %esi
 ; X86-SSE-NEXT:    popl %ebp
 ; X86-SSE-NEXT:    jmpl *%eax # TAILCALL
+;
+; X86-SSE-OPT0-LABEL: f_thunk:
+; X86-SSE-OPT0:       # %bb.0:
+; X86-SSE-OPT0-NEXT:    pushl %ebp
+; X86-SSE-OPT0-NEXT:    movl %esp, %ebp
+; X86-SSE-OPT0-NEXT:    andl $-16, %esp
+; X86-SSE-OPT0-NEXT:    subl $112, %esp
+; X86-SSE-OPT0-NEXT:    movl 8(%ebp), %eax
+; X86-SSE-OPT0-NEXT:    leal 12(%ebp), %ecx
+; X86-SSE-OPT0-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE-OPT0-NEXT:    movl %esp, %ecx
+; X86-SSE-OPT0-NEXT:    movl %eax, (%ecx)
+; X86-SSE-OPT0-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-SSE-OPT0-NEXT:    movaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-SSE-OPT0-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE-OPT0-NEXT:    movaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-SSE-OPT0-NEXT:    calll _get_f
+; X86-SSE-OPT0-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE-OPT0-NEXT:    movl %ecx, 8(%ebp)
+; X86-SSE-OPT0-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-SSE-OPT0-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
+; X86-SSE-OPT0-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload
+; X86-SSE-OPT0-NEXT:    movl %ebp, %esp
+; X86-SSE-OPT0-NEXT:    popl %ebp
+; X86-SSE-OPT0-NEXT:    jmpl *%eax # TAILCALL
   %ap = alloca [4 x i8*], align 16
   %ap_i8 = bitcast [4 x i8*]* %ap to i8*
   call void @llvm.va_start(i8* %ap_i8)
@@ -300,27 +563,192 @@
 
 ; No regparms on normal x86 conventions.
 
-; This thunk shouldn't require any spills and reloads, assuming the register
-; allocator knows what it's doing.
+; This thunk stores xmms on entry and restores them before jumping.
+; Storing and restoring xmms could be optimized out for this concrete case.
 
 define void @g_thunk(i8* %fptr_i8, ...) {
 ; LINUX-LABEL: g_thunk:
 ; LINUX:       # %bb.0:
-; LINUX-NEXT:    pushq %rax
-; LINUX-NEXT:    .cfi_def_cfa_offset 16
-; LINUX-NEXT:    popq %r11
+; LINUX-NEXT:    subq $136, %rsp
+; LINUX-NEXT:    .cfi_def_cfa_offset 144
+; LINUX-NEXT:    testb %al, %al
+; LINUX-NEXT:    je .LBB1_2
+; LINUX-NEXT:  # %bb.1:
+; LINUX-NEXT:    movaps %xmm0, (%rsp)
+; LINUX-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; LINUX-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; LINUX-NEXT:    movaps %xmm3, {{[0-9]+}}(%rsp)
+; LINUX-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; LINUX-NEXT:    movaps %xmm5, {{[0-9]+}}(%rsp)
+; LINUX-NEXT:    movaps %xmm6, {{[0-9]+}}(%rsp)
+; LINUX-NEXT:    movaps %xmm7, {{[0-9]+}}(%rsp)
+; LINUX-NEXT:  .LBB1_2:
+; LINUX-NEXT:    testb %al, %al
+; LINUX-NEXT:    je .LBB1_4
+; LINUX-NEXT:  # %bb.3:
+; LINUX-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm7
+; LINUX-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm6
+; LINUX-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm5
+; LINUX-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm4
+; LINUX-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm3
+; LINUX-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
+; LINUX-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
+; LINUX-NEXT:    movaps (%rsp), %xmm0
+; LINUX-NEXT:    addq $136, %rsp
+; LINUX-NEXT:    .cfi_def_cfa_offset 8
+; LINUX-NEXT:    jmpq *%rdi # TAILCALL
+; LINUX-NEXT:  .LBB1_4:
+; LINUX-NEXT:    .cfi_def_cfa_offset 144
+; LINUX-NEXT:    addq $136, %rsp
 ; LINUX-NEXT:    .cfi_def_cfa_offset 8
 ; LINUX-NEXT:    jmpq *%rdi # TAILCALL
 ;
+; LINUX-OPT0-LABEL: g_thunk:
+; LINUX-OPT0:       # %bb.0:
+; LINUX-OPT0-NEXT:    subq $200, %rsp
+; LINUX-OPT0-NEXT:    .cfi_def_cfa_offset 208
+; LINUX-OPT0-NEXT:    movb %al, %r10b
+; LINUX-OPT0-NEXT:    testb %al, %al
+; LINUX-OPT0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; LINUX-OPT0-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; LINUX-OPT0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; LINUX-OPT0-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; LINUX-OPT0-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; LINUX-OPT0-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; LINUX-OPT0-NEXT:    movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; LINUX-OPT0-NEXT:    je .LBB1_2
+; LINUX-OPT0-NEXT:  # %bb.1:
+; LINUX-OPT0-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; LINUX-OPT0-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; LINUX-OPT0-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; LINUX-OPT0-NEXT:    movaps %xmm3, {{[0-9]+}}(%rsp)
+; LINUX-OPT0-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; LINUX-OPT0-NEXT:    movaps %xmm5, {{[0-9]+}}(%rsp)
+; LINUX-OPT0-NEXT:    movaps %xmm6, {{[0-9]+}}(%rsp)
+; LINUX-OPT0-NEXT:    movaps %xmm7, {{[0-9]+}}(%rsp)
+; LINUX-OPT0-NEXT:  .LBB1_2:
+; LINUX-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; LINUX-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; LINUX-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; LINUX-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; LINUX-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; LINUX-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; LINUX-OPT0-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; LINUX-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; LINUX-OPT0-NEXT:    testb %al, %al
+; LINUX-OPT0-NEXT:    je .LBB1_4
+; LINUX-OPT0-NEXT:  # %bb.3:
+; LINUX-OPT0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm7
+; LINUX-OPT0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm6
+; LINUX-OPT0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm5
+; LINUX-OPT0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm4
+; LINUX-OPT0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm3
+; LINUX-OPT0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
+; LINUX-OPT0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
+; LINUX-OPT0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; LINUX-OPT0-NEXT:    addq $200, %rsp
+; LINUX-OPT0-NEXT:    .cfi_def_cfa_offset 8
+; LINUX-OPT0-NEXT:    jmpq *%r11 # TAILCALL
+; LINUX-OPT0-NEXT:  .LBB1_4:
+; LINUX-OPT0-NEXT:    .cfi_def_cfa_offset 208
+; LINUX-OPT0-NEXT:    addq $200, %rsp
+; LINUX-OPT0-NEXT:    .cfi_def_cfa_offset 8
+; LINUX-OPT0-NEXT:    jmpq *%r11 # TAILCALL
+;
 ; LINUX-X32-LABEL: g_thunk:
 ; LINUX-X32:       # %bb.0:
-; LINUX-X32-NEXT:    pushq %rax
-; LINUX-X32-NEXT:    .cfi_def_cfa_offset 16
+; LINUX-X32-NEXT:    subl $136, %esp
+; LINUX-X32-NEXT:    .cfi_def_cfa_offset 144
+; LINUX-X32-NEXT:    testb %al, %al
+; LINUX-X32-NEXT:    je .LBB1_2
+; LINUX-X32-NEXT:  # %bb.1:
+; LINUX-X32-NEXT:    movaps %xmm0, (%esp)
+; LINUX-X32-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; LINUX-X32-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; LINUX-X32-NEXT:    movaps %xmm3, {{[0-9]+}}(%esp)
+; LINUX-X32-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; LINUX-X32-NEXT:    movaps %xmm5, {{[0-9]+}}(%esp)
+; LINUX-X32-NEXT:    movaps %xmm6, {{[0-9]+}}(%esp)
+; LINUX-X32-NEXT:    movaps %xmm7, {{[0-9]+}}(%esp)
+; LINUX-X32-NEXT:  .LBB1_2:
 ; LINUX-X32-NEXT:    movl %edi, %r11d
-; LINUX-X32-NEXT:    addl $8, %esp
+; LINUX-X32-NEXT:    testb %al, %al
+; LINUX-X32-NEXT:    je .LBB1_4
+; LINUX-X32-NEXT:  # %bb.3:
+; LINUX-X32-NEXT:    movaps {{[0-9]+}}(%esp), %xmm7
+; LINUX-X32-NEXT:    movaps {{[0-9]+}}(%esp), %xmm6
+; LINUX-X32-NEXT:    movaps {{[0-9]+}}(%esp), %xmm5
+; LINUX-X32-NEXT:    movaps {{[0-9]+}}(%esp), %xmm4
+; LINUX-X32-NEXT:    movaps {{[0-9]+}}(%esp), %xmm3
+; LINUX-X32-NEXT:    movaps {{[0-9]+}}(%esp), %xmm2
+; LINUX-X32-NEXT:    movaps {{[0-9]+}}(%esp), %xmm1
+; LINUX-X32-NEXT:    movaps (%esp), %xmm0
+; LINUX-X32-NEXT:    addl $136, %esp
+; LINUX-X32-NEXT:    .cfi_def_cfa_offset 8
+; LINUX-X32-NEXT:    jmpq *%r11 # TAILCALL
+; LINUX-X32-NEXT:  .LBB1_4:
+; LINUX-X32-NEXT:    .cfi_def_cfa_offset 144
+; LINUX-X32-NEXT:    addl $136, %esp
 ; LINUX-X32-NEXT:    .cfi_def_cfa_offset 8
 ; LINUX-X32-NEXT:    jmpq *%r11 # TAILCALL
 ;
+; LINUX-X32-OPT0-LABEL: g_thunk:
+; LINUX-X32-OPT0:       # %bb.0:
+; LINUX-X32-OPT0-NEXT:    subl $200, %esp
+; LINUX-X32-OPT0-NEXT:    .cfi_def_cfa_offset 208
+; LINUX-X32-OPT0-NEXT:    movb %al, %r10b
+; LINUX-X32-OPT0-NEXT:    testb %al, %al
+; LINUX-X32-OPT0-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; LINUX-X32-OPT0-NEXT:    movq %rsi, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill
+; LINUX-X32-OPT0-NEXT:    movq %rdx, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill
+; LINUX-X32-OPT0-NEXT:    movq %rcx, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill
+; LINUX-X32-OPT0-NEXT:    movq %r8, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill
+; LINUX-X32-OPT0-NEXT:    movq %r9, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill
+; LINUX-X32-OPT0-NEXT:    movb %r10b, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; LINUX-X32-OPT0-NEXT:    je .LBB1_2
+; LINUX-X32-OPT0-NEXT:  # %bb.1:
+; LINUX-X32-OPT0-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; LINUX-X32-OPT0-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; LINUX-X32-OPT0-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; LINUX-X32-OPT0-NEXT:    movaps %xmm3, {{[0-9]+}}(%esp)
+; LINUX-X32-OPT0-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; LINUX-X32-OPT0-NEXT:    movaps %xmm5, {{[0-9]+}}(%esp)
+; LINUX-X32-OPT0-NEXT:    movaps %xmm6, {{[0-9]+}}(%esp)
+; LINUX-X32-OPT0-NEXT:    movaps %xmm7, {{[0-9]+}}(%esp)
+; LINUX-X32-OPT0-NEXT:  .LBB1_2:
+; LINUX-X32-OPT0-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; LINUX-X32-OPT0-NEXT:    movl %eax, %ecx
+; LINUX-X32-OPT0-NEXT:    movl %ecx, %edx
+; LINUX-X32-OPT0-NEXT:    movl %eax, %edi
+; LINUX-X32-OPT0-NEXT:    movq {{[-0-9]+}}(%e{{[sb]}}p), %rsi # 8-byte Reload
+; LINUX-X32-OPT0-NEXT:    movq {{[-0-9]+}}(%e{{[sb]}}p), %r8 # 8-byte Reload
+; LINUX-X32-OPT0-NEXT:    movq %rdx, (%esp) # 8-byte Spill
+; LINUX-X32-OPT0-NEXT:    movq %r8, %rdx
+; LINUX-X32-OPT0-NEXT:    movq {{[-0-9]+}}(%e{{[sb]}}p), %rcx # 8-byte Reload
+; LINUX-X32-OPT0-NEXT:    movq {{[-0-9]+}}(%e{{[sb]}}p), %r8 # 8-byte Reload
+; LINUX-X32-OPT0-NEXT:    movq {{[-0-9]+}}(%e{{[sb]}}p), %r9 # 8-byte Reload
+; LINUX-X32-OPT0-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload
+; LINUX-X32-OPT0-NEXT:    movq (%esp), %r11 # 8-byte Reload
+; LINUX-X32-OPT0-NEXT:    testb %al, %al
+; LINUX-X32-OPT0-NEXT:    je .LBB1_4
+; LINUX-X32-OPT0-NEXT:  # %bb.3:
+; LINUX-X32-OPT0-NEXT:    movaps {{[0-9]+}}(%esp), %xmm7
+; LINUX-X32-OPT0-NEXT:    movaps {{[0-9]+}}(%esp), %xmm6
+; LINUX-X32-OPT0-NEXT:    movaps {{[0-9]+}}(%esp), %xmm5
+; LINUX-X32-OPT0-NEXT:    movaps {{[0-9]+}}(%esp), %xmm4
+; LINUX-X32-OPT0-NEXT:    movaps {{[0-9]+}}(%esp), %xmm3
+; LINUX-X32-OPT0-NEXT:    movaps {{[0-9]+}}(%esp), %xmm2
+; LINUX-X32-OPT0-NEXT:    movaps {{[0-9]+}}(%esp), %xmm1
+; LINUX-X32-OPT0-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
+; LINUX-X32-OPT0-NEXT:    addl $200, %esp
+; LINUX-X32-OPT0-NEXT:    .cfi_def_cfa_offset 8
+; LINUX-X32-OPT0-NEXT:    jmpq *%r11 # TAILCALL
+; LINUX-X32-OPT0-NEXT:  .LBB1_4:
+; LINUX-X32-OPT0-NEXT:    .cfi_def_cfa_offset 208
+; LINUX-X32-OPT0-NEXT:    addl $200, %esp
+; LINUX-X32-OPT0-NEXT:    .cfi_def_cfa_offset 8
+; LINUX-X32-OPT0-NEXT:    jmpq *%r11 # TAILCALL
+;
 ; WINDOWS-LABEL: g_thunk:
 ; WINDOWS:       # %bb.0:
 ; WINDOWS-NEXT:    subq $40, %rsp
@@ -332,6 +760,19 @@
 ; WINDOWS-NEXT:    .text
 ; WINDOWS-NEXT:    .seh_endproc
 ;
+; WINDOWS-OPT0-LABEL: g_thunk:
+; WINDOWS-OPT0:       # %bb.0:
+; WINDOWS-OPT0-NEXT:    subq $40, %rsp
+; WINDOWS-OPT0-NEXT:    .seh_stackalloc 40
+; WINDOWS-OPT0-NEXT:    .seh_endprologue
+; WINDOWS-OPT0-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; WINDOWS-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; WINDOWS-OPT0-NEXT:    addq $40, %rsp
+; WINDOWS-OPT0-NEXT:    rex64 jmpq *%r10 # TAILCALL
+; WINDOWS-OPT0-NEXT:    .seh_handlerdata
+; WINDOWS-OPT0-NEXT:    .text
+; WINDOWS-OPT0-NEXT:    .seh_endproc
+;
 ; X86-LABEL: g_thunk:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
@@ -353,41 +794,321 @@
 define void @h_thunk(%struct.Foo* %this, ...) {
 ; LINUX-LABEL: h_thunk:
 ; LINUX:       # %bb.0:
-; LINUX-NEXT:    pushq %rax
-; LINUX-NEXT:    .cfi_def_cfa_offset 16
+; LINUX-NEXT:    subq $136, %rsp
+; LINUX-NEXT:    .cfi_def_cfa_offset 144
+; LINUX-NEXT:    testb %al, %al
+; LINUX-NEXT:    je .LBB2_2
+; LINUX-NEXT:  # %bb.1:
+; LINUX-NEXT:    movaps %xmm0, (%rsp)
+; LINUX-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; LINUX-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; LINUX-NEXT:    movaps %xmm3, {{[0-9]+}}(%rsp)
+; LINUX-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; LINUX-NEXT:    movaps %xmm5, {{[0-9]+}}(%rsp)
+; LINUX-NEXT:    movaps %xmm6, {{[0-9]+}}(%rsp)
+; LINUX-NEXT:    movaps %xmm7, {{[0-9]+}}(%rsp)
+; LINUX-NEXT:  .LBB2_2:
 ; LINUX-NEXT:    cmpb $1, (%rdi)
-; LINUX-NEXT:    jne .LBB2_2
-; LINUX-NEXT:  # %bb.1: # %then
+; LINUX-NEXT:    jne .LBB2_4
+; LINUX-NEXT:  # %bb.3: # %then
 ; LINUX-NEXT:    movq 8(%rdi), %r11
-; LINUX-NEXT:    addq $8, %rsp
+; LINUX-NEXT:    testb %al, %al
+; LINUX-NEXT:    je .LBB2_6
+; LINUX-NEXT:  # %bb.5: # %then
+; LINUX-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm7
+; LINUX-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm6
+; LINUX-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm5
+; LINUX-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm4
+; LINUX-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm3
+; LINUX-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
+; LINUX-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
+; LINUX-NEXT:    movaps (%rsp), %xmm0
+; LINUX-NEXT:    addq $136, %rsp
 ; LINUX-NEXT:    .cfi_def_cfa_offset 8
 ; LINUX-NEXT:    jmpq *%r11 # TAILCALL
-; LINUX-NEXT:  .LBB2_2: # %else
-; LINUX-NEXT:    .cfi_def_cfa_offset 16
+; LINUX-NEXT:  .LBB2_4: # %else
+; LINUX-NEXT:    .cfi_def_cfa_offset 144
 ; LINUX-NEXT:    movq 16(%rdi), %r11
 ; LINUX-NEXT:    movl $42, {{.*}}(%rip)
-; LINUX-NEXT:    addq $8, %rsp
+; LINUX-NEXT:    testb %al, %al
+; LINUX-NEXT:    je .LBB2_8
+; LINUX-NEXT:  # %bb.7: # %else
+; LINUX-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm7
+; LINUX-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm6
+; LINUX-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm5
+; LINUX-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm4
+; LINUX-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm3
+; LINUX-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
+; LINUX-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
+; LINUX-NEXT:    movaps (%rsp), %xmm0
+; LINUX-NEXT:    addq $136, %rsp
+; LINUX-NEXT:    .cfi_def_cfa_offset 8
+; LINUX-NEXT:    jmpq *%r11 # TAILCALL
+; LINUX-NEXT:  .LBB2_6: # %then
+; LINUX-NEXT:    .cfi_def_cfa_offset 144
+; LINUX-NEXT:    addq $136, %rsp
+; LINUX-NEXT:    .cfi_def_cfa_offset 8
+; LINUX-NEXT:    jmpq *%r11 # TAILCALL
+; LINUX-NEXT:  .LBB2_8: # %else
+; LINUX-NEXT:    .cfi_def_cfa_offset 144
+; LINUX-NEXT:    addq $136, %rsp
 ; LINUX-NEXT:    .cfi_def_cfa_offset 8
 ; LINUX-NEXT:    jmpq *%r11 # TAILCALL
 ;
+; LINUX-OPT0-LABEL: h_thunk:
+; LINUX-OPT0:       # %bb.0:
+; LINUX-OPT0-NEXT:    subq $216, %rsp
+; LINUX-OPT0-NEXT:    .cfi_def_cfa_offset 224
+; LINUX-OPT0-NEXT:    movb %al, %r10b
+; LINUX-OPT0-NEXT:    testb %al, %al
+; LINUX-OPT0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; LINUX-OPT0-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; LINUX-OPT0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; LINUX-OPT0-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; LINUX-OPT0-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; LINUX-OPT0-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; LINUX-OPT0-NEXT:    movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; LINUX-OPT0-NEXT:    je .LBB2_4
+; LINUX-OPT0-NEXT:  # %bb.3:
+; LINUX-OPT0-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; LINUX-OPT0-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; LINUX-OPT0-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; LINUX-OPT0-NEXT:    movaps %xmm3, {{[0-9]+}}(%rsp)
+; LINUX-OPT0-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; LINUX-OPT0-NEXT:    movaps %xmm5, {{[0-9]+}}(%rsp)
+; LINUX-OPT0-NEXT:    movaps %xmm6, {{[0-9]+}}(%rsp)
+; LINUX-OPT0-NEXT:    movaps %xmm7, {{[0-9]+}}(%rsp)
+; LINUX-OPT0-NEXT:  .LBB2_4:
+; LINUX-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; LINUX-OPT0-NEXT:    testb $1, (%rax)
+; LINUX-OPT0-NEXT:    jne .LBB2_1
+; LINUX-OPT0-NEXT:    jmp .LBB2_2
+; LINUX-OPT0-NEXT:  .LBB2_1: # %then
+; LINUX-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; LINUX-OPT0-NEXT:    movq 8(%rax), %rcx
+; LINUX-OPT0-NEXT:    movq %rax, %rdi
+; LINUX-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; LINUX-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; LINUX-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; LINUX-OPT0-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; LINUX-OPT0-NEXT:    movq %r8, %rcx
+; LINUX-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; LINUX-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; LINUX-OPT0-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; LINUX-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; LINUX-OPT0-NEXT:    testb %al, %al
+; LINUX-OPT0-NEXT:    je .LBB2_6
+; LINUX-OPT0-NEXT:  # %bb.5: # %then
+; LINUX-OPT0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm7
+; LINUX-OPT0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm6
+; LINUX-OPT0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm5
+; LINUX-OPT0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm4
+; LINUX-OPT0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm3
+; LINUX-OPT0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
+; LINUX-OPT0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
+; LINUX-OPT0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; LINUX-OPT0-NEXT:    addq $216, %rsp
+; LINUX-OPT0-NEXT:    .cfi_def_cfa_offset 8
+; LINUX-OPT0-NEXT:    jmpq *%r11 # TAILCALL
+; LINUX-OPT0-NEXT:  .LBB2_6: # %then
+; LINUX-OPT0-NEXT:    .cfi_def_cfa_offset 224
+; LINUX-OPT0-NEXT:    addq $216, %rsp
+; LINUX-OPT0-NEXT:    .cfi_def_cfa_offset 8
+; LINUX-OPT0-NEXT:    jmpq *%r11 # TAILCALL
+; LINUX-OPT0-NEXT:  .LBB2_2: # %else
+; LINUX-OPT0-NEXT:    .cfi_def_cfa_offset 224
+; LINUX-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; LINUX-OPT0-NEXT:    movq 16(%rax), %rcx
+; LINUX-OPT0-NEXT:    movl $42, {{.*}}(%rip)
+; LINUX-OPT0-NEXT:    movq %rax, %rdi
+; LINUX-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; LINUX-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; LINUX-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; LINUX-OPT0-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; LINUX-OPT0-NEXT:    movq %r8, %rcx
+; LINUX-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; LINUX-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; LINUX-OPT0-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; LINUX-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; LINUX-OPT0-NEXT:    testb %al, %al
+; LINUX-OPT0-NEXT:    je .LBB2_8
+; LINUX-OPT0-NEXT:  # %bb.7: # %else
+; LINUX-OPT0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm7
+; LINUX-OPT0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm6
+; LINUX-OPT0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm5
+; LINUX-OPT0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm4
+; LINUX-OPT0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm3
+; LINUX-OPT0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
+; LINUX-OPT0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
+; LINUX-OPT0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; LINUX-OPT0-NEXT:    addq $216, %rsp
+; LINUX-OPT0-NEXT:    .cfi_def_cfa_offset 8
+; LINUX-OPT0-NEXT:    jmpq *%r11 # TAILCALL
+; LINUX-OPT0-NEXT:  .LBB2_8: # %else
+; LINUX-OPT0-NEXT:    .cfi_def_cfa_offset 224
+; LINUX-OPT0-NEXT:    addq $216, %rsp
+; LINUX-OPT0-NEXT:    .cfi_def_cfa_offset 8
+; LINUX-OPT0-NEXT:    jmpq *%r11 # TAILCALL
+;
 ; LINUX-X32-LABEL: h_thunk:
 ; LINUX-X32:       # %bb.0:
-; LINUX-X32-NEXT:    pushq %rax
-; LINUX-X32-NEXT:    .cfi_def_cfa_offset 16
+; LINUX-X32-NEXT:    subl $136, %esp
+; LINUX-X32-NEXT:    .cfi_def_cfa_offset 144
+; LINUX-X32-NEXT:    testb %al, %al
+; LINUX-X32-NEXT:    je .LBB2_2
+; LINUX-X32-NEXT:  # %bb.1:
+; LINUX-X32-NEXT:    movaps %xmm0, (%esp)
+; LINUX-X32-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; LINUX-X32-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; LINUX-X32-NEXT:    movaps %xmm3, {{[0-9]+}}(%esp)
+; LINUX-X32-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; LINUX-X32-NEXT:    movaps %xmm5, {{[0-9]+}}(%esp)
+; LINUX-X32-NEXT:    movaps %xmm6, {{[0-9]+}}(%esp)
+; LINUX-X32-NEXT:    movaps %xmm7, {{[0-9]+}}(%esp)
+; LINUX-X32-NEXT:  .LBB2_2:
 ; LINUX-X32-NEXT:    cmpb $1, (%edi)
-; LINUX-X32-NEXT:    jne .LBB2_2
-; LINUX-X32-NEXT:  # %bb.1: # %then
+; LINUX-X32-NEXT:    jne .LBB2_4
+; LINUX-X32-NEXT:  # %bb.3: # %then
 ; LINUX-X32-NEXT:    movl 4(%edi), %r11d
-; LINUX-X32-NEXT:    addl $8, %esp
+; LINUX-X32-NEXT:    testb %al, %al
+; LINUX-X32-NEXT:    je .LBB2_6
+; LINUX-X32-NEXT:  # %bb.5: # %then
+; LINUX-X32-NEXT:    movaps {{[0-9]+}}(%esp), %xmm7
+; LINUX-X32-NEXT:    movaps {{[0-9]+}}(%esp), %xmm6
+; LINUX-X32-NEXT:    movaps {{[0-9]+}}(%esp), %xmm5
+; LINUX-X32-NEXT:    movaps {{[0-9]+}}(%esp), %xmm4
+; LINUX-X32-NEXT:    movaps {{[0-9]+}}(%esp), %xmm3
+; LINUX-X32-NEXT:    movaps {{[0-9]+}}(%esp), %xmm2
+; LINUX-X32-NEXT:    movaps {{[0-9]+}}(%esp), %xmm1
+; LINUX-X32-NEXT:    movaps (%esp), %xmm0
+; LINUX-X32-NEXT:    addl $136, %esp
 ; LINUX-X32-NEXT:    .cfi_def_cfa_offset 8
 ; LINUX-X32-NEXT:    jmpq *%r11 # TAILCALL
-; LINUX-X32-NEXT:  .LBB2_2: # %else
-; LINUX-X32-NEXT:    .cfi_def_cfa_offset 16
+; LINUX-X32-NEXT:  .LBB2_4: # %else
+; LINUX-X32-NEXT:    .cfi_def_cfa_offset 144
 ; LINUX-X32-NEXT:    movl 8(%edi), %r11d
 ; LINUX-X32-NEXT:    movl $42, {{.*}}(%rip)
-; LINUX-X32-NEXT:    addl $8, %esp
+; LINUX-X32-NEXT:    testb %al, %al
+; LINUX-X32-NEXT:    je .LBB2_8
+; LINUX-X32-NEXT:  # %bb.7: # %else
+; LINUX-X32-NEXT:    movaps {{[0-9]+}}(%esp), %xmm7
+; LINUX-X32-NEXT:    movaps {{[0-9]+}}(%esp), %xmm6
+; LINUX-X32-NEXT:    movaps {{[0-9]+}}(%esp), %xmm5
+; LINUX-X32-NEXT:    movaps {{[0-9]+}}(%esp), %xmm4
+; LINUX-X32-NEXT:    movaps {{[0-9]+}}(%esp), %xmm3
+; LINUX-X32-NEXT:    movaps {{[0-9]+}}(%esp), %xmm2
+; LINUX-X32-NEXT:    movaps {{[0-9]+}}(%esp), %xmm1
+; LINUX-X32-NEXT:    movaps (%esp), %xmm0
+; LINUX-X32-NEXT:    addl $136, %esp
+; LINUX-X32-NEXT:    .cfi_def_cfa_offset 8
+; LINUX-X32-NEXT:    jmpq *%r11 # TAILCALL
+; LINUX-X32-NEXT:  .LBB2_6: # %then
+; LINUX-X32-NEXT:    .cfi_def_cfa_offset 144
+; LINUX-X32-NEXT:    addl $136, %esp
 ; LINUX-X32-NEXT:    .cfi_def_cfa_offset 8
 ; LINUX-X32-NEXT:    jmpq *%r11 # TAILCALL
+; LINUX-X32-NEXT:  .LBB2_8: # %else
+; LINUX-X32-NEXT:    .cfi_def_cfa_offset 144
+; LINUX-X32-NEXT:    addl $136, %esp
+; LINUX-X32-NEXT:    .cfi_def_cfa_offset 8
+; LINUX-X32-NEXT:    jmpq *%r11 # TAILCALL
+;
+; LINUX-X32-OPT0-LABEL: h_thunk:
+; LINUX-X32-OPT0:       # %bb.0:
+; LINUX-X32-OPT0-NEXT:    subl $216, %esp
+; LINUX-X32-OPT0-NEXT:    .cfi_def_cfa_offset 224
+; LINUX-X32-OPT0-NEXT:    movb %al, %r10b
+; LINUX-X32-OPT0-NEXT:    testb %al, %al
+; LINUX-X32-OPT0-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; LINUX-X32-OPT0-NEXT:    movq %rsi, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill
+; LINUX-X32-OPT0-NEXT:    movq %rdx, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill
+; LINUX-X32-OPT0-NEXT:    movq %rcx, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill
+; LINUX-X32-OPT0-NEXT:    movq %r8, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill
+; LINUX-X32-OPT0-NEXT:    movq %r9, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill
+; LINUX-X32-OPT0-NEXT:    movb %r10b, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; LINUX-X32-OPT0-NEXT:    je .LBB2_4
+; LINUX-X32-OPT0-NEXT:  # %bb.3:
+; LINUX-X32-OPT0-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; LINUX-X32-OPT0-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; LINUX-X32-OPT0-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; LINUX-X32-OPT0-NEXT:    movaps %xmm3, {{[0-9]+}}(%esp)
+; LINUX-X32-OPT0-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; LINUX-X32-OPT0-NEXT:    movaps %xmm5, {{[0-9]+}}(%esp)
+; LINUX-X32-OPT0-NEXT:    movaps %xmm6, {{[0-9]+}}(%esp)
+; LINUX-X32-OPT0-NEXT:    movaps %xmm7, {{[0-9]+}}(%esp)
+; LINUX-X32-OPT0-NEXT:  .LBB2_4:
+; LINUX-X32-OPT0-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; LINUX-X32-OPT0-NEXT:    testb $1, (%eax)
+; LINUX-X32-OPT0-NEXT:    jne .LBB2_1
+; LINUX-X32-OPT0-NEXT:    jmp .LBB2_2
+; LINUX-X32-OPT0-NEXT:  .LBB2_1: # %then
+; LINUX-X32-OPT0-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; LINUX-X32-OPT0-NEXT:    movl 4(%eax), %ecx
+; LINUX-X32-OPT0-NEXT:    movl %ecx, %edx
+; LINUX-X32-OPT0-NEXT:    movl %eax, %edi
+; LINUX-X32-OPT0-NEXT:    movq {{[-0-9]+}}(%e{{[sb]}}p), %rsi # 8-byte Reload
+; LINUX-X32-OPT0-NEXT:    movq {{[-0-9]+}}(%e{{[sb]}}p), %r8 # 8-byte Reload
+; LINUX-X32-OPT0-NEXT:    movq %rdx, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill
+; LINUX-X32-OPT0-NEXT:    movq %r8, %rdx
+; LINUX-X32-OPT0-NEXT:    movq {{[-0-9]+}}(%e{{[sb]}}p), %rcx # 8-byte Reload
+; LINUX-X32-OPT0-NEXT:    movq {{[-0-9]+}}(%e{{[sb]}}p), %r8 # 8-byte Reload
+; LINUX-X32-OPT0-NEXT:    movq {{[-0-9]+}}(%e{{[sb]}}p), %r9 # 8-byte Reload
+; LINUX-X32-OPT0-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload
+; LINUX-X32-OPT0-NEXT:    movq {{[-0-9]+}}(%e{{[sb]}}p), %r11 # 8-byte Reload
+; LINUX-X32-OPT0-NEXT:    testb %al, %al
+; LINUX-X32-OPT0-NEXT:    je .LBB2_6
+; LINUX-X32-OPT0-NEXT:  # %bb.5: # %then
+; LINUX-X32-OPT0-NEXT:    movaps {{[0-9]+}}(%esp), %xmm7
+; LINUX-X32-OPT0-NEXT:    movaps {{[0-9]+}}(%esp), %xmm6
+; LINUX-X32-OPT0-NEXT:    movaps {{[0-9]+}}(%esp), %xmm5
+; LINUX-X32-OPT0-NEXT:    movaps {{[0-9]+}}(%esp), %xmm4
+; LINUX-X32-OPT0-NEXT:    movaps {{[0-9]+}}(%esp), %xmm3
+; LINUX-X32-OPT0-NEXT:    movaps {{[0-9]+}}(%esp), %xmm2
+; LINUX-X32-OPT0-NEXT:    movaps {{[0-9]+}}(%esp), %xmm1
+; LINUX-X32-OPT0-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
+; LINUX-X32-OPT0-NEXT:    addl $216, %esp
+; LINUX-X32-OPT0-NEXT:    .cfi_def_cfa_offset 8
+; LINUX-X32-OPT0-NEXT:    jmpq *%r11 # TAILCALL
+; LINUX-X32-OPT0-NEXT:  .LBB2_6: # %then
+; LINUX-X32-OPT0-NEXT:    .cfi_def_cfa_offset 224
+; LINUX-X32-OPT0-NEXT:    addl $216, %esp
+; LINUX-X32-OPT0-NEXT:    .cfi_def_cfa_offset 8
+; LINUX-X32-OPT0-NEXT:    jmpq *%r11 # TAILCALL
+; LINUX-X32-OPT0-NEXT:  .LBB2_2: # %else
+; LINUX-X32-OPT0-NEXT:    .cfi_def_cfa_offset 224
+; LINUX-X32-OPT0-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; LINUX-X32-OPT0-NEXT:    movl 8(%eax), %ecx
+; LINUX-X32-OPT0-NEXT:    movl %ecx, %edx
+; LINUX-X32-OPT0-NEXT:    movl $42, {{.*}}(%rip)
+; LINUX-X32-OPT0-NEXT:    movl %eax, %edi
+; LINUX-X32-OPT0-NEXT:    movq {{[-0-9]+}}(%e{{[sb]}}p), %rsi # 8-byte Reload
+; LINUX-X32-OPT0-NEXT:    movq {{[-0-9]+}}(%e{{[sb]}}p), %r8 # 8-byte Reload
+; LINUX-X32-OPT0-NEXT:    movq %rdx, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill
+; LINUX-X32-OPT0-NEXT:    movq %r8, %rdx
+; LINUX-X32-OPT0-NEXT:    movq {{[-0-9]+}}(%e{{[sb]}}p), %rcx # 8-byte Reload
+; LINUX-X32-OPT0-NEXT:    movq {{[-0-9]+}}(%e{{[sb]}}p), %r8 # 8-byte Reload
+; LINUX-X32-OPT0-NEXT:    movq {{[-0-9]+}}(%e{{[sb]}}p), %r9 # 8-byte Reload
+; LINUX-X32-OPT0-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload
+; LINUX-X32-OPT0-NEXT:    movq {{[-0-9]+}}(%e{{[sb]}}p), %r11 # 8-byte Reload
+; LINUX-X32-OPT0-NEXT:    testb %al, %al
+; LINUX-X32-OPT0-NEXT:    je .LBB2_8
+; LINUX-X32-OPT0-NEXT:  # %bb.7: # %else
+; LINUX-X32-OPT0-NEXT:    movaps {{[0-9]+}}(%esp), %xmm7
+; LINUX-X32-OPT0-NEXT:    movaps {{[0-9]+}}(%esp), %xmm6
+; LINUX-X32-OPT0-NEXT:    movaps {{[0-9]+}}(%esp), %xmm5
+; LINUX-X32-OPT0-NEXT:    movaps {{[0-9]+}}(%esp), %xmm4
+; LINUX-X32-OPT0-NEXT:    movaps {{[0-9]+}}(%esp), %xmm3
+; LINUX-X32-OPT0-NEXT:    movaps {{[0-9]+}}(%esp), %xmm2
+; LINUX-X32-OPT0-NEXT:    movaps {{[0-9]+}}(%esp), %xmm1
+; LINUX-X32-OPT0-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
+; LINUX-X32-OPT0-NEXT:    addl $216, %esp
+; LINUX-X32-OPT0-NEXT:    .cfi_def_cfa_offset 8
+; LINUX-X32-OPT0-NEXT:    jmpq *%r11 # TAILCALL
+; LINUX-X32-OPT0-NEXT:  .LBB2_8: # %else
+; LINUX-X32-OPT0-NEXT:    .cfi_def_cfa_offset 224
+; LINUX-X32-OPT0-NEXT:    addl $216, %esp
+; LINUX-X32-OPT0-NEXT:    .cfi_def_cfa_offset 8
+; LINUX-X32-OPT0-NEXT:    jmpq *%r11 # TAILCALL
 ;
 ; WINDOWS-LABEL: h_thunk:
 ; WINDOWS:       # %bb.0:
@@ -409,23 +1130,136 @@
 ; WINDOWS-NEXT:    .text
 ; WINDOWS-NEXT:    .seh_endproc
 ;
-; X86-LABEL: h_thunk:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb $1, (%eax)
-; X86-NEXT:    jne LBB2_2
-; X86-NEXT:  # %bb.1: # %then
-; X86-NEXT:    movl 4(%eax), %ecx
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    popl %eax
-; X86-NEXT:    jmpl *%ecx # TAILCALL
-; X86-NEXT:  LBB2_2: # %else
-; X86-NEXT:    movl 8(%eax), %ecx
-; X86-NEXT:    movl $42, _g
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    popl %eax
-; X86-NEXT:    jmpl *%ecx # TAILCALL
+; WINDOWS-OPT0-LABEL: h_thunk:
+; WINDOWS-OPT0:       # %bb.0:
+; WINDOWS-OPT0-NEXT:    subq $88, %rsp
+; WINDOWS-OPT0-NEXT:    .seh_stackalloc 88
+; WINDOWS-OPT0-NEXT:    .seh_endprologue
+; WINDOWS-OPT0-NEXT:    testb $1, (%rcx)
+; WINDOWS-OPT0-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; WINDOWS-OPT0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; WINDOWS-OPT0-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; WINDOWS-OPT0-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; WINDOWS-OPT0-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; WINDOWS-OPT0-NEXT:    jne .LBB2_1
+; WINDOWS-OPT0-NEXT:    jmp .LBB2_2
+; WINDOWS-OPT0-NEXT:  .LBB2_1: # %then
+; WINDOWS-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; WINDOWS-OPT0-NEXT:    movq 8(%rax), %rcx
+; WINDOWS-OPT0-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; WINDOWS-OPT0-NEXT:    movq %rax, %rcx
+; WINDOWS-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; WINDOWS-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; WINDOWS-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; WINDOWS-OPT0-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; WINDOWS-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; WINDOWS-OPT0-NEXT:    addq $88, %rsp
+; WINDOWS-OPT0-NEXT:    rex64 jmpq *%r10 # TAILCALL
+; WINDOWS-OPT0-NEXT:  .LBB2_2: # %else
+; WINDOWS-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; WINDOWS-OPT0-NEXT:    movq 16(%rax), %rcx
+; WINDOWS-OPT0-NEXT:    movl $42, {{.*}}(%rip)
+; WINDOWS-OPT0-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; WINDOWS-OPT0-NEXT:    movq %rax, %rcx
+; WINDOWS-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; WINDOWS-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; WINDOWS-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; WINDOWS-OPT0-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; WINDOWS-OPT0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; WINDOWS-OPT0-NEXT:    addq $88, %rsp
+; WINDOWS-OPT0-NEXT:    rex64 jmpq *%r10 # TAILCALL
+; WINDOWS-OPT0-NEXT:    .seh_handlerdata
+; WINDOWS-OPT0-NEXT:    .text
+; WINDOWS-OPT0-NEXT:    .seh_endproc
+;
+; X86-NOSSE-LABEL: h_thunk:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    cmpb $1, (%eax)
+; X86-NOSSE-NEXT:    jne LBB2_2
+; X86-NOSSE-NEXT:  # %bb.1: # %then
+; X86-NOSSE-NEXT:    movl 4(%eax), %ecx
+; X86-NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    popl %eax
+; X86-NOSSE-NEXT:    jmpl *%ecx # TAILCALL
+; X86-NOSSE-NEXT:  LBB2_2: # %else
+; X86-NOSSE-NEXT:    movl 8(%eax), %ecx
+; X86-NOSSE-NEXT:    movl $42, _g
+; X86-NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    popl %eax
+; X86-NOSSE-NEXT:    jmpl *%ecx # TAILCALL
+;
+; X86-NOSSE-OPT0-LABEL: h_thunk:
+; X86-NOSSE-OPT0:       # %bb.0:
+; X86-NOSSE-OPT0-NEXT:    subl $8, %esp
+; X86-NOSSE-OPT0-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-OPT0-NEXT:    testb $1, (%eax)
+; X86-NOSSE-OPT0-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOSSE-OPT0-NEXT:    jne LBB2_1
+; X86-NOSSE-OPT0-NEXT:    jmp LBB2_2
+; X86-NOSSE-OPT0-NEXT:  LBB2_1: # %then
+; X86-NOSSE-OPT0-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NOSSE-OPT0-NEXT:    movl 4(%eax), %ecx
+; X86-NOSSE-OPT0-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NOSSE-OPT0-NEXT:    addl $8, %esp
+; X86-NOSSE-OPT0-NEXT:    jmpl *%ecx # TAILCALL
+; X86-NOSSE-OPT0-NEXT:  LBB2_2: # %else
+; X86-NOSSE-OPT0-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NOSSE-OPT0-NEXT:    movl 8(%eax), %ecx
+; X86-NOSSE-OPT0-NEXT:    movl $42, _g
+; X86-NOSSE-OPT0-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NOSSE-OPT0-NEXT:    addl $8, %esp
+; X86-NOSSE-OPT0-NEXT:    jmpl *%ecx # TAILCALL
+;
+; X86-SSE-LABEL: h_thunk:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    pushl %eax
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    cmpb $1, (%eax)
+; X86-SSE-NEXT:    jne LBB2_2
+; X86-SSE-NEXT:  # %bb.1: # %then
+; X86-SSE-NEXT:    movl 4(%eax), %ecx
+; X86-SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT:    popl %eax
+; X86-SSE-NEXT:    jmpl *%ecx # TAILCALL
+; X86-SSE-NEXT:  LBB2_2: # %else
+; X86-SSE-NEXT:    movl 8(%eax), %ecx
+; X86-SSE-NEXT:    movl $42, _g
+; X86-SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT:    popl %eax
+; X86-SSE-NEXT:    jmpl *%ecx # TAILCALL
+;
+; X86-SSE-OPT0-LABEL: h_thunk:
+; X86-SSE-OPT0:       # %bb.0:
+; X86-SSE-OPT0-NEXT:    subl $92, %esp
+; X86-SSE-OPT0-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-OPT0-NEXT:    testb $1, (%eax)
+; X86-SSE-OPT0-NEXT:    movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-SSE-OPT0-NEXT:    movups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-SSE-OPT0-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE-OPT0-NEXT:    movups %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-SSE-OPT0-NEXT:    jne LBB2_1
+; X86-SSE-OPT0-NEXT:    jmp LBB2_2
+; X86-SSE-OPT0-NEXT:  LBB2_1: # %then
+; X86-SSE-OPT0-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE-OPT0-NEXT:    movl 4(%eax), %ecx
+; X86-SSE-OPT0-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE-OPT0-NEXT:    movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-SSE-OPT0-NEXT:    movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
+; X86-SSE-OPT0-NEXT:    movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload
+; X86-SSE-OPT0-NEXT:    addl $92, %esp
+; X86-SSE-OPT0-NEXT:    jmpl *%ecx # TAILCALL
+; X86-SSE-OPT0-NEXT:  LBB2_2: # %else
+; X86-SSE-OPT0-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE-OPT0-NEXT:    movl 8(%eax), %ecx
+; X86-SSE-OPT0-NEXT:    movl $42, _g
+; X86-SSE-OPT0-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE-OPT0-NEXT:    movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-SSE-OPT0-NEXT:    movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
+; X86-SSE-OPT0-NEXT:    movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload
+; X86-SSE-OPT0-NEXT:    addl $92, %esp
+; X86-SSE-OPT0-NEXT:    jmpl *%ecx # TAILCALL
   %cond_p = getelementptr %struct.Foo, %struct.Foo* %this, i32 0, i32 0
   %cond = load i1, i1* %cond_p
   br i1 %cond, label %then, label %else
Index: llvm/test/CodeGen/X86/vastart-defs-eflags.ll
===================================================================
--- llvm/test/CodeGen/X86/vastart-defs-eflags.ll
+++ llvm/test/CodeGen/X86/vastart-defs-eflags.ll
@@ -9,6 +9,11 @@
 ; CHECK-LABEL: check_flag:
 ; CHECK:       ## %bb.0: ## %entry
 ; CHECK-NEXT:    subq $56, %rsp
+; CHECK-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    testb %al, %al
 ; CHECK-NEXT:    je LBB0_2
 ; CHECK-NEXT:  ## %bb.1: ## %entry
@@ -21,11 +26,6 @@
 ; CHECK-NEXT:    movaps %xmm6, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movaps %xmm7, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:  LBB0_2: ## %entry
-; CHECK-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    testl $512, %edi ## imm = 0x200
 ; CHECK-NEXT:    je LBB0_4
Index: llvm/test/CodeGen/X86/x32-va_start.ll
===================================================================
--- llvm/test/CodeGen/X86/x32-va_start.ll
+++ llvm/test/CodeGen/X86/x32-va_start.ll
@@ -27,6 +27,11 @@
   call void @llvm.lifetime.start.p0i8(i64 16, i8* %0) #2
   call void @llvm.va_start(i8* %0)
 ; SSE: subl $72, %esp
+; CHECK-DAG: movq %r9
+; CHECK-DAG: movq %r8
+; CHECK-DAG: movq %rcx
+; CHECK-DAG: movq %rdx
+; CHECK-DAG: movq %rsi
 ; SSE: testb %al, %al
 ; SSE: je .[[NOFP:.*]]
 ; SSE-DAG: movaps %xmm1
@@ -38,11 +43,6 @@
 ; SSE-DAG: movaps %xmm7
 ; NOSSE-NOT: xmm
 ; SSE: .[[NOFP]]:
-; CHECK-DAG: movq %r9
-; CHECK-DAG: movq %r8
-; CHECK-DAG: movq %rcx
-; CHECK-DAG: movq %rdx
-; CHECK-DAG: movq %rsi
   %gp_offset_p = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %ap, i32 0, i32 0, i32 0
   %gp_offset = load i32, i32* %gp_offset_p, align 16
   %fits_in_gp = icmp ult i32 %gp_offset, 41
Index: llvm/test/CodeGen/X86/xmm-vararg-noopt.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/X86/xmm-vararg-noopt.ll
@@ -0,0 +1,49 @@
+; RUN: llc -O0 -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
+
+; CHECK-LABEL: testvarargs
+; Ensure that xmm registers are not used before testing %al
+; CHECK-NOT: xmm
+; CHECK: testb %al, %al
+; CHECK-NOT: xmm
+; CHECK: # %bb.1
+; CHECK-NEXT: %xmm0, {{.*}}%rsp
+; CHECK-NEXT: %xmm1, {{.*}}%rsp
+; CHECK-NEXT: %xmm2, {{.*}}%rsp
+; CHECK-NEXT: %xmm3, {{.*}}%rsp
+; CHECK-NEXT: %xmm4, {{.*}}%rsp
+; CHECK-NEXT: %xmm5, {{.*}}%rsp
+; CHECK-NEXT: %xmm6, {{.*}}%rsp
+; CHECK-NEXT: %xmm7, {{.*}}%rsp
+
+; ModuleID = 'variadic.c'
+source_filename = "variadic.c"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux"
+
+%struct.__va_list_tag = type { i32, i32, i8*, i8* }
+
+@.str = private unnamed_addr constant [9 x i8] c"\0A hello \00", align 1
+
+; Function Attrs: noinline nounwind optnone uwtable
+define dso_local void @testvarargs(i8* %fmt, ...) {
+entry:
+  %fmt.addr = alloca i8*, align 8
+  %va = alloca [1 x %struct.__va_list_tag], align 16
+  store i8* %fmt, i8** %fmt.addr, align 8
+  %arraydecay = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %va, i64 0, i64 0
+  %arraydecay1 = bitcast %struct.__va_list_tag* %arraydecay to i8*
+  call void @llvm.va_start(i8* %arraydecay1)
+  %arraydecay2 = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %va, i64 0, i64 0
+  %arraydecay23 = bitcast %struct.__va_list_tag* %arraydecay2 to i8*
+  call void @llvm.va_end(i8* %arraydecay23)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str, i64 0, i64 0))
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.va_start(i8*)
+
+; Function Attrs: nounwind
+declare void @llvm.va_end(i8*)
+
+declare dso_local i32 @printf(i8*, ...)