Index: llvm/include/llvm/CodeGen/CallingConvLower.h =================================================================== --- llvm/include/llvm/CodeGen/CallingConvLower.h +++ llvm/include/llvm/CodeGen/CallingConvLower.h @@ -14,6 +14,7 @@ #ifndef LLVM_CODEGEN_CALLINGCONVLOWER_H #define LLVM_CODEGEN_CALLINGCONVLOWER_H +#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -163,10 +164,12 @@ }; /// Describes a register that needs to be forwarded from the prologue to a -/// musttail call. +/// musttail call. Specifying VReg == 0 means that the register should be +/// put into guarded area and no virtual register was created for it. struct ForwardedRegister { ForwardedRegister(unsigned VReg, MCPhysReg PReg, MVT VT) : VReg(VReg), PReg(PReg), VT(VT) {} + bool IsGuarded() const { return VReg == 0; } unsigned VReg; MCPhysReg PReg; MVT VT; @@ -525,8 +528,9 @@ /// Compute the set of registers that need to be preserved and forwarded to /// any musttail calls. void analyzeMustTailForwardedRegisters( - SmallVectorImpl &Forwards, ArrayRef RegParmTypes, - CCAssignFn Fn); + SmallVectorImpl &Forwards, + const SmallDenseSet &GuardedForwardedRegs, + ArrayRef RegParmTypes, CCAssignFn Fn); /// Returns true if the results of the two calling conventions are compatible. /// This is usually part of the check for tailcall eligibility. Index: llvm/include/llvm/CodeGen/MachineBasicBlock.h =================================================================== --- llvm/include/llvm/CodeGen/MachineBasicBlock.h +++ llvm/include/llvm/CodeGen/MachineBasicBlock.h @@ -110,6 +110,10 @@ /// Indicate that this basic block is entered via an exception handler. bool IsEHPad = false; + /// Indicate that this basic block used for saving vararg registers + /// and is entered from entry block. + bool IsGuardedRegsBlk = false; + /// Indicate that this basic block is potentially the target of an indirect /// branch. bool AddressTaken = false; @@ -378,6 +382,14 @@ /// Set alignment of the basic block. void setAlignment(Align A) { Alignment = A; } + /// Returns true if the block is used to save guarded varargs registers. + /// This basic block is entered from an entry block. + bool isGuardedRegsBlk() const { return IsGuardedRegsBlk; } + + /// Marks the block as one which is used to save guarded varargs registers. + /// This basic block is entered from an entry block. + void setIsGuardedRegsBlk(bool V = true) { IsGuardedRegsBlk = V; } + /// Returns true if the block is a landing pad. That is this basic block is /// entered via an exception handler. bool isEHPad() const { return IsEHPad; } Index: llvm/lib/CodeGen/CallingConvLower.cpp =================================================================== --- llvm/lib/CodeGen/CallingConvLower.cpp +++ llvm/lib/CodeGen/CallingConvLower.cpp @@ -236,8 +236,9 @@ } void CCState::analyzeMustTailForwardedRegisters( - SmallVectorImpl &Forwards, ArrayRef RegParmTypes, - CCAssignFn Fn) { + SmallVectorImpl &Forwards, + const SmallDenseSet &GuardedForwardedRegs, + ArrayRef RegParmTypes, CCAssignFn Fn) { // Oftentimes calling conventions will not user register parameters for // variadic functions, so we need to assume we're not variadic so that we get // all the registers that might be used in a non-variadic call. @@ -250,8 +251,11 @@ const TargetLowering *TL = MF.getSubtarget().getTargetLowering(); const TargetRegisterClass *RC = TL->getRegClassFor(RegVT); for (MCPhysReg PReg : RemainingRegs) { - unsigned VReg = MF.addLiveIn(PReg, RC); - Forwards.push_back(ForwardedRegister(VReg, PReg, RegVT)); + if (GuardedForwardedRegs.count(PReg) == 0) { + unsigned VReg = MF.addLiveIn(PReg, RC); + Forwards.push_back(ForwardedRegister(VReg, PReg, RegVT)); + } else + Forwards.push_back(ForwardedRegister(0, PReg, RegVT)); } } } Index: llvm/lib/CodeGen/MachineVerifier.cpp =================================================================== --- llvm/lib/CodeGen/MachineVerifier.cpp +++ llvm/lib/CodeGen/MachineVerifier.cpp @@ -621,9 +621,11 @@ if (!MF->getProperties().hasProperty( MachineFunctionProperties::Property::NoPHIs) && MRI->tracksLiveness()) { // If this block has allocatable physical registers live-in, check that - // it is an entry block or landing pad. + // it is an entry block or landing pad or varargs guarded registers + // saving block. for (const auto &LI : MBB->liveins()) { if (isAllocatable(LI.PhysReg) && !MBB->isEHPad() && + !MBB->isGuardedRegsBlk() && MBB->getIterator() != MBB->getParent()->begin()) { report("MBB has allocatable live-in, but isn't entry or landing-pad.", MBB); report_context(LI.PhysReg); Index: llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp +++ llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp @@ -209,6 +209,13 @@ // If we have a musttail call in a variadic function, we need to ensure we // forward implicit register parameters. if (const auto *CI = dyn_cast(&I)) { + // check for llvm::Intrinsic::icall_branch_funnel intrinsic. + // we do not store varargs parameters explicitly for icall_branch_funnel + if (CI->getCalledFunction() && + CI->getCalledFunction()->getIntrinsicID() == + llvm::Intrinsic::icall_branch_funnel) + continue; + if (CI->isMustTailCall() && Fn->isVarArg()) MF->getFrameInfo().setHasMustTailInVarArgFunc(true); } Index: llvm/lib/Target/AArch64/AArch64CallLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64CallLowering.cpp +++ llvm/lib/Target/AArch64/AArch64CallLowering.cpp @@ -397,7 +397,16 @@ // Later on, we can use this vector to restore the registers if necessary. SmallVectorImpl &Forwards = FuncInfo->getForwardedMustTailRegParms(); - CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, AssignFn); + + // TODO: At x86 platform, XMM varargs parameters should be + // TODO: guarded with check for %al register to avoid using xmm + // TODO: registers(if they were not actually specified). + // TODO: Define set of guarded registers here if the same is neccessary + // TODO: for AArch64 (https://bugs.llvm.org/show_bug.cgi?id=42219). + // TODO: Otherwise remove this comment. + SmallDenseSet guardedRegs; + CCInfo.analyzeMustTailForwardedRegisters(Forwards, guardedRegs, RegParmTypes, + AssignFn); // Conservatively forward X8, since it might be used for an aggregate // return. Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -3344,8 +3344,16 @@ // Compute the set of forwarded registers. The rest are scratch. SmallVectorImpl &Forwards = FuncInfo->getForwardedMustTailRegParms(); - CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, - CC_AArch64_AAPCS); + + // TODO: At x86 platform, XMM varargs parameters should be + // TODO: guarded with check for %al register to avoid using xmm + // TODO: registers(if they were not actually specified). + // TODO: Define set of guarded registers here if the same is neccessary + // TODO: for AArch64 (https://bugs.llvm.org/show_bug.cgi?id=42219). + // TODO: Otherwise remove this comment. + SmallDenseSet guardedRegs; + CCInfo.analyzeMustTailForwardedRegisters(Forwards, guardedRegs, + RegParmTypes, CC_AArch64_AAPCS); // Conservatively forward X8, since it might be used for aggregate return. if (!CCInfo.isAllocated(AArch64::X8)) { Index: llvm/lib/Target/X86/X86ExpandPseudo.cpp =================================================================== --- llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -64,6 +64,9 @@ bool ExpandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI); bool ExpandMBB(MachineBasicBlock &MBB); + + void CreateTailCallBlocksPair(MachineBasicBlock &OriginalTailCallBlk, + MachineBasicBlock::iterator &TCPseudoInstr); }; char X86ExpandPseudo::ID = 0; @@ -173,6 +176,209 @@ JTMBB->erase(JTInst); } +// this function replaces original tail call instruction with two versions +// of tailcall instruction. One is fully similar to original, another has xmm +// registers restoring code inserted previously. Additionally there is created a +// branch which checks %al and selects proper version of tailcall. +// +// f_thunk: f_thunk: +// # %bb.1: => # %bb.1: +// addq 32, %rsp testb %al, %al +// jmpq tc_func je .LBB0_2 +// # %bb.2: +// movaps 96(%rsp), %xmm0 +// addq 32, %rsp +// jmpq tc_func +// .LBB0_2: +// # %bb.3: +// addq 32, %rsp +// jmpq tc_func +// +void X86ExpandPseudo::CreateTailCallBlocksPair( + MachineBasicBlock &OriginalTailCallBlk, + MachineBasicBlock::iterator &TCPseudoInstr) { + + MachineFunction *Func = OriginalTailCallBlk.getParent(); + X86MachineFunctionInfo *X86Info = Func->getInfo(); + const auto &Forwards = X86Info->getForwardedMustTailRegParms(); + + // enumerate forwarded registers and check for existance + // any of guarded registers. + bool hasGuardedArgs = false; + for (auto &F : Forwards) + if (F.IsGuarded()) { + hasGuardedArgs = true; + break; + } + + // do nothing if there are no guarded registers + if (!hasGuardedArgs) + return; + + const BasicBlock *LLVM_BB = OriginalTailCallBlk.getBasicBlock(); + + MachineBasicBlock::iterator TailCallMInstr = std::prev(TCPseudoInstr); + DebugLoc DL = TCPseudoInstr->getDebugLoc(); + + // create two blocks for tailcalls. + MachineFunction::iterator MBBIter = ++OriginalTailCallBlk.getIterator(); + MachineBasicBlock *TailCallBlkWithGuardedRegs = + Func->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *TailCallBlk = Func->CreateMachineBasicBlock(LLVM_BB); + Func->insert(MBBIter, TailCallBlkWithGuardedRegs); + Func->insert(MBBIter, TailCallBlk); + + TailCallBlk->transferSuccessors(&OriginalTailCallBlk); + OriginalTailCallBlk.addSuccessor(TailCallBlkWithGuardedRegs); + OriginalTailCallBlk.addSuccessor(TailCallBlk); + + // search for the start of stack restoring code + MachineInstr *FirstInstructionOfStackRestoringCode = &*TailCallMInstr; + + for (MachineBasicBlock::reverse_iterator CurStackRestoreInstr = + TailCallMInstr.getReverse(); + CurStackRestoreInstr != OriginalTailCallBlk.rend(); + ++CurStackRestoreInstr) { + + // skip tail call instruction + if (CurStackRestoreInstr->getOpcode() == TailCallMInstr->getOpcode()) + continue; + + // skip CFI instructions + if (CurStackRestoreInstr->isCFIInstruction()) + continue; + + if ((CurStackRestoreInstr->getOpcode() == X86::SUB64ri32 || + CurStackRestoreInstr->getOpcode() == X86::SUB64ri8 || + CurStackRestoreInstr->getOpcode() == X86::SUB32ri || + CurStackRestoreInstr->getOpcode() == X86::SUB32ri8) && + CurStackRestoreInstr->getOperand(0).getReg() == + TRI->getStackRegister()) { + FirstInstructionOfStackRestoringCode = &*CurStackRestoreInstr; + continue; + } else if ((CurStackRestoreInstr->getOpcode() == X86::ADD64ri32 || + CurStackRestoreInstr->getOpcode() == X86::ADD64ri8 || + CurStackRestoreInstr->getOpcode() == X86::ADD32ri || + CurStackRestoreInstr->getOpcode() == X86::ADD32ri8) && + CurStackRestoreInstr->getOperand(0).getReg() == + TRI->getStackRegister()) { + FirstInstructionOfStackRestoringCode = &*CurStackRestoreInstr; + continue; + } else if (CurStackRestoreInstr->getOpcode() == X86::POP64r) { + FirstInstructionOfStackRestoringCode = &*CurStackRestoreInstr; + continue; + } else if ((CurStackRestoreInstr->getOpcode() == X86::LEA32r || + CurStackRestoreInstr->getOpcode() == X86::LEA64_32r) && + CurStackRestoreInstr->getOperand(0).getReg() == + TRI->getStackRegister() && + CurStackRestoreInstr->getOperand(1).getReg() == + TRI->getStackRegister() && + CurStackRestoreInstr->getOperand(2).getImm() == 1 && + CurStackRestoreInstr->getOperand(3).getReg() == + X86::NoRegister && + CurStackRestoreInstr->getOperand(5).getReg() == + X86::NoRegister) { + // For LEAs we have: def = lea SP, FI, noreg, Offset, noreg. + FirstInstructionOfStackRestoringCode = &*CurStackRestoreInstr; + continue; + } + + break; + } + + // copy stack restoring code and tailcall instruction into + // two created blocks. Delete copied instructions from the + // OriginalTailCallBlk. + MachineBasicBlock::iterator curInstr = FirstInstructionOfStackRestoringCode; + + do { + // copy instructions into TailCallBlkWithGuardedRegs + MachineInstrBuilder MIB = BuildMI(TailCallBlkWithGuardedRegs, DL, + TII->get(curInstr->getOpcode())); + + for (auto MO : curInstr->operands()) + MIB->addOperand(*Func, MO); + + // copy instructions into TailCallBlk + MachineInstrBuilder SMIB = + BuildMI(TailCallBlk, DL, TII->get(curInstr->getOpcode())); + + for (auto MO : curInstr->operands()) + SMIB->addOperand(*Func, MO); + + // stop copying if we achieved tail call instruction + if (curInstr->getOpcode() == TailCallMInstr->getOpcode()) { + OriginalTailCallBlk.erase(curInstr); + break; + } + + curInstr = &*OriginalTailCallBlk.erase(curInstr); + } while (curInstr != OriginalTailCallBlk.end()); + + // copy call site information into new tail call instructions + OriginalTailCallBlk.getParent()->copyCallSiteInfo( + &*TCPseudoInstr, &*TailCallBlkWithGuardedRegs->getLastNonDebugInstr()); + + OriginalTailCallBlk.getParent()->copyCallSiteInfo( + &*TCPseudoInstr, &*TailCallBlk->getLastNonDebugInstr()); + + // If %al is 0, branch around the XMM save block. + BuildMI(&OriginalTailCallBlk, DL, TII->get(X86::TEST8rr)) + .addReg(X86::AL) + .addReg(X86::AL); + BuildMI(&OriginalTailCallBlk, DL, TII->get(X86::JCC_1)) + .addMBB(TailCallBlk) + .addImm(X86::COND_E); + + // add code restoring xmm regsiters into start of TailCallInstrFromGuardedBlk + MachineInstr &TailCallInstrFromGuardedBlk = + *TailCallBlkWithGuardedRegs->getLastNonDebugInstr(); + + // TODO: take into account YMM, ZMM here + unsigned MOVOpc = STI->hasAVX() ? X86::VMOVAPSrm : X86::MOVAPSrm; + + int RegIdx = 0; + for (const auto &Fwd : Forwards) { + if (Fwd.IsGuarded()) { + int64_t OffsetInsideSaveArea = + (Func->getFrameInfo().hasVAStart() ? X86Info->getVarArgsFPOffset() + : 0); + unsigned BaseReg; + int64_t Offset = + X86FL->getFrameIndexReference( + *Func, X86Info->getThunkRegSaveFrameIndex(), BaseReg) + + RegIdx * 16 + OffsetInsideSaveArea; + + MachineMemOperand *MMO = Func->getMachineMemOperand( + MachinePointerInfo::getFixedStack( + *Func, X86Info->getThunkRegSaveFrameIndex(), Offset), + MachineMemOperand::MOLoad, + /*Size=*/16, /*Align=*/16); + + BuildMI(*TailCallBlkWithGuardedRegs, TailCallBlkWithGuardedRegs->begin(), + DL, TII->get(MOVOpc), Fwd.PReg) + .addReg(BaseReg) + .addImm(/*Scale=*/1) + .addReg(/*IndexReg=*/0) + .addImm(/*Disp=*/Offset) + .addReg(/*Segment=*/0) + .addMemOperand(MMO); + + TailCallInstrFromGuardedBlk.addOperand( + MachineOperand::CreateReg(Fwd.PReg, false /*IsDef*/, true /*IsImp*/)); + RegIdx++; + } + } + + // add liveins into newly created blocks + for (auto &MO : TCPseudoInstr->operands()) { + if (MO.isReg() && Register::isPhysicalRegister(MO.getReg())) { + TailCallBlk->addLiveIn(MO.getReg()); + TailCallBlkWithGuardedRegs->addLiveIn(MO.getReg()); + } + } +} + /// If \p MBBI is a pseudo instruction, this method expands /// it to the corresponding (sequence of) actual instruction(s). /// \returns true if \p MBBI has been expanded. @@ -275,7 +481,17 @@ MachineInstr &NewMI = *std::prev(MBBI); NewMI.copyImplicitOps(*MBBI->getParent()->getParent(), *MBBI); - MBB.getParent()->moveCallSiteInfo(&*MBBI, &NewMI); + MBB.getParent()->copyCallSiteInfo(&*MBBI, &NewMI); + MachineFunction *Func = MBB.getParent(); + + // check for case when variadic function is a thunk. + // We need to propagate parameters into final tailcall then. + // Passing xmm parameters a bit tricky in this case. + // Xmm parameters should be guarded with the check for %al + // register. + if (!STI->isCallingConvWin64(Func->getFunction().getCallingConv()) && + STI->is64Bit() && Func->getFrameInfo().hasMustTailInVarArgFunc()) + CreateTailCallBlocksPair(MBB, MBBI); // Delete the pseudo instruction TCRETURN. MBB.erase(MBBI); Index: llvm/lib/Target/X86/X86ISelLowering.h =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.h +++ llvm/lib/Target/X86/X86ISelLowering.h @@ -529,6 +529,11 @@ // is needed so that this can be expanded with control flow. VASTART_SAVE_XMM_REGS, + // Save xmm argument registers of the vararg thunk function to the stack, + // according to %al. An operator is needed so that this can be expanded with + // control flow. + VARARG_THUNK_SAVE_XMM_REGS, + // Windows's _chkstk call to do stack probing. WIN_ALLOCA, @@ -1431,6 +1436,11 @@ EmitVAStartSaveXMMRegsWithCustomInserter(MachineInstr &BInstr, MachineBasicBlock *BB) const; + /// Utility function to emit the guarded xmm regs saving block. + MachineBasicBlock * + EmitVarargThunkSaveXMMRegsWithCustomInserter(MachineInstr &BInstr, + MachineBasicBlock *BB) const; + MachineBasicBlock *EmitLoweredCascadedSelect(MachineInstr &MI1, MachineInstr &MI2, MachineBasicBlock *BB) const; Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -3354,9 +3354,14 @@ F.hasFnAttribute(Attribute::NoImplicitFloat)) && "SSE register cannot be used when SSE is disabled!"); + SmallDenseSet guardedXmmRegs; + SmallVector LiveGPRs; + SmallVector LiveXMMRegs; + SDValue ALVal; + // 64-bit calling conventions support varargs and register parameters, so we // have to do extra work to spill them in the prologue. - if (Is64Bit && isVarArg && MFI.hasVAStart()) { + if (Is64Bit && isVarArg) { // Find the first unallocated argument registers. ArrayRef ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget); ArrayRef ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget); @@ -3366,77 +3371,83 @@ "SSE register cannot be used when SSE is disabled!"); // Gather all the live in physical registers. - SmallVector LiveGPRs; - SmallVector LiveXMMRegs; - SDValue ALVal; for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) { unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass); LiveGPRs.push_back( DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64)); } + if (!ArgXMMs.empty()) { unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass); ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8); for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) { - unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass); - LiveXMMRegs.push_back( - DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32)); - } - } - - if (IsWin64) { - // Get to the caller-allocated home save location. Add 8 to account - // for the return address. - int HomeOffset = TFI.getOffsetOfLocalArea() + 8; - FuncInfo->setRegSaveFrameIndex( - MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); - // Fixup to set vararg frame on shadow area (4 x i64). - if (NumIntRegs < 4) - FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); - } else { - // For X86-64, if there are vararg parameters that are passed via - // registers, then we must store them to their spots on the stack so - // they may be loaded by dereferencing the result of va_next. - FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); - FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16); - FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject( - ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false)); - } - - // Store the integer parameter registers. - SmallVector MemOps; - SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), - getPointerTy(DAG.getDataLayout())); - unsigned Offset = FuncInfo->getVarArgsGPOffset(); - for (SDValue Val : LiveGPRs) { - SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), - RSFIN, DAG.getIntPtrConstant(Offset, dl)); - SDValue Store = - DAG.getStore(Val.getValue(1), dl, Val, FIN, - MachinePointerInfo::getFixedStack( - DAG.getMachineFunction(), - FuncInfo->getRegSaveFrameIndex(), Offset)); - MemOps.push_back(Store); - Offset += 8; - } - - if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) { - // Now store the XMM (fp + vector) parameter registers. - SmallVector SaveXMMOps; - SaveXMMOps.push_back(Chain); - SaveXMMOps.push_back(ALVal); - SaveXMMOps.push_back(DAG.getIntPtrConstant( - FuncInfo->getRegSaveFrameIndex(), dl)); - SaveXMMOps.push_back(DAG.getIntPtrConstant( - FuncInfo->getVarArgsFPOffset(), dl)); - SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(), - LiveXMMRegs.end()); - MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, - MVT::Other, SaveXMMOps)); + // FastRegisterAllocator spills virtual registers at basic + // block boundary. That leads to usages of xmm registers + // outside of check for %al. Pass physical registers to + // VASTART_SAVE_XMM_REGS to avoid unneccessary spilling. + // See https://bugs.llvm.org/show_bug.cgi?id=42219. + MF.getRegInfo().addLiveIn(Reg); + LiveXMMRegs.push_back(DAG.getRegister(Reg, MVT::v4f32)); + guardedXmmRegs.insert(Reg); + } + } + + if (MFI.hasVAStart()) { + if (IsWin64) { + // Get to the caller-allocated home save location. Add 8 to account + // for the return address. + int HomeOffset = TFI.getOffsetOfLocalArea() + 8; + FuncInfo->setRegSaveFrameIndex( + MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); + // Fixup to set vararg frame on shadow area (4 x i64). + if (NumIntRegs < 4) + FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); + } else { + // For X86-64, if there are vararg parameters that are passed via + // registers, then we must store them to their spots on the stack so + // they may be loaded by dereferencing the result of va_next. + FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); + FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16); + FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject( + ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false)); + } + + // Store the integer parameter registers. + SmallVector MemOps; + SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), + getPointerTy(DAG.getDataLayout())); + unsigned Offset = FuncInfo->getVarArgsGPOffset(); + for (SDValue Val : LiveGPRs) { + SDValue FIN = + DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), RSFIN, + DAG.getIntPtrConstant(Offset, dl)); + SDValue Store = + DAG.getStore(Val.getValue(1), dl, Val, FIN, + MachinePointerInfo::getFixedStack( + DAG.getMachineFunction(), + FuncInfo->getRegSaveFrameIndex(), Offset)); + MemOps.push_back(Store); + Offset += 8; + } + + if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) { + // Now store the XMM (fp + vector) parameter registers. + SmallVector SaveXMMOps; + SaveXMMOps.push_back(Chain); + SaveXMMOps.push_back(ALVal); + SaveXMMOps.push_back( + DAG.getIntPtrConstant(FuncInfo->getRegSaveFrameIndex(), dl)); + SaveXMMOps.push_back( + DAG.getIntPtrConstant(FuncInfo->getVarArgsFPOffset(), dl)); + SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(), + LiveXMMRegs.end()); + MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, + MVT::Other, SaveXMMOps)); + } + + if (!MemOps.empty()) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); } - - if (!MemOps.empty()) - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); } if (isVarArg && MFI.hasMustTailInVarArgFunc()) { @@ -3462,7 +3473,8 @@ // Compute the set of forwarded registers. The rest are scratch. SmallVectorImpl &Forwards = FuncInfo->getForwardedMustTailRegParms(); - CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86); + CCInfo.analyzeMustTailForwardedRegisters(Forwards, guardedXmmRegs, + RegParmTypes, CC_X86); // Conservatively forward AL on x86_64, since it might be used for varargs. if (Is64Bit && !CCInfo.isAllocated(X86::AL)) { @@ -3473,9 +3485,48 @@ // Copy all forwards from physical to virtual registers. for (ForwardedRegister &FR : Forwards) { // FIXME: Can we use a less constrained schedule? - SDValue RegVal = DAG.getCopyFromReg(Chain, dl, FR.VReg, FR.VT); - FR.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(FR.VT)); - Chain = DAG.getCopyToReg(Chain, dl, FR.VReg, RegVal); + if (!FR.IsGuarded()) { + SDValue RegVal = DAG.getCopyFromReg(Chain, dl, FR.VReg, FR.VT); + FR.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(FR.VT)); + Chain = DAG.getCopyToReg(Chain, dl, FR.VReg, RegVal); + } + } + + if (guardedXmmRegs.size() > 0) { + if (MFI.hasVAStart()) { + // all incoming xmm registers are already stored by VAStart + // handling. Reuse these stored values for thunk forwarded + // parameters here. + FuncInfo->setThunkRegSaveFrameIndex(FuncInfo->getRegSaveFrameIndex()); + } else { + // TODO: add check for possibility to not store guarded vararg + // TODO: parameters. If function contains only musttail calls, if it + // TODO: does not use floating point types, + // TODO: if Attribute::NoImplicitFloat specified then: + // TODO: it is possible to not store/restore guarded vararg parameters + // TODO: of thunk. + + // TODO: implement support for YMM, ZMM vararg registers + + // allocate stack space to save guardedXmmRegs, 16 is size of XMM + FuncInfo->setThunkRegSaveFrameIndex( + MFI.CreateStackObject(guardedXmmRegs.size() * 16, 16, false)); + + // Save guarded forwards into guarded area + SmallVector VarargMemOps; + SmallVector VarargXMMOps; + VarargXMMOps.push_back(Chain); + VarargXMMOps.push_back(ALVal); + VarargXMMOps.push_back( + DAG.getIntPtrConstant(FuncInfo->getThunkRegSaveFrameIndex(), dl)); + VarargXMMOps.push_back(DAG.getIntPtrConstant(0, dl)); + VarargXMMOps.insert(VarargXMMOps.end(), LiveXMMRegs.begin(), + LiveXMMRegs.end()); + VarargMemOps.push_back(DAG.getNode(X86ISD::VARARG_THUNK_SAVE_XMM_REGS, + dl, MVT::Other, VarargXMMOps)); + if (!VarargMemOps.empty()) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, VarargMemOps); + } } } @@ -3497,8 +3548,9 @@ } if (!Is64Bit) { - // RegSaveFrameIndex is X86-64 only. + // RegSaveFrameIndex and ThunkRegSaveFrameIndex is X86-64 only. FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); + FuncInfo->setThunkRegSaveFrameIndex(0xAAAAAAA); if (CallConv == CallingConv::X86_FastCall || CallConv == CallingConv::X86_ThisCall) // fastcc functions can't have varargs. @@ -3904,8 +3956,10 @@ if (isVarArg && IsMustTail) { const auto &Forwards = X86Info->getForwardedMustTailRegParms(); for (const auto &F : Forwards) { - SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT); - RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val)); + if (!F.IsGuarded()) { + SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT); + RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val)); + } } } @@ -28758,6 +28812,8 @@ case X86ISD::PSADBW: return "X86ISD::PSADBW"; case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW"; case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; + case X86ISD::VARARG_THUNK_SAVE_XMM_REGS: + return "X86::VARARG_THUNK_SAVE_XMM_REGS"; case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER"; @@ -29522,8 +29578,68 @@ return endMBB; } +// This function creates additional block for storing varargs guarded +// registers. It adds check for %al into entry block, to skip +// GuardedRegsBlk if xmm registers should not be stored. +// +// EntryBlk[VAPseudoInstr] EntryBlk +// | | . +// | | . +// | | GuardedRegsBlk +// | => | . +// | | . +// | TailBlk[VAPseudoInstr] +// | | +// | | +// +static std::pair +CreateGuardedRegsBlock(MachineBasicBlock *EntryBlk, MachineInstr &VAPseudoInstr, + const X86Subtarget &Subtarget) { + + MachineFunction *Func = EntryBlk->getParent(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + DebugLoc DL = VAPseudoInstr.getDebugLoc(); + Register CountReg = VAPseudoInstr.getOperand(0).getReg(); + + // Create the new basic blocks. One block contains all the XMM stores, + // and one block is the final destination regardless of whether any + // stores were performed. + const BasicBlock *LLVMBlk = EntryBlk->getBasicBlock(); + MachineFunction::iterator EntryBlkIter = ++EntryBlk->getIterator(); + MachineBasicBlock *GuardedRegsBlk = Func->CreateMachineBasicBlock(LLVMBlk); + MachineBasicBlock *TailBlk = Func->CreateMachineBasicBlock(LLVMBlk); + Func->insert(EntryBlkIter, GuardedRegsBlk); + Func->insert(EntryBlkIter, TailBlk); + + GuardedRegsBlk->setIsGuardedRegsBlk(); + + // Transfer the remainder of MBB and its successor edges to EndMBB. + TailBlk->splice(TailBlk->begin(), EntryBlk, + std::next(MachineBasicBlock::iterator(VAPseudoInstr)), + EntryBlk->end()); + TailBlk->transferSuccessorsAndUpdatePHIs(EntryBlk); + + // The original block will now fall through to the XMM save block. + EntryBlk->addSuccessor(GuardedRegsBlk); + // The XMMSaveMBB will fall through to the end block. + GuardedRegsBlk->addSuccessor(TailBlk); + + if (!Subtarget.isCallingConvWin64(Func->getFunction().getCallingConv())) { + // If %al is 0, branch around the XMM save block. + BuildMI(EntryBlk, DL, TII->get(X86::TEST8rr)) + .addReg(CountReg) + .addReg(CountReg); + BuildMI(EntryBlk, DL, TII->get(X86::JCC_1)) + .addMBB(TailBlk) + .addImm(X86::COND_E); + EntryBlk->addSuccessor(TailBlk); + } + + return std::make_pair(GuardedRegsBlk, TailBlk); +} + MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( - MachineInstr &MI, MachineBasicBlock *MBB) const { + MachineInstr &PseudoVaStartInstr, MachineBasicBlock *EntryBlk) const { // Emit code to save XMM registers to the stack. The ABI says that the // number of registers to save is given in %al, so it's theoretically // possible to do an indirect jump trick to avoid saving all of them, @@ -29532,69 +29648,133 @@ // easier on the hardware branch predictor, and stores aren't all that // expensive anyway. - // Create the new basic blocks. One block contains all the XMM stores, - // and one block is the final destination regardless of whether any - // stores were performed. - const BasicBlock *LLVM_BB = MBB->getBasicBlock(); - MachineFunction *F = MBB->getParent(); - MachineFunction::iterator MBBIter = ++MBB->getIterator(); - MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); - MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); - F->insert(MBBIter, XMMSaveMBB); - F->insert(MBBIter, EndMBB); + MachineBasicBlock *GuardedRegsBlk = nullptr; + MachineBasicBlock *TailBlk = nullptr; - // Transfer the remainder of MBB and its successor edges to EndMBB. - EndMBB->splice(EndMBB->begin(), MBB, - std::next(MachineBasicBlock::iterator(MI)), MBB->end()); - EndMBB->transferSuccessorsAndUpdatePHIs(MBB); + std::tie(GuardedRegsBlk, TailBlk) = + CreateGuardedRegsBlock(EntryBlk, PseudoVaStartInstr, Subtarget); - // The original block will now fall through to the XMM save block. - MBB->addSuccessor(XMMSaveMBB); - // The XMMSaveMBB will fall through to the end block. - XMMSaveMBB->addSuccessor(EndMBB); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + DebugLoc DL = PseudoVaStartInstr.getDebugLoc(); + int64_t RegSaveFrameIndex = PseudoVaStartInstr.getOperand(1).getImm(); + int64_t VarArgsFPOffset = PseudoVaStartInstr.getOperand(2).getImm(); + MachineFunction *Func = EntryBlk->getParent(); // Now add the instructions. - const TargetInstrInfo *TII = Subtarget.getInstrInfo(); - DebugLoc DL = MI.getDebugLoc(); - Register CountReg = MI.getOperand(0).getReg(); - int64_t RegSaveFrameIndex = MI.getOperand(1).getImm(); - int64_t VarArgsFPOffset = MI.getOperand(2).getImm(); + // Make sure the last operand is EFLAGS, which gets clobbered by the branch + // that was just emitted, but clearly shouldn't be "saved". + assert( + (PseudoVaStartInstr.getNumOperands() <= 3 || + !PseudoVaStartInstr.getOperand(PseudoVaStartInstr.getNumOperands() - 1) + .isReg() || + PseudoVaStartInstr.getOperand(PseudoVaStartInstr.getNumOperands() - 1) + .getReg() == X86::EFLAGS) && + "Expected last argument to be EFLAGS"); + + // TODO: add support for YMM and ZMM here. + unsigned MovOpc = Subtarget.hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr; + // save all guarded XMM registers. + for (unsigned OpndIdx = 3, RegIdx = 0; + OpndIdx + 1 < PseudoVaStartInstr.getNumOperands(); OpndIdx++, RegIdx++) { + int64_t offset = RegIdx * 16 + VarArgsFPOffset; + MachineMemOperand *memoryOpnd = Func->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*Func, RegSaveFrameIndex, offset), + MachineMemOperand::MOStore, + /*Size=*/16, /*Align=*/16); + BuildMI(GuardedRegsBlk, DL, TII->get(MovOpc)) + .addFrameIndex(RegSaveFrameIndex) + .addImm(/*Scale=*/1) + .addReg(/*IndexReg=*/0) + .addImm(/*Disp=*/offset) + .addReg(/*Segment=*/0) + .addReg(PseudoVaStartInstr.getOperand(OpndIdx).getReg()) + .addMemOperand(memoryOpnd); + assert(Register::isPhysicalRegister( + PseudoVaStartInstr.getOperand(OpndIdx).getReg())); + GuardedRegsBlk->addLiveIn(PseudoVaStartInstr.getOperand(OpndIdx).getReg()); + } + + PseudoVaStartInstr.eraseFromParent(); // The pseudo instruction is gone now. - if (!Subtarget.isCallingConvWin64(F->getFunction().getCallingConv())) { - // If %al is 0, branch around the XMM save block. - BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); - BuildMI(MBB, DL, TII->get(X86::JCC_1)).addMBB(EndMBB).addImm(X86::COND_E); - MBB->addSuccessor(EndMBB); + return TailBlk; +} + +MachineBasicBlock * +X86TargetLowering::EmitVarargThunkSaveXMMRegsWithCustomInserter( + MachineInstr &PseudoVarargThunkInstr, MachineBasicBlock *EntryBlk) const { + MachineBasicBlock *GuardedRegsBlk = nullptr; + MachineBasicBlock *TailBlk = nullptr; + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + DebugLoc DL = PseudoVarargThunkInstr.getDebugLoc(); + int64_t ThunkRegSaveFrameIndex = + PseudoVarargThunkInstr.getOperand(1).getImm(); + int64_t VarArgsRegsOffset = PseudoVarargThunkInstr.getOperand(2).getImm(); + MachineFunction *Func = EntryBlk->getParent(); + bool NeedToAddLiveInsIntoGuardedRegsBlk = true; + + // check whether GuardedRegsBlk is already created by VASTART handling code + assert(Func->begin() != Func->end()); + for (auto &Succ : (*Func->begin()).successors()) { + if (Succ->isGuardedRegsBlk()) { + GuardedRegsBlk = Succ; + TailBlk = *GuardedRegsBlk->succ_begin(); + NeedToAddLiveInsIntoGuardedRegsBlk = false; + break; + } } + if (GuardedRegsBlk == nullptr) + std::tie(GuardedRegsBlk, TailBlk) = + CreateGuardedRegsBlock(EntryBlk, PseudoVarargThunkInstr, Subtarget); + + // Now add the instructions. + // Make sure the last operand is EFLAGS, which gets clobbered by the branch // that was just emitted, but clearly shouldn't be "saved". - assert((MI.getNumOperands() <= 3 || - !MI.getOperand(MI.getNumOperands() - 1).isReg() || - MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) && + assert((PseudoVarargThunkInstr.getNumOperands() <= 3 || + !PseudoVarargThunkInstr + .getOperand(PseudoVarargThunkInstr.getNumOperands() - 1) + .isReg() || + PseudoVarargThunkInstr + .getOperand(PseudoVarargThunkInstr.getNumOperands() - 1) + .getReg() == X86::EFLAGS) && "Expected last argument to be EFLAGS"); + + // TODO: add support for YMM and ZMM here. unsigned MOVOpc = Subtarget.hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr; + // In the XMM save block, save all the XMM argument registers. - for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) { - int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; - MachineMemOperand *MMO = F->getMachineMemOperand( - MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset), - MachineMemOperand::MOStore, - /*Size=*/16, /*Align=*/16); - BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc)) - .addFrameIndex(RegSaveFrameIndex) + for (unsigned OpndIdx = 3, RegIdx = 0; + OpndIdx + 1 < PseudoVarargThunkInstr.getNumOperands(); + OpndIdx++, RegIdx++) { + int64_t Offset = RegIdx * 16 + VarArgsRegsOffset; + + MachineMemOperand *MMO = + Func->getMachineMemOperand(MachinePointerInfo::getFixedStack( + *Func, ThunkRegSaveFrameIndex, Offset), + MachineMemOperand::MOStore, + /*Size=*/16, /*Align=*/16); + BuildMI(GuardedRegsBlk, DL, TII->get(MOVOpc)) + .addFrameIndex(ThunkRegSaveFrameIndex) .addImm(/*Scale=*/1) .addReg(/*IndexReg=*/0) .addImm(/*Disp=*/Offset) .addReg(/*Segment=*/0) - .addReg(MI.getOperand(i).getReg()) + .addReg(PseudoVarargThunkInstr.getOperand(OpndIdx).getReg()) .addMemOperand(MMO); + assert(Register::isPhysicalRegister( + PseudoVarargThunkInstr.getOperand(OpndIdx).getReg())); + + if (NeedToAddLiveInsIntoGuardedRegsBlk) + GuardedRegsBlk->addLiveIn( + PseudoVarargThunkInstr.getOperand(OpndIdx).getReg()); } - MI.eraseFromParent(); // The pseudo instruction is gone now. + PseudoVarargThunkInstr + .eraseFromParent(); // The pseudo instruction is gone now. - return EndMBB; + return TailBlk; } // The EFLAGS operand of SelectItr might be missing a kill marker @@ -31333,6 +31513,9 @@ case X86::VASTART_SAVE_XMM_REGS: return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); + case X86::VARARG_THUNK_SAVE_XMM_REGS: + return EmitVarargThunkSaveXMMRegsWithCustomInserter(MI, BB); + case X86::VAARG_64: return EmitVAARG64WithCustomInserter(MI, BB); Index: llvm/lib/Target/X86/X86InstrCompiler.td =================================================================== --- llvm/lib/Target/X86/X86InstrCompiler.td +++ llvm/lib/Target/X86/X86InstrCompiler.td @@ -81,6 +81,19 @@ imm:$offset), (implicit EFLAGS)]>; +// x86-64 %al guarded thunk arguments lowering magic. +def VARARG_THUNK_SAVE_XMM_REGS : I<0, Pseudo, + (outs), + (ins GR8:$al, + i64imm:$regsavefi, i64imm:$offset, + variable_ops), + "#VARARG_THUNK_SAVE_XMM_REGS $al, $regsavefi, $offset", + [(X86vararg_thunk_save_xmm_regs GR8:$al, + imm:$regsavefi, + imm:$offset), + (implicit EFLAGS)]>; + + // The VAARG_64 pseudo-instruction takes the address of the va_list, // and places the address of the next argument into a register. let Defs = [EFLAGS] in Index: llvm/lib/Target/X86/X86InstrInfo.td =================================================================== --- llvm/lib/Target/X86/X86InstrInfo.td +++ llvm/lib/Target/X86/X86InstrInfo.td @@ -99,6 +99,11 @@ SDTCisVT<1, iPTR>, SDTCisVT<2, iPTR>]>; +def SDT_X86VARARG_THUNK_SAVE_XMM_REGS : SDTypeProfile<0, -1, [SDTCisVT<0, i8>, + SDTCisVT<1, iPTR>, + SDTCisVT<2, iPTR>]>; + + def SDT_X86VAARG_64 : SDTypeProfile<1, -1, [SDTCisPtrTy<0>, SDTCisPtrTy<1>, SDTCisVT<2, i32>, @@ -190,6 +195,12 @@ SDNode<"X86ISD::VASTART_SAVE_XMM_REGS", SDT_X86VASTART_SAVE_XMM_REGS, [SDNPHasChain, SDNPVariadic]>; + +def X86vararg_thunk_save_xmm_regs : + SDNode<"X86ISD::VARARG_THUNK_SAVE_XMM_REGS", + SDT_X86VARARG_THUNK_SAVE_XMM_REGS, + [SDNPHasChain, SDNPVariadic]>; + def X86vaarg64 : SDNode<"X86ISD::VAARG_64", SDT_X86VAARG_64, [SDNPHasChain, SDNPMayLoad, SDNPMayStore, Index: llvm/lib/Target/X86/X86MachineFunctionInfo.h =================================================================== --- llvm/lib/Target/X86/X86MachineFunctionInfo.h +++ llvm/lib/Target/X86/X86MachineFunctionInfo.h @@ -73,6 +73,9 @@ int VarArgsFrameIndex = 0; /// RegSaveFrameIndex - X86-64 vararg func register save area. int RegSaveFrameIndex = 0; + /// thunkRegSaveFrameIndex - X86-64 vararg func register save area for thunk + /// functions. + int thunkRegSaveFrameIndex = 0; /// VarArgsGPOffset - X86-64 vararg func int reg offset. unsigned VarArgsGPOffset = 0; /// VarArgsFPOffset - X86-64 vararg func fp reg offset. @@ -155,6 +158,9 @@ int getRegSaveFrameIndex() const { return RegSaveFrameIndex; } void setRegSaveFrameIndex(int Idx) { RegSaveFrameIndex = Idx; } + int getThunkRegSaveFrameIndex() const { return thunkRegSaveFrameIndex; } + void setThunkRegSaveFrameIndex(int Idx) { thunkRegSaveFrameIndex = Idx; } + unsigned getVarArgsGPOffset() const { return VarArgsGPOffset; } void setVarArgsGPOffset(unsigned Offset) { VarArgsGPOffset = Offset; } Index: llvm/test/CodeGen/X86/musttail-varargs.ll =================================================================== --- llvm/test/CodeGen/X86/musttail-varargs.ll +++ llvm/test/CodeGen/X86/musttail-varargs.ll @@ -1,9 +1,14 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -verify-machineinstrs < %s -enable-tail-merge=0 -mtriple=x86_64-linux | FileCheck %s --check-prefix=LINUX +; RUN: llc -verify-machineinstrs -O0 < %s -enable-tail-merge=0 -mtriple=x86_64-linux | FileCheck %s --check-prefix=LINUX-OPT0 ; RUN: llc -verify-machineinstrs < %s -enable-tail-merge=0 -mtriple=x86_64-linux-gnux32 | FileCheck %s --check-prefix=LINUX-X32 +; RUN: llc -verify-machineinstrs -O0 < %s -enable-tail-merge=0 -mtriple=x86_64-linux-gnux32 | FileCheck %s --check-prefix=LINUX-X32-OPT0 ; RUN: llc -verify-machineinstrs < %s -enable-tail-merge=0 -mtriple=x86_64-windows | FileCheck %s --check-prefix=WINDOWS +; RUN: llc -verify-machineinstrs -O0 < %s -enable-tail-merge=0 -mtriple=x86_64-windows | FileCheck %s --check-prefix=WINDOWS-OPT0 ; RUN: llc -verify-machineinstrs < %s -enable-tail-merge=0 -mtriple=i686-windows | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE +; RUN: llc -verify-machineinstrs -O0 < %s -enable-tail-merge=0 -mtriple=i686-windows | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE-OPT0 ; RUN: llc -verify-machineinstrs < %s -enable-tail-merge=0 -mtriple=i686-windows -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE +; RUN: llc -verify-machineinstrs -O0 < %s -enable-tail-merge=0 -mtriple=i686-windows -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE-OPT0 ; Test that we actually spill and reload all arguments in the variadic argument ; pack. Doing a normal call will clobber all argument registers, and we will @@ -29,8 +34,8 @@ ; LINUX-NEXT: .cfi_def_cfa_offset 48 ; LINUX-NEXT: pushq %rbx ; LINUX-NEXT: .cfi_def_cfa_offset 56 -; LINUX-NEXT: subq $360, %rsp # imm = 0x168 -; LINUX-NEXT: .cfi_def_cfa_offset 416 +; LINUX-NEXT: subq $232, %rsp +; LINUX-NEXT: .cfi_def_cfa_offset 288 ; LINUX-NEXT: .cfi_offset %rbx, -56 ; LINUX-NEXT: .cfi_offset %r12, -48 ; LINUX-NEXT: .cfi_offset %r13, -40 @@ -43,6 +48,11 @@ ; LINUX-NEXT: movq %rdx, %rbp ; LINUX-NEXT: movq %rsi, %rbx ; LINUX-NEXT: movq %rdi, %r14 +; LINUX-NEXT: movq %rsi, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movq %r8, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movq %r9, {{[0-9]+}}(%rsp) ; LINUX-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; LINUX-NEXT: testb %al, %al ; LINUX-NEXT: je .LBB0_2 @@ -56,11 +66,6 @@ ; LINUX-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp) ; LINUX-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp) ; LINUX-NEXT: .LBB0_2: -; LINUX-NEXT: movq %rbx, {{[0-9]+}}(%rsp) -; LINUX-NEXT: movq %rbp, {{[0-9]+}}(%rsp) -; LINUX-NEXT: movq %r13, {{[0-9]+}}(%rsp) -; LINUX-NEXT: movq %r12, {{[0-9]+}}(%rsp) -; LINUX-NEXT: movq %r15, {{[0-9]+}}(%rsp) ; LINUX-NEXT: leaq {{[0-9]+}}(%rsp), %rax ; LINUX-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; LINUX-NEXT: leaq {{[0-9]+}}(%rsp), %rax @@ -68,14 +73,6 @@ ; LINUX-NEXT: movabsq $206158430216, %rax # imm = 0x3000000008 ; LINUX-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; LINUX-NEXT: movq %r14, %rdi -; LINUX-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUX-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUX-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUX-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUX-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUX-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUX-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUX-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; LINUX-NEXT: callq get_f ; LINUX-NEXT: movq %rax, %r11 ; LINUX-NEXT: movq %r14, %rdi @@ -84,16 +81,36 @@ ; LINUX-NEXT: movq %r13, %rcx ; LINUX-NEXT: movq %r12, %r8 ; LINUX-NEXT: movq %r15, %r9 -; LINUX-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; LINUX-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; LINUX-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; LINUX-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; LINUX-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; LINUX-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; LINUX-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; LINUX-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; LINUX-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload -; LINUX-NEXT: addq $360, %rsp # imm = 0x168 +; LINUX-NEXT: testb %al, %al +; LINUX-NEXT: je .LBB0_4 +; LINUX-NEXT: # %bb.3: +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm6 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; LINUX-NEXT: addq $232, %rsp +; LINUX-NEXT: .cfi_def_cfa_offset 56 +; LINUX-NEXT: popq %rbx +; LINUX-NEXT: .cfi_def_cfa_offset 48 +; LINUX-NEXT: popq %r12 +; LINUX-NEXT: .cfi_def_cfa_offset 40 +; LINUX-NEXT: popq %r13 +; LINUX-NEXT: .cfi_def_cfa_offset 32 +; LINUX-NEXT: popq %r14 +; LINUX-NEXT: .cfi_def_cfa_offset 24 +; LINUX-NEXT: popq %r15 +; LINUX-NEXT: .cfi_def_cfa_offset 16 +; LINUX-NEXT: popq %rbp +; LINUX-NEXT: .cfi_def_cfa_offset 8 +; LINUX-NEXT: jmpq *%r11 # TAILCALL +; LINUX-NEXT: .LBB0_4: +; LINUX-NEXT: .cfi_def_cfa_offset 288 +; LINUX-NEXT: addq $232, %rsp ; LINUX-NEXT: .cfi_def_cfa_offset 56 ; LINUX-NEXT: popq %rbx ; LINUX-NEXT: .cfi_def_cfa_offset 48 @@ -109,6 +126,85 @@ ; LINUX-NEXT: .cfi_def_cfa_offset 8 ; LINUX-NEXT: jmpq *%r11 # TAILCALL ; +; LINUX-OPT0-LABEL: f_thunk: +; LINUX-OPT0: # %bb.0: +; LINUX-OPT0-NEXT: subq $328, %rsp # imm = 0x148 +; LINUX-OPT0-NEXT: .cfi_def_cfa_offset 336 +; LINUX-OPT0-NEXT: testb %al, %al +; LINUX-OPT0-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; LINUX-OPT0-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: je .LBB0_2 +; LINUX-OPT0-NEXT: # %bb.1: +; LINUX-OPT0-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm5, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: .LBB0_2: +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; LINUX-OPT0-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; LINUX-OPT0-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; LINUX-OPT0-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; LINUX-OPT0-NEXT: movq %rsi, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; LINUX-OPT0-NEXT: movq %rdi, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %r8b # 1-byte Reload +; LINUX-OPT0-NEXT: leaq {{[0-9]+}}(%rsp), %r9 +; LINUX-OPT0-NEXT: movq %r9, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: leaq {{[0-9]+}}(%rsp), %r9 +; LINUX-OPT0-NEXT: movq %r9, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movl $48, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movl $8, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; LINUX-OPT0-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %r9, %rdi +; LINUX-OPT0-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; LINUX-OPT0-NEXT: callq get_f +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; LINUX-OPT0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %r10b # 1-byte Reload +; LINUX-OPT0-NEXT: movq %rax, (%rsp) # 8-byte Spill +; LINUX-OPT0-NEXT: movb %r10b, %al +; LINUX-OPT0-NEXT: movq (%rsp), %r11 # 8-byte Reload +; LINUX-OPT0-NEXT: testb %al, %al +; LINUX-OPT0-NEXT: je .LBB0_4 +; LINUX-OPT0-NEXT: # %bb.3: +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm6 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; LINUX-OPT0-NEXT: addq $328, %rsp # imm = 0x148 +; LINUX-OPT0-NEXT: .cfi_def_cfa_offset 8 +; LINUX-OPT0-NEXT: jmpq *%r11 # TAILCALL +; LINUX-OPT0-NEXT: .LBB0_4: +; LINUX-OPT0-NEXT: .cfi_def_cfa_offset 336 +; LINUX-OPT0-NEXT: addq $328, %rsp # imm = 0x148 +; LINUX-OPT0-NEXT: .cfi_def_cfa_offset 8 +; LINUX-OPT0-NEXT: jmpq *%r11 # TAILCALL +; ; LINUX-X32-LABEL: f_thunk: ; LINUX-X32: # %bb.0: ; LINUX-X32-NEXT: pushq %rbp @@ -123,8 +219,8 @@ ; LINUX-X32-NEXT: .cfi_def_cfa_offset 48 ; LINUX-X32-NEXT: pushq %rbx ; LINUX-X32-NEXT: .cfi_def_cfa_offset 56 -; LINUX-X32-NEXT: subl $344, %esp # imm = 0x158 -; LINUX-X32-NEXT: .cfi_def_cfa_offset 400 +; LINUX-X32-NEXT: subl $216, %esp +; LINUX-X32-NEXT: .cfi_def_cfa_offset 272 ; LINUX-X32-NEXT: .cfi_offset %rbx, -56 ; LINUX-X32-NEXT: .cfi_offset %r12, -48 ; LINUX-X32-NEXT: .cfi_offset %r13, -40 @@ -137,6 +233,11 @@ ; LINUX-X32-NEXT: movq %rdx, %rbp ; LINUX-X32-NEXT: movq %rsi, %rbx ; LINUX-X32-NEXT: movl %edi, %r14d +; LINUX-X32-NEXT: movq %rsi, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movq %rdx, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movq %rcx, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movq %r8, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movq %r9, {{[0-9]+}}(%esp) ; LINUX-X32-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; LINUX-X32-NEXT: testb %al, %al ; LINUX-X32-NEXT: je .LBB0_2 @@ -150,11 +251,6 @@ ; LINUX-X32-NEXT: movaps %xmm6, {{[0-9]+}}(%esp) ; LINUX-X32-NEXT: movaps %xmm7, {{[0-9]+}}(%esp) ; LINUX-X32-NEXT: .LBB0_2: -; LINUX-X32-NEXT: movq %rbx, {{[0-9]+}}(%esp) -; LINUX-X32-NEXT: movq %rbp, {{[0-9]+}}(%esp) -; LINUX-X32-NEXT: movq %r13, {{[0-9]+}}(%esp) -; LINUX-X32-NEXT: movq %r12, {{[0-9]+}}(%esp) -; LINUX-X32-NEXT: movq %r15, {{[0-9]+}}(%esp) ; LINUX-X32-NEXT: leal {{[0-9]+}}(%rsp), %eax ; LINUX-X32-NEXT: movl %eax, {{[0-9]+}}(%esp) ; LINUX-X32-NEXT: leal {{[0-9]+}}(%rsp), %eax @@ -162,14 +258,6 @@ ; LINUX-X32-NEXT: movabsq $206158430216, %rax # imm = 0x3000000008 ; LINUX-X32-NEXT: movq %rax, {{[0-9]+}}(%esp) ; LINUX-X32-NEXT: movl %r14d, %edi -; LINUX-X32-NEXT: movaps %xmm7, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; LINUX-X32-NEXT: movaps %xmm6, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; LINUX-X32-NEXT: movaps %xmm5, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; LINUX-X32-NEXT: movaps %xmm4, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; LINUX-X32-NEXT: movaps %xmm3, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; LINUX-X32-NEXT: movaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; LINUX-X32-NEXT: movaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; LINUX-X32-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; LINUX-X32-NEXT: callq get_f ; LINUX-X32-NEXT: movl %eax, %r11d ; LINUX-X32-NEXT: movl %r14d, %edi @@ -178,16 +266,36 @@ ; LINUX-X32-NEXT: movq %r13, %rcx ; LINUX-X32-NEXT: movq %r12, %r8 ; LINUX-X32-NEXT: movq %r15, %r9 -; LINUX-X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; LINUX-X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload -; LINUX-X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload -; LINUX-X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm3 # 16-byte Reload -; LINUX-X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm4 # 16-byte Reload -; LINUX-X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm5 # 16-byte Reload -; LINUX-X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm6 # 16-byte Reload -; LINUX-X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm7 # 16-byte Reload ; LINUX-X32-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload -; LINUX-X32-NEXT: addl $344, %esp # imm = 0x158 +; LINUX-X32-NEXT: testb %al, %al +; LINUX-X32-NEXT: je .LBB0_4 +; LINUX-X32-NEXT: # %bb.3: +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm7 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm6 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm5 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm4 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm3 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm2 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm1 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 +; LINUX-X32-NEXT: addl $216, %esp +; LINUX-X32-NEXT: .cfi_def_cfa_offset 56 +; LINUX-X32-NEXT: popq %rbx +; LINUX-X32-NEXT: .cfi_def_cfa_offset 48 +; LINUX-X32-NEXT: popq %r12 +; LINUX-X32-NEXT: .cfi_def_cfa_offset 40 +; LINUX-X32-NEXT: popq %r13 +; LINUX-X32-NEXT: .cfi_def_cfa_offset 32 +; LINUX-X32-NEXT: popq %r14 +; LINUX-X32-NEXT: .cfi_def_cfa_offset 24 +; LINUX-X32-NEXT: popq %r15 +; LINUX-X32-NEXT: .cfi_def_cfa_offset 16 +; LINUX-X32-NEXT: popq %rbp +; LINUX-X32-NEXT: .cfi_def_cfa_offset 8 +; LINUX-X32-NEXT: jmpq *%r11 # TAILCALL +; LINUX-X32-NEXT: .LBB0_4: +; LINUX-X32-NEXT: .cfi_def_cfa_offset 272 +; LINUX-X32-NEXT: addl $216, %esp ; LINUX-X32-NEXT: .cfi_def_cfa_offset 56 ; LINUX-X32-NEXT: popq %rbx ; LINUX-X32-NEXT: .cfi_def_cfa_offset 48 @@ -203,6 +311,87 @@ ; LINUX-X32-NEXT: .cfi_def_cfa_offset 8 ; LINUX-X32-NEXT: jmpq *%r11 # TAILCALL ; +; LINUX-X32-OPT0-LABEL: f_thunk: +; LINUX-X32-OPT0: # %bb.0: +; LINUX-X32-OPT0-NEXT: subl $312, %esp # imm = 0x138 +; LINUX-X32-OPT0-NEXT: .cfi_def_cfa_offset 320 +; LINUX-X32-OPT0-NEXT: testb %al, %al +; LINUX-X32-OPT0-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; LINUX-X32-OPT0-NEXT: movq %r9, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %r8, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %rcx, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %rdx, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %rsi, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; LINUX-X32-OPT0-NEXT: je .LBB0_2 +; LINUX-X32-OPT0-NEXT: # %bb.1: +; LINUX-X32-OPT0-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm5, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm6, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm7, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: .LBB0_2: +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %rax # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq %rax, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %rcx # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq %rcx, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %rdx # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq %rdx, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %rsi # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq %rsi, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %rdi # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq %rdi, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %r8b # 1-byte Reload +; LINUX-X32-OPT0-NEXT: leal {{[0-9]+}}(%rsp), %r9d +; LINUX-X32-OPT0-NEXT: movl %r9d, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: leal {{[0-9]+}}(%rsp), %r9d +; LINUX-X32-OPT0-NEXT: movl %r9d, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movl $48, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movl $8, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %r9d # 4-byte Reload +; LINUX-X32-OPT0-NEXT: movq %rdi, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movl %r9d, %edi +; LINUX-X32-OPT0-NEXT: movq %rsi, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %rdx, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %rcx, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %rax, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movb %r8b, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; LINUX-X32-OPT0-NEXT: callq get_f +; LINUX-X32-OPT0-NEXT: movl %eax, %eax +; LINUX-X32-OPT0-NEXT: movl %eax, %ecx +; LINUX-X32-OPT0-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %rsi # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %rdx # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %r10 # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq %rcx, (%esp) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %r10, %rcx +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %r8 # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %r9 # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload +; LINUX-X32-OPT0-NEXT: movq (%esp), %r11 # 8-byte Reload +; LINUX-X32-OPT0-NEXT: testb %al, %al +; LINUX-X32-OPT0-NEXT: je .LBB0_4 +; LINUX-X32-OPT0-NEXT: # %bb.3: +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm7 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm6 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm5 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm4 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm3 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm2 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm1 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 +; LINUX-X32-OPT0-NEXT: addl $312, %esp # imm = 0x138 +; LINUX-X32-OPT0-NEXT: .cfi_def_cfa_offset 8 +; LINUX-X32-OPT0-NEXT: jmpq *%r11 # TAILCALL +; LINUX-X32-OPT0-NEXT: .LBB0_4: +; LINUX-X32-OPT0-NEXT: .cfi_def_cfa_offset 320 +; LINUX-X32-OPT0-NEXT: addl $312, %esp # imm = 0x138 +; LINUX-X32-OPT0-NEXT: .cfi_def_cfa_offset 8 +; LINUX-X32-OPT0-NEXT: jmpq *%r11 # TAILCALL +; ; WINDOWS-LABEL: f_thunk: ; WINDOWS: # %bb.0: ; WINDOWS-NEXT: pushq %r14 @@ -246,6 +435,36 @@ ; WINDOWS-NEXT: .text ; WINDOWS-NEXT: .seh_endproc ; +; WINDOWS-OPT0-LABEL: f_thunk: +; WINDOWS-OPT0: # %bb.0: +; WINDOWS-OPT0-NEXT: subq $120, %rsp +; WINDOWS-OPT0-NEXT: .seh_stackalloc 120 +; WINDOWS-OPT0-NEXT: .seh_endprologue +; WINDOWS-OPT0-NEXT: movq %r9, {{[0-9]+}}(%rsp) +; WINDOWS-OPT0-NEXT: movq %r8, {{[0-9]+}}(%rsp) +; WINDOWS-OPT0-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; WINDOWS-OPT0-NEXT: leaq {{[0-9]+}}(%rsp), %r10 +; WINDOWS-OPT0-NEXT: movq %r10, {{[0-9]+}}(%rsp) +; WINDOWS-OPT0-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; WINDOWS-OPT0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; WINDOWS-OPT0-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; WINDOWS-OPT0-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; WINDOWS-OPT0-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; WINDOWS-OPT0-NEXT: callq get_f +; WINDOWS-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; WINDOWS-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; WINDOWS-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; WINDOWS-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; WINDOWS-OPT0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %r11b # 1-byte Reload +; WINDOWS-OPT0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; WINDOWS-OPT0-NEXT: movb %r11b, %al +; WINDOWS-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; WINDOWS-OPT0-NEXT: addq $120, %rsp +; WINDOWS-OPT0-NEXT: rex64 jmpq *%r10 # TAILCALL +; WINDOWS-OPT0-NEXT: .seh_handlerdata +; WINDOWS-OPT0-NEXT: .text +; WINDOWS-OPT0-NEXT: .seh_endproc +; ; X86-NOSSE-LABEL: f_thunk: ; X86-NOSSE: # %bb.0: ; X86-NOSSE-NEXT: pushl %ebp @@ -264,6 +483,25 @@ ; X86-NOSSE-NEXT: popl %ebp ; X86-NOSSE-NEXT: jmpl *%eax # TAILCALL ; +; X86-NOSSE-OPT0-LABEL: f_thunk: +; X86-NOSSE-OPT0: # %bb.0: +; X86-NOSSE-OPT0-NEXT: pushl %ebp +; X86-NOSSE-OPT0-NEXT: movl %esp, %ebp +; X86-NOSSE-OPT0-NEXT: andl $-16, %esp +; X86-NOSSE-OPT0-NEXT: subl $48, %esp +; X86-NOSSE-OPT0-NEXT: movl 8(%ebp), %eax +; X86-NOSSE-OPT0-NEXT: leal 12(%ebp), %ecx +; X86-NOSSE-OPT0-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NOSSE-OPT0-NEXT: movl %esp, %ecx +; X86-NOSSE-OPT0-NEXT: movl %eax, (%ecx) +; X86-NOSSE-OPT0-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NOSSE-OPT0-NEXT: calll _get_f +; X86-NOSSE-OPT0-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NOSSE-OPT0-NEXT: movl %ecx, 8(%ebp) +; X86-NOSSE-OPT0-NEXT: movl %ebp, %esp +; X86-NOSSE-OPT0-NEXT: popl %ebp +; X86-NOSSE-OPT0-NEXT: jmpl *%eax # TAILCALL +; ; X86-SSE-LABEL: f_thunk: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: pushl %ebp @@ -287,6 +525,31 @@ ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: popl %ebp ; X86-SSE-NEXT: jmpl *%eax # TAILCALL +; +; X86-SSE-OPT0-LABEL: f_thunk: +; X86-SSE-OPT0: # %bb.0: +; X86-SSE-OPT0-NEXT: pushl %ebp +; X86-SSE-OPT0-NEXT: movl %esp, %ebp +; X86-SSE-OPT0-NEXT: andl $-16, %esp +; X86-SSE-OPT0-NEXT: subl $112, %esp +; X86-SSE-OPT0-NEXT: movl 8(%ebp), %eax +; X86-SSE-OPT0-NEXT: leal 12(%ebp), %ecx +; X86-SSE-OPT0-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-SSE-OPT0-NEXT: movl %esp, %ecx +; X86-SSE-OPT0-NEXT: movl %eax, (%ecx) +; X86-SSE-OPT0-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-OPT0-NEXT: movaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-OPT0-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE-OPT0-NEXT: movaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-OPT0-NEXT: calll _get_f +; X86-SSE-OPT0-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-SSE-OPT0-NEXT: movl %ecx, 8(%ebp) +; X86-SSE-OPT0-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-OPT0-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload +; X86-SSE-OPT0-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload +; X86-SSE-OPT0-NEXT: movl %ebp, %esp +; X86-SSE-OPT0-NEXT: popl %ebp +; X86-SSE-OPT0-NEXT: jmpl *%eax # TAILCALL %ap = alloca [4 x i8*], align 16 %ap_i8 = bitcast [4 x i8*]* %ap to i8* call void @llvm.va_start(i8* %ap_i8) @@ -300,27 +563,192 @@ ; No regparms on normal x86 conventions. -; This thunk shouldn't require any spills and reloads, assuming the register -; allocator knows what it's doing. +; This thunk stores xmms on entry and restores them before jumping. +; Storing and restoring xmms could be optimized out for this concrete case. define void @g_thunk(i8* %fptr_i8, ...) { ; LINUX-LABEL: g_thunk: ; LINUX: # %bb.0: -; LINUX-NEXT: pushq %rax -; LINUX-NEXT: .cfi_def_cfa_offset 16 -; LINUX-NEXT: popq %r11 +; LINUX-NEXT: subq $136, %rsp +; LINUX-NEXT: .cfi_def_cfa_offset 144 +; LINUX-NEXT: testb %al, %al +; LINUX-NEXT: je .LBB1_2 +; LINUX-NEXT: # %bb.1: +; LINUX-NEXT: movaps %xmm0, (%rsp) +; LINUX-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movaps %xmm5, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp) +; LINUX-NEXT: .LBB1_2: +; LINUX-NEXT: testb %al, %al +; LINUX-NEXT: je .LBB1_4 +; LINUX-NEXT: # %bb.3: +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm6 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 +; LINUX-NEXT: movaps (%rsp), %xmm0 +; LINUX-NEXT: addq $136, %rsp +; LINUX-NEXT: .cfi_def_cfa_offset 8 +; LINUX-NEXT: jmpq *%rdi # TAILCALL +; LINUX-NEXT: .LBB1_4: +; LINUX-NEXT: .cfi_def_cfa_offset 144 +; LINUX-NEXT: addq $136, %rsp ; LINUX-NEXT: .cfi_def_cfa_offset 8 ; LINUX-NEXT: jmpq *%rdi # TAILCALL ; +; LINUX-OPT0-LABEL: g_thunk: +; LINUX-OPT0: # %bb.0: +; LINUX-OPT0-NEXT: subq $200, %rsp +; LINUX-OPT0-NEXT: .cfi_def_cfa_offset 208 +; LINUX-OPT0-NEXT: movb %al, %r10b +; LINUX-OPT0-NEXT: testb %al, %al +; LINUX-OPT0-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; LINUX-OPT0-NEXT: je .LBB1_2 +; LINUX-OPT0-NEXT: # %bb.1: +; LINUX-OPT0-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm5, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: .LBB1_2: +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; LINUX-OPT0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; LINUX-OPT0-NEXT: testb %al, %al +; LINUX-OPT0-NEXT: je .LBB1_4 +; LINUX-OPT0-NEXT: # %bb.3: +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm6 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; LINUX-OPT0-NEXT: addq $200, %rsp +; LINUX-OPT0-NEXT: .cfi_def_cfa_offset 8 +; LINUX-OPT0-NEXT: jmpq *%r11 # TAILCALL +; LINUX-OPT0-NEXT: .LBB1_4: +; LINUX-OPT0-NEXT: .cfi_def_cfa_offset 208 +; LINUX-OPT0-NEXT: addq $200, %rsp +; LINUX-OPT0-NEXT: .cfi_def_cfa_offset 8 +; LINUX-OPT0-NEXT: jmpq *%r11 # TAILCALL +; ; LINUX-X32-LABEL: g_thunk: ; LINUX-X32: # %bb.0: -; LINUX-X32-NEXT: pushq %rax -; LINUX-X32-NEXT: .cfi_def_cfa_offset 16 +; LINUX-X32-NEXT: subl $136, %esp +; LINUX-X32-NEXT: .cfi_def_cfa_offset 144 +; LINUX-X32-NEXT: testb %al, %al +; LINUX-X32-NEXT: je .LBB1_2 +; LINUX-X32-NEXT: # %bb.1: +; LINUX-X32-NEXT: movaps %xmm0, (%esp) +; LINUX-X32-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movaps %xmm5, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movaps %xmm6, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movaps %xmm7, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: .LBB1_2: ; LINUX-X32-NEXT: movl %edi, %r11d -; LINUX-X32-NEXT: addl $8, %esp +; LINUX-X32-NEXT: testb %al, %al +; LINUX-X32-NEXT: je .LBB1_4 +; LINUX-X32-NEXT: # %bb.3: +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm7 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm6 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm5 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm4 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm3 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm2 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm1 +; LINUX-X32-NEXT: movaps (%esp), %xmm0 +; LINUX-X32-NEXT: addl $136, %esp +; LINUX-X32-NEXT: .cfi_def_cfa_offset 8 +; LINUX-X32-NEXT: jmpq *%r11 # TAILCALL +; LINUX-X32-NEXT: .LBB1_4: +; LINUX-X32-NEXT: .cfi_def_cfa_offset 144 +; LINUX-X32-NEXT: addl $136, %esp ; LINUX-X32-NEXT: .cfi_def_cfa_offset 8 ; LINUX-X32-NEXT: jmpq *%r11 # TAILCALL ; +; LINUX-X32-OPT0-LABEL: g_thunk: +; LINUX-X32-OPT0: # %bb.0: +; LINUX-X32-OPT0-NEXT: subl $200, %esp +; LINUX-X32-OPT0-NEXT: .cfi_def_cfa_offset 208 +; LINUX-X32-OPT0-NEXT: movb %al, %r10b +; LINUX-X32-OPT0-NEXT: testb %al, %al +; LINUX-X32-OPT0-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; LINUX-X32-OPT0-NEXT: movq %rsi, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %rdx, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %rcx, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %r8, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %r9, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movb %r10b, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; LINUX-X32-OPT0-NEXT: je .LBB1_2 +; LINUX-X32-OPT0-NEXT: # %bb.1: +; LINUX-X32-OPT0-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm5, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm6, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm7, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: .LBB1_2: +; LINUX-X32-OPT0-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; LINUX-X32-OPT0-NEXT: movl %eax, %ecx +; LINUX-X32-OPT0-NEXT: movl %ecx, %edx +; LINUX-X32-OPT0-NEXT: movl %eax, %edi +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %rsi # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %r8 # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq %rdx, (%esp) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %r8, %rdx +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %rcx # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %r8 # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %r9 # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload +; LINUX-X32-OPT0-NEXT: movq (%esp), %r11 # 8-byte Reload +; LINUX-X32-OPT0-NEXT: testb %al, %al +; LINUX-X32-OPT0-NEXT: je .LBB1_4 +; LINUX-X32-OPT0-NEXT: # %bb.3: +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm7 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm6 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm5 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm4 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm3 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm2 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm1 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 +; LINUX-X32-OPT0-NEXT: addl $200, %esp +; LINUX-X32-OPT0-NEXT: .cfi_def_cfa_offset 8 +; LINUX-X32-OPT0-NEXT: jmpq *%r11 # TAILCALL +; LINUX-X32-OPT0-NEXT: .LBB1_4: +; LINUX-X32-OPT0-NEXT: .cfi_def_cfa_offset 208 +; LINUX-X32-OPT0-NEXT: addl $200, %esp +; LINUX-X32-OPT0-NEXT: .cfi_def_cfa_offset 8 +; LINUX-X32-OPT0-NEXT: jmpq *%r11 # TAILCALL +; ; WINDOWS-LABEL: g_thunk: ; WINDOWS: # %bb.0: ; WINDOWS-NEXT: subq $40, %rsp @@ -332,6 +760,19 @@ ; WINDOWS-NEXT: .text ; WINDOWS-NEXT: .seh_endproc ; +; WINDOWS-OPT0-LABEL: g_thunk: +; WINDOWS-OPT0: # %bb.0: +; WINDOWS-OPT0-NEXT: subq $40, %rsp +; WINDOWS-OPT0-NEXT: .seh_stackalloc 40 +; WINDOWS-OPT0-NEXT: .seh_endprologue +; WINDOWS-OPT0-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; WINDOWS-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; WINDOWS-OPT0-NEXT: addq $40, %rsp +; WINDOWS-OPT0-NEXT: rex64 jmpq *%r10 # TAILCALL +; WINDOWS-OPT0-NEXT: .seh_handlerdata +; WINDOWS-OPT0-NEXT: .text +; WINDOWS-OPT0-NEXT: .seh_endproc +; ; X86-LABEL: g_thunk: ; X86: # %bb.0: ; X86-NEXT: pushl %eax @@ -353,41 +794,321 @@ define void @h_thunk(%struct.Foo* %this, ...) { ; LINUX-LABEL: h_thunk: ; LINUX: # %bb.0: -; LINUX-NEXT: pushq %rax -; LINUX-NEXT: .cfi_def_cfa_offset 16 +; LINUX-NEXT: subq $136, %rsp +; LINUX-NEXT: .cfi_def_cfa_offset 144 +; LINUX-NEXT: testb %al, %al +; LINUX-NEXT: je .LBB2_2 +; LINUX-NEXT: # %bb.1: +; LINUX-NEXT: movaps %xmm0, (%rsp) +; LINUX-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movaps %xmm5, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp) +; LINUX-NEXT: .LBB2_2: ; LINUX-NEXT: cmpb $1, (%rdi) -; LINUX-NEXT: jne .LBB2_2 -; LINUX-NEXT: # %bb.1: # %then +; LINUX-NEXT: jne .LBB2_4 +; LINUX-NEXT: # %bb.3: # %then ; LINUX-NEXT: movq 8(%rdi), %r11 -; LINUX-NEXT: addq $8, %rsp +; LINUX-NEXT: testb %al, %al +; LINUX-NEXT: je .LBB2_6 +; LINUX-NEXT: # %bb.5: # %then +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm6 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 +; LINUX-NEXT: movaps (%rsp), %xmm0 +; LINUX-NEXT: addq $136, %rsp ; LINUX-NEXT: .cfi_def_cfa_offset 8 ; LINUX-NEXT: jmpq *%r11 # TAILCALL -; LINUX-NEXT: .LBB2_2: # %else -; LINUX-NEXT: .cfi_def_cfa_offset 16 +; LINUX-NEXT: .LBB2_4: # %else +; LINUX-NEXT: .cfi_def_cfa_offset 144 ; LINUX-NEXT: movq 16(%rdi), %r11 ; LINUX-NEXT: movl $42, {{.*}}(%rip) -; LINUX-NEXT: addq $8, %rsp +; LINUX-NEXT: testb %al, %al +; LINUX-NEXT: je .LBB2_8 +; LINUX-NEXT: # %bb.7: # %else +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm6 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 +; LINUX-NEXT: movaps (%rsp), %xmm0 +; LINUX-NEXT: addq $136, %rsp +; LINUX-NEXT: .cfi_def_cfa_offset 8 +; LINUX-NEXT: jmpq *%r11 # TAILCALL +; LINUX-NEXT: .LBB2_6: # %then +; LINUX-NEXT: .cfi_def_cfa_offset 144 +; LINUX-NEXT: addq $136, %rsp +; LINUX-NEXT: .cfi_def_cfa_offset 8 +; LINUX-NEXT: jmpq *%r11 # TAILCALL +; LINUX-NEXT: .LBB2_8: # %else +; LINUX-NEXT: .cfi_def_cfa_offset 144 +; LINUX-NEXT: addq $136, %rsp ; LINUX-NEXT: .cfi_def_cfa_offset 8 ; LINUX-NEXT: jmpq *%r11 # TAILCALL ; +; LINUX-OPT0-LABEL: h_thunk: +; LINUX-OPT0: # %bb.0: +; LINUX-OPT0-NEXT: subq $216, %rsp +; LINUX-OPT0-NEXT: .cfi_def_cfa_offset 224 +; LINUX-OPT0-NEXT: movb %al, %r10b +; LINUX-OPT0-NEXT: testb %al, %al +; LINUX-OPT0-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; LINUX-OPT0-NEXT: je .LBB2_4 +; LINUX-OPT0-NEXT: # %bb.3: +; LINUX-OPT0-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm5, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: .LBB2_4: +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; LINUX-OPT0-NEXT: testb $1, (%rax) +; LINUX-OPT0-NEXT: jne .LBB2_1 +; LINUX-OPT0-NEXT: jmp .LBB2_2 +; LINUX-OPT0-NEXT: .LBB2_1: # %then +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; LINUX-OPT0-NEXT: movq 8(%rax), %rcx +; LINUX-OPT0-NEXT: movq %rax, %rdi +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; LINUX-OPT0-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %r8, %rcx +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; LINUX-OPT0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; LINUX-OPT0-NEXT: testb %al, %al +; LINUX-OPT0-NEXT: je .LBB2_6 +; LINUX-OPT0-NEXT: # %bb.5: # %then +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm6 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; LINUX-OPT0-NEXT: addq $216, %rsp +; LINUX-OPT0-NEXT: .cfi_def_cfa_offset 8 +; LINUX-OPT0-NEXT: jmpq *%r11 # TAILCALL +; LINUX-OPT0-NEXT: .LBB2_6: # %then +; LINUX-OPT0-NEXT: .cfi_def_cfa_offset 224 +; LINUX-OPT0-NEXT: addq $216, %rsp +; LINUX-OPT0-NEXT: .cfi_def_cfa_offset 8 +; LINUX-OPT0-NEXT: jmpq *%r11 # TAILCALL +; LINUX-OPT0-NEXT: .LBB2_2: # %else +; LINUX-OPT0-NEXT: .cfi_def_cfa_offset 224 +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; LINUX-OPT0-NEXT: movq 16(%rax), %rcx +; LINUX-OPT0-NEXT: movl $42, {{.*}}(%rip) +; LINUX-OPT0-NEXT: movq %rax, %rdi +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; LINUX-OPT0-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %r8, %rcx +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; LINUX-OPT0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; LINUX-OPT0-NEXT: testb %al, %al +; LINUX-OPT0-NEXT: je .LBB2_8 +; LINUX-OPT0-NEXT: # %bb.7: # %else +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm6 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; LINUX-OPT0-NEXT: addq $216, %rsp +; LINUX-OPT0-NEXT: .cfi_def_cfa_offset 8 +; LINUX-OPT0-NEXT: jmpq *%r11 # TAILCALL +; LINUX-OPT0-NEXT: .LBB2_8: # %else +; LINUX-OPT0-NEXT: .cfi_def_cfa_offset 224 +; LINUX-OPT0-NEXT: addq $216, %rsp +; LINUX-OPT0-NEXT: .cfi_def_cfa_offset 8 +; LINUX-OPT0-NEXT: jmpq *%r11 # TAILCALL +; ; LINUX-X32-LABEL: h_thunk: ; LINUX-X32: # %bb.0: -; LINUX-X32-NEXT: pushq %rax -; LINUX-X32-NEXT: .cfi_def_cfa_offset 16 +; LINUX-X32-NEXT: subl $136, %esp +; LINUX-X32-NEXT: .cfi_def_cfa_offset 144 +; LINUX-X32-NEXT: testb %al, %al +; LINUX-X32-NEXT: je .LBB2_2 +; LINUX-X32-NEXT: # %bb.1: +; LINUX-X32-NEXT: movaps %xmm0, (%esp) +; LINUX-X32-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movaps %xmm5, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movaps %xmm6, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movaps %xmm7, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: .LBB2_2: ; LINUX-X32-NEXT: cmpb $1, (%edi) -; LINUX-X32-NEXT: jne .LBB2_2 -; LINUX-X32-NEXT: # %bb.1: # %then +; LINUX-X32-NEXT: jne .LBB2_4 +; LINUX-X32-NEXT: # %bb.3: # %then ; LINUX-X32-NEXT: movl 4(%edi), %r11d -; LINUX-X32-NEXT: addl $8, %esp +; LINUX-X32-NEXT: testb %al, %al +; LINUX-X32-NEXT: je .LBB2_6 +; LINUX-X32-NEXT: # %bb.5: # %then +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm7 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm6 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm5 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm4 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm3 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm2 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm1 +; LINUX-X32-NEXT: movaps (%esp), %xmm0 +; LINUX-X32-NEXT: addl $136, %esp ; LINUX-X32-NEXT: .cfi_def_cfa_offset 8 ; LINUX-X32-NEXT: jmpq *%r11 # TAILCALL -; LINUX-X32-NEXT: .LBB2_2: # %else -; LINUX-X32-NEXT: .cfi_def_cfa_offset 16 +; LINUX-X32-NEXT: .LBB2_4: # %else +; LINUX-X32-NEXT: .cfi_def_cfa_offset 144 ; LINUX-X32-NEXT: movl 8(%edi), %r11d ; LINUX-X32-NEXT: movl $42, {{.*}}(%rip) -; LINUX-X32-NEXT: addl $8, %esp +; LINUX-X32-NEXT: testb %al, %al +; LINUX-X32-NEXT: je .LBB2_8 +; LINUX-X32-NEXT: # %bb.7: # %else +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm7 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm6 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm5 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm4 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm3 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm2 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm1 +; LINUX-X32-NEXT: movaps (%esp), %xmm0 +; LINUX-X32-NEXT: addl $136, %esp +; LINUX-X32-NEXT: .cfi_def_cfa_offset 8 +; LINUX-X32-NEXT: jmpq *%r11 # TAILCALL +; LINUX-X32-NEXT: .LBB2_6: # %then +; LINUX-X32-NEXT: .cfi_def_cfa_offset 144 +; LINUX-X32-NEXT: addl $136, %esp ; LINUX-X32-NEXT: .cfi_def_cfa_offset 8 ; LINUX-X32-NEXT: jmpq *%r11 # TAILCALL +; LINUX-X32-NEXT: .LBB2_8: # %else +; LINUX-X32-NEXT: .cfi_def_cfa_offset 144 +; LINUX-X32-NEXT: addl $136, %esp +; LINUX-X32-NEXT: .cfi_def_cfa_offset 8 +; LINUX-X32-NEXT: jmpq *%r11 # TAILCALL +; +; LINUX-X32-OPT0-LABEL: h_thunk: +; LINUX-X32-OPT0: # %bb.0: +; LINUX-X32-OPT0-NEXT: subl $216, %esp +; LINUX-X32-OPT0-NEXT: .cfi_def_cfa_offset 224 +; LINUX-X32-OPT0-NEXT: movb %al, %r10b +; LINUX-X32-OPT0-NEXT: testb %al, %al +; LINUX-X32-OPT0-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; LINUX-X32-OPT0-NEXT: movq %rsi, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %rdx, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %rcx, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %r8, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %r9, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movb %r10b, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; LINUX-X32-OPT0-NEXT: je .LBB2_4 +; LINUX-X32-OPT0-NEXT: # %bb.3: +; LINUX-X32-OPT0-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm5, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm6, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm7, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: .LBB2_4: +; LINUX-X32-OPT0-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; LINUX-X32-OPT0-NEXT: testb $1, (%eax) +; LINUX-X32-OPT0-NEXT: jne .LBB2_1 +; LINUX-X32-OPT0-NEXT: jmp .LBB2_2 +; LINUX-X32-OPT0-NEXT: .LBB2_1: # %then +; LINUX-X32-OPT0-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; LINUX-X32-OPT0-NEXT: movl 4(%eax), %ecx +; LINUX-X32-OPT0-NEXT: movl %ecx, %edx +; LINUX-X32-OPT0-NEXT: movl %eax, %edi +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %rsi # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %r8 # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq %rdx, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %r8, %rdx +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %rcx # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %r8 # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %r9 # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %r11 # 8-byte Reload +; LINUX-X32-OPT0-NEXT: testb %al, %al +; LINUX-X32-OPT0-NEXT: je .LBB2_6 +; LINUX-X32-OPT0-NEXT: # %bb.5: # %then +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm7 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm6 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm5 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm4 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm3 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm2 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm1 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 +; LINUX-X32-OPT0-NEXT: addl $216, %esp +; LINUX-X32-OPT0-NEXT: .cfi_def_cfa_offset 8 +; LINUX-X32-OPT0-NEXT: jmpq *%r11 # TAILCALL +; LINUX-X32-OPT0-NEXT: .LBB2_6: # %then +; LINUX-X32-OPT0-NEXT: .cfi_def_cfa_offset 224 +; LINUX-X32-OPT0-NEXT: addl $216, %esp +; LINUX-X32-OPT0-NEXT: .cfi_def_cfa_offset 8 +; LINUX-X32-OPT0-NEXT: jmpq *%r11 # TAILCALL +; LINUX-X32-OPT0-NEXT: .LBB2_2: # %else +; LINUX-X32-OPT0-NEXT: .cfi_def_cfa_offset 224 +; LINUX-X32-OPT0-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; LINUX-X32-OPT0-NEXT: movl 8(%eax), %ecx +; LINUX-X32-OPT0-NEXT: movl %ecx, %edx +; LINUX-X32-OPT0-NEXT: movl $42, {{.*}}(%rip) +; LINUX-X32-OPT0-NEXT: movl %eax, %edi +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %rsi # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %r8 # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq %rdx, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %r8, %rdx +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %rcx # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %r8 # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %r9 # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %r11 # 8-byte Reload +; LINUX-X32-OPT0-NEXT: testb %al, %al +; LINUX-X32-OPT0-NEXT: je .LBB2_8 +; LINUX-X32-OPT0-NEXT: # %bb.7: # %else +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm7 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm6 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm5 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm4 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm3 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm2 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm1 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 +; LINUX-X32-OPT0-NEXT: addl $216, %esp +; LINUX-X32-OPT0-NEXT: .cfi_def_cfa_offset 8 +; LINUX-X32-OPT0-NEXT: jmpq *%r11 # TAILCALL +; LINUX-X32-OPT0-NEXT: .LBB2_8: # %else +; LINUX-X32-OPT0-NEXT: .cfi_def_cfa_offset 224 +; LINUX-X32-OPT0-NEXT: addl $216, %esp +; LINUX-X32-OPT0-NEXT: .cfi_def_cfa_offset 8 +; LINUX-X32-OPT0-NEXT: jmpq *%r11 # TAILCALL ; ; WINDOWS-LABEL: h_thunk: ; WINDOWS: # %bb.0: @@ -409,23 +1130,136 @@ ; WINDOWS-NEXT: .text ; WINDOWS-NEXT: .seh_endproc ; -; X86-LABEL: h_thunk: -; X86: # %bb.0: -; X86-NEXT: pushl %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpb $1, (%eax) -; X86-NEXT: jne LBB2_2 -; X86-NEXT: # %bb.1: # %then -; X86-NEXT: movl 4(%eax), %ecx -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: popl %eax -; X86-NEXT: jmpl *%ecx # TAILCALL -; X86-NEXT: LBB2_2: # %else -; X86-NEXT: movl 8(%eax), %ecx -; X86-NEXT: movl $42, _g -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: popl %eax -; X86-NEXT: jmpl *%ecx # TAILCALL +; WINDOWS-OPT0-LABEL: h_thunk: +; WINDOWS-OPT0: # %bb.0: +; WINDOWS-OPT0-NEXT: subq $88, %rsp +; WINDOWS-OPT0-NEXT: .seh_stackalloc 88 +; WINDOWS-OPT0-NEXT: .seh_endprologue +; WINDOWS-OPT0-NEXT: testb $1, (%rcx) +; WINDOWS-OPT0-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; WINDOWS-OPT0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; WINDOWS-OPT0-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; WINDOWS-OPT0-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; WINDOWS-OPT0-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; WINDOWS-OPT0-NEXT: jne .LBB2_1 +; WINDOWS-OPT0-NEXT: jmp .LBB2_2 +; WINDOWS-OPT0-NEXT: .LBB2_1: # %then +; WINDOWS-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; WINDOWS-OPT0-NEXT: movq 8(%rax), %rcx +; WINDOWS-OPT0-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; WINDOWS-OPT0-NEXT: movq %rax, %rcx +; WINDOWS-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; WINDOWS-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; WINDOWS-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; WINDOWS-OPT0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; WINDOWS-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; WINDOWS-OPT0-NEXT: addq $88, %rsp +; WINDOWS-OPT0-NEXT: rex64 jmpq *%r10 # TAILCALL +; WINDOWS-OPT0-NEXT: .LBB2_2: # %else +; WINDOWS-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; WINDOWS-OPT0-NEXT: movq 16(%rax), %rcx +; WINDOWS-OPT0-NEXT: movl $42, {{.*}}(%rip) +; WINDOWS-OPT0-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; WINDOWS-OPT0-NEXT: movq %rax, %rcx +; WINDOWS-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; WINDOWS-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; WINDOWS-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; WINDOWS-OPT0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; WINDOWS-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; WINDOWS-OPT0-NEXT: addq $88, %rsp +; WINDOWS-OPT0-NEXT: rex64 jmpq *%r10 # TAILCALL +; WINDOWS-OPT0-NEXT: .seh_handlerdata +; WINDOWS-OPT0-NEXT: .text +; WINDOWS-OPT0-NEXT: .seh_endproc +; +; X86-NOSSE-LABEL: h_thunk: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl %eax +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOSSE-NEXT: cmpb $1, (%eax) +; X86-NOSSE-NEXT: jne LBB2_2 +; X86-NOSSE-NEXT: # %bb.1: # %then +; X86-NOSSE-NEXT: movl 4(%eax), %ecx +; X86-NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: popl %eax +; X86-NOSSE-NEXT: jmpl *%ecx # TAILCALL +; X86-NOSSE-NEXT: LBB2_2: # %else +; X86-NOSSE-NEXT: movl 8(%eax), %ecx +; X86-NOSSE-NEXT: movl $42, _g +; X86-NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: popl %eax +; X86-NOSSE-NEXT: jmpl *%ecx # TAILCALL +; +; X86-NOSSE-OPT0-LABEL: h_thunk: +; X86-NOSSE-OPT0: # %bb.0: +; X86-NOSSE-OPT0-NEXT: subl $8, %esp +; X86-NOSSE-OPT0-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOSSE-OPT0-NEXT: testb $1, (%eax) +; X86-NOSSE-OPT0-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NOSSE-OPT0-NEXT: jne LBB2_1 +; X86-NOSSE-OPT0-NEXT: jmp LBB2_2 +; X86-NOSSE-OPT0-NEXT: LBB2_1: # %then +; X86-NOSSE-OPT0-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NOSSE-OPT0-NEXT: movl 4(%eax), %ecx +; X86-NOSSE-OPT0-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NOSSE-OPT0-NEXT: addl $8, %esp +; X86-NOSSE-OPT0-NEXT: jmpl *%ecx # TAILCALL +; X86-NOSSE-OPT0-NEXT: LBB2_2: # %else +; X86-NOSSE-OPT0-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NOSSE-OPT0-NEXT: movl 8(%eax), %ecx +; X86-NOSSE-OPT0-NEXT: movl $42, _g +; X86-NOSSE-OPT0-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NOSSE-OPT0-NEXT: addl $8, %esp +; X86-NOSSE-OPT0-NEXT: jmpl *%ecx # TAILCALL +; +; X86-SSE-LABEL: h_thunk: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pushl %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: cmpb $1, (%eax) +; X86-SSE-NEXT: jne LBB2_2 +; X86-SSE-NEXT: # %bb.1: # %then +; X86-SSE-NEXT: movl 4(%eax), %ecx +; X86-SSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-SSE-NEXT: popl %eax +; X86-SSE-NEXT: jmpl *%ecx # TAILCALL +; X86-SSE-NEXT: LBB2_2: # %else +; X86-SSE-NEXT: movl 8(%eax), %ecx +; X86-SSE-NEXT: movl $42, _g +; X86-SSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-SSE-NEXT: popl %eax +; X86-SSE-NEXT: jmpl *%ecx # TAILCALL +; +; X86-SSE-OPT0-LABEL: h_thunk: +; X86-SSE-OPT0: # %bb.0: +; X86-SSE-OPT0-NEXT: subl $92, %esp +; X86-SSE-OPT0-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-OPT0-NEXT: testb $1, (%eax) +; X86-SSE-OPT0-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-OPT0-NEXT: movups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-OPT0-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE-OPT0-NEXT: movups %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-OPT0-NEXT: jne LBB2_1 +; X86-SSE-OPT0-NEXT: jmp LBB2_2 +; X86-SSE-OPT0-NEXT: LBB2_1: # %then +; X86-SSE-OPT0-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-SSE-OPT0-NEXT: movl 4(%eax), %ecx +; X86-SSE-OPT0-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-SSE-OPT0-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-OPT0-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload +; X86-SSE-OPT0-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload +; X86-SSE-OPT0-NEXT: addl $92, %esp +; X86-SSE-OPT0-NEXT: jmpl *%ecx # TAILCALL +; X86-SSE-OPT0-NEXT: LBB2_2: # %else +; X86-SSE-OPT0-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-SSE-OPT0-NEXT: movl 8(%eax), %ecx +; X86-SSE-OPT0-NEXT: movl $42, _g +; X86-SSE-OPT0-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-SSE-OPT0-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-OPT0-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload +; X86-SSE-OPT0-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload +; X86-SSE-OPT0-NEXT: addl $92, %esp +; X86-SSE-OPT0-NEXT: jmpl *%ecx # TAILCALL %cond_p = getelementptr %struct.Foo, %struct.Foo* %this, i32 0, i32 0 %cond = load i1, i1* %cond_p br i1 %cond, label %then, label %else Index: llvm/test/CodeGen/X86/vastart-defs-eflags.ll =================================================================== --- llvm/test/CodeGen/X86/vastart-defs-eflags.ll +++ llvm/test/CodeGen/X86/vastart-defs-eflags.ll @@ -9,6 +9,11 @@ ; CHECK-LABEL: check_flag: ; CHECK: ## %bb.0: ## %entry ; CHECK-NEXT: subq $56, %rsp +; CHECK-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: je LBB0_2 ; CHECK-NEXT: ## %bb.1: ## %entry @@ -21,11 +26,6 @@ ; CHECK-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp) ; CHECK-NEXT: LBB0_2: ## %entry -; CHECK-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testl $512, %edi ## imm = 0x200 ; CHECK-NEXT: je LBB0_4 Index: llvm/test/CodeGen/X86/x32-va_start.ll =================================================================== --- llvm/test/CodeGen/X86/x32-va_start.ll +++ llvm/test/CodeGen/X86/x32-va_start.ll @@ -27,6 +27,11 @@ call void @llvm.lifetime.start.p0i8(i64 16, i8* %0) #2 call void @llvm.va_start(i8* %0) ; SSE: subl $72, %esp +; CHECK-DAG: movq %r9 +; CHECK-DAG: movq %r8 +; CHECK-DAG: movq %rcx +; CHECK-DAG: movq %rdx +; CHECK-DAG: movq %rsi ; SSE: testb %al, %al ; SSE: je .[[NOFP:.*]] ; SSE-DAG: movaps %xmm1 @@ -38,11 +43,6 @@ ; SSE-DAG: movaps %xmm7 ; NOSSE-NOT: xmm ; SSE: .[[NOFP]]: -; CHECK-DAG: movq %r9 -; CHECK-DAG: movq %r8 -; CHECK-DAG: movq %rcx -; CHECK-DAG: movq %rdx -; CHECK-DAG: movq %rsi %gp_offset_p = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %ap, i32 0, i32 0, i32 0 %gp_offset = load i32, i32* %gp_offset_p, align 16 %fits_in_gp = icmp ult i32 %gp_offset, 41 Index: llvm/test/CodeGen/X86/xmm-vararg-noopt.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/xmm-vararg-noopt.ll @@ -0,0 +1,49 @@ +; RUN: llc -O0 -mtriple=x86_64-unknown-unknown < %s | FileCheck %s + +; CHECK-LABEL: testvarargs +; Ensure that xmm registers are not used before testing %al +; CHECK-NOT: xmm +; CHECK: testb %al, %al +; CHECK-NOT: xmm +; CHECK: # %bb.1 +; CHECK-NEXT: %xmm0, {{.*}}%rsp +; CHECK-NEXT: %xmm1, {{.*}}%rsp +; CHECK-NEXT: %xmm2, {{.*}}%rsp +; CHECK-NEXT: %xmm3, {{.*}}%rsp +; CHECK-NEXT: %xmm4, {{.*}}%rsp +; CHECK-NEXT: %xmm5, {{.*}}%rsp +; CHECK-NEXT: %xmm6, {{.*}}%rsp +; CHECK-NEXT: %xmm7, {{.*}}%rsp + +; ModuleID = 'variadic.c' +source_filename = "variadic.c" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux" + +%struct.__va_list_tag = type { i32, i32, i8*, i8* } + +@.str = private unnamed_addr constant [9 x i8] c"\0A hello \00", align 1 + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local void @testvarargs(i8* %fmt, ...) { +entry: + %fmt.addr = alloca i8*, align 8 + %va = alloca [1 x %struct.__va_list_tag], align 16 + store i8* %fmt, i8** %fmt.addr, align 8 + %arraydecay = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %va, i64 0, i64 0 + %arraydecay1 = bitcast %struct.__va_list_tag* %arraydecay to i8* + call void @llvm.va_start(i8* %arraydecay1) + %arraydecay2 = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %va, i64 0, i64 0 + %arraydecay23 = bitcast %struct.__va_list_tag* %arraydecay2 to i8* + call void @llvm.va_end(i8* %arraydecay23) + %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str, i64 0, i64 0)) + ret void +} + +; Function Attrs: nounwind +declare void @llvm.va_start(i8*) + +; Function Attrs: nounwind +declare void @llvm.va_end(i8*) + +declare dso_local i32 @printf(i8*, ...)