diff --git a/llvm/include/llvm/CodeGen/CallingConvLower.h b/llvm/include/llvm/CodeGen/CallingConvLower.h --- a/llvm/include/llvm/CodeGen/CallingConvLower.h +++ b/llvm/include/llvm/CodeGen/CallingConvLower.h @@ -14,6 +14,7 @@ #ifndef LLVM_CODEGEN_CALLINGCONVLOWER_H #define LLVM_CODEGEN_CALLINGCONVLOWER_H +#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -163,10 +164,12 @@ }; /// Describes a register that needs to be forwarded from the prologue to a -/// musttail call. +/// musttail call. Specifying VReg == 0 means that the register should be +/// put into guarded area and no virtual register was created for it. struct ForwardedRegister { ForwardedRegister(unsigned VReg, MCPhysReg PReg, MVT VT) : VReg(VReg), PReg(PReg), VT(VT) {} + bool isGuarded() const { return VReg == 0; } unsigned VReg; MCPhysReg PReg; MVT VT; @@ -523,10 +526,14 @@ CCAssignFn Fn); /// Compute the set of registers that need to be preserved and forwarded to - /// any musttail calls. + /// any musttail calls. Some platforms(AMD64) allow to guard access to + /// certain kind of input registers. f.e. Accesses to xmm registers should + /// be done according to the state of %al register. This function set + /// IsGuarded bit according to the specified set of GuardedForwardedRegs. void analyzeMustTailForwardedRegisters( - SmallVectorImpl &Forwards, ArrayRef RegParmTypes, - CCAssignFn Fn); + SmallVectorImpl &Forwards, + const SmallDenseSet &GuardedForwardedRegs, + ArrayRef RegParmTypes, CCAssignFn Fn); /// Returns true if the results of the two calling conventions are compatible. /// This is usually part of the check for tailcall eligibility. diff --git a/llvm/lib/CodeGen/CallingConvLower.cpp b/llvm/lib/CodeGen/CallingConvLower.cpp --- a/llvm/lib/CodeGen/CallingConvLower.cpp +++ b/llvm/lib/CodeGen/CallingConvLower.cpp @@ -236,8 +236,9 @@ } void CCState::analyzeMustTailForwardedRegisters( - SmallVectorImpl &Forwards, ArrayRef RegParmTypes, - CCAssignFn Fn) { + SmallVectorImpl &Forwards, + const SmallDenseSet &GuardedForwardedRegs, + ArrayRef RegParmTypes, CCAssignFn Fn) { // Oftentimes calling conventions will not user register parameters for // variadic functions, so we need to assume we're not variadic so that we get // all the registers that might be used in a non-variadic call. @@ -250,8 +251,11 @@ const TargetLowering *TL = MF.getSubtarget().getTargetLowering(); const TargetRegisterClass *RC = TL->getRegClassFor(RegVT); for (MCPhysReg PReg : RemainingRegs) { - unsigned VReg = MF.addLiveIn(PReg, RC); - Forwards.push_back(ForwardedRegister(VReg, PReg, RegVT)); + if (GuardedForwardedRegs.count(PReg) == 0) { + unsigned VReg = MF.addLiveIn(PReg, RC); + Forwards.push_back(ForwardedRegister(VReg, PReg, RegVT)); + } else + Forwards.push_back(ForwardedRegister(0, PReg, RegVT)); } } } diff --git a/llvm/lib/Target/AArch64/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/AArch64CallLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64CallLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64CallLowering.cpp @@ -390,7 +390,10 @@ // Later on, we can use this vector to restore the registers if necessary. SmallVectorImpl &Forwards = FuncInfo->getForwardedMustTailRegParms(); - CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, AssignFn); + + SmallDenseSet GuardedRegs; + CCInfo.analyzeMustTailForwardedRegisters(Forwards, GuardedRegs, RegParmTypes, + AssignFn); // Conservatively forward X8, since it might be used for an aggregate // return. diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -3564,8 +3564,10 @@ // Compute the set of forwarded registers. The rest are scratch. SmallVectorImpl &Forwards = FuncInfo->getForwardedMustTailRegParms(); - CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, - CC_AArch64_AAPCS); + + SmallDenseSet GuardedRegs; + CCInfo.analyzeMustTailForwardedRegisters(Forwards, GuardedRegs, + RegParmTypes, CC_AArch64_AAPCS); // Conservatively forward X8, since it might be used for aggregate return. if (!CCInfo.isAllocated(AArch64::X8)) { diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp --- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -64,6 +64,12 @@ bool ExpandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI); bool ExpandMBB(MachineBasicBlock &MBB); + + void expandSaveVarargXmmRegs(MachineBasicBlock *MBB, + MachineBasicBlock::iterator MBBI) const; + + void createTailCallBlocksPair(MachineBasicBlock &OriginalTailCallBlk, + MachineBasicBlock::iterator &TCPseudoInstr); }; char X86ExpandPseudo::ID = 0; @@ -173,6 +179,154 @@ JTMBB->erase(JTInst); } +// this function replaces original tail call instruction with two versions +// of tailcall instruction. One is fully similar to original, another has xmm +// registers restoring code inserted previously. Additionally there is created a +// branch which checks %al and selects proper version of tailcall. This %al +// trick is AMD64 specific thing. +// +// f_thunk: f_thunk: +// # %bb.1: => # %bb.1: +// addq 32, %rsp testb %al, %al +// jmpq tc_func je .LBB0_2 +// # %bb.2: +// movaps 96(%rsp), %xmm0 +// addq 32, %rsp +// jmpq tc_func +// .LBB0_2: +// # %bb.3: +// addq 32, %rsp +// jmpq tc_func +// +void X86ExpandPseudo::createTailCallBlocksPair( + MachineBasicBlock &OriginalTailCallBlk, + MachineBasicBlock::iterator &TCPseudoInstr) { + + MachineFunction *Func = OriginalTailCallBlk.getParent(); + X86MachineFunctionInfo *X86Info = Func->getInfo(); + const auto &Forwards = X86Info->getForwardedMustTailRegParms(); + const BasicBlock *BB = OriginalTailCallBlk.getBasicBlock(); + + MachineBasicBlock::iterator TailCallMInstr = std::prev(TCPseudoInstr); + DebugLoc DL = TCPseudoInstr->getDebugLoc(); + + // create two blocks for tailcalls. + MachineFunction::iterator MBBIter = ++OriginalTailCallBlk.getIterator(); + MachineBasicBlock *TailCallBlkWithGuardedRegs = + Func->CreateMachineBasicBlock(BB); + MachineBasicBlock *TailCallBlk = Func->CreateMachineBasicBlock(BB); + Func->insert(MBBIter, TailCallBlkWithGuardedRegs); + Func->insert(MBBIter, TailCallBlk); + + TailCallBlk->transferSuccessors(&OriginalTailCallBlk); + OriginalTailCallBlk.addSuccessor(TailCallBlkWithGuardedRegs); + OriginalTailCallBlk.addSuccessor(TailCallBlk); + + // search for the first stack frame destroying instruction + MachineBasicBlock::iterator FirstStackFrameDestroyingInstr = &*TailCallMInstr; + for (MachineBasicBlock::iterator I = TCPseudoInstr; + I != OriginalTailCallBlk.begin(); --I) { + MachineBasicBlock::iterator PI = std::prev(I); + if (PI->getFlag(MachineInstr::FrameDestroy)) + FirstStackFrameDestroyingInstr = PI; + } + + // copy stack restoring code and tailcall instruction into + // two created blocks. Delete copied instructions from the + // OriginalTailCallBlk. + MachineBasicBlock::iterator CurInstr = FirstStackFrameDestroyingInstr; + do { + // duplicate instructions and put them into new blocks. + // handle CFI instructions separately. + if (CurInstr->isCFIInstruction()) { + BuildMI(*TailCallBlkWithGuardedRegs, TailCallBlkWithGuardedRegs->end(), + TailCallBlkWithGuardedRegs->findDebugLoc( + TailCallBlkWithGuardedRegs->begin()), + TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CurInstr->getOperand(0).getCFIIndex()); + BuildMI(*TailCallBlk, TailCallBlk->end(), + TailCallBlk->findDebugLoc(TailCallBlk->begin()), + TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CurInstr->getOperand(0).getCFIIndex()); + } else { + TII->duplicate(*TailCallBlkWithGuardedRegs, + TailCallBlkWithGuardedRegs->end(), *CurInstr); + + TII->duplicate(*TailCallBlk, TailCallBlk->end(), *CurInstr); + } + + // stop copying if we achieved tail call instruction + if (CurInstr->getOpcode() == TailCallMInstr->getOpcode()) { + OriginalTailCallBlk.erase(CurInstr); + break; + } + + CurInstr = &*OriginalTailCallBlk.erase(CurInstr); + } while (CurInstr != OriginalTailCallBlk.end()); + + // copy call site information into new tail call instructions + OriginalTailCallBlk.getParent()->copyCallSiteInfo( + &*TCPseudoInstr, &*TailCallBlkWithGuardedRegs->getLastNonDebugInstr()); + + OriginalTailCallBlk.getParent()->copyCallSiteInfo( + &*TCPseudoInstr, &*TailCallBlk->getLastNonDebugInstr()); + + // If %al is 0, branch around the XMM save block. + BuildMI(&OriginalTailCallBlk, DL, TII->get(X86::TEST8rr)) + .addReg(X86::AL) + .addReg(X86::AL); + BuildMI(&OriginalTailCallBlk, DL, TII->get(X86::JCC_1)) + .addMBB(TailCallBlk) + .addImm(X86::COND_E); + + // add code restoring xmm regsiters into start of TailCallInstrFromGuardedBlk + MachineInstr &TailCallInstrFromGuardedBlk = + *TailCallBlkWithGuardedRegs->getLastNonDebugInstr(); + + // TODO: take into account YMM, ZMM here + unsigned MOVOpc = STI->hasAVX() ? X86::VMOVAPSrm : X86::MOVAPSrm; + + unsigned BaseReg; + int64_t FrameOffset = X86FL->getFrameIndexReference( + *Func, X86Info->getThunkRegSaveFrameIndex(), BaseReg); + int64_t SaveAreaOffset = + (Func->getFrameInfo().hasVAStart() ? X86Info->getVarArgsFPOffset() : 0); + + int RegIdx = 0; + for (const auto &Fwd : Forwards) { + if (Fwd.isGuarded()) { + int64_t Offset = FrameOffset + SaveAreaOffset + RegIdx * 16; + + MachineMemOperand *MMO = Func->getMachineMemOperand( + MachinePointerInfo::getFixedStack( + *Func, X86Info->getThunkRegSaveFrameIndex(), Offset), + MachineMemOperand::MOLoad, + /*Size=*/16, /*Align=*/16); + + BuildMI(*TailCallBlkWithGuardedRegs, TailCallBlkWithGuardedRegs->begin(), + DL, TII->get(MOVOpc), Fwd.PReg) + .addReg(BaseReg) + .addImm(/*Scale=*/1) + .addReg(/*IndexReg=*/0) + .addImm(/*Disp=*/Offset) + .addReg(/*Segment=*/0) + .addMemOperand(MMO); + + TailCallInstrFromGuardedBlk.addOperand( + MachineOperand::CreateReg(Fwd.PReg, false /*IsDef*/, true /*IsImp*/)); + RegIdx++; + } + } + + // add liveins into newly created blocks + for (auto &MO : TCPseudoInstr->operands()) { + if (MO.isReg() && Register::isPhysicalRegister(MO.getReg())) { + TailCallBlk->addLiveIn(MO.getReg()); + TailCallBlkWithGuardedRegs->addLiveIn(MO.getReg()); + } + } +} + /// If \p MBBI is a pseudo instruction, this method expands /// it to the corresponding (sequence of) actual instruction(s). /// \returns true if \p MBBI has been expanded. @@ -275,9 +429,28 @@ MachineInstr &NewMI = *std::prev(MBBI); NewMI.copyImplicitOps(*MBBI->getParent()->getParent(), *MBBI); - MBB.getParent()->moveCallSiteInfo(&*MBBI, &NewMI); + MBB.getParent()->copyCallSiteInfo(&*MBBI, &NewMI); + + MachineFunction *Func = MBB.getParent(); + X86MachineFunctionInfo *X86Info = Func->getInfo(); + const auto &Forwards = X86Info->getForwardedMustTailRegParms(); + + // if tailcall return sequence is a "musttail" + // and some of forwarded registers should be guarded + // then replace current tailcall return sequence + // with two return sequences: one which restores + // guarded registers and another one which does not. + // Otherwise, leave current tailcall return sequence as is. + if (Func->getFrameInfo().hasMustTailInVarArgFunc()) { + for (const auto &F : Forwards) + if (F.isGuarded()) { + createTailCallBlocksPair(MBB, MBBI); + break; + } + } // Delete the pseudo instruction TCRETURN. + MBB.getParent()->eraseCallSiteInfo(&*MBBI); MBB.erase(MBBI); return true; @@ -366,6 +539,10 @@ case TargetOpcode::ICALL_BRANCH_FUNNEL: ExpandICallBranchFunnel(&MBB, MBBI); return true; + + case X86::SAVE_VARARG_XMM_REGS: + expandSaveVarargXmmRegs(&MBB, MBBI); + return true; } llvm_unreachable("Previous switch has a fallthrough?"); } @@ -386,6 +563,63 @@ return Modified; } +/// This function replaces X86::SAVE_VARARG_XMM_REGS pseudo instruction +/// with set of copying instructions for specified xmm vararg registers. +/// +/// [0] parameter of X86::SAVE_VARARG_XMM_REGS is frame index of stack area, +/// where registers should be stored +/// [1] parameter of X86::SAVE_VARARG_XMM_REGS is offset inside stack frame +/// to the area where registers should be stored +/// [2] - [till end] parameters of X86::SAVE_VARARG_XMM_REGS are set of +/// xmm registers which should be stored. +void X86ExpandPseudo::expandSaveVarargXmmRegs( + MachineBasicBlock *GuardedBlock, + MachineBasicBlock::iterator SaveVarargXmmRegsInstr) const { + assert(SaveVarargXmmRegsInstr->getOpcode() == X86::SAVE_VARARG_XMM_REGS); + + MachineFunction *Func = GuardedBlock->getParent(); + DebugLoc DL = SaveVarargXmmRegsInstr->getDebugLoc(); + + int64_t FrameIndex = SaveVarargXmmRegsInstr->getOperand(0).getImm(); + unsigned BaseReg; + int64_t FrameOffset = + X86FL->getFrameIndexReference(*Func, FrameIndex, BaseReg); + int64_t VarArgsRegsOffset = SaveVarargXmmRegsInstr->getOperand(1).getImm(); + + // TODO: add support for YMM and ZMM here. + unsigned MOVOpc = STI->hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr; + + // In the XMM save block, save all the XMM argument registers. + for (int64_t OpndIdx = 2, RegIdx = 0; + OpndIdx < SaveVarargXmmRegsInstr->getNumOperands(); + OpndIdx++, RegIdx++) { + + int64_t Offset = FrameOffset + VarArgsRegsOffset + RegIdx * 16; + + MachineMemOperand *MMO = Func->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*Func, FrameIndex, Offset), + MachineMemOperand::MOStore, + /*Size=*/16, /*Align=*/16); + + BuildMI(GuardedBlock, DL, TII->get(MOVOpc)) + .addReg(BaseReg) + .addImm(/*Scale=*/1) + .addReg(/*IndexReg=*/0) + .addImm(/*Disp=*/Offset) + .addReg(/*Segment=*/0) + .addReg(SaveVarargXmmRegsInstr->getOperand(OpndIdx).getReg()) + .addMemOperand(MMO); + assert(Register::isPhysicalRegister( + SaveVarargXmmRegsInstr->getOperand(OpndIdx).getReg())); + + GuardedBlock->addLiveIn( + SaveVarargXmmRegsInstr->getOperand(OpndIdx).getReg()); + } + + // Delete the pseudo. + SaveVarargXmmRegsInstr->eraseFromParent(); +} + bool X86ExpandPseudo::runOnMachineFunction(MachineFunction &MF) { STI = &static_cast(MF.getSubtarget()); TII = STI->getInstrInfo(); diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -529,6 +529,11 @@ // is needed so that this can be expanded with control flow. VASTART_SAVE_XMM_REGS, + // Save xmm argument registers of the vararg thunk function to the stack, + // according to %al. An operator is needed so that this can be expanded with + // control flow. + MUSTTAIL_SAVE_GUARDED_REGS, + // Windows's _chkstk call to do stack probing. WIN_ALLOCA, @@ -1430,14 +1435,23 @@ // Utility function to emit the low-level va_arg code for X86-64. MachineBasicBlock * - EmitVAARG64WithCustomInserter(MachineInstr &MI, + emitVAARG64WithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const; /// Utility function to emit the xmm reg save portion of va_start. MachineBasicBlock * - EmitVAStartSaveXMMRegsWithCustomInserter(MachineInstr &BInstr, + emitVAStartSaveXMMRegsWithCustomInserter(MachineInstr &BInstr, MachineBasicBlock *BB) const; + /// Utility function to emit the guarded xmm regs saving block. + MachineBasicBlock * + emitVarargThunkSaveXMMRegsWithCustomInserter(MachineInstr &BInstr, + MachineBasicBlock *BB) const; + + void addSaveVarargXmmRegsPseudo(MachineBasicBlock *GuardedRegsBlk, + MachineBasicBlock *TailBlk, + MachineInstr &SrcPseudoInstr) const; + MachineBasicBlock *EmitLoweredCascadedSelect(MachineInstr &MI1, MachineInstr &MI2, MachineBasicBlock *BB) const; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -3540,9 +3540,14 @@ F.hasFnAttribute(Attribute::NoImplicitFloat)) && "SSE register cannot be used when SSE is disabled!"); + SmallDenseSet GuardedRegs; + SmallVector LiveGPRs; + SmallVector LiveXMMRegs; + SDValue ALVal; + // 64-bit calling conventions support varargs and register parameters, so we // have to do extra work to spill them in the prologue. - if (Is64Bit && isVarArg && MFI.hasVAStart()) { + if (Is64Bit && isVarArg) { // Find the first unallocated argument registers. ArrayRef ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget); ArrayRef ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget); @@ -3552,77 +3557,94 @@ "SSE register cannot be used when SSE is disabled!"); // Gather all the live in physical registers. - SmallVector LiveGPRs; - SmallVector LiveXMMRegs; - SDValue ALVal; for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) { unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass); LiveGPRs.push_back( DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64)); } + if (!ArgXMMs.empty()) { unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass); ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8); for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) { - unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass); - LiveXMMRegs.push_back( - DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32)); - } - } - - if (IsWin64) { - // Get to the caller-allocated home save location. Add 8 to account - // for the return address. - int HomeOffset = TFI.getOffsetOfLocalArea() + 8; - FuncInfo->setRegSaveFrameIndex( - MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); - // Fixup to set vararg frame on shadow area (4 x i64). - if (NumIntRegs < 4) - FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); - } else { - // For X86-64, if there are vararg parameters that are passed via - // registers, then we must store them to their spots on the stack so - // they may be loaded by dereferencing the result of va_next. - FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); - FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16); - FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject( - ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false)); - } - - // Store the integer parameter registers. - SmallVector MemOps; - SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), - getPointerTy(DAG.getDataLayout())); - unsigned Offset = FuncInfo->getVarArgsGPOffset(); - for (SDValue Val : LiveGPRs) { - SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), - RSFIN, DAG.getIntPtrConstant(Offset, dl)); - SDValue Store = - DAG.getStore(Val.getValue(1), dl, Val, FIN, - MachinePointerInfo::getFixedStack( - DAG.getMachineFunction(), - FuncInfo->getRegSaveFrameIndex(), Offset)); - MemOps.push_back(Store); - Offset += 8; - } - - if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) { - // Now store the XMM (fp + vector) parameter registers. - SmallVector SaveXMMOps; - SaveXMMOps.push_back(Chain); - SaveXMMOps.push_back(ALVal); - SaveXMMOps.push_back(DAG.getIntPtrConstant( - FuncInfo->getRegSaveFrameIndex(), dl)); - SaveXMMOps.push_back(DAG.getIntPtrConstant( - FuncInfo->getVarArgsFPOffset(), dl)); - SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(), - LiveXMMRegs.end()); - MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, - MVT::Other, SaveXMMOps)); + // FastRegisterAllocator spills virtual registers at basic + // block boundary. That leads to usages of xmm registers + // outside of check for %al. Pass physical registers to + // VASTART_SAVE_XMM_REGS to avoid unneccessary spilling. + MF.getRegInfo().addLiveIn(Reg); + LiveXMMRegs.push_back(DAG.getRegister(Reg, MVT::v4f32)); + + // 'musttail' calls forward input registers from thunk function to + // callee. AMD64 ABI allows to avoid access to xmm registers by guarding + // them with %al register. That behavior is important for + // NoImplicitFloat case. To implement that behavior we create set of + // registers which should be guarded while forwarding. + // + // We do not guard registers for functions with "thunk" attribute + // currently. Attribute "thunk" points to special kind of thunk + // function: "perfectly forwarding thunk". It is assumed that functions + // with "thunk" attribute should not be used in NoImplicitFloat case. + if (!F.hasFnAttribute("thunk")) + GuardedRegs.insert(Reg); + } + } + + if (MFI.hasVAStart()) { + if (IsWin64) { + // Get to the caller-allocated home save location. Add 8 to account + // for the return address. + int HomeOffset = TFI.getOffsetOfLocalArea() + 8; + FuncInfo->setRegSaveFrameIndex( + MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); + // Fixup to set vararg frame on shadow area (4 x i64). + if (NumIntRegs < 4) + FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); + } else { + // For X86-64, if there are vararg parameters that are passed via + // registers, then we must store them to their spots on the stack so + // they may be loaded by dereferencing the result of va_next. + FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); + FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16); + FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject( + ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false)); + } + + // Store the integer parameter registers. + SmallVector MemOps; + SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), + getPointerTy(DAG.getDataLayout())); + unsigned Offset = FuncInfo->getVarArgsGPOffset(); + for (SDValue Val : LiveGPRs) { + SDValue FIN = + DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), RSFIN, + DAG.getIntPtrConstant(Offset, dl)); + SDValue Store = + DAG.getStore(Val.getValue(1), dl, Val, FIN, + MachinePointerInfo::getFixedStack( + DAG.getMachineFunction(), + FuncInfo->getRegSaveFrameIndex(), Offset)); + MemOps.push_back(Store); + Offset += 8; + } + + if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) { + // Now store the XMM (fp + vector) parameter registers. + SmallVector SaveXMMOps; + SaveXMMOps.push_back(Chain); + SaveXMMOps.push_back(ALVal); + SaveXMMOps.push_back( + DAG.getIntPtrConstant(FuncInfo->getRegSaveFrameIndex(), dl)); + SaveXMMOps.push_back( + DAG.getIntPtrConstant(FuncInfo->getVarArgsFPOffset(), dl)); + SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(), + LiveXMMRegs.end()); + MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, + MVT::Other, SaveXMMOps)); + } + + if (!MemOps.empty()) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); } - - if (!MemOps.empty()) - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); } if (isVarArg && MFI.hasMustTailInVarArgFunc()) { @@ -3648,7 +3670,8 @@ // Compute the set of forwarded registers. The rest are scratch. SmallVectorImpl &Forwards = FuncInfo->getForwardedMustTailRegParms(); - CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86); + CCInfo.analyzeMustTailForwardedRegisters(Forwards, GuardedRegs, + RegParmTypes, CC_X86); // Forward AL for SysV x86_64 targets, since it is used for varargs. if (Is64Bit && !IsWin64 && !CCInfo.isAllocated(X86::AL)) { @@ -3659,9 +3682,42 @@ // Copy all forwards from physical to virtual registers. for (ForwardedRegister &FR : Forwards) { // FIXME: Can we use a less constrained schedule? - SDValue RegVal = DAG.getCopyFromReg(Chain, dl, FR.VReg, FR.VT); - FR.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(FR.VT)); - Chain = DAG.getCopyToReg(Chain, dl, FR.VReg, RegVal); + if (!FR.isGuarded()) { + SDValue RegVal = DAG.getCopyFromReg(Chain, dl, FR.VReg, FR.VT); + FR.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(FR.VT)); + Chain = DAG.getCopyToReg(Chain, dl, FR.VReg, RegVal); + } + } + + if (!GuardedRegs.empty()) { + if (MFI.hasVAStart()) { + // all incoming xmm registers are already stored by VAStart + // handling. Reuse these stored values for thunk forwarded + // parameters here. + FuncInfo->setThunkRegSaveFrameIndex(FuncInfo->getRegSaveFrameIndex()); + } else { + // TODO: implement support for YMM, ZMM vararg registers + + // allocate stack space to save registers which should be guarded by + // ABI, 16 is size of XMM + FuncInfo->setThunkRegSaveFrameIndex( + MFI.CreateStackObject(GuardedRegs.size() * 16, 16, false)); + + // Save guarded forwards into guarded area + SmallVector VarargMemOps; + SmallVector VarargXMMOps; + VarargXMMOps.push_back(Chain); + VarargXMMOps.push_back(ALVal); + VarargXMMOps.push_back( + DAG.getIntPtrConstant(FuncInfo->getThunkRegSaveFrameIndex(), dl)); + VarargXMMOps.push_back(DAG.getIntPtrConstant(0, dl)); + VarargXMMOps.insert(VarargXMMOps.end(), LiveXMMRegs.begin(), + LiveXMMRegs.end()); + VarargMemOps.push_back(DAG.getNode(X86ISD::MUSTTAIL_SAVE_GUARDED_REGS, + dl, MVT::Other, VarargXMMOps)); + if (!VarargMemOps.empty()) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, VarargMemOps); + } } } @@ -3683,8 +3739,9 @@ } if (!Is64Bit) { - // RegSaveFrameIndex is X86-64 only. + // RegSaveFrameIndex and ThunkRegSaveFrameIndex is X86-64 only. FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); + FuncInfo->setThunkRegSaveFrameIndex(0xAAAAAAA); if (CallConv == CallingConv::X86_FastCall || CallConv == CallingConv::X86_ThisCall) // fastcc functions can't have varargs. @@ -4086,8 +4143,10 @@ if (isVarArg && IsMustTail) { const auto &Forwards = X86Info->getForwardedMustTailRegParms(); for (const auto &F : Forwards) { - SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT); - RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val)); + if (!F.isGuarded()) { + SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT); + RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val)); + } } } @@ -29878,6 +29937,7 @@ NODE_NAME_CASE(PSADBW) NODE_NAME_CASE(DBPSADBW) NODE_NAME_CASE(VASTART_SAVE_XMM_REGS) + NODE_NAME_CASE(MUSTTAIL_SAVE_GUARDED_REGS) NODE_NAME_CASE(VAARG_64) NODE_NAME_CASE(WIN_ALLOCA) NODE_NAME_CASE(MEMBARRIER) @@ -30392,7 +30452,7 @@ MachineBasicBlock * -X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, +X86TargetLowering::emitVAARG64WithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const { // Emit va_arg instruction on X86-64. @@ -30651,79 +30711,148 @@ return endMBB; } -MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( - MachineInstr &MI, MachineBasicBlock *MBB) const { - // Emit code to save XMM registers to the stack. The ABI says that the - // number of registers to save is given in %al, so it's theoretically - // possible to do an indirect jump trick to avoid saving all of them, - // however this code takes a simpler approach and just executes all - // of the stores if %al is non-zero. It's less code, and it's probably - // easier on the hardware branch predictor, and stores aren't all that - // expensive anyway. +// This function creates additional block for storing varargs guarded +// registers. It adds check for %al into entry block, to skip +// GuardedRegsBlk if xmm registers should not be stored. +// +// EntryBlk[VAPseudoInstr] EntryBlk +// | | . +// | | . +// | | GuardedRegsBlk +// | => | . +// | | . +// | TailBlk[VAPseudoInstr] +// | | +// | | +// +static std::pair +createGuardedRegsBlock(MachineBasicBlock *EntryBlk, MachineInstr &VAPseudoInstr, + const X86Subtarget &Subtarget) { + + MachineFunction *Func = EntryBlk->getParent(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + DebugLoc DL = VAPseudoInstr.getDebugLoc(); + Register CountReg = VAPseudoInstr.getOperand(0).getReg(); // Create the new basic blocks. One block contains all the XMM stores, // and one block is the final destination regardless of whether any // stores were performed. - const BasicBlock *LLVM_BB = MBB->getBasicBlock(); - MachineFunction *F = MBB->getParent(); - MachineFunction::iterator MBBIter = ++MBB->getIterator(); - MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); - MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); - F->insert(MBBIter, XMMSaveMBB); - F->insert(MBBIter, EndMBB); + const BasicBlock *LLVMBlk = EntryBlk->getBasicBlock(); + MachineFunction::iterator EntryBlkIter = ++EntryBlk->getIterator(); + MachineBasicBlock *GuardedRegsBlk = Func->CreateMachineBasicBlock(LLVMBlk); + MachineBasicBlock *TailBlk = Func->CreateMachineBasicBlock(LLVMBlk); + Func->insert(EntryBlkIter, GuardedRegsBlk); + Func->insert(EntryBlkIter, TailBlk); // Transfer the remainder of MBB and its successor edges to EndMBB. - EndMBB->splice(EndMBB->begin(), MBB, - std::next(MachineBasicBlock::iterator(MI)), MBB->end()); - EndMBB->transferSuccessorsAndUpdatePHIs(MBB); + TailBlk->splice(TailBlk->begin(), EntryBlk, + std::next(MachineBasicBlock::iterator(VAPseudoInstr)), + EntryBlk->end()); + TailBlk->transferSuccessorsAndUpdatePHIs(EntryBlk); // The original block will now fall through to the XMM save block. - MBB->addSuccessor(XMMSaveMBB); + EntryBlk->addSuccessor(GuardedRegsBlk); // The XMMSaveMBB will fall through to the end block. - XMMSaveMBB->addSuccessor(EndMBB); - - // Now add the instructions. - const TargetInstrInfo *TII = Subtarget.getInstrInfo(); - DebugLoc DL = MI.getDebugLoc(); - - Register CountReg = MI.getOperand(0).getReg(); - int64_t RegSaveFrameIndex = MI.getOperand(1).getImm(); - int64_t VarArgsFPOffset = MI.getOperand(2).getImm(); + GuardedRegsBlk->addSuccessor(TailBlk); - if (!Subtarget.isCallingConvWin64(F->getFunction().getCallingConv())) { + if (!Subtarget.isCallingConvWin64(Func->getFunction().getCallingConv())) { // If %al is 0, branch around the XMM save block. - BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); - BuildMI(MBB, DL, TII->get(X86::JCC_1)).addMBB(EndMBB).addImm(X86::COND_E); - MBB->addSuccessor(EndMBB); + BuildMI(EntryBlk, DL, TII->get(X86::TEST8rr)) + .addReg(CountReg) + .addReg(CountReg); + BuildMI(EntryBlk, DL, TII->get(X86::JCC_1)) + .addMBB(TailBlk) + .addImm(X86::COND_E); + EntryBlk->addSuccessor(TailBlk); } + return std::make_pair(GuardedRegsBlk, TailBlk); +} + +void X86TargetLowering::addSaveVarargXmmRegsPseudo( + MachineBasicBlock *GuardedRegsBlk, MachineBasicBlock *TailBlk, + MachineInstr &SrcPseudoInstr) const { // Make sure the last operand is EFLAGS, which gets clobbered by the branch // that was just emitted, but clearly shouldn't be "saved". - assert((MI.getNumOperands() <= 3 || - !MI.getOperand(MI.getNumOperands() - 1).isReg() || - MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) && + assert((SrcPseudoInstr.getNumOperands() <= 3 || + !SrcPseudoInstr.getOperand(SrcPseudoInstr.getNumOperands() - 1) + .isReg() || + SrcPseudoInstr.getOperand(SrcPseudoInstr.getNumOperands() - 1) + .getReg() == X86::EFLAGS) && "Expected last argument to be EFLAGS"); - unsigned MOVOpc = Subtarget.hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr; - // In the XMM save block, save all the XMM argument registers. - for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) { - int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; - MachineMemOperand *MMO = F->getMachineMemOperand( - MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset), - MachineMemOperand::MOStore, - /*Size=*/16, /*Align=*/16); - BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc)) - .addFrameIndex(RegSaveFrameIndex) - .addImm(/*Scale=*/1) - .addReg(/*IndexReg=*/0) - .addImm(/*Disp=*/Offset) - .addReg(/*Segment=*/0) - .addReg(MI.getOperand(i).getReg()) - .addMemOperand(MMO); + + // create SAVE_VARARG_XMM_REGS pseudo + MachineInstrBuilder MIB = + BuildMI(GuardedRegsBlk, SrcPseudoInstr.getDebugLoc(), + Subtarget.getInstrInfo()->get(X86::SAVE_VARARG_XMM_REGS)); + + // set Frame Index + MIB.addImm(SrcPseudoInstr.getOperand(1).getImm()); + + // set ArgsOffset + MIB.addImm(SrcPseudoInstr.getOperand(2).getImm()); + + for (unsigned OpndIdx = 3, RegIdx = 0; + OpndIdx + 1 < SrcPseudoInstr.getNumOperands(); OpndIdx++, RegIdx++) + MIB.addReg(SrcPseudoInstr.getOperand(OpndIdx).getReg(), + RegState::InternalRead); + + SrcPseudoInstr.eraseFromParent(); // The pseudo instruction is gone now. +} + +MachineBasicBlock *X86TargetLowering::emitVAStartSaveXMMRegsWithCustomInserter( + MachineInstr &PseudoVaStartInstr, MachineBasicBlock *EntryBlk) const { + // Emit code to save XMM registers to the stack. The ABI says that the + // number of registers to save is given in %al, so it's theoretically + // possible to do an indirect jump trick to avoid saving all of them, + // however this code takes a simpler approach and just executes all + // of the stores if %al is non-zero. It's less code, and it's probably + // easier on the hardware branch predictor, and stores aren't all that + // expensive anyway. + + MachineBasicBlock *GuardedRegsBlk = nullptr; + MachineBasicBlock *TailBlk = nullptr; + + std::tie(GuardedRegsBlk, TailBlk) = + createGuardedRegsBlock(EntryBlk, PseudoVaStartInstr, Subtarget); + + addSaveVarargXmmRegsPseudo(GuardedRegsBlk, TailBlk, PseudoVaStartInstr); + + return TailBlk; +} + +MachineBasicBlock * +X86TargetLowering::emitVarargThunkSaveXMMRegsWithCustomInserter( + MachineInstr &PseudoVarargThunkInstr, MachineBasicBlock *EntryBlk) const { + MachineBasicBlock *GuardedRegsBlk = nullptr; + MachineBasicBlock *TailBlk = nullptr; + MachineFunction *Func = EntryBlk->getParent(); + + // check whether GuardedRegsBlk is already created by VASTART handling code + assert(Func->begin() != Func->end()); + for (auto &Succ : (*Func->begin()).successors()) { + + for (auto &Instr : Succ->instrs()) { + if (Instr.getOpcode() == X86::SAVE_VARARG_XMM_REGS) { + // GuardedRegsBlk is already created by VASTART handling code + assert(Func->getFrameInfo().hasVAStart()); + GuardedRegsBlk = Succ; + TailBlk = *GuardedRegsBlk->succ_begin(); + break; + } + } + + if (GuardedRegsBlk) + break; } - MI.eraseFromParent(); // The pseudo instruction is gone now. + if (GuardedRegsBlk == nullptr) + std::tie(GuardedRegsBlk, TailBlk) = + createGuardedRegsBlock(EntryBlk, PseudoVarargThunkInstr, Subtarget); - return EndMBB; + addSaveVarargXmmRegsPseudo(GuardedRegsBlk, TailBlk, PseudoVarargThunkInstr); + + return TailBlk; } // The EFLAGS operand of SelectItr might be missing a kill marker @@ -32539,10 +32668,13 @@ return emitXBegin(MI, BB, Subtarget.getInstrInfo()); case X86::VASTART_SAVE_XMM_REGS: - return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); + return emitVAStartSaveXMMRegsWithCustomInserter(MI, BB); + + case X86::MUSTTAIL_SAVE_GUARDED_REGS: + return emitVarargThunkSaveXMMRegsWithCustomInserter(MI, BB); case X86::VAARG_64: - return EmitVAARG64WithCustomInserter(MI, BB); + return emitVAARG64WithCustomInserter(MI, BB); case X86::EH_SjLj_SetJmp32: case X86::EH_SjLj_SetJmp64: diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -68,6 +68,15 @@ let SchedRW = [WriteSystem] in { +let hasSideEffects = 1 in { +def SAVE_VARARG_XMM_REGS : I<0, Pseudo, + (outs), + (ins i64imm:$regsavefi, i64imm:$offset, + variable_ops), + "#SAVE_VARARG_XMM_REGS $regsavefi, $offset", + []>; +} + // x86-64 va_start lowering magic. let usesCustomInserter = 1, Defs = [EFLAGS] in { def VASTART_SAVE_XMM_REGS : I<0, Pseudo, @@ -81,6 +90,19 @@ imm:$offset), (implicit EFLAGS)]>; +// x86-64 %al guarded thunk arguments lowering magic. +def MUSTTAIL_SAVE_GUARDED_REGS : I<0, Pseudo, + (outs), + (ins GR8:$al, + i64imm:$regsavefi, i64imm:$offset, + variable_ops), + "#MUSTTAIL_SAVE_GUARDED_REGS $al, $regsavefi, $offset", + [(X86musttail_save_guarded_regs GR8:$al, + imm:$regsavefi, + imm:$offset), + (implicit EFLAGS)]>; + + // The VAARG_64 pseudo-instruction takes the address of the va_list, // and places the address of the next argument into a register. let Defs = [EFLAGS] in diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td --- a/llvm/lib/Target/X86/X86InstrInfo.td +++ b/llvm/lib/Target/X86/X86InstrInfo.td @@ -99,6 +99,11 @@ SDTCisVT<1, iPTR>, SDTCisVT<2, iPTR>]>; +def SDT_X86MUSTTAIL_SAVE_GUARDED_REGS : SDTypeProfile<0, -1, [SDTCisVT<0, i8>, + SDTCisVT<1, iPTR>, + SDTCisVT<2, iPTR>]>; + + def SDT_X86VAARG_64 : SDTypeProfile<1, -1, [SDTCisPtrTy<0>, SDTCisPtrTy<1>, SDTCisVT<2, i32>, @@ -192,6 +197,12 @@ SDNode<"X86ISD::VASTART_SAVE_XMM_REGS", SDT_X86VASTART_SAVE_XMM_REGS, [SDNPHasChain, SDNPVariadic]>; + +def X86musttail_save_guarded_regs : + SDNode<"X86ISD::MUSTTAIL_SAVE_GUARDED_REGS", + SDT_X86MUSTTAIL_SAVE_GUARDED_REGS, + [SDNPHasChain, SDNPVariadic]>; + def X86vaarg64 : SDNode<"X86ISD::VAARG_64", SDT_X86VAARG_64, [SDNPHasChain, SDNPMayLoad, SDNPMayStore, diff --git a/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/llvm/lib/Target/X86/X86MachineFunctionInfo.h --- a/llvm/lib/Target/X86/X86MachineFunctionInfo.h +++ b/llvm/lib/Target/X86/X86MachineFunctionInfo.h @@ -73,6 +73,9 @@ int VarArgsFrameIndex = 0; /// RegSaveFrameIndex - X86-64 vararg func register save area. int RegSaveFrameIndex = 0; + /// ThunkRegSaveFrameIndex - X86-64 vararg func register save area for thunk + /// functions. + int ThunkRegSaveFrameIndex = 0; /// VarArgsGPOffset - X86-64 vararg func int reg offset. unsigned VarArgsGPOffset = 0; /// VarArgsFPOffset - X86-64 vararg func fp reg offset. @@ -155,6 +158,9 @@ int getRegSaveFrameIndex() const { return RegSaveFrameIndex; } void setRegSaveFrameIndex(int Idx) { RegSaveFrameIndex = Idx; } + int getThunkRegSaveFrameIndex() const { return ThunkRegSaveFrameIndex; } + void setThunkRegSaveFrameIndex(int Idx) { ThunkRegSaveFrameIndex = Idx; } + unsigned getVarArgsGPOffset() const { return VarArgsGPOffset; } void setVarArgsGPOffset(unsigned Offset) { VarArgsGPOffset = Offset; } diff --git a/llvm/test/CodeGen/X86/icall-branch-funnel.ll b/llvm/test/CodeGen/X86/icall-branch-funnel.ll --- a/llvm/test/CodeGen/X86/icall-branch-funnel.ll +++ b/llvm/test/CodeGen/X86/icall-branch-funnel.ll @@ -19,10 +19,10 @@ ; CHECK: jt2: ; CHECK: leaq g+1(%rip), %r11 ; CHECK-NEXT: cmpq %r11, %r10 - ; CHECK-NEXT: jae .LBB0_1 + ; CHECK-NEXT: jae .LBB0_3 ; CHECK-NEXT: # ; CHECK-NEXT: jmp f0 - ; CHECK-NEXT: .LBB0_1: + ; CHECK-NEXT: .LBB0_3: ; CHECK-NEXT: jmp f1 musttail call void (...) @llvm.icall.branch.funnel( i8* %0, @@ -37,14 +37,14 @@ ; CHECK: jt3: ; CHECK: leaq g+1(%rip), %r11 ; CHECK-NEXT: cmpq %r11, %r10 - ; CHECK-NEXT: jae .LBB1_1 + ; CHECK-NEXT: jae .LBB1_3 ; CHECK-NEXT: # ; CHECK-NEXT: jmp f0 - ; CHECK-NEXT: .LBB1_1: - ; CHECK-NEXT: jne .LBB1_2 + ; CHECK-NEXT: .LBB1_3: + ; CHECK-NEXT: jne .LBB1_4 ; CHECK-NEXT: # ; CHECK-NEXT: jmp f1 - ; CHECK-NEXT: .LBB1_2: + ; CHECK-NEXT: .LBB1_4: ; CHECK-NEXT: jmp f2 musttail call void (...) @llvm.icall.branch.funnel( i8* %0, @@ -60,34 +60,34 @@ ; CHECK: jt7: ; CHECK: leaq g+3(%rip), %r11 ; CHECK-NEXT: cmpq %r11, %r10 - ; CHECK-NEXT: jae .LBB2_1 + ; CHECK-NEXT: jae .LBB2_3 ; CHECK-NEXT: # ; CHECK-NEXT: leaq g+1(%rip), %r11 ; CHECK-NEXT: cmpq %r11, %r10 - ; CHECK-NEXT: jae .LBB2_6 + ; CHECK-NEXT: jae .LBB2_8 ; CHECK-NEXT: # ; CHECK-NEXT: jmp f0 - ; CHECK-NEXT: .LBB2_1: - ; CHECK-NEXT: jne .LBB2_2 + ; CHECK-NEXT: .LBB2_3: + ; CHECK-NEXT: jne .LBB2_4 ; CHECK-NEXT: # ; CHECK-NEXT: jmp f3 - ; CHECK-NEXT: .LBB2_6: - ; CHECK-NEXT: jne .LBB2_7 + ; CHECK-NEXT: .LBB2_8: + ; CHECK-NEXT: jne .LBB2_9 ; CHECK-NEXT: # ; CHECK-NEXT: jmp f1 - ; CHECK-NEXT: .LBB2_2: + ; CHECK-NEXT: .LBB2_4: ; CHECK-NEXT: leaq g+5(%rip), %r11 ; CHECK-NEXT: cmpq %r11, %r10 - ; CHECK-NEXT: jae .LBB2_3 + ; CHECK-NEXT: jae .LBB2_5 ; CHECK-NEXT: # ; CHECK-NEXT: jmp f4 - ; CHECK-NEXT: .LBB2_7: + ; CHECK-NEXT: .LBB2_9: ; CHECK-NEXT: jmp f2 - ; CHECK-NEXT: .LBB2_3: - ; CHECK-NEXT: jne .LBB2_4 + ; CHECK-NEXT: .LBB2_5: + ; CHECK-NEXT: jne .LBB2_6 ; CHECK-NEXT: # ; CHECK-NEXT: jmp f5 - ; CHECK-NEXT: .LBB2_4: + ; CHECK-NEXT: .LBB2_6: ; CHECK-NEXT: jmp f6 musttail call void (...) @llvm.icall.branch.funnel( i8* %0, @@ -107,50 +107,50 @@ ; CHECK: jt10: ; CHECK: leaq g+5(%rip), %r11 ; CHECK-NEXT: cmpq %r11, %r10 - ; CHECK-NEXT: jae .LBB3_1 + ; CHECK-NEXT: jae .LBB3_3 ; CHECK-NEXT: # ; CHECK-NEXT: leaq g+1(%rip), %r11 ; CHECK-NEXT: cmpq %r11, %r10 - ; CHECK-NEXT: jae .LBB3_7 + ; CHECK-NEXT: jae .LBB3_9 ; CHECK-NEXT: # ; CHECK-NEXT: jmp f0 - ; CHECK-NEXT: .LBB3_1: - ; CHECK-NEXT: jne .LBB3_2 + ; CHECK-NEXT: .LBB3_3: + ; CHECK-NEXT: jne .LBB3_4 ; CHECK-NEXT: # ; CHECK-NEXT: jmp f5 - ; CHECK-NEXT: .LBB3_7: - ; CHECK-NEXT: jne .LBB3_8 + ; CHECK-NEXT: .LBB3_9: + ; CHECK-NEXT: jne .LBB3_10 ; CHECK-NEXT: # ; CHECK-NEXT: jmp f1 - ; CHECK-NEXT: .LBB3_2: + ; CHECK-NEXT: .LBB3_4: ; CHECK-NEXT: leaq g+7(%rip), %r11 ; CHECK-NEXT: cmpq %r11, %r10 - ; CHECK-NEXT: jae .LBB3_3 + ; CHECK-NEXT: jae .LBB3_5 ; CHECK-NEXT: # ; CHECK-NEXT: jmp f6 - ; CHECK-NEXT: .LBB3_8: + ; CHECK-NEXT: .LBB3_10: ; CHECK-NEXT: leaq g+3(%rip), %r11 ; CHECK-NEXT: cmpq %r11, %r10 - ; CHECK-NEXT: jae .LBB3_9 + ; CHECK-NEXT: jae .LBB3_11 ; CHECK-NEXT: # ; CHECK-NEXT: jmp f2 - ; CHECK-NEXT: .LBB3_3: - ; CHECK-NEXT: jne .LBB3_4 + ; CHECK-NEXT: .LBB3_5: + ; CHECK-NEXT: jne .LBB3_6 ; CHECK-NEXT: # ; CHECK-NEXT: jmp f7 - ; CHECK-NEXT: .LBB3_9: - ; CHECK-NEXT: jne .LBB3_10 + ; CHECK-NEXT: .LBB3_11: + ; CHECK-NEXT: jne .LBB3_12 ; CHECK-NEXT: # ; CHECK-NEXT: jmp f3 - ; CHECK-NEXT: .LBB3_4: + ; CHECK-NEXT: .LBB3_6: ; CHECK-NEXT: leaq g+9(%rip), %r11 ; CHECK-NEXT: cmpq %r11, %r10 - ; CHECK-NEXT: jae .LBB3_5 + ; CHECK-NEXT: jae .LBB3_7 ; CHECK-NEXT: # ; CHECK-NEXT: jmp f8 - ; CHECK-NEXT: .LBB3_10: + ; CHECK-NEXT: .LBB3_12: ; CHECK-NEXT: jmp f4 - ; CHECK-NEXT: .LBB3_5: + ; CHECK-NEXT: .LBB3_7: ; CHECK-NEXT: jmp f9 musttail call void (...) @llvm.icall.branch.funnel( i8* %0, diff --git a/llvm/test/CodeGen/X86/musttail-varargs.ll b/llvm/test/CodeGen/X86/musttail-varargs.ll --- a/llvm/test/CodeGen/X86/musttail-varargs.ll +++ b/llvm/test/CodeGen/X86/musttail-varargs.ll @@ -1,9 +1,14 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -verify-machineinstrs < %s -enable-tail-merge=0 -mtriple=x86_64-linux | FileCheck %s --check-prefix=LINUX +; RUN: llc -verify-machineinstrs -O0 < %s -enable-tail-merge=0 -mtriple=x86_64-linux | FileCheck %s --check-prefix=LINUX-OPT0 ; RUN: llc -verify-machineinstrs < %s -enable-tail-merge=0 -mtriple=x86_64-linux-gnux32 | FileCheck %s --check-prefix=LINUX-X32 +; RUN: llc -verify-machineinstrs -O0 < %s -enable-tail-merge=0 -mtriple=x86_64-linux-gnux32 | FileCheck %s --check-prefix=LINUX-X32-OPT0 ; RUN: llc -verify-machineinstrs < %s -enable-tail-merge=0 -mtriple=x86_64-windows | FileCheck %s --check-prefix=WINDOWS +; RUN: llc -verify-machineinstrs -O0 < %s -enable-tail-merge=0 -mtriple=x86_64-windows | FileCheck %s --check-prefix=WINDOWS-OPT0 ; RUN: llc -verify-machineinstrs < %s -enable-tail-merge=0 -mtriple=i686-windows | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE +; RUN: llc -verify-machineinstrs -O0 < %s -enable-tail-merge=0 -mtriple=i686-windows | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE-OPT0 ; RUN: llc -verify-machineinstrs < %s -enable-tail-merge=0 -mtriple=i686-windows -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE +; RUN: llc -verify-machineinstrs -O0 < %s -enable-tail-merge=0 -mtriple=i686-windows -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE-OPT0 ; Test that we actually spill and reload all arguments in the variadic argument ; pack. Doing a normal call will clobber all argument registers, and we will @@ -29,8 +34,8 @@ ; LINUX-NEXT: .cfi_def_cfa_offset 48 ; LINUX-NEXT: pushq %rbx ; LINUX-NEXT: .cfi_def_cfa_offset 56 -; LINUX-NEXT: subq $360, %rsp # imm = 0x168 -; LINUX-NEXT: .cfi_def_cfa_offset 416 +; LINUX-NEXT: subq $232, %rsp +; LINUX-NEXT: .cfi_def_cfa_offset 288 ; LINUX-NEXT: .cfi_offset %rbx, -56 ; LINUX-NEXT: .cfi_offset %r12, -48 ; LINUX-NEXT: .cfi_offset %r13, -40 @@ -43,6 +48,11 @@ ; LINUX-NEXT: movq %rdx, %rbp ; LINUX-NEXT: movq %rsi, %rbx ; LINUX-NEXT: movq %rdi, %r14 +; LINUX-NEXT: movq %rsi, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movq %r8, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movq %r9, {{[0-9]+}}(%rsp) ; LINUX-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; LINUX-NEXT: testb %al, %al ; LINUX-NEXT: je .LBB0_2 @@ -56,11 +66,6 @@ ; LINUX-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp) ; LINUX-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp) ; LINUX-NEXT: .LBB0_2: -; LINUX-NEXT: movq %rbx, {{[0-9]+}}(%rsp) -; LINUX-NEXT: movq %rbp, {{[0-9]+}}(%rsp) -; LINUX-NEXT: movq %r13, {{[0-9]+}}(%rsp) -; LINUX-NEXT: movq %r12, {{[0-9]+}}(%rsp) -; LINUX-NEXT: movq %r15, {{[0-9]+}}(%rsp) ; LINUX-NEXT: leaq {{[0-9]+}}(%rsp), %rax ; LINUX-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; LINUX-NEXT: leaq {{[0-9]+}}(%rsp), %rax @@ -68,14 +73,6 @@ ; LINUX-NEXT: movabsq $206158430216, %rax # imm = 0x3000000008 ; LINUX-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; LINUX-NEXT: movq %r14, %rdi -; LINUX-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUX-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUX-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUX-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUX-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUX-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUX-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUX-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; LINUX-NEXT: callq get_f ; LINUX-NEXT: movq %rax, %r11 ; LINUX-NEXT: movq %r14, %rdi @@ -84,16 +81,36 @@ ; LINUX-NEXT: movq %r13, %rcx ; LINUX-NEXT: movq %r12, %r8 ; LINUX-NEXT: movq %r15, %r9 -; LINUX-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; LINUX-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; LINUX-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; LINUX-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; LINUX-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; LINUX-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; LINUX-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; LINUX-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; LINUX-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload -; LINUX-NEXT: addq $360, %rsp # imm = 0x168 +; LINUX-NEXT: testb %al, %al +; LINUX-NEXT: je .LBB0_4 +; LINUX-NEXT: # %bb.3: +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm6 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; LINUX-NEXT: addq $232, %rsp +; LINUX-NEXT: .cfi_def_cfa_offset 56 +; LINUX-NEXT: popq %rbx +; LINUX-NEXT: .cfi_def_cfa_offset 48 +; LINUX-NEXT: popq %r12 +; LINUX-NEXT: .cfi_def_cfa_offset 40 +; LINUX-NEXT: popq %r13 +; LINUX-NEXT: .cfi_def_cfa_offset 32 +; LINUX-NEXT: popq %r14 +; LINUX-NEXT: .cfi_def_cfa_offset 24 +; LINUX-NEXT: popq %r15 +; LINUX-NEXT: .cfi_def_cfa_offset 16 +; LINUX-NEXT: popq %rbp +; LINUX-NEXT: .cfi_def_cfa_offset 8 +; LINUX-NEXT: jmpq *%r11 # TAILCALL +; LINUX-NEXT: .LBB0_4: +; LINUX-NEXT: .cfi_def_cfa_offset 288 +; LINUX-NEXT: addq $232, %rsp ; LINUX-NEXT: .cfi_def_cfa_offset 56 ; LINUX-NEXT: popq %rbx ; LINUX-NEXT: .cfi_def_cfa_offset 48 @@ -109,6 +126,85 @@ ; LINUX-NEXT: .cfi_def_cfa_offset 8 ; LINUX-NEXT: jmpq *%r11 # TAILCALL ; +; LINUX-OPT0-LABEL: f_thunk: +; LINUX-OPT0: # %bb.0: +; LINUX-OPT0-NEXT: subq $328, %rsp # imm = 0x148 +; LINUX-OPT0-NEXT: .cfi_def_cfa_offset 336 +; LINUX-OPT0-NEXT: testb %al, %al +; LINUX-OPT0-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; LINUX-OPT0-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: je .LBB0_2 +; LINUX-OPT0-NEXT: # %bb.1: +; LINUX-OPT0-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm5, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: .LBB0_2: +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; LINUX-OPT0-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; LINUX-OPT0-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; LINUX-OPT0-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; LINUX-OPT0-NEXT: movq %rsi, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; LINUX-OPT0-NEXT: movq %rdi, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %r8b # 1-byte Reload +; LINUX-OPT0-NEXT: leaq {{[0-9]+}}(%rsp), %r9 +; LINUX-OPT0-NEXT: movq %r9, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: leaq {{[0-9]+}}(%rsp), %r9 +; LINUX-OPT0-NEXT: movq %r9, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movl $48, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movl $8, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; LINUX-OPT0-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %r9, %rdi +; LINUX-OPT0-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; LINUX-OPT0-NEXT: callq get_f +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; LINUX-OPT0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %r10b # 1-byte Reload +; LINUX-OPT0-NEXT: movq %rax, (%rsp) # 8-byte Spill +; LINUX-OPT0-NEXT: movb %r10b, %al +; LINUX-OPT0-NEXT: movq (%rsp), %r11 # 8-byte Reload +; LINUX-OPT0-NEXT: testb %al, %al +; LINUX-OPT0-NEXT: je .LBB0_4 +; LINUX-OPT0-NEXT: # %bb.3: +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm6 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; LINUX-OPT0-NEXT: addq $328, %rsp # imm = 0x148 +; LINUX-OPT0-NEXT: .cfi_def_cfa_offset 8 +; LINUX-OPT0-NEXT: jmpq *%r11 # TAILCALL +; LINUX-OPT0-NEXT: .LBB0_4: +; LINUX-OPT0-NEXT: .cfi_def_cfa_offset 336 +; LINUX-OPT0-NEXT: addq $328, %rsp # imm = 0x148 +; LINUX-OPT0-NEXT: .cfi_def_cfa_offset 8 +; LINUX-OPT0-NEXT: jmpq *%r11 # TAILCALL +; ; LINUX-X32-LABEL: f_thunk: ; LINUX-X32: # %bb.0: ; LINUX-X32-NEXT: pushq %rbp @@ -123,8 +219,8 @@ ; LINUX-X32-NEXT: .cfi_def_cfa_offset 48 ; LINUX-X32-NEXT: pushq %rbx ; LINUX-X32-NEXT: .cfi_def_cfa_offset 56 -; LINUX-X32-NEXT: subl $344, %esp # imm = 0x158 -; LINUX-X32-NEXT: .cfi_def_cfa_offset 400 +; LINUX-X32-NEXT: subl $216, %esp +; LINUX-X32-NEXT: .cfi_def_cfa_offset 272 ; LINUX-X32-NEXT: .cfi_offset %rbx, -56 ; LINUX-X32-NEXT: .cfi_offset %r12, -48 ; LINUX-X32-NEXT: .cfi_offset %r13, -40 @@ -137,6 +233,11 @@ ; LINUX-X32-NEXT: movq %rdx, %rbp ; LINUX-X32-NEXT: movq %rsi, %rbx ; LINUX-X32-NEXT: movl %edi, %r14d +; LINUX-X32-NEXT: movq %rsi, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movq %rdx, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movq %rcx, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movq %r8, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movq %r9, {{[0-9]+}}(%esp) ; LINUX-X32-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; LINUX-X32-NEXT: testb %al, %al ; LINUX-X32-NEXT: je .LBB0_2 @@ -150,11 +251,6 @@ ; LINUX-X32-NEXT: movaps %xmm6, {{[0-9]+}}(%esp) ; LINUX-X32-NEXT: movaps %xmm7, {{[0-9]+}}(%esp) ; LINUX-X32-NEXT: .LBB0_2: -; LINUX-X32-NEXT: movq %rbx, {{[0-9]+}}(%esp) -; LINUX-X32-NEXT: movq %rbp, {{[0-9]+}}(%esp) -; LINUX-X32-NEXT: movq %r13, {{[0-9]+}}(%esp) -; LINUX-X32-NEXT: movq %r12, {{[0-9]+}}(%esp) -; LINUX-X32-NEXT: movq %r15, {{[0-9]+}}(%esp) ; LINUX-X32-NEXT: leal {{[0-9]+}}(%rsp), %eax ; LINUX-X32-NEXT: movl %eax, {{[0-9]+}}(%esp) ; LINUX-X32-NEXT: leal {{[0-9]+}}(%rsp), %eax @@ -162,14 +258,6 @@ ; LINUX-X32-NEXT: movabsq $206158430216, %rax # imm = 0x3000000008 ; LINUX-X32-NEXT: movq %rax, {{[0-9]+}}(%esp) ; LINUX-X32-NEXT: movl %r14d, %edi -; LINUX-X32-NEXT: movaps %xmm7, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; LINUX-X32-NEXT: movaps %xmm6, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; LINUX-X32-NEXT: movaps %xmm5, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; LINUX-X32-NEXT: movaps %xmm4, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; LINUX-X32-NEXT: movaps %xmm3, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; LINUX-X32-NEXT: movaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; LINUX-X32-NEXT: movaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; LINUX-X32-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; LINUX-X32-NEXT: callq get_f ; LINUX-X32-NEXT: movl %eax, %r11d ; LINUX-X32-NEXT: movl %r14d, %edi @@ -178,16 +266,36 @@ ; LINUX-X32-NEXT: movq %r13, %rcx ; LINUX-X32-NEXT: movq %r12, %r8 ; LINUX-X32-NEXT: movq %r15, %r9 -; LINUX-X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; LINUX-X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload -; LINUX-X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload -; LINUX-X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm3 # 16-byte Reload -; LINUX-X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm4 # 16-byte Reload -; LINUX-X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm5 # 16-byte Reload -; LINUX-X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm6 # 16-byte Reload -; LINUX-X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm7 # 16-byte Reload ; LINUX-X32-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload -; LINUX-X32-NEXT: addl $344, %esp # imm = 0x158 +; LINUX-X32-NEXT: testb %al, %al +; LINUX-X32-NEXT: je .LBB0_4 +; LINUX-X32-NEXT: # %bb.3: +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm7 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm6 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm5 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm4 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm3 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm2 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm1 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 +; LINUX-X32-NEXT: addl $216, %esp +; LINUX-X32-NEXT: .cfi_def_cfa_offset 56 +; LINUX-X32-NEXT: popq %rbx +; LINUX-X32-NEXT: .cfi_def_cfa_offset 48 +; LINUX-X32-NEXT: popq %r12 +; LINUX-X32-NEXT: .cfi_def_cfa_offset 40 +; LINUX-X32-NEXT: popq %r13 +; LINUX-X32-NEXT: .cfi_def_cfa_offset 32 +; LINUX-X32-NEXT: popq %r14 +; LINUX-X32-NEXT: .cfi_def_cfa_offset 24 +; LINUX-X32-NEXT: popq %r15 +; LINUX-X32-NEXT: .cfi_def_cfa_offset 16 +; LINUX-X32-NEXT: popq %rbp +; LINUX-X32-NEXT: .cfi_def_cfa_offset 8 +; LINUX-X32-NEXT: jmpq *%r11 # TAILCALL +; LINUX-X32-NEXT: .LBB0_4: +; LINUX-X32-NEXT: .cfi_def_cfa_offset 272 +; LINUX-X32-NEXT: addl $216, %esp ; LINUX-X32-NEXT: .cfi_def_cfa_offset 56 ; LINUX-X32-NEXT: popq %rbx ; LINUX-X32-NEXT: .cfi_def_cfa_offset 48 @@ -203,6 +311,87 @@ ; LINUX-X32-NEXT: .cfi_def_cfa_offset 8 ; LINUX-X32-NEXT: jmpq *%r11 # TAILCALL ; +; LINUX-X32-OPT0-LABEL: f_thunk: +; LINUX-X32-OPT0: # %bb.0: +; LINUX-X32-OPT0-NEXT: subl $312, %esp # imm = 0x138 +; LINUX-X32-OPT0-NEXT: .cfi_def_cfa_offset 320 +; LINUX-X32-OPT0-NEXT: testb %al, %al +; LINUX-X32-OPT0-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; LINUX-X32-OPT0-NEXT: movq %r9, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %r8, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %rcx, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %rdx, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %rsi, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; LINUX-X32-OPT0-NEXT: je .LBB0_2 +; LINUX-X32-OPT0-NEXT: # %bb.1: +; LINUX-X32-OPT0-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm5, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm6, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm7, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: .LBB0_2: +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %rax # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq %rax, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %rcx # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq %rcx, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %rdx # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq %rdx, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %rsi # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq %rsi, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %rdi # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq %rdi, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %r8b # 1-byte Reload +; LINUX-X32-OPT0-NEXT: leal {{[0-9]+}}(%rsp), %r9d +; LINUX-X32-OPT0-NEXT: movl %r9d, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: leal {{[0-9]+}}(%rsp), %r9d +; LINUX-X32-OPT0-NEXT: movl %r9d, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movl $48, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movl $8, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %r9d # 4-byte Reload +; LINUX-X32-OPT0-NEXT: movq %rdi, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movl %r9d, %edi +; LINUX-X32-OPT0-NEXT: movq %rsi, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %rdx, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %rcx, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %rax, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movb %r8b, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; LINUX-X32-OPT0-NEXT: callq get_f +; LINUX-X32-OPT0-NEXT: movl %eax, %eax +; LINUX-X32-OPT0-NEXT: movl %eax, %ecx +; LINUX-X32-OPT0-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %rsi # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %rdx # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %r10 # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq %rcx, (%esp) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %r10, %rcx +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %r8 # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %r9 # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload +; LINUX-X32-OPT0-NEXT: movq (%esp), %r11 # 8-byte Reload +; LINUX-X32-OPT0-NEXT: testb %al, %al +; LINUX-X32-OPT0-NEXT: je .LBB0_4 +; LINUX-X32-OPT0-NEXT: # %bb.3: +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm7 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm6 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm5 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm4 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm3 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm2 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm1 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 +; LINUX-X32-OPT0-NEXT: addl $312, %esp # imm = 0x138 +; LINUX-X32-OPT0-NEXT: .cfi_def_cfa_offset 8 +; LINUX-X32-OPT0-NEXT: jmpq *%r11 # TAILCALL +; LINUX-X32-OPT0-NEXT: .LBB0_4: +; LINUX-X32-OPT0-NEXT: .cfi_def_cfa_offset 320 +; LINUX-X32-OPT0-NEXT: addl $312, %esp # imm = 0x138 +; LINUX-X32-OPT0-NEXT: .cfi_def_cfa_offset 8 +; LINUX-X32-OPT0-NEXT: jmpq *%r11 # TAILCALL +; ; WINDOWS-LABEL: f_thunk: ; WINDOWS: # %bb.0: ; WINDOWS-NEXT: pushq %r14 @@ -240,6 +429,31 @@ ; WINDOWS-NEXT: .text ; WINDOWS-NEXT: .seh_endproc ; +; WINDOWS-OPT0-LABEL: f_thunk: +; WINDOWS-OPT0: # %bb.0: +; WINDOWS-OPT0-NEXT: subq $104, %rsp +; WINDOWS-OPT0-NEXT: .seh_stackalloc 104 +; WINDOWS-OPT0-NEXT: .seh_endprologue +; WINDOWS-OPT0-NEXT: movq %r9, {{[0-9]+}}(%rsp) +; WINDOWS-OPT0-NEXT: movq %r8, {{[0-9]+}}(%rsp) +; WINDOWS-OPT0-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; WINDOWS-OPT0-NEXT: leaq {{[0-9]+}}(%rsp), %rax +; WINDOWS-OPT0-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; WINDOWS-OPT0-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; WINDOWS-OPT0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; WINDOWS-OPT0-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; WINDOWS-OPT0-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; WINDOWS-OPT0-NEXT: callq get_f +; WINDOWS-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; WINDOWS-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; WINDOWS-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; WINDOWS-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; WINDOWS-OPT0-NEXT: addq $104, %rsp +; WINDOWS-OPT0-NEXT: rex64 jmpq *%rax # TAILCALL +; WINDOWS-OPT0-NEXT: .seh_handlerdata +; WINDOWS-OPT0-NEXT: .text +; WINDOWS-OPT0-NEXT: .seh_endproc +; ; X86-NOSSE-LABEL: f_thunk: ; X86-NOSSE: # %bb.0: ; X86-NOSSE-NEXT: pushl %ebp @@ -259,6 +473,25 @@ ; X86-NOSSE-NEXT: popl %ebp ; X86-NOSSE-NEXT: jmpl *%eax # TAILCALL ; +; X86-NOSSE-OPT0-LABEL: f_thunk: +; X86-NOSSE-OPT0: # %bb.0: +; X86-NOSSE-OPT0-NEXT: pushl %ebp +; X86-NOSSE-OPT0-NEXT: movl %esp, %ebp +; X86-NOSSE-OPT0-NEXT: andl $-16, %esp +; X86-NOSSE-OPT0-NEXT: subl $48, %esp +; X86-NOSSE-OPT0-NEXT: movl 8(%ebp), %eax +; X86-NOSSE-OPT0-NEXT: leal 12(%ebp), %ecx +; X86-NOSSE-OPT0-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NOSSE-OPT0-NEXT: movl %esp, %ecx +; X86-NOSSE-OPT0-NEXT: movl %eax, (%ecx) +; X86-NOSSE-OPT0-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NOSSE-OPT0-NEXT: calll _get_f +; X86-NOSSE-OPT0-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NOSSE-OPT0-NEXT: movl %ecx, 8(%ebp) +; X86-NOSSE-OPT0-NEXT: movl %ebp, %esp +; X86-NOSSE-OPT0-NEXT: popl %ebp +; X86-NOSSE-OPT0-NEXT: jmpl *%eax # TAILCALL +; ; X86-SSE-LABEL: f_thunk: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: pushl %ebp @@ -283,6 +516,31 @@ ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: popl %ebp ; X86-SSE-NEXT: jmpl *%eax # TAILCALL +; +; X86-SSE-OPT0-LABEL: f_thunk: +; X86-SSE-OPT0: # %bb.0: +; X86-SSE-OPT0-NEXT: pushl %ebp +; X86-SSE-OPT0-NEXT: movl %esp, %ebp +; X86-SSE-OPT0-NEXT: andl $-16, %esp +; X86-SSE-OPT0-NEXT: subl $112, %esp +; X86-SSE-OPT0-NEXT: movl 8(%ebp), %eax +; X86-SSE-OPT0-NEXT: leal 12(%ebp), %ecx +; X86-SSE-OPT0-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-SSE-OPT0-NEXT: movl %esp, %ecx +; X86-SSE-OPT0-NEXT: movl %eax, (%ecx) +; X86-SSE-OPT0-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-OPT0-NEXT: movaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-OPT0-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE-OPT0-NEXT: movaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-OPT0-NEXT: calll _get_f +; X86-SSE-OPT0-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-SSE-OPT0-NEXT: movl %ecx, 8(%ebp) +; X86-SSE-OPT0-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-OPT0-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload +; X86-SSE-OPT0-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload +; X86-SSE-OPT0-NEXT: movl %ebp, %esp +; X86-SSE-OPT0-NEXT: popl %ebp +; X86-SSE-OPT0-NEXT: jmpl *%eax # TAILCALL %ap = alloca [4 x i8*], align 16 %ap_i8 = bitcast [4 x i8*]* %ap to i8* call void @llvm.va_start(i8* %ap_i8) @@ -296,23 +554,209 @@ ; No regparms on normal x86 conventions. -; This thunk shouldn't require any spills and reloads, assuming the register -; allocator knows what it's doing. +; This thunk stores xmms on entry and restores them before jumping. +; Storing and restoring xmms could be optimized out for this concrete case. define void @g_thunk(i8* %fptr_i8, ...) { ; LINUX-LABEL: g_thunk: ; LINUX: # %bb.0: +; LINUX-NEXT: pushq %rax +; LINUX-NEXT: .cfi_def_cfa_offset 16 +; LINUX-NEXT: testb %al, %al +; LINUX-NEXT: je .LBB1_2 +; LINUX-NEXT: # %bb.1: +; LINUX-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; LINUX-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; LINUX-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; LINUX-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; LINUX-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; LINUX-NEXT: movaps %xmm5, -{{[0-9]+}}(%rsp) +; LINUX-NEXT: movaps %xmm6, -{{[0-9]+}}(%rsp) +; LINUX-NEXT: movaps %xmm7, -{{[0-9]+}}(%rsp) +; LINUX-NEXT: .LBB1_2: +; LINUX-NEXT: testb %al, %al +; LINUX-NEXT: je .LBB1_4 +; LINUX-NEXT: # %bb.3: +; LINUX-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm7 +; LINUX-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm6 +; LINUX-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm5 +; LINUX-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm4 +; LINUX-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm3 +; LINUX-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm2 +; LINUX-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1 +; LINUX-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 +; LINUX-NEXT: popq %r11 +; LINUX-NEXT: .cfi_def_cfa_offset 8 +; LINUX-NEXT: jmpq *%rdi # TAILCALL +; LINUX-NEXT: .LBB1_4: +; LINUX-NEXT: .cfi_def_cfa_offset 16 +; LINUX-NEXT: popq %r11 +; LINUX-NEXT: .cfi_def_cfa_offset 8 ; LINUX-NEXT: jmpq *%rdi # TAILCALL ; +; LINUX-OPT0-LABEL: g_thunk: +; LINUX-OPT0: # %bb.0: +; LINUX-OPT0-NEXT: subq $72, %rsp +; LINUX-OPT0-NEXT: .cfi_def_cfa_offset 80 +; LINUX-OPT0-NEXT: movb %al, %r10b +; LINUX-OPT0-NEXT: testb %al, %al +; LINUX-OPT0-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; LINUX-OPT0-NEXT: je .LBB1_2 +; LINUX-OPT0-NEXT: # %bb.1: +; LINUX-OPT0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm4, (%rsp) +; LINUX-OPT0-NEXT: movaps %xmm5, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: .LBB1_2: +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; LINUX-OPT0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; LINUX-OPT0-NEXT: testb %al, %al +; LINUX-OPT0-NEXT: je .LBB1_4 +; LINUX-OPT0-NEXT: # %bb.3: +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm6 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5 +; LINUX-OPT0-NEXT: movaps (%rsp), %xmm4 +; LINUX-OPT0-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm3 +; LINUX-OPT0-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm2 +; LINUX-OPT0-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1 +; LINUX-OPT0-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 +; LINUX-OPT0-NEXT: addq $72, %rsp +; LINUX-OPT0-NEXT: .cfi_def_cfa_offset 8 +; LINUX-OPT0-NEXT: jmpq *%r11 # TAILCALL +; LINUX-OPT0-NEXT: .LBB1_4: +; LINUX-OPT0-NEXT: .cfi_def_cfa_offset 80 +; LINUX-OPT0-NEXT: addq $72, %rsp +; LINUX-OPT0-NEXT: .cfi_def_cfa_offset 8 +; LINUX-OPT0-NEXT: jmpq *%r11 # TAILCALL +; ; LINUX-X32-LABEL: g_thunk: ; LINUX-X32: # %bb.0: +; LINUX-X32-NEXT: pushq %rax +; LINUX-X32-NEXT: .cfi_def_cfa_offset 16 +; LINUX-X32-NEXT: testb %al, %al +; LINUX-X32-NEXT: je .LBB1_2 +; LINUX-X32-NEXT: # %bb.1: +; LINUX-X32-NEXT: movaps %xmm0, -{{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movaps %xmm1, -{{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movaps %xmm2, -{{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movaps %xmm3, -{{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movaps %xmm4, -{{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movaps %xmm5, -{{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movaps %xmm6, -{{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movaps %xmm7, -{{[0-9]+}}(%esp) +; LINUX-X32-NEXT: .LBB1_2: ; LINUX-X32-NEXT: movl %edi, %r11d +; LINUX-X32-NEXT: testb %al, %al +; LINUX-X32-NEXT: je .LBB1_4 +; LINUX-X32-NEXT: # %bb.3: +; LINUX-X32-NEXT: movaps -{{[0-9]+}}(%esp), %xmm7 +; LINUX-X32-NEXT: movaps -{{[0-9]+}}(%esp), %xmm6 +; LINUX-X32-NEXT: movaps -{{[0-9]+}}(%esp), %xmm5 +; LINUX-X32-NEXT: movaps -{{[0-9]+}}(%esp), %xmm4 +; LINUX-X32-NEXT: movaps -{{[0-9]+}}(%esp), %xmm3 +; LINUX-X32-NEXT: movaps -{{[0-9]+}}(%esp), %xmm2 +; LINUX-X32-NEXT: movaps -{{[0-9]+}}(%esp), %xmm1 +; LINUX-X32-NEXT: movaps -{{[0-9]+}}(%esp), %xmm0 +; LINUX-X32-NEXT: addl $8, %esp +; LINUX-X32-NEXT: .cfi_def_cfa_offset 8 +; LINUX-X32-NEXT: jmpq *%r11 # TAILCALL +; LINUX-X32-NEXT: .LBB1_4: +; LINUX-X32-NEXT: .cfi_def_cfa_offset 16 +; LINUX-X32-NEXT: addl $8, %esp +; LINUX-X32-NEXT: .cfi_def_cfa_offset 8 ; LINUX-X32-NEXT: jmpq *%r11 # TAILCALL ; +; LINUX-X32-OPT0-LABEL: g_thunk: +; LINUX-X32-OPT0: # %bb.0: +; LINUX-X32-OPT0-NEXT: subl $72, %esp +; LINUX-X32-OPT0-NEXT: .cfi_def_cfa_offset 80 +; LINUX-X32-OPT0-NEXT: movb %al, %r10b +; LINUX-X32-OPT0-NEXT: testb %al, %al +; LINUX-X32-OPT0-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; LINUX-X32-OPT0-NEXT: movq %rsi, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %rdx, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %rcx, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %r8, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %r9, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movb %r10b, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; LINUX-X32-OPT0-NEXT: je .LBB1_2 +; LINUX-X32-OPT0-NEXT: # %bb.1: +; LINUX-X32-OPT0-NEXT: movaps %xmm0, -{{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm1, -{{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm2, -{{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm3, -{{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm4, (%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm5, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm6, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm7, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: .LBB1_2: +; LINUX-X32-OPT0-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; LINUX-X32-OPT0-NEXT: movl %eax, %ecx +; LINUX-X32-OPT0-NEXT: movl %ecx, %edx +; LINUX-X32-OPT0-NEXT: movl %eax, %edi +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %rsi # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %r8 # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq %rdx, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %r8, %rdx +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %rcx # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %r8 # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %r9 # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %r11 # 8-byte Reload +; LINUX-X32-OPT0-NEXT: testb %al, %al +; LINUX-X32-OPT0-NEXT: je .LBB1_4 +; LINUX-X32-OPT0-NEXT: # %bb.3: +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm7 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm6 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm5 +; LINUX-X32-OPT0-NEXT: movaps (%esp), %xmm4 +; LINUX-X32-OPT0-NEXT: movaps -{{[0-9]+}}(%esp), %xmm3 +; LINUX-X32-OPT0-NEXT: movaps -{{[0-9]+}}(%esp), %xmm2 +; LINUX-X32-OPT0-NEXT: movaps -{{[0-9]+}}(%esp), %xmm1 +; LINUX-X32-OPT0-NEXT: movaps -{{[0-9]+}}(%esp), %xmm0 +; LINUX-X32-OPT0-NEXT: addl $72, %esp +; LINUX-X32-OPT0-NEXT: .cfi_def_cfa_offset 8 +; LINUX-X32-OPT0-NEXT: jmpq *%r11 # TAILCALL +; LINUX-X32-OPT0-NEXT: .LBB1_4: +; LINUX-X32-OPT0-NEXT: .cfi_def_cfa_offset 80 +; LINUX-X32-OPT0-NEXT: addl $72, %esp +; LINUX-X32-OPT0-NEXT: .cfi_def_cfa_offset 8 +; LINUX-X32-OPT0-NEXT: jmpq *%r11 # TAILCALL +; ; WINDOWS-LABEL: g_thunk: ; WINDOWS: # %bb.0: ; WINDOWS-NEXT: rex64 jmpq *%rcx # TAILCALL ; +; WINDOWS-OPT0-LABEL: g_thunk: +; WINDOWS-OPT0: # %bb.0: +; WINDOWS-OPT0-NEXT: pushq %rax +; WINDOWS-OPT0-NEXT: .seh_stackalloc 8 +; WINDOWS-OPT0-NEXT: .seh_endprologue +; WINDOWS-OPT0-NEXT: movq %rcx, (%rsp) # 8-byte Spill +; WINDOWS-OPT0-NEXT: movq (%rsp), %rax # 8-byte Reload +; WINDOWS-OPT0-NEXT: popq %r10 +; WINDOWS-OPT0-NEXT: rex64 jmpq *%rax # TAILCALL +; WINDOWS-OPT0-NEXT: .seh_handlerdata +; WINDOWS-OPT0-NEXT: .text +; WINDOWS-OPT0-NEXT: .seh_endproc +; ; X86-LABEL: g_thunk: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -332,28 +776,322 @@ define void @h_thunk(%struct.Foo* %this, ...) { ; LINUX-LABEL: h_thunk: ; LINUX: # %bb.0: +; LINUX-NEXT: pushq %rax +; LINUX-NEXT: .cfi_def_cfa_offset 16 +; LINUX-NEXT: testb %al, %al +; LINUX-NEXT: je .LBB2_2 +; LINUX-NEXT: # %bb.1: +; LINUX-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; LINUX-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; LINUX-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; LINUX-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; LINUX-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; LINUX-NEXT: movaps %xmm5, -{{[0-9]+}}(%rsp) +; LINUX-NEXT: movaps %xmm6, -{{[0-9]+}}(%rsp) +; LINUX-NEXT: movaps %xmm7, -{{[0-9]+}}(%rsp) +; LINUX-NEXT: .LBB2_2: ; LINUX-NEXT: cmpb $1, (%rdi) -; LINUX-NEXT: jne .LBB2_2 -; LINUX-NEXT: # %bb.1: # %then +; LINUX-NEXT: jne .LBB2_4 +; LINUX-NEXT: # %bb.3: # %then ; LINUX-NEXT: movq 8(%rdi), %r11 +; LINUX-NEXT: testb %al, %al +; LINUX-NEXT: je .LBB2_6 +; LINUX-NEXT: # %bb.5: # %then +; LINUX-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm7 +; LINUX-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm6 +; LINUX-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm5 +; LINUX-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm4 +; LINUX-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm3 +; LINUX-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm2 +; LINUX-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1 +; LINUX-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 +; LINUX-NEXT: addq $8, %rsp +; LINUX-NEXT: .cfi_def_cfa_offset 8 ; LINUX-NEXT: jmpq *%r11 # TAILCALL -; LINUX-NEXT: .LBB2_2: # %else +; LINUX-NEXT: .LBB2_4: # %else +; LINUX-NEXT: .cfi_def_cfa_offset 16 ; LINUX-NEXT: movq 16(%rdi), %r11 ; LINUX-NEXT: movl $42, {{.*}}(%rip) +; LINUX-NEXT: testb %al, %al +; LINUX-NEXT: je .LBB2_8 +; LINUX-NEXT: # %bb.7: # %else +; LINUX-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm7 +; LINUX-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm6 +; LINUX-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm5 +; LINUX-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm4 +; LINUX-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm3 +; LINUX-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm2 +; LINUX-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1 +; LINUX-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 +; LINUX-NEXT: addq $8, %rsp +; LINUX-NEXT: .cfi_def_cfa_offset 8 +; LINUX-NEXT: jmpq *%r11 # TAILCALL +; LINUX-NEXT: .LBB2_6: # %then +; LINUX-NEXT: .cfi_def_cfa_offset 16 +; LINUX-NEXT: addq $8, %rsp +; LINUX-NEXT: .cfi_def_cfa_offset 8 +; LINUX-NEXT: jmpq *%r11 # TAILCALL +; LINUX-NEXT: .LBB2_8: # %else +; LINUX-NEXT: .cfi_def_cfa_offset 16 +; LINUX-NEXT: addq $8, %rsp +; LINUX-NEXT: .cfi_def_cfa_offset 8 ; LINUX-NEXT: jmpq *%r11 # TAILCALL ; +; LINUX-OPT0-LABEL: h_thunk: +; LINUX-OPT0: # %bb.0: +; LINUX-OPT0-NEXT: subq $88, %rsp +; LINUX-OPT0-NEXT: .cfi_def_cfa_offset 96 +; LINUX-OPT0-NEXT: movb %al, %r10b +; LINUX-OPT0-NEXT: testb %al, %al +; LINUX-OPT0-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; LINUX-OPT0-NEXT: je .LBB2_4 +; LINUX-OPT0-NEXT: # %bb.3: +; LINUX-OPT0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm3, (%rsp) +; LINUX-OPT0-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm5, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: .LBB2_4: +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; LINUX-OPT0-NEXT: testb $1, (%rax) +; LINUX-OPT0-NEXT: jne .LBB2_1 +; LINUX-OPT0-NEXT: jmp .LBB2_2 +; LINUX-OPT0-NEXT: .LBB2_1: # %then +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; LINUX-OPT0-NEXT: movq 8(%rax), %rcx +; LINUX-OPT0-NEXT: movq %rax, %rdi +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; LINUX-OPT0-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %r8, %rcx +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; LINUX-OPT0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; LINUX-OPT0-NEXT: testb %al, %al +; LINUX-OPT0-NEXT: je .LBB2_6 +; LINUX-OPT0-NEXT: # %bb.5: # %then +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm6 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4 +; LINUX-OPT0-NEXT: movaps (%rsp), %xmm3 +; LINUX-OPT0-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm2 +; LINUX-OPT0-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1 +; LINUX-OPT0-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 +; LINUX-OPT0-NEXT: addq $88, %rsp +; LINUX-OPT0-NEXT: .cfi_def_cfa_offset 8 +; LINUX-OPT0-NEXT: jmpq *%r11 # TAILCALL +; LINUX-OPT0-NEXT: .LBB2_6: # %then +; LINUX-OPT0-NEXT: .cfi_def_cfa_offset 96 +; LINUX-OPT0-NEXT: addq $88, %rsp +; LINUX-OPT0-NEXT: .cfi_def_cfa_offset 8 +; LINUX-OPT0-NEXT: jmpq *%r11 # TAILCALL +; LINUX-OPT0-NEXT: .LBB2_2: # %else +; LINUX-OPT0-NEXT: .cfi_def_cfa_offset 96 +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; LINUX-OPT0-NEXT: movq 16(%rax), %rcx +; LINUX-OPT0-NEXT: movl $42, {{.*}}(%rip) +; LINUX-OPT0-NEXT: movq %rax, %rdi +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; LINUX-OPT0-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %r8, %rcx +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; LINUX-OPT0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; LINUX-OPT0-NEXT: testb %al, %al +; LINUX-OPT0-NEXT: je .LBB2_8 +; LINUX-OPT0-NEXT: # %bb.7: # %else +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm6 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4 +; LINUX-OPT0-NEXT: movaps (%rsp), %xmm3 +; LINUX-OPT0-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm2 +; LINUX-OPT0-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1 +; LINUX-OPT0-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 +; LINUX-OPT0-NEXT: addq $88, %rsp +; LINUX-OPT0-NEXT: .cfi_def_cfa_offset 8 +; LINUX-OPT0-NEXT: jmpq *%r11 # TAILCALL +; LINUX-OPT0-NEXT: .LBB2_8: # %else +; LINUX-OPT0-NEXT: .cfi_def_cfa_offset 96 +; LINUX-OPT0-NEXT: addq $88, %rsp +; LINUX-OPT0-NEXT: .cfi_def_cfa_offset 8 +; LINUX-OPT0-NEXT: jmpq *%r11 # TAILCALL +; ; LINUX-X32-LABEL: h_thunk: ; LINUX-X32: # %bb.0: +; LINUX-X32-NEXT: pushq %rax +; LINUX-X32-NEXT: .cfi_def_cfa_offset 16 +; LINUX-X32-NEXT: testb %al, %al +; LINUX-X32-NEXT: je .LBB2_2 +; LINUX-X32-NEXT: # %bb.1: +; LINUX-X32-NEXT: movaps %xmm0, -{{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movaps %xmm1, -{{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movaps %xmm2, -{{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movaps %xmm3, -{{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movaps %xmm4, -{{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movaps %xmm5, -{{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movaps %xmm6, -{{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movaps %xmm7, -{{[0-9]+}}(%esp) +; LINUX-X32-NEXT: .LBB2_2: ; LINUX-X32-NEXT: cmpb $1, (%edi) -; LINUX-X32-NEXT: jne .LBB2_2 -; LINUX-X32-NEXT: # %bb.1: # %then +; LINUX-X32-NEXT: jne .LBB2_4 +; LINUX-X32-NEXT: # %bb.3: # %then ; LINUX-X32-NEXT: movl 4(%edi), %r11d +; LINUX-X32-NEXT: testb %al, %al +; LINUX-X32-NEXT: je .LBB2_6 +; LINUX-X32-NEXT: # %bb.5: # %then +; LINUX-X32-NEXT: movaps -{{[0-9]+}}(%esp), %xmm7 +; LINUX-X32-NEXT: movaps -{{[0-9]+}}(%esp), %xmm6 +; LINUX-X32-NEXT: movaps -{{[0-9]+}}(%esp), %xmm5 +; LINUX-X32-NEXT: movaps -{{[0-9]+}}(%esp), %xmm4 +; LINUX-X32-NEXT: movaps -{{[0-9]+}}(%esp), %xmm3 +; LINUX-X32-NEXT: movaps -{{[0-9]+}}(%esp), %xmm2 +; LINUX-X32-NEXT: movaps -{{[0-9]+}}(%esp), %xmm1 +; LINUX-X32-NEXT: movaps -{{[0-9]+}}(%esp), %xmm0 +; LINUX-X32-NEXT: addl $8, %esp +; LINUX-X32-NEXT: .cfi_def_cfa_offset 8 ; LINUX-X32-NEXT: jmpq *%r11 # TAILCALL -; LINUX-X32-NEXT: .LBB2_2: # %else +; LINUX-X32-NEXT: .LBB2_4: # %else +; LINUX-X32-NEXT: .cfi_def_cfa_offset 16 ; LINUX-X32-NEXT: movl 8(%edi), %r11d ; LINUX-X32-NEXT: movl $42, {{.*}}(%rip) +; LINUX-X32-NEXT: testb %al, %al +; LINUX-X32-NEXT: je .LBB2_8 +; LINUX-X32-NEXT: # %bb.7: # %else +; LINUX-X32-NEXT: movaps -{{[0-9]+}}(%esp), %xmm7 +; LINUX-X32-NEXT: movaps -{{[0-9]+}}(%esp), %xmm6 +; LINUX-X32-NEXT: movaps -{{[0-9]+}}(%esp), %xmm5 +; LINUX-X32-NEXT: movaps -{{[0-9]+}}(%esp), %xmm4 +; LINUX-X32-NEXT: movaps -{{[0-9]+}}(%esp), %xmm3 +; LINUX-X32-NEXT: movaps -{{[0-9]+}}(%esp), %xmm2 +; LINUX-X32-NEXT: movaps -{{[0-9]+}}(%esp), %xmm1 +; LINUX-X32-NEXT: movaps -{{[0-9]+}}(%esp), %xmm0 +; LINUX-X32-NEXT: addl $8, %esp +; LINUX-X32-NEXT: .cfi_def_cfa_offset 8 +; LINUX-X32-NEXT: jmpq *%r11 # TAILCALL +; LINUX-X32-NEXT: .LBB2_6: # %then +; LINUX-X32-NEXT: .cfi_def_cfa_offset 16 +; LINUX-X32-NEXT: addl $8, %esp +; LINUX-X32-NEXT: .cfi_def_cfa_offset 8 +; LINUX-X32-NEXT: jmpq *%r11 # TAILCALL +; LINUX-X32-NEXT: .LBB2_8: # %else +; LINUX-X32-NEXT: .cfi_def_cfa_offset 16 +; LINUX-X32-NEXT: addl $8, %esp +; LINUX-X32-NEXT: .cfi_def_cfa_offset 8 ; LINUX-X32-NEXT: jmpq *%r11 # TAILCALL ; +; LINUX-X32-OPT0-LABEL: h_thunk: +; LINUX-X32-OPT0: # %bb.0: +; LINUX-X32-OPT0-NEXT: subl $88, %esp +; LINUX-X32-OPT0-NEXT: .cfi_def_cfa_offset 96 +; LINUX-X32-OPT0-NEXT: movb %al, %r10b +; LINUX-X32-OPT0-NEXT: testb %al, %al +; LINUX-X32-OPT0-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; LINUX-X32-OPT0-NEXT: movq %rsi, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %rdx, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %rcx, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %r8, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %r9, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movb %r10b, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; LINUX-X32-OPT0-NEXT: je .LBB2_4 +; LINUX-X32-OPT0-NEXT: # %bb.3: +; LINUX-X32-OPT0-NEXT: movaps %xmm0, -{{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm1, -{{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm2, -{{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm3, (%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm5, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm6, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm7, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: .LBB2_4: +; LINUX-X32-OPT0-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; LINUX-X32-OPT0-NEXT: testb $1, (%eax) +; LINUX-X32-OPT0-NEXT: jne .LBB2_1 +; LINUX-X32-OPT0-NEXT: jmp .LBB2_2 +; LINUX-X32-OPT0-NEXT: .LBB2_1: # %then +; LINUX-X32-OPT0-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; LINUX-X32-OPT0-NEXT: movl 4(%eax), %ecx +; LINUX-X32-OPT0-NEXT: movl %ecx, %edx +; LINUX-X32-OPT0-NEXT: movl %eax, %edi +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %rsi # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %r8 # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq %rdx, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %r8, %rdx +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %rcx # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %r8 # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %r9 # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %r11 # 8-byte Reload +; LINUX-X32-OPT0-NEXT: testb %al, %al +; LINUX-X32-OPT0-NEXT: je .LBB2_6 +; LINUX-X32-OPT0-NEXT: # %bb.5: # %then +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm7 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm6 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm5 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm4 +; LINUX-X32-OPT0-NEXT: movaps (%esp), %xmm3 +; LINUX-X32-OPT0-NEXT: movaps -{{[0-9]+}}(%esp), %xmm2 +; LINUX-X32-OPT0-NEXT: movaps -{{[0-9]+}}(%esp), %xmm1 +; LINUX-X32-OPT0-NEXT: movaps -{{[0-9]+}}(%esp), %xmm0 +; LINUX-X32-OPT0-NEXT: addl $88, %esp +; LINUX-X32-OPT0-NEXT: .cfi_def_cfa_offset 8 +; LINUX-X32-OPT0-NEXT: jmpq *%r11 # TAILCALL +; LINUX-X32-OPT0-NEXT: .LBB2_6: # %then +; LINUX-X32-OPT0-NEXT: .cfi_def_cfa_offset 96 +; LINUX-X32-OPT0-NEXT: addl $88, %esp +; LINUX-X32-OPT0-NEXT: .cfi_def_cfa_offset 8 +; LINUX-X32-OPT0-NEXT: jmpq *%r11 # TAILCALL +; LINUX-X32-OPT0-NEXT: .LBB2_2: # %else +; LINUX-X32-OPT0-NEXT: .cfi_def_cfa_offset 96 +; LINUX-X32-OPT0-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; LINUX-X32-OPT0-NEXT: movl 8(%eax), %ecx +; LINUX-X32-OPT0-NEXT: movl %ecx, %edx +; LINUX-X32-OPT0-NEXT: movl $42, {{.*}}(%rip) +; LINUX-X32-OPT0-NEXT: movl %eax, %edi +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %rsi # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %r8 # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq %rdx, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %r8, %rdx +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %rcx # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %r8 # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %r9 # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %r11 # 8-byte Reload +; LINUX-X32-OPT0-NEXT: testb %al, %al +; LINUX-X32-OPT0-NEXT: je .LBB2_8 +; LINUX-X32-OPT0-NEXT: # %bb.7: # %else +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm7 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm6 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm5 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm4 +; LINUX-X32-OPT0-NEXT: movaps (%esp), %xmm3 +; LINUX-X32-OPT0-NEXT: movaps -{{[0-9]+}}(%esp), %xmm2 +; LINUX-X32-OPT0-NEXT: movaps -{{[0-9]+}}(%esp), %xmm1 +; LINUX-X32-OPT0-NEXT: movaps -{{[0-9]+}}(%esp), %xmm0 +; LINUX-X32-OPT0-NEXT: addl $88, %esp +; LINUX-X32-OPT0-NEXT: .cfi_def_cfa_offset 8 +; LINUX-X32-OPT0-NEXT: jmpq *%r11 # TAILCALL +; LINUX-X32-OPT0-NEXT: .LBB2_8: # %else +; LINUX-X32-OPT0-NEXT: .cfi_def_cfa_offset 96 +; LINUX-X32-OPT0-NEXT: addl $88, %esp +; LINUX-X32-OPT0-NEXT: .cfi_def_cfa_offset 8 +; LINUX-X32-OPT0-NEXT: jmpq *%r11 # TAILCALL +; ; WINDOWS-LABEL: h_thunk: ; WINDOWS: # %bb.0: ; WINDOWS-NEXT: cmpb $1, (%rcx) @@ -366,20 +1104,127 @@ ; WINDOWS-NEXT: movl $42, {{.*}}(%rip) ; WINDOWS-NEXT: rex64 jmpq *%rax # TAILCALL ; -; X86-LABEL: h_thunk: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpb $1, (%eax) -; X86-NEXT: jne LBB2_2 -; X86-NEXT: # %bb.1: # %then -; X86-NEXT: movl 4(%eax), %ecx -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: jmpl *%ecx # TAILCALL -; X86-NEXT: LBB2_2: # %else -; X86-NEXT: movl 8(%eax), %ecx -; X86-NEXT: movl $42, _g -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: jmpl *%ecx # TAILCALL +; WINDOWS-OPT0-LABEL: h_thunk: +; WINDOWS-OPT0: # %bb.0: +; WINDOWS-OPT0-NEXT: subq $48, %rsp +; WINDOWS-OPT0-NEXT: .seh_stackalloc 48 +; WINDOWS-OPT0-NEXT: .seh_endprologue +; WINDOWS-OPT0-NEXT: testb $1, (%rcx) +; WINDOWS-OPT0-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; WINDOWS-OPT0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; WINDOWS-OPT0-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; WINDOWS-OPT0-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; WINDOWS-OPT0-NEXT: jne .LBB2_1 +; WINDOWS-OPT0-NEXT: jmp .LBB2_2 +; WINDOWS-OPT0-NEXT: .LBB2_1: # %then +; WINDOWS-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; WINDOWS-OPT0-NEXT: movq 8(%rax), %rcx +; WINDOWS-OPT0-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; WINDOWS-OPT0-NEXT: movq %rax, %rcx +; WINDOWS-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; WINDOWS-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; WINDOWS-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; WINDOWS-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; WINDOWS-OPT0-NEXT: addq $48, %rsp +; WINDOWS-OPT0-NEXT: rex64 jmpq *%r10 # TAILCALL +; WINDOWS-OPT0-NEXT: .LBB2_2: # %else +; WINDOWS-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; WINDOWS-OPT0-NEXT: movq 16(%rax), %rcx +; WINDOWS-OPT0-NEXT: movl $42, {{.*}}(%rip) +; WINDOWS-OPT0-NEXT: movq %rcx, (%rsp) # 8-byte Spill +; WINDOWS-OPT0-NEXT: movq %rax, %rcx +; WINDOWS-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; WINDOWS-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; WINDOWS-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; WINDOWS-OPT0-NEXT: movq (%rsp), %r10 # 8-byte Reload +; WINDOWS-OPT0-NEXT: addq $48, %rsp +; WINDOWS-OPT0-NEXT: rex64 jmpq *%r10 # TAILCALL +; WINDOWS-OPT0-NEXT: .seh_handlerdata +; WINDOWS-OPT0-NEXT: .text +; WINDOWS-OPT0-NEXT: .seh_endproc +; +; X86-NOSSE-LABEL: h_thunk: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOSSE-NEXT: cmpb $1, (%eax) +; X86-NOSSE-NEXT: jne LBB2_2 +; X86-NOSSE-NEXT: # %bb.1: # %then +; X86-NOSSE-NEXT: movl 4(%eax), %ecx +; X86-NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: jmpl *%ecx # TAILCALL +; X86-NOSSE-NEXT: LBB2_2: # %else +; X86-NOSSE-NEXT: movl 8(%eax), %ecx +; X86-NOSSE-NEXT: movl $42, _g +; X86-NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: jmpl *%ecx # TAILCALL +; +; X86-NOSSE-OPT0-LABEL: h_thunk: +; X86-NOSSE-OPT0: # %bb.0: +; X86-NOSSE-OPT0-NEXT: pushl %eax +; X86-NOSSE-OPT0-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOSSE-OPT0-NEXT: testb $1, (%eax) +; X86-NOSSE-OPT0-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NOSSE-OPT0-NEXT: jne LBB2_1 +; X86-NOSSE-OPT0-NEXT: jmp LBB2_2 +; X86-NOSSE-OPT0-NEXT: LBB2_1: # %then +; X86-NOSSE-OPT0-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NOSSE-OPT0-NEXT: movl 4(%eax), %ecx +; X86-NOSSE-OPT0-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NOSSE-OPT0-NEXT: popl %eax +; X86-NOSSE-OPT0-NEXT: jmpl *%ecx # TAILCALL +; X86-NOSSE-OPT0-NEXT: LBB2_2: # %else +; X86-NOSSE-OPT0-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NOSSE-OPT0-NEXT: movl 8(%eax), %ecx +; X86-NOSSE-OPT0-NEXT: movl $42, _g +; X86-NOSSE-OPT0-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NOSSE-OPT0-NEXT: popl %eax +; X86-NOSSE-OPT0-NEXT: jmpl *%ecx # TAILCALL +; +; X86-SSE-LABEL: h_thunk: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: cmpb $1, (%eax) +; X86-SSE-NEXT: jne LBB2_2 +; X86-SSE-NEXT: # %bb.1: # %then +; X86-SSE-NEXT: movl 4(%eax), %ecx +; X86-SSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-SSE-NEXT: jmpl *%ecx # TAILCALL +; X86-SSE-NEXT: LBB2_2: # %else +; X86-SSE-NEXT: movl 8(%eax), %ecx +; X86-SSE-NEXT: movl $42, _g +; X86-SSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-SSE-NEXT: jmpl *%ecx # TAILCALL +; +; X86-SSE-OPT0-LABEL: h_thunk: +; X86-SSE-OPT0: # %bb.0: +; X86-SSE-OPT0-NEXT: subl $76, %esp +; X86-SSE-OPT0-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-OPT0-NEXT: testb $1, (%eax) +; X86-SSE-OPT0-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-OPT0-NEXT: movups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-OPT0-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE-OPT0-NEXT: movups %xmm2, (%esp) # 16-byte Spill +; X86-SSE-OPT0-NEXT: jne LBB2_1 +; X86-SSE-OPT0-NEXT: jmp LBB2_2 +; X86-SSE-OPT0-NEXT: LBB2_1: # %then +; X86-SSE-OPT0-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-SSE-OPT0-NEXT: movl 4(%eax), %ecx +; X86-SSE-OPT0-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-SSE-OPT0-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-OPT0-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload +; X86-SSE-OPT0-NEXT: movups (%esp), %xmm2 # 16-byte Reload +; X86-SSE-OPT0-NEXT: addl $76, %esp +; X86-SSE-OPT0-NEXT: jmpl *%ecx # TAILCALL +; X86-SSE-OPT0-NEXT: LBB2_2: # %else +; X86-SSE-OPT0-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-SSE-OPT0-NEXT: movl 8(%eax), %ecx +; X86-SSE-OPT0-NEXT: movl $42, _g +; X86-SSE-OPT0-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-SSE-OPT0-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-OPT0-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload +; X86-SSE-OPT0-NEXT: movups (%esp), %xmm2 # 16-byte Reload +; X86-SSE-OPT0-NEXT: addl $76, %esp +; X86-SSE-OPT0-NEXT: jmpl *%ecx # TAILCALL %cond_p = getelementptr %struct.Foo, %struct.Foo* %this, i32 0, i32 0 %cond = load i1, i1* %cond_p br i1 %cond, label %then, label %else diff --git a/llvm/test/CodeGen/X86/vastart-defs-eflags.ll b/llvm/test/CodeGen/X86/vastart-defs-eflags.ll --- a/llvm/test/CodeGen/X86/vastart-defs-eflags.ll +++ b/llvm/test/CodeGen/X86/vastart-defs-eflags.ll @@ -9,6 +9,11 @@ ; CHECK-LABEL: check_flag: ; CHECK: ## %bb.0: ## %entry ; CHECK-NEXT: subq $56, %rsp +; CHECK-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: je LBB0_2 ; CHECK-NEXT: ## %bb.1: ## %entry @@ -21,11 +26,6 @@ ; CHECK-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp) ; CHECK-NEXT: LBB0_2: ## %entry -; CHECK-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testl $512, %edi ## imm = 0x200 ; CHECK-NEXT: je LBB0_4 diff --git a/llvm/test/CodeGen/X86/x32-va_start.ll b/llvm/test/CodeGen/X86/x32-va_start.ll --- a/llvm/test/CodeGen/X86/x32-va_start.ll +++ b/llvm/test/CodeGen/X86/x32-va_start.ll @@ -27,6 +27,11 @@ call void @llvm.lifetime.start.p0i8(i64 16, i8* %0) #2 call void @llvm.va_start(i8* %0) ; SSE: subl $72, %esp +; CHECK-DAG: movq %r9 +; CHECK-DAG: movq %r8 +; CHECK-DAG: movq %rcx +; CHECK-DAG: movq %rdx +; CHECK-DAG: movq %rsi ; SSE: testb %al, %al ; SSE: je .[[NOFP:.*]] ; SSE-DAG: movaps %xmm1 @@ -38,11 +43,6 @@ ; SSE-DAG: movaps %xmm7 ; NOSSE-NOT: xmm ; SSE: .[[NOFP]]: -; CHECK-DAG: movq %r9 -; CHECK-DAG: movq %r8 -; CHECK-DAG: movq %rcx -; CHECK-DAG: movq %rdx -; CHECK-DAG: movq %rsi %gp_offset_p = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %ap, i32 0, i32 0, i32 0 %gp_offset = load i32, i32* %gp_offset_p, align 16 %fits_in_gp = icmp ult i32 %gp_offset, 41 diff --git a/llvm/test/CodeGen/X86/xmm-vararg-noopt.ll b/llvm/test/CodeGen/X86/xmm-vararg-noopt.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/xmm-vararg-noopt.ll @@ -0,0 +1,49 @@ +; RUN: llc -O0 -mtriple=x86_64-unknown-unknown < %s | FileCheck %s + +; CHECK-LABEL: testvarargs +; Ensure that xmm registers are not used before testing %al +; CHECK-NOT: xmm +; CHECK: testb %al, %al +; CHECK-NOT: xmm +; CHECK: # %bb.1 +; CHECK-NEXT: %xmm0, {{.*}}%rsp +; CHECK-NEXT: %xmm1, {{.*}}%rsp +; CHECK-NEXT: %xmm2, {{.*}}%rsp +; CHECK-NEXT: %xmm3, {{.*}}%rsp +; CHECK-NEXT: %xmm4, {{.*}}%rsp +; CHECK-NEXT: %xmm5, {{.*}}%rsp +; CHECK-NEXT: %xmm6, {{.*}}%rsp +; CHECK-NEXT: %xmm7, {{.*}}%rsp + +; ModuleID = 'variadic.c' +source_filename = "variadic.c" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux" + +%struct.__va_list_tag = type { i32, i32, i8*, i8* } + +@.str = private unnamed_addr constant [9 x i8] c"\0A hello \00", align 1 + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local void @testvarargs(i8* %fmt, ...) { +entry: + %fmt.addr = alloca i8*, align 8 + %va = alloca [1 x %struct.__va_list_tag], align 16 + store i8* %fmt, i8** %fmt.addr, align 8 + %arraydecay = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %va, i64 0, i64 0 + %arraydecay1 = bitcast %struct.__va_list_tag* %arraydecay to i8* + call void @llvm.va_start(i8* %arraydecay1) + %arraydecay2 = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %va, i64 0, i64 0 + %arraydecay23 = bitcast %struct.__va_list_tag* %arraydecay2 to i8* + call void @llvm.va_end(i8* %arraydecay23) + %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str, i64 0, i64 0)) + ret void +} + +; Function Attrs: nounwind +declare void @llvm.va_start(i8*) + +; Function Attrs: nounwind +declare void @llvm.va_end(i8*) + +declare dso_local i32 @printf(i8*, ...)