Index: llvm/include/llvm/CodeGen/CallingConvLower.h =================================================================== --- llvm/include/llvm/CodeGen/CallingConvLower.h +++ llvm/include/llvm/CodeGen/CallingConvLower.h @@ -14,6 +14,7 @@ #ifndef LLVM_CODEGEN_CALLINGCONVLOWER_H #define LLVM_CODEGEN_CALLINGCONVLOWER_H +#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -163,10 +164,12 @@ }; /// Describes a register that needs to be forwarded from the prologue to a -/// musttail call. +/// musttail call. Specifying VReg == 0 means that the register should be +/// put into guarded area and no virtual register was created for it. struct ForwardedRegister { ForwardedRegister(unsigned VReg, MCPhysReg PReg, MVT VT) : VReg(VReg), PReg(PReg), VT(VT) {} + bool IsGuarded() const { return VReg == 0; } unsigned VReg; MCPhysReg PReg; MVT VT; @@ -525,8 +528,9 @@ /// Compute the set of registers that need to be preserved and forwarded to /// any musttail calls. void analyzeMustTailForwardedRegisters( - SmallVectorImpl &Forwards, ArrayRef RegParmTypes, - CCAssignFn Fn); + SmallVectorImpl &Forwards, + const SmallDenseSet &GuardedForwardedRegs, + ArrayRef RegParmTypes, CCAssignFn Fn); /// Returns true if the results of the two calling conventions are compatible. /// This is usually part of the check for tailcall eligibility. Index: llvm/include/llvm/CodeGen/MachineBasicBlock.h =================================================================== --- llvm/include/llvm/CodeGen/MachineBasicBlock.h +++ llvm/include/llvm/CodeGen/MachineBasicBlock.h @@ -110,6 +110,10 @@ /// Indicate that this basic block is entered via an exception handler. bool IsEHPad = false; + /// Indicate that this basic block used for saving vararg registers + /// and is entered from entry block. + bool IsGuardedRegsBlk = false; + /// Indicate that this basic block is potentially the target of an indirect /// branch. bool AddressTaken = false; @@ -378,6 +382,14 @@ /// Set alignment of the basic block. void setAlignment(Align A) { Alignment = A; } + /// Returns true if the block is used to save guarded varargs registers. + /// This basic block is entered from an entry block. + bool isGuardedRegsBlk() const { return IsGuardedRegsBlk; } + + /// Marks the block as one which is used to save guarded varargs registers. + /// This basic block is entered from an entry block. + void setIsGuardedRegsBlk(bool V = true) { IsGuardedRegsBlk = V; } + /// Returns true if the block is a landing pad. That is this basic block is /// entered via an exception handler. bool isEHPad() const { return IsEHPad; } Index: llvm/lib/CodeGen/CallingConvLower.cpp =================================================================== --- llvm/lib/CodeGen/CallingConvLower.cpp +++ llvm/lib/CodeGen/CallingConvLower.cpp @@ -236,8 +236,9 @@ } void CCState::analyzeMustTailForwardedRegisters( - SmallVectorImpl &Forwards, ArrayRef RegParmTypes, - CCAssignFn Fn) { + SmallVectorImpl &Forwards, + const SmallDenseSet &GuardedForwardedRegs, + ArrayRef RegParmTypes, CCAssignFn Fn) { // Oftentimes calling conventions will not user register parameters for // variadic functions, so we need to assume we're not variadic so that we get // all the registers that might be used in a non-variadic call. @@ -250,8 +251,11 @@ const TargetLowering *TL = MF.getSubtarget().getTargetLowering(); const TargetRegisterClass *RC = TL->getRegClassFor(RegVT); for (MCPhysReg PReg : RemainingRegs) { - unsigned VReg = MF.addLiveIn(PReg, RC); - Forwards.push_back(ForwardedRegister(VReg, PReg, RegVT)); + if (GuardedForwardedRegs.count(PReg) == 0) { + unsigned VReg = MF.addLiveIn(PReg, RC); + Forwards.push_back(ForwardedRegister(VReg, PReg, RegVT)); + } else + Forwards.push_back(ForwardedRegister(0, PReg, RegVT)); } } } Index: llvm/lib/CodeGen/MachineVerifier.cpp =================================================================== --- llvm/lib/CodeGen/MachineVerifier.cpp +++ llvm/lib/CodeGen/MachineVerifier.cpp @@ -621,9 +621,11 @@ if (!MF->getProperties().hasProperty( MachineFunctionProperties::Property::NoPHIs) && MRI->tracksLiveness()) { // If this block has allocatable physical registers live-in, check that - // it is an entry block or landing pad. + // it is an entry block or landing pad or varargs guarded registers + // saving block. for (const auto &LI : MBB->liveins()) { if (isAllocatable(LI.PhysReg) && !MBB->isEHPad() && + !MBB->isGuardedRegsBlk() && MBB->getIterator() != MBB->getParent()->begin()) { report("MBB has allocatable live-in, but isn't entry or landing-pad.", MBB); report_context(LI.PhysReg); Index: llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp +++ llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp @@ -209,6 +209,13 @@ // If we have a musttail call in a variadic function, we need to ensure we // forward implicit register parameters. if (const auto *CI = dyn_cast(&I)) { + // check for llvm::Intrinsic::icall_branch_funnel intrinsic. + // we do not store varargs parameters explicitly for icall_branch_funnel + if (CI->getCalledFunction() && + CI->getCalledFunction()->getIntrinsicID() == + llvm::Intrinsic::icall_branch_funnel) + continue; + if (CI->isMustTailCall() && Fn->isVarArg()) MF->getFrameInfo().setHasMustTailInVarArgFunc(true); } Index: llvm/lib/Target/AArch64/AArch64CallLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64CallLowering.cpp +++ llvm/lib/Target/AArch64/AArch64CallLowering.cpp @@ -397,7 +397,16 @@ // Later on, we can use this vector to restore the registers if necessary. SmallVectorImpl &Forwards = FuncInfo->getForwardedMustTailRegParms(); - CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, AssignFn); + + // TODO: At x86 platform, XMM varargs parameters should be + // TODO: guarded with check for %al register to avoid using xmm + // TODO: registers(if they were not actually specified). + // TODO: Define set of guarded registers here if the same is neccessary + // TODO: for AArch64 (https://bugs.llvm.org/show_bug.cgi?id=42219). + // TODO: Otherwise remove this comment. + SmallDenseSet guardedRegs; + CCInfo.analyzeMustTailForwardedRegisters(Forwards, guardedRegs, RegParmTypes, + AssignFn); // Conservatively forward X8, since it might be used for an aggregate // return. Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -3322,8 +3322,16 @@ // Compute the set of forwarded registers. The rest are scratch. SmallVectorImpl &Forwards = FuncInfo->getForwardedMustTailRegParms(); - CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, - CC_AArch64_AAPCS); + + // TODO: At x86 platform, XMM varargs parameters should be + // TODO: guarded with check for %al register to avoid using xmm + // TODO: registers(if they were not actually specified). + // TODO: Define set of guarded registers here if the same is neccessary + // TODO: for AArch64 (https://bugs.llvm.org/show_bug.cgi?id=42219). + // TODO: Otherwise remove this comment. + SmallDenseSet guardedRegs; + CCInfo.analyzeMustTailForwardedRegisters(Forwards, guardedRegs, + RegParmTypes, CC_AArch64_AAPCS); // Conservatively forward X8, since it might be used for aggregate return. if (!CCInfo.isAllocated(AArch64::X8)) { Index: llvm/lib/Target/X86/X86ExpandPseudo.cpp =================================================================== --- llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -64,6 +64,9 @@ bool ExpandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI); bool ExpandMBB(MachineBasicBlock &MBB); + + void CreateTailCallBlocksPair(MachineBasicBlock &OriginalTailCallBlk, + MachineBasicBlock::iterator &TCPseudoInstr); }; char X86ExpandPseudo::ID = 0; @@ -173,6 +176,209 @@ JTMBB->erase(JTInst); } +// this function replaces original tail call instruction with two versions +// of tailcall instruction. One is fully similar to original, another has xmm +// registers restoring code inserted previously. Additionally there is created a +// branch which checks %al and selects proper version of tailcall. +// +// f_thunk: f_thunk: +// # %bb.1: => # %bb.1: +// addq 32, %rsp testb %al, %al +// jmpq tc_func je .LBB0_2 +// # %bb.2: +// movaps 96(%rsp), %xmm0 +// addq 32, %rsp +// jmpq tc_func +// .LBB0_2: +// # %bb.3: +// addq 32, %rsp +// jmpq tc_func +// +void X86ExpandPseudo::CreateTailCallBlocksPair( + MachineBasicBlock &OriginalTailCallBlk, + MachineBasicBlock::iterator &TCPseudoInstr) { + + MachineFunction *Func = OriginalTailCallBlk.getParent(); + X86MachineFunctionInfo *X86Info = Func->getInfo(); + const auto &Forwards = X86Info->getForwardedMustTailRegParms(); + + // enumerate forwarded registers and check for existance + // any of guarded registers. + bool hasGuardedArgs = false; + for (auto &F : Forwards) + if (F.IsGuarded()) { + hasGuardedArgs = true; + break; + } + + // do nothing if there are no guarded registers + if (!hasGuardedArgs) + return; + + const BasicBlock *LLVM_BB = OriginalTailCallBlk.getBasicBlock(); + + MachineBasicBlock::iterator TailCallMInstr = std::prev(TCPseudoInstr); + DebugLoc DL = TCPseudoInstr->getDebugLoc(); + + // create two blocks for tailcalls. + MachineFunction::iterator MBBIter = ++OriginalTailCallBlk.getIterator(); + MachineBasicBlock *TailCallBlkWithGuardedRegs = + Func->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *TailCallBlk = Func->CreateMachineBasicBlock(LLVM_BB); + Func->insert(MBBIter, TailCallBlkWithGuardedRegs); + Func->insert(MBBIter, TailCallBlk); + + TailCallBlk->transferSuccessors(&OriginalTailCallBlk); + OriginalTailCallBlk.addSuccessor(TailCallBlkWithGuardedRegs); + OriginalTailCallBlk.addSuccessor(TailCallBlk); + + // search for the start of stack restoring code + MachineInstr *FirstInstructionOfStackRestoringCode = &*TailCallMInstr; + + for (MachineBasicBlock::reverse_iterator CurStackRestoreInstr = + TailCallMInstr.getReverse(); + CurStackRestoreInstr != OriginalTailCallBlk.rend(); + ++CurStackRestoreInstr) { + + // skip tail call instruction + if (CurStackRestoreInstr->getOpcode() == TailCallMInstr->getOpcode()) + continue; + + // skip CFI instructions + if (CurStackRestoreInstr->isCFIInstruction()) + continue; + + if ((CurStackRestoreInstr->getOpcode() == X86::SUB64ri32 || + CurStackRestoreInstr->getOpcode() == X86::SUB64ri8 || + CurStackRestoreInstr->getOpcode() == X86::SUB32ri || + CurStackRestoreInstr->getOpcode() == X86::SUB32ri8) && + CurStackRestoreInstr->getOperand(0).getReg() == + TRI->getStackRegister()) { + FirstInstructionOfStackRestoringCode = &*CurStackRestoreInstr; + continue; + } else if ((CurStackRestoreInstr->getOpcode() == X86::ADD64ri32 || + CurStackRestoreInstr->getOpcode() == X86::ADD64ri8 || + CurStackRestoreInstr->getOpcode() == X86::ADD32ri || + CurStackRestoreInstr->getOpcode() == X86::ADD32ri8) && + CurStackRestoreInstr->getOperand(0).getReg() == + TRI->getStackRegister()) { + FirstInstructionOfStackRestoringCode = &*CurStackRestoreInstr; + continue; + } else if (CurStackRestoreInstr->getOpcode() == X86::POP64r) { + FirstInstructionOfStackRestoringCode = &*CurStackRestoreInstr; + continue; + } else if ((CurStackRestoreInstr->getOpcode() == X86::LEA32r || + CurStackRestoreInstr->getOpcode() == X86::LEA64_32r) && + CurStackRestoreInstr->getOperand(0).getReg() == + TRI->getStackRegister() && + CurStackRestoreInstr->getOperand(1).getReg() == + TRI->getStackRegister() && + CurStackRestoreInstr->getOperand(2).getImm() == 1 && + CurStackRestoreInstr->getOperand(3).getReg() == + X86::NoRegister && + CurStackRestoreInstr->getOperand(5).getReg() == + X86::NoRegister) { + // For LEAs we have: def = lea SP, FI, noreg, Offset, noreg. + FirstInstructionOfStackRestoringCode = &*CurStackRestoreInstr; + continue; + } + + break; + } + + // copy stack restoring code and tailcall instruction into + // two created blocks. Delete copied instructions from the + // OriginalTailCallBlk. + MachineBasicBlock::iterator curInstr = FirstInstructionOfStackRestoringCode; + + do { + // copy instructions into TailCallBlkWithGuardedRegs + MachineInstrBuilder MIB = BuildMI(TailCallBlkWithGuardedRegs, DL, + TII->get(curInstr->getOpcode())); + + for (auto MO : curInstr->operands()) + MIB->addOperand(*Func, MO); + + // copy instructions into TailCallBlk + MachineInstrBuilder SMIB = + BuildMI(TailCallBlk, DL, TII->get(curInstr->getOpcode())); + + for (auto MO : curInstr->operands()) + SMIB->addOperand(*Func, MO); + + // stop copying if we achieved tail call instruction + if (curInstr->getOpcode() == TailCallMInstr->getOpcode()) { + OriginalTailCallBlk.erase(curInstr); + break; + } + + curInstr = &*OriginalTailCallBlk.erase(curInstr); + } while (curInstr != OriginalTailCallBlk.end()); + + // copy call site information into new tail call instructions + OriginalTailCallBlk.getParent()->copyCallSiteInfo( + &*TCPseudoInstr, &*TailCallBlkWithGuardedRegs->getLastNonDebugInstr()); + + OriginalTailCallBlk.getParent()->copyCallSiteInfo( + &*TCPseudoInstr, &*TailCallBlk->getLastNonDebugInstr()); + + // If %al is 0, branch around the XMM save block. + BuildMI(&OriginalTailCallBlk, DL, TII->get(X86::TEST8rr)) + .addReg(X86::AL) + .addReg(X86::AL); + BuildMI(&OriginalTailCallBlk, DL, TII->get(X86::JCC_1)) + .addMBB(TailCallBlk) + .addImm(X86::COND_E); + + // add code restoring xmm regsiters into start of TailCallInstrFromGuardedBlk + MachineInstr &TailCallInstrFromGuardedBlk = + *TailCallBlkWithGuardedRegs->getLastNonDebugInstr(); + + // TODO: take into account YMM, ZMM here + unsigned MOVOpc = STI->hasAVX() ? X86::VMOVAPSrm : X86::MOVAPSrm; + + int RegIdx = 0; + for (const auto &Fwd : Forwards) { + if (Fwd.IsGuarded()) { + int64_t OffsetInsideSaveArea = + (Func->getFrameInfo().hasVAStart() ? X86Info->getVarArgsFPOffset() + : 0); + unsigned BaseReg; + int64_t Offset = + X86FL->getFrameIndexReference( + *Func, X86Info->getThunkRegSaveFrameIndex(), BaseReg) + + RegIdx * 16 + OffsetInsideSaveArea; + + MachineMemOperand *MMO = Func->getMachineMemOperand( + MachinePointerInfo::getFixedStack( + *Func, X86Info->getThunkRegSaveFrameIndex(), Offset), + MachineMemOperand::MOLoad, + /*Size=*/16, /*Align=*/16); + + BuildMI(*TailCallBlkWithGuardedRegs, TailCallBlkWithGuardedRegs->begin(), + DL, TII->get(MOVOpc), Fwd.PReg) + .addReg(BaseReg) + .addImm(/*Scale=*/1) + .addReg(/*IndexReg=*/0) + .addImm(/*Disp=*/Offset) + .addReg(/*Segment=*/0) + .addMemOperand(MMO); + + TailCallInstrFromGuardedBlk.addOperand( + MachineOperand::CreateReg(Fwd.PReg, false /*IsDef*/, true /*IsImp*/)); + RegIdx++; + } + } + + // add liveins into newly created blocks + for (auto &MO : TCPseudoInstr->operands()) { + if (MO.isReg() && Register::isPhysicalRegister(MO.getReg())) { + TailCallBlk->addLiveIn(MO.getReg()); + TailCallBlkWithGuardedRegs->addLiveIn(MO.getReg()); + } + } +} + /// If \p MBBI is a pseudo instruction, this method expands /// it to the corresponding (sequence of) actual instruction(s). /// \returns true if \p MBBI has been expanded. @@ -275,7 +481,17 @@ MachineInstr &NewMI = *std::prev(MBBI); NewMI.copyImplicitOps(*MBBI->getParent()->getParent(), *MBBI); - MBB.getParent()->moveCallSiteInfo(&*MBBI, &NewMI); + MBB.getParent()->copyCallSiteInfo(&*MBBI, &NewMI); + MachineFunction *Func = MBB.getParent(); + + // check for case when variadic function is a thunk. + // We need to propagate parameters into final tailcall then. + // Passing xmm parameters a bit tricky in this case. + // Xmm parameters should be guarded with the check for %al + // register. + if (!STI->isCallingConvWin64(Func->getFunction().getCallingConv()) && + STI->is64Bit() && Func->getFrameInfo().hasMustTailInVarArgFunc()) + CreateTailCallBlocksPair(MBB, MBBI); // Delete the pseudo instruction TCRETURN. MBB.erase(MBBI); Index: llvm/lib/Target/X86/X86ISelLowering.h =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.h +++ llvm/lib/Target/X86/X86ISelLowering.h @@ -24,652 +24,771 @@ namespace X86ISD { // X86 Specific DAG Nodes - enum NodeType : unsigned { - // Start the numbering where the builtin ops leave off. - FIRST_NUMBER = ISD::BUILTIN_OP_END, - - /// Bit scan forward. - BSF, - /// Bit scan reverse. - BSR, - - /// Double shift instructions. These correspond to - /// X86::SHLDxx and X86::SHRDxx instructions. - SHLD, - SHRD, - - /// Bitwise logical AND of floating point values. This corresponds - /// to X86::ANDPS or X86::ANDPD. - FAND, - - /// Bitwise logical OR of floating point values. This corresponds - /// to X86::ORPS or X86::ORPD. - FOR, - - /// Bitwise logical XOR of floating point values. This corresponds - /// to X86::XORPS or X86::XORPD. - FXOR, - - /// Bitwise logical ANDNOT of floating point values. This - /// corresponds to X86::ANDNPS or X86::ANDNPD. - FANDN, - - /// These operations represent an abstract X86 call - /// instruction, which includes a bunch of information. In particular the - /// operands of these node are: - /// - /// #0 - The incoming token chain - /// #1 - The callee - /// #2 - The number of arg bytes the caller pushes on the stack. - /// #3 - The number of arg bytes the callee pops off the stack. - /// #4 - The value to pass in AL/AX/EAX (optional) - /// #5 - The value to pass in DL/DX/EDX (optional) - /// - /// The result values of these nodes are: - /// - /// #0 - The outgoing token chain - /// #1 - The first register result value (optional) - /// #2 - The second register result value (optional) - /// - CALL, - - /// Same as call except it adds the NoTrack prefix. - NT_CALL, - - /// X86 compare and logical compare instructions. - CMP, COMI, UCOMI, - - /// X86 bit-test instructions. - BT, - - /// X86 SetCC. Operand 0 is condition code, and operand 1 is the EFLAGS - /// operand, usually produced by a CMP instruction. - SETCC, - - /// X86 Select - SELECTS, - - // Same as SETCC except it's materialized with a sbb and the value is all - // one's or all zero's. - SETCC_CARRY, // R = carry_bit ? ~0 : 0 - - /// X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD. - /// Operands are two FP values to compare; result is a mask of - /// 0s or 1s. Generally DTRT for C/C++ with NaNs. - FSETCC, - - /// X86 FP SETCC, similar to above, but with output as an i1 mask and - /// and a version with SAE. - FSETCCM, FSETCCM_SAE, - - /// X86 conditional moves. Operand 0 and operand 1 are the two values - /// to select from. Operand 2 is the condition code, and operand 3 is the - /// flag operand produced by a CMP or TEST instruction. - CMOV, - - /// X86 conditional branches. Operand 0 is the chain operand, operand 1 - /// is the block to branch if condition is true, operand 2 is the - /// condition code, and operand 3 is the flag operand produced by a CMP - /// or TEST instruction. - BRCOND, - - /// BRIND node with NoTrack prefix. Operand 0 is the chain operand and - /// operand 1 is the target address. - NT_BRIND, - - /// Return with a flag operand. Operand 0 is the chain operand, operand - /// 1 is the number of bytes of stack to pop. - RET_FLAG, - - /// Return from interrupt. Operand 0 is the number of bytes to pop. - IRET, - - /// Repeat fill, corresponds to X86::REP_STOSx. - REP_STOS, - - /// Repeat move, corresponds to X86::REP_MOVSx. - REP_MOVS, - - /// On Darwin, this node represents the result of the popl - /// at function entry, used for PIC code. - GlobalBaseReg, - - /// A wrapper node for TargetConstantPool, TargetJumpTable, - /// TargetExternalSymbol, TargetGlobalAddress, TargetGlobalTLSAddress, - /// MCSymbol and TargetBlockAddress. - Wrapper, - - /// Special wrapper used under X86-64 PIC mode for RIP - /// relative displacements. - WrapperRIP, - - /// Copies a 64-bit value from an MMX vector to the low word - /// of an XMM vector, with the high word zero filled. - MOVQ2DQ, - - /// Copies a 64-bit value from the low word of an XMM vector - /// to an MMX vector. - MOVDQ2Q, - - /// Copies a 32-bit value from the low word of a MMX - /// vector to a GPR. - MMX_MOVD2W, - - /// Copies a GPR into the low 32-bit word of a MMX vector - /// and zero out the high word. - MMX_MOVW2D, - - /// Extract an 8-bit value from a vector and zero extend it to - /// i32, corresponds to X86::PEXTRB. - PEXTRB, - - /// Extract a 16-bit value from a vector and zero extend it to - /// i32, corresponds to X86::PEXTRW. - PEXTRW, - - /// Insert any element of a 4 x float vector into any element - /// of a destination 4 x floatvector. - INSERTPS, - - /// Insert the lower 8-bits of a 32-bit value to a vector, - /// corresponds to X86::PINSRB. - PINSRB, - - /// Insert the lower 16-bits of a 32-bit value to a vector, - /// corresponds to X86::PINSRW. - PINSRW, - - /// Shuffle 16 8-bit values within a vector. - PSHUFB, - - /// Compute Sum of Absolute Differences. - PSADBW, - /// Compute Double Block Packed Sum-Absolute-Differences - DBPSADBW, - - /// Bitwise Logical AND NOT of Packed FP values. - ANDNP, - - /// Blend where the selector is an immediate. - BLENDI, - - /// Dynamic (non-constant condition) vector blend where only the sign bits - /// of the condition elements are used. This is used to enforce that the - /// condition mask is not valid for generic VSELECT optimizations. This - /// is also used to implement the intrinsics. - /// Operands are in VSELECT order: MASK, TRUE, FALSE - BLENDV, - - /// Combined add and sub on an FP vector. - ADDSUB, - - // FP vector ops with rounding mode. - FADD_RND, FADDS, FADDS_RND, - FSUB_RND, FSUBS, FSUBS_RND, - FMUL_RND, FMULS, FMULS_RND, - FDIV_RND, FDIVS, FDIVS_RND, - FMAX_SAE, FMAXS_SAE, - FMIN_SAE, FMINS_SAE, - FSQRT_RND, FSQRTS, FSQRTS_RND, - - // FP vector get exponent. - FGETEXP, FGETEXP_SAE, FGETEXPS, FGETEXPS_SAE, - // Extract Normalized Mantissas. - VGETMANT, VGETMANT_SAE, VGETMANTS, VGETMANTS_SAE, - // FP Scale. - SCALEF, SCALEF_RND, - SCALEFS, SCALEFS_RND, - - // Unsigned Integer average. - AVG, - - /// Integer horizontal add/sub. - HADD, - HSUB, - - /// Floating point horizontal add/sub. - FHADD, - FHSUB, - - // Detect Conflicts Within a Vector - CONFLICT, - - /// Floating point max and min. - FMAX, FMIN, - - /// Commutative FMIN and FMAX. - FMAXC, FMINC, - - /// Scalar intrinsic floating point max and min. - FMAXS, FMINS, - - /// Floating point reciprocal-sqrt and reciprocal approximation. - /// Note that these typically require refinement - /// in order to obtain suitable precision. - FRSQRT, FRCP, - - // AVX-512 reciprocal approximations with a little more precision. - RSQRT14, RSQRT14S, RCP14, RCP14S, - - // Thread Local Storage. - TLSADDR, + enum NodeType : unsigned { + // Start the numbering where the builtin ops leave off. + FIRST_NUMBER = ISD::BUILTIN_OP_END, + + /// Bit scan forward. + BSF, + /// Bit scan reverse. + BSR, + + /// Double shift instructions. These correspond to + /// X86::SHLDxx and X86::SHRDxx instructions. + SHLD, + SHRD, + + /// Bitwise logical AND of floating point values. This corresponds + /// to X86::ANDPS or X86::ANDPD. + FAND, + + /// Bitwise logical OR of floating point values. This corresponds + /// to X86::ORPS or X86::ORPD. + FOR, + + /// Bitwise logical XOR of floating point values. This corresponds + /// to X86::XORPS or X86::XORPD. + FXOR, + + /// Bitwise logical ANDNOT of floating point values. This + /// corresponds to X86::ANDNPS or X86::ANDNPD. + FANDN, + + /// These operations represent an abstract X86 call + /// instruction, which includes a bunch of information. In particular the + /// operands of these node are: + /// + /// #0 - The incoming token chain + /// #1 - The callee + /// #2 - The number of arg bytes the caller pushes on the stack. + /// #3 - The number of arg bytes the callee pops off the stack. + /// #4 - The value to pass in AL/AX/EAX (optional) + /// #5 - The value to pass in DL/DX/EDX (optional) + /// + /// The result values of these nodes are: + /// + /// #0 - The outgoing token chain + /// #1 - The first register result value (optional) + /// #2 - The second register result value (optional) + /// + CALL, - // Thread Local Storage. A call to get the start address - // of the TLS block for the current module. - TLSBASEADDR, + /// Same as call except it adds the NoTrack prefix. + NT_CALL, - // Thread Local Storage. When calling to an OS provided - // thunk at the address from an earlier relocation. - TLSCALL, + /// X86 compare and logical compare instructions. + CMP, + COMI, + UCOMI, - // Exception Handling helpers. - EH_RETURN, + /// X86 bit-test instructions. + BT, - // SjLj exception handling setjmp. - EH_SJLJ_SETJMP, + /// X86 SetCC. Operand 0 is condition code, and operand 1 is the EFLAGS + /// operand, usually produced by a CMP instruction. + SETCC, - // SjLj exception handling longjmp. - EH_SJLJ_LONGJMP, + /// X86 Select + SELECTS, - // SjLj exception handling dispatch. - EH_SJLJ_SETUP_DISPATCH, + // Same as SETCC except it's materialized with a sbb and the value is all + // one's or all zero's. + SETCC_CARRY, // R = carry_bit ? ~0 : 0 - /// Tail call return. See X86TargetLowering::LowerCall for - /// the list of operands. - TC_RETURN, + /// X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD. + /// Operands are two FP values to compare; result is a mask of + /// 0s or 1s. Generally DTRT for C/C++ with NaNs. + FSETCC, - // Vector move to low scalar and zero higher vector elements. - VZEXT_MOVL, + /// X86 FP SETCC, similar to above, but with output as an i1 mask and + /// and a version with SAE. + FSETCCM, + FSETCCM_SAE, - // Vector integer truncate. - VTRUNC, - // Vector integer truncate with unsigned/signed saturation. - VTRUNCUS, VTRUNCS, + /// X86 conditional moves. Operand 0 and operand 1 are the two values + /// to select from. Operand 2 is the condition code, and operand 3 is the + /// flag operand produced by a CMP or TEST instruction. + CMOV, - // Masked version of the above. Used when less than a 128-bit result is - // produced since the mask only applies to the lower elements and can't - // be represented by a select. - // SRC, PASSTHRU, MASK - VMTRUNC, VMTRUNCUS, VMTRUNCS, - - // Vector FP extend. - VFPEXT, VFPEXT_SAE, VFPEXTS, VFPEXTS_SAE, - - // Vector FP round. - VFPROUND, VFPROUND_RND, VFPROUNDS, VFPROUNDS_RND, - - // Masked version of above. Used for v2f64->v4f32. - // SRC, PASSTHRU, MASK - VMFPROUND, - - // 128-bit vector logical left / right shift - VSHLDQ, VSRLDQ, - - // Vector shift elements - VSHL, VSRL, VSRA, - - // Vector variable shift - VSHLV, VSRLV, VSRAV, - - // Vector shift elements by immediate - VSHLI, VSRLI, VSRAI, - - // Shifts of mask registers. - KSHIFTL, KSHIFTR, - - // Bit rotate by immediate - VROTLI, VROTRI, - - // Vector packed double/float comparison. - CMPP, - - // Vector integer comparisons. - PCMPEQ, PCMPGT, - - // v8i16 Horizontal minimum and position. - PHMINPOS, - - MULTISHIFT, - - /// Vector comparison generating mask bits for fp and - /// integer signed and unsigned data types. - CMPM, - // Vector comparison with SAE for FP values - CMPM_SAE, - - // Arithmetic operations with FLAGS results. - ADD, SUB, ADC, SBB, SMUL, UMUL, - OR, XOR, AND, - - // Bit field extract. - BEXTR, - - // Zero High Bits Starting with Specified Bit Position. - BZHI, - - // X86-specific multiply by immediate. - MUL_IMM, - - // Vector sign bit extraction. - MOVMSK, - - // Vector bitwise comparisons. - PTEST, - - // Vector packed fp sign bitwise comparisons. - TESTP, - - // OR/AND test for masks. - KORTEST, - KTEST, - - // ADD for masks. - KADD, - - // Several flavors of instructions with vector shuffle behaviors. - // Saturated signed/unnsigned packing. - PACKSS, - PACKUS, - // Intra-lane alignr. - PALIGNR, - // AVX512 inter-lane alignr. - VALIGN, - PSHUFD, - PSHUFHW, - PSHUFLW, - SHUFP, - // VBMI2 Concat & Shift. - VSHLD, - VSHRD, - VSHLDV, - VSHRDV, - //Shuffle Packed Values at 128-bit granularity. - SHUF128, - MOVDDUP, - MOVSHDUP, - MOVSLDUP, - MOVLHPS, - MOVHLPS, - MOVSD, - MOVSS, - UNPCKL, - UNPCKH, - VPERMILPV, - VPERMILPI, - VPERMI, - VPERM2X128, - - // Variable Permute (VPERM). - // Res = VPERMV MaskV, V0 - VPERMV, - - // 3-op Variable Permute (VPERMT2). - // Res = VPERMV3 V0, MaskV, V1 - VPERMV3, - - // Bitwise ternary logic. - VPTERNLOG, - // Fix Up Special Packed Float32/64 values. - VFIXUPIMM, VFIXUPIMM_SAE, - VFIXUPIMMS, VFIXUPIMMS_SAE, - // Range Restriction Calculation For Packed Pairs of Float32/64 values. - VRANGE, VRANGE_SAE, VRANGES, VRANGES_SAE, - // Reduce - Perform Reduction Transformation on scalar\packed FP. - VREDUCE, VREDUCE_SAE, VREDUCES, VREDUCES_SAE, - // RndScale - Round FP Values To Include A Given Number Of Fraction Bits. - // Also used by the legacy (V)ROUND intrinsics where we mask out the - // scaling part of the immediate. - VRNDSCALE, VRNDSCALE_SAE, VRNDSCALES, VRNDSCALES_SAE, - // Tests Types Of a FP Values for packed types. - VFPCLASS, - // Tests Types Of a FP Values for scalar types. - VFPCLASSS, - - // Broadcast (splat) scalar or element 0 of a vector. If the operand is - // a vector, this node may change the vector length as part of the splat. - VBROADCAST, - // Broadcast mask to vector. - VBROADCASTM, - // Broadcast subvector to vector. - SUBV_BROADCAST, - - /// SSE4A Extraction and Insertion. - EXTRQI, INSERTQI, - - // XOP arithmetic/logical shifts. - VPSHA, VPSHL, - // XOP signed/unsigned integer comparisons. - VPCOM, VPCOMU, - // XOP packed permute bytes. - VPPERM, - // XOP two source permutation. - VPERMIL2, - - // Vector multiply packed unsigned doubleword integers. - PMULUDQ, - // Vector multiply packed signed doubleword integers. - PMULDQ, - // Vector Multiply Packed UnsignedIntegers with Round and Scale. - MULHRS, - - // Multiply and Add Packed Integers. - VPMADDUBSW, VPMADDWD, - - // AVX512IFMA multiply and add. - // NOTE: These are different than the instruction and perform - // op0 x op1 + op2. - VPMADD52L, VPMADD52H, - - // VNNI - VPDPBUSD, - VPDPBUSDS, - VPDPWSSD, - VPDPWSSDS, - - // FMA nodes. - // We use the target independent ISD::FMA for the non-inverted case. - FNMADD, - FMSUB, - FNMSUB, - FMADDSUB, - FMSUBADD, - - // FMA with rounding mode. - FMADD_RND, - FNMADD_RND, - FMSUB_RND, - FNMSUB_RND, - FMADDSUB_RND, - FMSUBADD_RND, - - // Compress and expand. - COMPRESS, - EXPAND, - - // Bits shuffle - VPSHUFBITQMB, - - // Convert Unsigned/Integer to Floating-Point Value with rounding mode. - SINT_TO_FP_RND, UINT_TO_FP_RND, - SCALAR_SINT_TO_FP, SCALAR_UINT_TO_FP, - SCALAR_SINT_TO_FP_RND, SCALAR_UINT_TO_FP_RND, - - // Vector float/double to signed/unsigned integer. - CVTP2SI, CVTP2UI, CVTP2SI_RND, CVTP2UI_RND, - // Scalar float/double to signed/unsigned integer. - CVTS2SI, CVTS2UI, CVTS2SI_RND, CVTS2UI_RND, - - // Vector float/double to signed/unsigned integer with truncation. - CVTTP2SI, CVTTP2UI, CVTTP2SI_SAE, CVTTP2UI_SAE, - // Scalar float/double to signed/unsigned integer with truncation. - CVTTS2SI, CVTTS2UI, CVTTS2SI_SAE, CVTTS2UI_SAE, - - // Vector signed/unsigned integer to float/double. - CVTSI2P, CVTUI2P, - - // Masked versions of above. Used for v2f64->v4f32. - // SRC, PASSTHRU, MASK - MCVTP2SI, MCVTP2UI, MCVTTP2SI, MCVTTP2UI, - MCVTSI2P, MCVTUI2P, - - // Vector float to bfloat16. - // Convert TWO packed single data to one packed BF16 data - CVTNE2PS2BF16, - // Convert packed single data to packed BF16 data - CVTNEPS2BF16, - // Masked version of above. - // SRC, PASSTHRU, MASK - MCVTNEPS2BF16, - - // Dot product of BF16 pairs to accumulated into - // packed single precision. - DPBF16PS, - - // Save xmm argument registers to the stack, according to %al. An operator - // is needed so that this can be expanded with control flow. - VASTART_SAVE_XMM_REGS, - - // Windows's _chkstk call to do stack probing. - WIN_ALLOCA, - - // For allocating variable amounts of stack space when using - // segmented stacks. Check if the current stacklet has enough space, and - // falls back to heap allocation if not. - SEG_ALLOCA, - - // Memory barriers. - MEMBARRIER, - MFENCE, - - // Store FP status word into i16 register. - FNSTSW16r, - - // Store contents of %ah into %eflags. - SAHF, - - // Get a random integer and indicate whether it is valid in CF. - RDRAND, - - // Get a NIST SP800-90B & C compliant random integer and - // indicate whether it is valid in CF. - RDSEED, - - // Protection keys - // RDPKRU - Operand 0 is chain. Operand 1 is value for ECX. - // WRPKRU - Operand 0 is chain. Operand 1 is value for EDX. Operand 2 is - // value for ECX. - RDPKRU, WRPKRU, - - // SSE42 string comparisons. - // These nodes produce 3 results, index, mask, and flags. X86ISelDAGToDAG - // will emit one or two instructions based on which results are used. If - // flags and index/mask this allows us to use a single instruction since - // we won't have to pick and opcode for flags. Instead we can rely on the - // DAG to CSE everything and decide at isel. - PCMPISTR, - PCMPESTR, - - // Test if in transactional execution. - XTEST, - - // ERI instructions. - RSQRT28, RSQRT28_SAE, RSQRT28S, RSQRT28S_SAE, - RCP28, RCP28_SAE, RCP28S, RCP28S_SAE, EXP2, EXP2_SAE, - - // Conversions between float and half-float. - CVTPS2PH, CVTPH2PS, CVTPH2PS_SAE, - - // Masked version of above. - // SRC, RND, PASSTHRU, MASK - MCVTPS2PH, - - // Galois Field Arithmetic Instructions - GF2P8AFFINEINVQB, GF2P8AFFINEQB, GF2P8MULB, - - // LWP insert record. - LWPINS, - - // User level wait - UMWAIT, TPAUSE, - - // Enqueue Stores Instructions - ENQCMD, ENQCMDS, - - // For avx512-vp2intersect - VP2INTERSECT, - - // Compare and swap. - LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE, - LCMPXCHG8_DAG, - LCMPXCHG16_DAG, - LCMPXCHG8_SAVE_EBX_DAG, - LCMPXCHG16_SAVE_RBX_DAG, - - /// LOCK-prefixed arithmetic read-modify-write instructions. - /// EFLAGS, OUTCHAIN = LADD(INCHAIN, PTR, RHS) - LADD, LSUB, LOR, LXOR, LAND, - - // Load, scalar_to_vector, and zero extend. - VZEXT_LOAD, - - // extract_vector_elt, store. - VEXTRACT_STORE, - - // scalar broadcast from memory - VBROADCAST_LOAD, - - // Store FP control world into i16 memory. - FNSTCW16m, - - /// This instruction implements FP_TO_SINT with the - /// integer destination in memory and a FP reg source. This corresponds - /// to the X86::FIST*m instructions and the rounding mode change stuff. It - /// has two inputs (token chain and address) and two outputs (int value - /// and token chain). Memory VT specifies the type to store to. - FP_TO_INT_IN_MEM, - - /// This instruction implements SINT_TO_FP with the - /// integer source in memory and FP reg result. This corresponds to the - /// X86::FILD*m instructions. It has two inputs (token chain and address) - /// and two outputs (FP value and token chain). FILD_FLAG also produces a - /// flag). The integer source type is specified by the memory VT. - FILD, - FILD_FLAG, - - /// This instruction implements a fp->int store from FP stack - /// slots. This corresponds to the fist instruction. It takes a - /// chain operand, value to store, address, and glue. The memory VT - /// specifies the type to store as. - FIST, - - /// This instruction implements an extending load to FP stack slots. - /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain - /// operand, and ptr to load from. The memory VT specifies the type to - /// load from. - FLD, - - /// This instruction implements a truncating store from FP stack - /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a - /// chain operand, value to store, address, and glue. The memory VT - /// specifies the type to store as. - FST, - - /// This instruction grabs the address of the next argument - /// from a va_list. (reads and modifies the va_list in memory) - VAARG_64, - - // Vector truncating store with unsigned/signed saturation - VTRUNCSTOREUS, VTRUNCSTORES, - // Vector truncating masked store with unsigned/signed saturation - VMTRUNCSTOREUS, VMTRUNCSTORES, - - // X86 specific gather and scatter - MGATHER, MSCATTER, - - // WARNING: Do not add anything in the end unless you want the node to - // have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all - // opcodes will be thought as target memory ops! - }; + /// X86 conditional branches. Operand 0 is the chain operand, operand 1 + /// is the block to branch if condition is true, operand 2 is the + /// condition code, and operand 3 is the flag operand produced by a CMP + /// or TEST instruction. + BRCOND, + + /// BRIND node with NoTrack prefix. Operand 0 is the chain operand and + /// operand 1 is the target address. + NT_BRIND, + + /// Return with a flag operand. Operand 0 is the chain operand, operand + /// 1 is the number of bytes of stack to pop. + RET_FLAG, + + /// Return from interrupt. Operand 0 is the number of bytes to pop. + IRET, + + /// Repeat fill, corresponds to X86::REP_STOSx. + REP_STOS, + + /// Repeat move, corresponds to X86::REP_MOVSx. + REP_MOVS, + + /// On Darwin, this node represents the result of the popl + /// at function entry, used for PIC code. + GlobalBaseReg, + + /// A wrapper node for TargetConstantPool, TargetJumpTable, + /// TargetExternalSymbol, TargetGlobalAddress, TargetGlobalTLSAddress, + /// MCSymbol and TargetBlockAddress. + Wrapper, + + /// Special wrapper used under X86-64 PIC mode for RIP + /// relative displacements. + WrapperRIP, + + /// Copies a 64-bit value from an MMX vector to the low word + /// of an XMM vector, with the high word zero filled. + MOVQ2DQ, + + /// Copies a 64-bit value from the low word of an XMM vector + /// to an MMX vector. + MOVDQ2Q, + + /// Copies a 32-bit value from the low word of a MMX + /// vector to a GPR. + MMX_MOVD2W, + + /// Copies a GPR into the low 32-bit word of a MMX vector + /// and zero out the high word. + MMX_MOVW2D, + + /// Extract an 8-bit value from a vector and zero extend it to + /// i32, corresponds to X86::PEXTRB. + PEXTRB, + + /// Extract a 16-bit value from a vector and zero extend it to + /// i32, corresponds to X86::PEXTRW. + PEXTRW, + + /// Insert any element of a 4 x float vector into any element + /// of a destination 4 x floatvector. + INSERTPS, + + /// Insert the lower 8-bits of a 32-bit value to a vector, + /// corresponds to X86::PINSRB. + PINSRB, + + /// Insert the lower 16-bits of a 32-bit value to a vector, + /// corresponds to X86::PINSRW. + PINSRW, + + /// Shuffle 16 8-bit values within a vector. + PSHUFB, + + /// Compute Sum of Absolute Differences. + PSADBW, + /// Compute Double Block Packed Sum-Absolute-Differences + DBPSADBW, + + /// Bitwise Logical AND NOT of Packed FP values. + ANDNP, + + /// Blend where the selector is an immediate. + BLENDI, + + /// Dynamic (non-constant condition) vector blend where only the sign bits + /// of the condition elements are used. This is used to enforce that the + /// condition mask is not valid for generic VSELECT optimizations. This + /// is also used to implement the intrinsics. + /// Operands are in VSELECT order: MASK, TRUE, FALSE + BLENDV, + + /// Combined add and sub on an FP vector. + ADDSUB, + + // FP vector ops with rounding mode. + FADD_RND, + FADDS, + FADDS_RND, + FSUB_RND, + FSUBS, + FSUBS_RND, + FMUL_RND, + FMULS, + FMULS_RND, + FDIV_RND, + FDIVS, + FDIVS_RND, + FMAX_SAE, + FMAXS_SAE, + FMIN_SAE, + FMINS_SAE, + FSQRT_RND, + FSQRTS, + FSQRTS_RND, + + // FP vector get exponent. + FGETEXP, + FGETEXP_SAE, + FGETEXPS, + FGETEXPS_SAE, + // Extract Normalized Mantissas. + VGETMANT, + VGETMANT_SAE, + VGETMANTS, + VGETMANTS_SAE, + // FP Scale. + SCALEF, + SCALEF_RND, + SCALEFS, + SCALEFS_RND, + + // Unsigned Integer average. + AVG, + + /// Integer horizontal add/sub. + HADD, + HSUB, + + /// Floating point horizontal add/sub. + FHADD, + FHSUB, + + // Detect Conflicts Within a Vector + CONFLICT, + + /// Floating point max and min. + FMAX, + FMIN, + + /// Commutative FMIN and FMAX. + FMAXC, + FMINC, + + /// Scalar intrinsic floating point max and min. + FMAXS, + FMINS, + + /// Floating point reciprocal-sqrt and reciprocal approximation. + /// Note that these typically require refinement + /// in order to obtain suitable precision. + FRSQRT, + FRCP, + + // AVX-512 reciprocal approximations with a little more precision. + RSQRT14, + RSQRT14S, + RCP14, + RCP14S, + + // Thread Local Storage. + TLSADDR, + + // Thread Local Storage. A call to get the start address + // of the TLS block for the current module. + TLSBASEADDR, + + // Thread Local Storage. When calling to an OS provided + // thunk at the address from an earlier relocation. + TLSCALL, + + // Exception Handling helpers. + EH_RETURN, + + // SjLj exception handling setjmp. + EH_SJLJ_SETJMP, + + // SjLj exception handling longjmp. + EH_SJLJ_LONGJMP, + + // SjLj exception handling dispatch. + EH_SJLJ_SETUP_DISPATCH, + + /// Tail call return. See X86TargetLowering::LowerCall for + /// the list of operands. + TC_RETURN, + + // Vector move to low scalar and zero higher vector elements. + VZEXT_MOVL, + + // Vector integer truncate. + VTRUNC, + // Vector integer truncate with unsigned/signed saturation. + VTRUNCUS, + VTRUNCS, + + // Masked version of the above. Used when less than a 128-bit result is + // produced since the mask only applies to the lower elements and can't + // be represented by a select. + // SRC, PASSTHRU, MASK + VMTRUNC, + VMTRUNCUS, + VMTRUNCS, + + // Vector FP extend. + VFPEXT, + VFPEXT_SAE, + VFPEXTS, + VFPEXTS_SAE, + + // Vector FP round. + VFPROUND, + VFPROUND_RND, + VFPROUNDS, + VFPROUNDS_RND, + + // Masked version of above. Used for v2f64->v4f32. + // SRC, PASSTHRU, MASK + VMFPROUND, + + // 128-bit vector logical left / right shift + VSHLDQ, + VSRLDQ, + + // Vector shift elements + VSHL, + VSRL, + VSRA, + + // Vector variable shift + VSHLV, + VSRLV, + VSRAV, + + // Vector shift elements by immediate + VSHLI, + VSRLI, + VSRAI, + + // Shifts of mask registers. + KSHIFTL, + KSHIFTR, + + // Bit rotate by immediate + VROTLI, + VROTRI, + + // Vector packed double/float comparison. + CMPP, + + // Vector integer comparisons. + PCMPEQ, + PCMPGT, + + // v8i16 Horizontal minimum and position. + PHMINPOS, + + MULTISHIFT, + + /// Vector comparison generating mask bits for fp and + /// integer signed and unsigned data types. + CMPM, + // Vector comparison with SAE for FP values + CMPM_SAE, + + // Arithmetic operations with FLAGS results. + ADD, + SUB, + ADC, + SBB, + SMUL, + UMUL, + OR, + XOR, + AND, + + // Bit field extract. + BEXTR, + + // Zero High Bits Starting with Specified Bit Position. + BZHI, + + // X86-specific multiply by immediate. + MUL_IMM, + + // Vector sign bit extraction. + MOVMSK, + + // Vector bitwise comparisons. + PTEST, + + // Vector packed fp sign bitwise comparisons. + TESTP, + + // OR/AND test for masks. + KORTEST, + KTEST, + + // ADD for masks. + KADD, + + // Several flavors of instructions with vector shuffle behaviors. + // Saturated signed/unnsigned packing. + PACKSS, + PACKUS, + // Intra-lane alignr. + PALIGNR, + // AVX512 inter-lane alignr. + VALIGN, + PSHUFD, + PSHUFHW, + PSHUFLW, + SHUFP, + // VBMI2 Concat & Shift. + VSHLD, + VSHRD, + VSHLDV, + VSHRDV, + // Shuffle Packed Values at 128-bit granularity. + SHUF128, + MOVDDUP, + MOVSHDUP, + MOVSLDUP, + MOVLHPS, + MOVHLPS, + MOVSD, + MOVSS, + UNPCKL, + UNPCKH, + VPERMILPV, + VPERMILPI, + VPERMI, + VPERM2X128, + + // Variable Permute (VPERM). + // Res = VPERMV MaskV, V0 + VPERMV, + + // 3-op Variable Permute (VPERMT2). + // Res = VPERMV3 V0, MaskV, V1 + VPERMV3, + + // Bitwise ternary logic. + VPTERNLOG, + // Fix Up Special Packed Float32/64 values. + VFIXUPIMM, + VFIXUPIMM_SAE, + VFIXUPIMMS, + VFIXUPIMMS_SAE, + // Range Restriction Calculation For Packed Pairs of Float32/64 values. + VRANGE, + VRANGE_SAE, + VRANGES, + VRANGES_SAE, + // Reduce - Perform Reduction Transformation on scalar\packed FP. + VREDUCE, + VREDUCE_SAE, + VREDUCES, + VREDUCES_SAE, + // RndScale - Round FP Values To Include A Given Number Of Fraction Bits. + // Also used by the legacy (V)ROUND intrinsics where we mask out the + // scaling part of the immediate. + VRNDSCALE, + VRNDSCALE_SAE, + VRNDSCALES, + VRNDSCALES_SAE, + // Tests Types Of a FP Values for packed types. + VFPCLASS, + // Tests Types Of a FP Values for scalar types. + VFPCLASSS, + + // Broadcast (splat) scalar or element 0 of a vector. If the operand is + // a vector, this node may change the vector length as part of the splat. + VBROADCAST, + // Broadcast mask to vector. + VBROADCASTM, + // Broadcast subvector to vector. + SUBV_BROADCAST, + + /// SSE4A Extraction and Insertion. + EXTRQI, + INSERTQI, + + // XOP arithmetic/logical shifts. + VPSHA, + VPSHL, + // XOP signed/unsigned integer comparisons. + VPCOM, + VPCOMU, + // XOP packed permute bytes. + VPPERM, + // XOP two source permutation. + VPERMIL2, + + // Vector multiply packed unsigned doubleword integers. + PMULUDQ, + // Vector multiply packed signed doubleword integers. + PMULDQ, + // Vector Multiply Packed UnsignedIntegers with Round and Scale. + MULHRS, + + // Multiply and Add Packed Integers. + VPMADDUBSW, + VPMADDWD, + + // AVX512IFMA multiply and add. + // NOTE: These are different than the instruction and perform + // op0 x op1 + op2. + VPMADD52L, + VPMADD52H, + + // VNNI + VPDPBUSD, + VPDPBUSDS, + VPDPWSSD, + VPDPWSSDS, + + // FMA nodes. + // We use the target independent ISD::FMA for the non-inverted case. + FNMADD, + FMSUB, + FNMSUB, + FMADDSUB, + FMSUBADD, + + // FMA with rounding mode. + FMADD_RND, + FNMADD_RND, + FMSUB_RND, + FNMSUB_RND, + FMADDSUB_RND, + FMSUBADD_RND, + + // Compress and expand. + COMPRESS, + EXPAND, + + // Bits shuffle + VPSHUFBITQMB, + + // Convert Unsigned/Integer to Floating-Point Value with rounding mode. + SINT_TO_FP_RND, + UINT_TO_FP_RND, + SCALAR_SINT_TO_FP, + SCALAR_UINT_TO_FP, + SCALAR_SINT_TO_FP_RND, + SCALAR_UINT_TO_FP_RND, + + // Vector float/double to signed/unsigned integer. + CVTP2SI, + CVTP2UI, + CVTP2SI_RND, + CVTP2UI_RND, + // Scalar float/double to signed/unsigned integer. + CVTS2SI, + CVTS2UI, + CVTS2SI_RND, + CVTS2UI_RND, + + // Vector float/double to signed/unsigned integer with truncation. + CVTTP2SI, + CVTTP2UI, + CVTTP2SI_SAE, + CVTTP2UI_SAE, + // Scalar float/double to signed/unsigned integer with truncation. + CVTTS2SI, + CVTTS2UI, + CVTTS2SI_SAE, + CVTTS2UI_SAE, + + // Vector signed/unsigned integer to float/double. + CVTSI2P, + CVTUI2P, + + // Masked versions of above. Used for v2f64->v4f32. + // SRC, PASSTHRU, MASK + MCVTP2SI, + MCVTP2UI, + MCVTTP2SI, + MCVTTP2UI, + MCVTSI2P, + MCVTUI2P, + + // Vector float to bfloat16. + // Convert TWO packed single data to one packed BF16 data + CVTNE2PS2BF16, + // Convert packed single data to packed BF16 data + CVTNEPS2BF16, + // Masked version of above. + // SRC, PASSTHRU, MASK + MCVTNEPS2BF16, + + // Dot product of BF16 pairs to accumulated into + // packed single precision. + DPBF16PS, + + // Save xmm argument registers to the stack, according to %al. An operator + // is needed so that this can be expanded with control flow. + VASTART_SAVE_XMM_REGS, + + // Save xmm argument registers of the vararg thunk function to the stack, + // according to %al. An operator is needed so that this can be expanded with + // control flow. + VARARG_THUNK_SAVE_XMM_REGS, + + // Windows's _chkstk call to do stack probing. + WIN_ALLOCA, + + // For allocating variable amounts of stack space when using + // segmented stacks. Check if the current stacklet has enough space, and + // falls back to heap allocation if not. + SEG_ALLOCA, + + // Memory barriers. + MEMBARRIER, + MFENCE, + + // Store FP status word into i16 register. + FNSTSW16r, + + // Store contents of %ah into %eflags. + SAHF, + + // Get a random integer and indicate whether it is valid in CF. + RDRAND, + + // Get a NIST SP800-90B & C compliant random integer and + // indicate whether it is valid in CF. + RDSEED, + + // Protection keys + // RDPKRU - Operand 0 is chain. Operand 1 is value for ECX. + // WRPKRU - Operand 0 is chain. Operand 1 is value for EDX. Operand 2 is + // value for ECX. + RDPKRU, + WRPKRU, + + // SSE42 string comparisons. + // These nodes produce 3 results, index, mask, and flags. X86ISelDAGToDAG + // will emit one or two instructions based on which results are used. If + // flags and index/mask this allows us to use a single instruction since + // we won't have to pick and opcode for flags. Instead we can rely on the + // DAG to CSE everything and decide at isel. + PCMPISTR, + PCMPESTR, + + // Test if in transactional execution. + XTEST, + + // ERI instructions. + RSQRT28, + RSQRT28_SAE, + RSQRT28S, + RSQRT28S_SAE, + RCP28, + RCP28_SAE, + RCP28S, + RCP28S_SAE, + EXP2, + EXP2_SAE, + + // Conversions between float and half-float. + CVTPS2PH, + CVTPH2PS, + CVTPH2PS_SAE, + + // Masked version of above. + // SRC, RND, PASSTHRU, MASK + MCVTPS2PH, + + // Galois Field Arithmetic Instructions + GF2P8AFFINEINVQB, + GF2P8AFFINEQB, + GF2P8MULB, + + // LWP insert record. + LWPINS, + + // User level wait + UMWAIT, + TPAUSE, + + // Enqueue Stores Instructions + ENQCMD, + ENQCMDS, + + // For avx512-vp2intersect + VP2INTERSECT, + + // Compare and swap. + LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE, + LCMPXCHG8_DAG, + LCMPXCHG16_DAG, + LCMPXCHG8_SAVE_EBX_DAG, + LCMPXCHG16_SAVE_RBX_DAG, + + /// LOCK-prefixed arithmetic read-modify-write instructions. + /// EFLAGS, OUTCHAIN = LADD(INCHAIN, PTR, RHS) + LADD, + LSUB, + LOR, + LXOR, + LAND, + + // Load, scalar_to_vector, and zero extend. + VZEXT_LOAD, + + // extract_vector_elt, store. + VEXTRACT_STORE, + + // scalar broadcast from memory + VBROADCAST_LOAD, + + // Store FP control world into i16 memory. + FNSTCW16m, + + /// This instruction implements FP_TO_SINT with the + /// integer destination in memory and a FP reg source. This corresponds + /// to the X86::FIST*m instructions and the rounding mode change stuff. It + /// has two inputs (token chain and address) and two outputs (int value + /// and token chain). Memory VT specifies the type to store to. + FP_TO_INT_IN_MEM, + + /// This instruction implements SINT_TO_FP with the + /// integer source in memory and FP reg result. This corresponds to the + /// X86::FILD*m instructions. It has two inputs (token chain and address) + /// and two outputs (FP value and token chain). FILD_FLAG also produces a + /// flag). The integer source type is specified by the memory VT. + FILD, + FILD_FLAG, + + /// This instruction implements a fp->int store from FP stack + /// slots. This corresponds to the fist instruction. It takes a + /// chain operand, value to store, address, and glue. The memory VT + /// specifies the type to store as. + FIST, + + /// This instruction implements an extending load to FP stack slots. + /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain + /// operand, and ptr to load from. The memory VT specifies the type to + /// load from. + FLD, + + /// This instruction implements a truncating store from FP stack + /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a + /// chain operand, value to store, address, and glue. The memory VT + /// specifies the type to store as. + FST, + + /// This instruction grabs the address of the next argument + /// from a va_list. (reads and modifies the va_list in memory) + VAARG_64, + + // Vector truncating store with unsigned/signed saturation + VTRUNCSTOREUS, + VTRUNCSTORES, + // Vector truncating masked store with unsigned/signed saturation + VMTRUNCSTOREUS, + VMTRUNCSTORES, + + // X86 specific gather and scatter + MGATHER, + MSCATTER, + + // WARNING: Do not add anything in the end unless you want the node to + // have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all + // opcodes will be thought as target memory ops! + }; } // end namespace X86ISD /// Define some predicates that are used for node matching. @@ -1431,6 +1550,10 @@ EmitVAStartSaveXMMRegsWithCustomInserter(MachineInstr &BInstr, MachineBasicBlock *BB) const; + MachineBasicBlock * + EmitVarargThunkSaveXMMRegsWithCustomInserter(MachineInstr &BInstr, + MachineBasicBlock *BB) const; + MachineBasicBlock *EmitLoweredCascadedSelect(MachineInstr &MI1, MachineInstr &MI2, MachineBasicBlock *BB) const; Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -3354,9 +3354,14 @@ F.hasFnAttribute(Attribute::NoImplicitFloat)) && "SSE register cannot be used when SSE is disabled!"); + SmallDenseSet guardedXmmRegs; + SmallVector LiveGPRs; + SmallVector LiveXMMRegs; + SDValue ALVal; + // 64-bit calling conventions support varargs and register parameters, so we // have to do extra work to spill them in the prologue. - if (Is64Bit && isVarArg && MFI.hasVAStart()) { + if (Is64Bit && isVarArg) { // Find the first unallocated argument registers. ArrayRef ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget); ArrayRef ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget); @@ -3366,77 +3371,83 @@ "SSE register cannot be used when SSE is disabled!"); // Gather all the live in physical registers. - SmallVector LiveGPRs; - SmallVector LiveXMMRegs; - SDValue ALVal; for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) { unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass); LiveGPRs.push_back( DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64)); } + if (!ArgXMMs.empty()) { unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass); ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8); for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) { - unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass); - LiveXMMRegs.push_back( - DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32)); - } - } - - if (IsWin64) { - // Get to the caller-allocated home save location. Add 8 to account - // for the return address. - int HomeOffset = TFI.getOffsetOfLocalArea() + 8; - FuncInfo->setRegSaveFrameIndex( - MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); - // Fixup to set vararg frame on shadow area (4 x i64). - if (NumIntRegs < 4) - FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); - } else { - // For X86-64, if there are vararg parameters that are passed via - // registers, then we must store them to their spots on the stack so - // they may be loaded by dereferencing the result of va_next. - FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); - FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16); - FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject( - ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false)); - } - - // Store the integer parameter registers. - SmallVector MemOps; - SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), - getPointerTy(DAG.getDataLayout())); - unsigned Offset = FuncInfo->getVarArgsGPOffset(); - for (SDValue Val : LiveGPRs) { - SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), - RSFIN, DAG.getIntPtrConstant(Offset, dl)); - SDValue Store = - DAG.getStore(Val.getValue(1), dl, Val, FIN, - MachinePointerInfo::getFixedStack( - DAG.getMachineFunction(), - FuncInfo->getRegSaveFrameIndex(), Offset)); - MemOps.push_back(Store); - Offset += 8; - } - - if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) { - // Now store the XMM (fp + vector) parameter registers. - SmallVector SaveXMMOps; - SaveXMMOps.push_back(Chain); - SaveXMMOps.push_back(ALVal); - SaveXMMOps.push_back(DAG.getIntPtrConstant( - FuncInfo->getRegSaveFrameIndex(), dl)); - SaveXMMOps.push_back(DAG.getIntPtrConstant( - FuncInfo->getVarArgsFPOffset(), dl)); - SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(), - LiveXMMRegs.end()); - MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, - MVT::Other, SaveXMMOps)); + // FastRegisterAllocator spills virtual registers at basic + // block boundary. That leads to usages of xmm registers + // outside of check for %al. Pass physical registers to + // VASTART_SAVE_XMM_REGS to avoid unneccessary spilling. + // See https://bugs.llvm.org/show_bug.cgi?id=42219. + MF.getRegInfo().addLiveIn(Reg); + LiveXMMRegs.push_back(DAG.getRegister(Reg, MVT::v4f32)); + guardedXmmRegs.insert(Reg); + } + } + + if (MFI.hasVAStart()) { + if (IsWin64) { + // Get to the caller-allocated home save location. Add 8 to account + // for the return address. + int HomeOffset = TFI.getOffsetOfLocalArea() + 8; + FuncInfo->setRegSaveFrameIndex( + MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); + // Fixup to set vararg frame on shadow area (4 x i64). + if (NumIntRegs < 4) + FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); + } else { + // For X86-64, if there are vararg parameters that are passed via + // registers, then we must store them to their spots on the stack so + // they may be loaded by dereferencing the result of va_next. + FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); + FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16); + FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject( + ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false)); + } + + // Store the integer parameter registers. + SmallVector MemOps; + SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), + getPointerTy(DAG.getDataLayout())); + unsigned Offset = FuncInfo->getVarArgsGPOffset(); + for (SDValue Val : LiveGPRs) { + SDValue FIN = + DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), RSFIN, + DAG.getIntPtrConstant(Offset, dl)); + SDValue Store = + DAG.getStore(Val.getValue(1), dl, Val, FIN, + MachinePointerInfo::getFixedStack( + DAG.getMachineFunction(), + FuncInfo->getRegSaveFrameIndex(), Offset)); + MemOps.push_back(Store); + Offset += 8; + } + + if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) { + // Now store the XMM (fp + vector) parameter registers. + SmallVector SaveXMMOps; + SaveXMMOps.push_back(Chain); + SaveXMMOps.push_back(ALVal); + SaveXMMOps.push_back( + DAG.getIntPtrConstant(FuncInfo->getRegSaveFrameIndex(), dl)); + SaveXMMOps.push_back( + DAG.getIntPtrConstant(FuncInfo->getVarArgsFPOffset(), dl)); + SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(), + LiveXMMRegs.end()); + MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, + MVT::Other, SaveXMMOps)); + } + + if (!MemOps.empty()) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); } - - if (!MemOps.empty()) - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); } if (isVarArg && MFI.hasMustTailInVarArgFunc()) { @@ -3462,7 +3473,8 @@ // Compute the set of forwarded registers. The rest are scratch. SmallVectorImpl &Forwards = FuncInfo->getForwardedMustTailRegParms(); - CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86); + CCInfo.analyzeMustTailForwardedRegisters(Forwards, guardedXmmRegs, + RegParmTypes, CC_X86); // Conservatively forward AL on x86_64, since it might be used for varargs. if (Is64Bit && !CCInfo.isAllocated(X86::AL)) { @@ -3473,9 +3485,48 @@ // Copy all forwards from physical to virtual registers. for (ForwardedRegister &FR : Forwards) { // FIXME: Can we use a less constrained schedule? - SDValue RegVal = DAG.getCopyFromReg(Chain, dl, FR.VReg, FR.VT); - FR.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(FR.VT)); - Chain = DAG.getCopyToReg(Chain, dl, FR.VReg, RegVal); + if (!FR.IsGuarded()) { + SDValue RegVal = DAG.getCopyFromReg(Chain, dl, FR.VReg, FR.VT); + FR.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(FR.VT)); + Chain = DAG.getCopyToReg(Chain, dl, FR.VReg, RegVal); + } + } + + if (guardedXmmRegs.size() > 0) { + if (MFI.hasVAStart()) { + // all incoming xmm registers are already stored by VAStart + // handling. Reuse these stored values for thunk forwarded + // parameters here. + FuncInfo->setThunkRegSaveFrameIndex(FuncInfo->getRegSaveFrameIndex()); + } else { + // TODO: add check for possibility to not store guarded vararg + // TODO: parameters. If function contains only musttail calls, if it + // TODO: does not use floating point types, + // TODO: if Attribute::NoImplicitFloat specified then: + // TODO: it is possible to not store/restore guarded vararg parameters + // TODO: of thunk. + + // TODO: implement support for YMM, ZMM vararg registers + + // allocate stack space to save guardedXmmRegs, 16 is size of XMM + FuncInfo->setThunkRegSaveFrameIndex( + MFI.CreateStackObject(guardedXmmRegs.size() * 16, 16, false)); + + // Save guarded forwards into guarded area + SmallVector VarargMemOps; + SmallVector VarargXMMOps; + VarargXMMOps.push_back(Chain); + VarargXMMOps.push_back(ALVal); + VarargXMMOps.push_back( + DAG.getIntPtrConstant(FuncInfo->getThunkRegSaveFrameIndex(), dl)); + VarargXMMOps.push_back(DAG.getIntPtrConstant(0, dl)); + VarargXMMOps.insert(VarargXMMOps.end(), LiveXMMRegs.begin(), + LiveXMMRegs.end()); + VarargMemOps.push_back(DAG.getNode(X86ISD::VARARG_THUNK_SAVE_XMM_REGS, + dl, MVT::Other, VarargXMMOps)); + if (!VarargMemOps.empty()) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, VarargMemOps); + } } } @@ -3497,8 +3548,9 @@ } if (!Is64Bit) { - // RegSaveFrameIndex is X86-64 only. + // RegSaveFrameIndex and ThunkRegSaveFrameIndex is X86-64 only. FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); + FuncInfo->setThunkRegSaveFrameIndex(0xAAAAAAA); if (CallConv == CallingConv::X86_FastCall || CallConv == CallingConv::X86_ThisCall) // fastcc functions can't have varargs. @@ -3904,8 +3956,10 @@ if (isVarArg && IsMustTail) { const auto &Forwards = X86Info->getForwardedMustTailRegParms(); for (const auto &F : Forwards) { - SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT); - RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val)); + if (!F.IsGuarded()) { + SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT); + RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val)); + } } } @@ -28745,6 +28799,8 @@ case X86ISD::PSADBW: return "X86ISD::PSADBW"; case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW"; case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; + case X86ISD::VARARG_THUNK_SAVE_XMM_REGS: + return "X86::VARARG_THUNK_SAVE_XMM_REGS"; case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER"; @@ -29509,8 +29565,68 @@ return endMBB; } +// This function creates additional block for storing varargs guarded +// registers. It adds check for %al into entry block, to skip +// GuardedRegsBlk if xmm registers should not be stored. +// +// EntryBlk[VAPseudoInstr] EntryBlk +// | | . +// | | . +// | | GuardedRegsBlk +// | => | . +// | | . +// | TailBlk[VAPseudoInstr] +// | | +// | | +// +static std::pair +CreateGuardedRegsBlock(MachineBasicBlock *EntryBlk, MachineInstr &VAPseudoInstr, + const X86Subtarget &Subtarget) { + + MachineFunction *Func = EntryBlk->getParent(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + DebugLoc DL = VAPseudoInstr.getDebugLoc(); + Register CountReg = VAPseudoInstr.getOperand(0).getReg(); + + // Create the new basic blocks. One block contains all the XMM stores, + // and one block is the final destination regardless of whether any + // stores were performed. + const BasicBlock *LLVMBlk = EntryBlk->getBasicBlock(); + MachineFunction::iterator EntryBlkIter = ++EntryBlk->getIterator(); + MachineBasicBlock *GuardedRegsBlk = Func->CreateMachineBasicBlock(LLVMBlk); + MachineBasicBlock *TailBlk = Func->CreateMachineBasicBlock(LLVMBlk); + Func->insert(EntryBlkIter, GuardedRegsBlk); + Func->insert(EntryBlkIter, TailBlk); + + GuardedRegsBlk->setIsGuardedRegsBlk(); + + // Transfer the remainder of MBB and its successor edges to EndMBB. + TailBlk->splice(TailBlk->begin(), EntryBlk, + std::next(MachineBasicBlock::iterator(VAPseudoInstr)), + EntryBlk->end()); + TailBlk->transferSuccessorsAndUpdatePHIs(EntryBlk); + + // The original block will now fall through to the XMM save block. + EntryBlk->addSuccessor(GuardedRegsBlk); + // The XMMSaveMBB will fall through to the end block. + GuardedRegsBlk->addSuccessor(TailBlk); + + if (!Subtarget.isCallingConvWin64(Func->getFunction().getCallingConv())) { + // If %al is 0, branch around the XMM save block. + BuildMI(EntryBlk, DL, TII->get(X86::TEST8rr)) + .addReg(CountReg) + .addReg(CountReg); + BuildMI(EntryBlk, DL, TII->get(X86::JCC_1)) + .addMBB(TailBlk) + .addImm(X86::COND_E); + EntryBlk->addSuccessor(TailBlk); + } + + return std::make_pair(GuardedRegsBlk, TailBlk); +} + MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( - MachineInstr &MI, MachineBasicBlock *MBB) const { + MachineInstr &PseudoVaStartInstr, MachineBasicBlock *EntryBlk) const { // Emit code to save XMM registers to the stack. The ABI says that the // number of registers to save is given in %al, so it's theoretically // possible to do an indirect jump trick to avoid saving all of them, @@ -29519,69 +29635,133 @@ // easier on the hardware branch predictor, and stores aren't all that // expensive anyway. - // Create the new basic blocks. One block contains all the XMM stores, - // and one block is the final destination regardless of whether any - // stores were performed. - const BasicBlock *LLVM_BB = MBB->getBasicBlock(); - MachineFunction *F = MBB->getParent(); - MachineFunction::iterator MBBIter = ++MBB->getIterator(); - MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); - MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); - F->insert(MBBIter, XMMSaveMBB); - F->insert(MBBIter, EndMBB); + MachineBasicBlock *GuardedRegsBlk = nullptr; + MachineBasicBlock *TailBlk = nullptr; - // Transfer the remainder of MBB and its successor edges to EndMBB. - EndMBB->splice(EndMBB->begin(), MBB, - std::next(MachineBasicBlock::iterator(MI)), MBB->end()); - EndMBB->transferSuccessorsAndUpdatePHIs(MBB); + std::tie(GuardedRegsBlk, TailBlk) = + CreateGuardedRegsBlock(EntryBlk, PseudoVaStartInstr, Subtarget); - // The original block will now fall through to the XMM save block. - MBB->addSuccessor(XMMSaveMBB); - // The XMMSaveMBB will fall through to the end block. - XMMSaveMBB->addSuccessor(EndMBB); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + DebugLoc DL = PseudoVaStartInstr.getDebugLoc(); + int64_t RegSaveFrameIndex = PseudoVaStartInstr.getOperand(1).getImm(); + int64_t VarArgsFPOffset = PseudoVaStartInstr.getOperand(2).getImm(); + MachineFunction *Func = EntryBlk->getParent(); // Now add the instructions. - const TargetInstrInfo *TII = Subtarget.getInstrInfo(); - DebugLoc DL = MI.getDebugLoc(); - Register CountReg = MI.getOperand(0).getReg(); - int64_t RegSaveFrameIndex = MI.getOperand(1).getImm(); - int64_t VarArgsFPOffset = MI.getOperand(2).getImm(); + // Make sure the last operand is EFLAGS, which gets clobbered by the branch + // that was just emitted, but clearly shouldn't be "saved". + assert( + (PseudoVaStartInstr.getNumOperands() <= 3 || + !PseudoVaStartInstr.getOperand(PseudoVaStartInstr.getNumOperands() - 1) + .isReg() || + PseudoVaStartInstr.getOperand(PseudoVaStartInstr.getNumOperands() - 1) + .getReg() == X86::EFLAGS) && + "Expected last argument to be EFLAGS"); + + // TODO: add support for YMM and ZMM here. + unsigned MovOpc = Subtarget.hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr; + // save all guarded XMM registers. + for (unsigned OpndIdx = 3, RegIdx = 0; + OpndIdx + 1 < PseudoVaStartInstr.getNumOperands(); OpndIdx++, RegIdx++) { + int64_t offset = RegIdx * 16 + VarArgsFPOffset; + MachineMemOperand *memoryOpnd = Func->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*Func, RegSaveFrameIndex, offset), + MachineMemOperand::MOStore, + /*Size=*/16, /*Align=*/16); + BuildMI(GuardedRegsBlk, DL, TII->get(MovOpc)) + .addFrameIndex(RegSaveFrameIndex) + .addImm(/*Scale=*/1) + .addReg(/*IndexReg=*/0) + .addImm(/*Disp=*/offset) + .addReg(/*Segment=*/0) + .addReg(PseudoVaStartInstr.getOperand(OpndIdx).getReg()) + .addMemOperand(memoryOpnd); + assert(Register::isPhysicalRegister( + PseudoVaStartInstr.getOperand(OpndIdx).getReg())); + GuardedRegsBlk->addLiveIn(PseudoVaStartInstr.getOperand(OpndIdx).getReg()); + } + + PseudoVaStartInstr.eraseFromParent(); // The pseudo instruction is gone now. - if (!Subtarget.isCallingConvWin64(F->getFunction().getCallingConv())) { - // If %al is 0, branch around the XMM save block. - BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); - BuildMI(MBB, DL, TII->get(X86::JCC_1)).addMBB(EndMBB).addImm(X86::COND_E); - MBB->addSuccessor(EndMBB); + return TailBlk; +} + +MachineBasicBlock * +X86TargetLowering::EmitVarargThunkSaveXMMRegsWithCustomInserter( + MachineInstr &PseudoVarargThunkInstr, MachineBasicBlock *EntryBlk) const { + MachineBasicBlock *GuardedRegsBlk = nullptr; + MachineBasicBlock *TailBlk = nullptr; + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + DebugLoc DL = PseudoVarargThunkInstr.getDebugLoc(); + int64_t ThunkRegSaveFrameIndex = + PseudoVarargThunkInstr.getOperand(1).getImm(); + int64_t VarArgsRegsOffset = PseudoVarargThunkInstr.getOperand(2).getImm(); + MachineFunction *Func = EntryBlk->getParent(); + bool NeedToAddLiveInsIntoGuardedRegsBlk = true; + + // check whether GuardedRegsBlk is already created by VASTART handling code + assert(Func->begin() != Func->end()); + for (auto &Succ : (*Func->begin()).successors()) { + if (Succ->isGuardedRegsBlk()) { + GuardedRegsBlk = Succ; + TailBlk = *GuardedRegsBlk->succ_begin(); + NeedToAddLiveInsIntoGuardedRegsBlk = false; + break; + } } + if (GuardedRegsBlk == nullptr) + std::tie(GuardedRegsBlk, TailBlk) = + CreateGuardedRegsBlock(EntryBlk, PseudoVarargThunkInstr, Subtarget); + + // Now add the instructions. + // Make sure the last operand is EFLAGS, which gets clobbered by the branch // that was just emitted, but clearly shouldn't be "saved". - assert((MI.getNumOperands() <= 3 || - !MI.getOperand(MI.getNumOperands() - 1).isReg() || - MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) && + assert((PseudoVarargThunkInstr.getNumOperands() <= 3 || + !PseudoVarargThunkInstr + .getOperand(PseudoVarargThunkInstr.getNumOperands() - 1) + .isReg() || + PseudoVarargThunkInstr + .getOperand(PseudoVarargThunkInstr.getNumOperands() - 1) + .getReg() == X86::EFLAGS) && "Expected last argument to be EFLAGS"); + + // TODO: add support for YMM and ZMM here. unsigned MOVOpc = Subtarget.hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr; + // In the XMM save block, save all the XMM argument registers. - for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) { - int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; - MachineMemOperand *MMO = F->getMachineMemOperand( - MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset), - MachineMemOperand::MOStore, - /*Size=*/16, /*Align=*/16); - BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc)) - .addFrameIndex(RegSaveFrameIndex) + for (unsigned OpndIdx = 3, RegIdx = 0; + OpndIdx + 1 < PseudoVarargThunkInstr.getNumOperands(); + OpndIdx++, RegIdx++) { + int64_t Offset = RegIdx * 16 + VarArgsRegsOffset; + + MachineMemOperand *MMO = + Func->getMachineMemOperand(MachinePointerInfo::getFixedStack( + *Func, ThunkRegSaveFrameIndex, Offset), + MachineMemOperand::MOStore, + /*Size=*/16, /*Align=*/16); + BuildMI(GuardedRegsBlk, DL, TII->get(MOVOpc)) + .addFrameIndex(ThunkRegSaveFrameIndex) .addImm(/*Scale=*/1) .addReg(/*IndexReg=*/0) .addImm(/*Disp=*/Offset) .addReg(/*Segment=*/0) - .addReg(MI.getOperand(i).getReg()) + .addReg(PseudoVarargThunkInstr.getOperand(OpndIdx).getReg()) .addMemOperand(MMO); + assert(Register::isPhysicalRegister( + PseudoVarargThunkInstr.getOperand(OpndIdx).getReg())); + + if (NeedToAddLiveInsIntoGuardedRegsBlk) + GuardedRegsBlk->addLiveIn( + PseudoVarargThunkInstr.getOperand(OpndIdx).getReg()); } - MI.eraseFromParent(); // The pseudo instruction is gone now. + PseudoVarargThunkInstr + .eraseFromParent(); // The pseudo instruction is gone now. - return EndMBB; + return TailBlk; } // The EFLAGS operand of SelectItr might be missing a kill marker @@ -31320,6 +31500,9 @@ case X86::VASTART_SAVE_XMM_REGS: return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); + case X86::VARARG_THUNK_SAVE_XMM_REGS: + return EmitVarargThunkSaveXMMRegsWithCustomInserter(MI, BB); + case X86::VAARG_64: return EmitVAARG64WithCustomInserter(MI, BB); Index: llvm/lib/Target/X86/X86InstrCompiler.td =================================================================== --- llvm/lib/Target/X86/X86InstrCompiler.td +++ llvm/lib/Target/X86/X86InstrCompiler.td @@ -81,6 +81,19 @@ imm:$offset), (implicit EFLAGS)]>; +// x86-64 %al guarded thunk arguments lowering magic. +def VARARG_THUNK_SAVE_XMM_REGS : I<0, Pseudo, + (outs), + (ins GR8:$al, + i64imm:$regsavefi, i64imm:$offset, + variable_ops), + "#VARARG_THUNK_SAVE_XMM_REGS $al, $regsavefi, $offset", + [(X86vararg_thunk_save_xmm_regs GR8:$al, + imm:$regsavefi, + imm:$offset), + (implicit EFLAGS)]>; + + // The VAARG_64 pseudo-instruction takes the address of the va_list, // and places the address of the next argument into a register. let Defs = [EFLAGS] in Index: llvm/lib/Target/X86/X86InstrInfo.td =================================================================== --- llvm/lib/Target/X86/X86InstrInfo.td +++ llvm/lib/Target/X86/X86InstrInfo.td @@ -99,6 +99,11 @@ SDTCisVT<1, iPTR>, SDTCisVT<2, iPTR>]>; +def SDT_X86VARARG_THUNK_SAVE_XMM_REGS : SDTypeProfile<0, -1, [SDTCisVT<0, i8>, + SDTCisVT<1, iPTR>, + SDTCisVT<2, iPTR>]>; + + def SDT_X86VAARG_64 : SDTypeProfile<1, -1, [SDTCisPtrTy<0>, SDTCisPtrTy<1>, SDTCisVT<2, i32>, @@ -190,6 +195,12 @@ SDNode<"X86ISD::VASTART_SAVE_XMM_REGS", SDT_X86VASTART_SAVE_XMM_REGS, [SDNPHasChain, SDNPVariadic]>; + +def X86vararg_thunk_save_xmm_regs : + SDNode<"X86ISD::VARARG_THUNK_SAVE_XMM_REGS", + SDT_X86VARARG_THUNK_SAVE_XMM_REGS, + [SDNPHasChain, SDNPVariadic]>; + def X86vaarg64 : SDNode<"X86ISD::VAARG_64", SDT_X86VAARG_64, [SDNPHasChain, SDNPMayLoad, SDNPMayStore, Index: llvm/lib/Target/X86/X86MachineFunctionInfo.h =================================================================== --- llvm/lib/Target/X86/X86MachineFunctionInfo.h +++ llvm/lib/Target/X86/X86MachineFunctionInfo.h @@ -73,6 +73,9 @@ int VarArgsFrameIndex = 0; /// RegSaveFrameIndex - X86-64 vararg func register save area. int RegSaveFrameIndex = 0; + /// thunkRegSaveFrameIndex - X86-64 vararg func register save area for thunk + /// functions. + int thunkRegSaveFrameIndex = 0; /// VarArgsGPOffset - X86-64 vararg func int reg offset. unsigned VarArgsGPOffset = 0; /// VarArgsFPOffset - X86-64 vararg func fp reg offset. @@ -155,6 +158,9 @@ int getRegSaveFrameIndex() const { return RegSaveFrameIndex; } void setRegSaveFrameIndex(int Idx) { RegSaveFrameIndex = Idx; } + int getThunkRegSaveFrameIndex() const { return thunkRegSaveFrameIndex; } + void setThunkRegSaveFrameIndex(int Idx) { thunkRegSaveFrameIndex = Idx; } + unsigned getVarArgsGPOffset() const { return VarArgsGPOffset; } void setVarArgsGPOffset(unsigned Offset) { VarArgsGPOffset = Offset; } Index: llvm/test/CodeGen/X86/musttail-varargs.ll =================================================================== --- llvm/test/CodeGen/X86/musttail-varargs.ll +++ llvm/test/CodeGen/X86/musttail-varargs.ll @@ -1,9 +1,14 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -verify-machineinstrs < %s -enable-tail-merge=0 -mtriple=x86_64-linux | FileCheck %s --check-prefix=LINUX +; RUN: llc -verify-machineinstrs -O0 < %s -enable-tail-merge=0 -mtriple=x86_64-linux | FileCheck %s --check-prefix=LINUX-OPT0 ; RUN: llc -verify-machineinstrs < %s -enable-tail-merge=0 -mtriple=x86_64-linux-gnux32 | FileCheck %s --check-prefix=LINUX-X32 +; RUN: llc -verify-machineinstrs -O0 < %s -enable-tail-merge=0 -mtriple=x86_64-linux-gnux32 | FileCheck %s --check-prefix=LINUX-X32-OPT0 ; RUN: llc -verify-machineinstrs < %s -enable-tail-merge=0 -mtriple=x86_64-windows | FileCheck %s --check-prefix=WINDOWS +; RUN: llc -verify-machineinstrs -O0 < %s -enable-tail-merge=0 -mtriple=x86_64-windows | FileCheck %s --check-prefix=WINDOWS-OPT0 ; RUN: llc -verify-machineinstrs < %s -enable-tail-merge=0 -mtriple=i686-windows | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE +; RUN: llc -verify-machineinstrs -O0 < %s -enable-tail-merge=0 -mtriple=i686-windows | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE-OPT0 ; RUN: llc -verify-machineinstrs < %s -enable-tail-merge=0 -mtriple=i686-windows -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE +; RUN: llc -verify-machineinstrs -O0 < %s -enable-tail-merge=0 -mtriple=i686-windows -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE-OPT0 ; Test that we actually spill and reload all arguments in the variadic argument ; pack. Doing a normal call will clobber all argument registers, and we will @@ -29,8 +34,8 @@ ; LINUX-NEXT: .cfi_def_cfa_offset 48 ; LINUX-NEXT: pushq %rbx ; LINUX-NEXT: .cfi_def_cfa_offset 56 -; LINUX-NEXT: subq $360, %rsp # imm = 0x168 -; LINUX-NEXT: .cfi_def_cfa_offset 416 +; LINUX-NEXT: subq $232, %rsp +; LINUX-NEXT: .cfi_def_cfa_offset 288 ; LINUX-NEXT: .cfi_offset %rbx, -56 ; LINUX-NEXT: .cfi_offset %r12, -48 ; LINUX-NEXT: .cfi_offset %r13, -40 @@ -43,6 +48,11 @@ ; LINUX-NEXT: movq %rdx, %rbp ; LINUX-NEXT: movq %rsi, %rbx ; LINUX-NEXT: movq %rdi, %r14 +; LINUX-NEXT: movq %rsi, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movq %r8, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movq %r9, {{[0-9]+}}(%rsp) ; LINUX-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; LINUX-NEXT: testb %al, %al ; LINUX-NEXT: je .LBB0_2 @@ -56,11 +66,6 @@ ; LINUX-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp) ; LINUX-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp) ; LINUX-NEXT: .LBB0_2: -; LINUX-NEXT: movq %rbx, {{[0-9]+}}(%rsp) -; LINUX-NEXT: movq %rbp, {{[0-9]+}}(%rsp) -; LINUX-NEXT: movq %r13, {{[0-9]+}}(%rsp) -; LINUX-NEXT: movq %r12, {{[0-9]+}}(%rsp) -; LINUX-NEXT: movq %r15, {{[0-9]+}}(%rsp) ; LINUX-NEXT: leaq {{[0-9]+}}(%rsp), %rax ; LINUX-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; LINUX-NEXT: leaq {{[0-9]+}}(%rsp), %rax @@ -68,14 +73,6 @@ ; LINUX-NEXT: movabsq $206158430216, %rax # imm = 0x3000000008 ; LINUX-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; LINUX-NEXT: movq %r14, %rdi -; LINUX-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUX-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUX-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUX-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUX-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUX-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUX-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUX-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; LINUX-NEXT: callq get_f ; LINUX-NEXT: movq %rax, %r11 ; LINUX-NEXT: movq %r14, %rdi @@ -84,16 +81,36 @@ ; LINUX-NEXT: movq %r13, %rcx ; LINUX-NEXT: movq %r12, %r8 ; LINUX-NEXT: movq %r15, %r9 -; LINUX-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; LINUX-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; LINUX-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; LINUX-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; LINUX-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; LINUX-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; LINUX-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; LINUX-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; LINUX-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload -; LINUX-NEXT: addq $360, %rsp # imm = 0x168 +; LINUX-NEXT: testb %al, %al +; LINUX-NEXT: je .LBB0_4 +; LINUX-NEXT: # %bb.3: +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm6 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; LINUX-NEXT: addq $232, %rsp +; LINUX-NEXT: .cfi_def_cfa_offset 56 +; LINUX-NEXT: popq %rbx +; LINUX-NEXT: .cfi_def_cfa_offset 48 +; LINUX-NEXT: popq %r12 +; LINUX-NEXT: .cfi_def_cfa_offset 40 +; LINUX-NEXT: popq %r13 +; LINUX-NEXT: .cfi_def_cfa_offset 32 +; LINUX-NEXT: popq %r14 +; LINUX-NEXT: .cfi_def_cfa_offset 24 +; LINUX-NEXT: popq %r15 +; LINUX-NEXT: .cfi_def_cfa_offset 16 +; LINUX-NEXT: popq %rbp +; LINUX-NEXT: .cfi_def_cfa_offset 8 +; LINUX-NEXT: jmpq *%r11 # TAILCALL +; LINUX-NEXT: .LBB0_4: +; LINUX-NEXT: .cfi_def_cfa_offset 288 +; LINUX-NEXT: addq $232, %rsp ; LINUX-NEXT: .cfi_def_cfa_offset 56 ; LINUX-NEXT: popq %rbx ; LINUX-NEXT: .cfi_def_cfa_offset 48 @@ -109,6 +126,85 @@ ; LINUX-NEXT: .cfi_def_cfa_offset 8 ; LINUX-NEXT: jmpq *%r11 # TAILCALL ; +; LINUX-OPT0-LABEL: f_thunk: +; LINUX-OPT0: # %bb.0: +; LINUX-OPT0-NEXT: subq $328, %rsp # imm = 0x148 +; LINUX-OPT0-NEXT: .cfi_def_cfa_offset 336 +; LINUX-OPT0-NEXT: testb %al, %al +; LINUX-OPT0-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; LINUX-OPT0-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: je .LBB0_2 +; LINUX-OPT0-NEXT: # %bb.1: +; LINUX-OPT0-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm5, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: .LBB0_2: +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; LINUX-OPT0-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; LINUX-OPT0-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; LINUX-OPT0-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; LINUX-OPT0-NEXT: movq %rsi, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; LINUX-OPT0-NEXT: movq %rdi, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %r8b # 1-byte Reload +; LINUX-OPT0-NEXT: leaq {{[0-9]+}}(%rsp), %r9 +; LINUX-OPT0-NEXT: movq %r9, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: leaq {{[0-9]+}}(%rsp), %r9 +; LINUX-OPT0-NEXT: movq %r9, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movl $48, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movl $8, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; LINUX-OPT0-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %r9, %rdi +; LINUX-OPT0-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; LINUX-OPT0-NEXT: callq get_f +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; LINUX-OPT0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %r10b # 1-byte Reload +; LINUX-OPT0-NEXT: movq %rax, (%rsp) # 8-byte Spill +; LINUX-OPT0-NEXT: movb %r10b, %al +; LINUX-OPT0-NEXT: movq (%rsp), %r11 # 8-byte Reload +; LINUX-OPT0-NEXT: testb %al, %al +; LINUX-OPT0-NEXT: je .LBB0_4 +; LINUX-OPT0-NEXT: # %bb.3: +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm6 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; LINUX-OPT0-NEXT: addq $328, %rsp # imm = 0x148 +; LINUX-OPT0-NEXT: .cfi_def_cfa_offset 8 +; LINUX-OPT0-NEXT: jmpq *%r11 # TAILCALL +; LINUX-OPT0-NEXT: .LBB0_4: +; LINUX-OPT0-NEXT: .cfi_def_cfa_offset 336 +; LINUX-OPT0-NEXT: addq $328, %rsp # imm = 0x148 +; LINUX-OPT0-NEXT: .cfi_def_cfa_offset 8 +; LINUX-OPT0-NEXT: jmpq *%r11 # TAILCALL +; ; LINUX-X32-LABEL: f_thunk: ; LINUX-X32: # %bb.0: ; LINUX-X32-NEXT: pushq %rbp @@ -123,8 +219,8 @@ ; LINUX-X32-NEXT: .cfi_def_cfa_offset 48 ; LINUX-X32-NEXT: pushq %rbx ; LINUX-X32-NEXT: .cfi_def_cfa_offset 56 -; LINUX-X32-NEXT: subl $344, %esp # imm = 0x158 -; LINUX-X32-NEXT: .cfi_def_cfa_offset 400 +; LINUX-X32-NEXT: subl $216, %esp +; LINUX-X32-NEXT: .cfi_def_cfa_offset 272 ; LINUX-X32-NEXT: .cfi_offset %rbx, -56 ; LINUX-X32-NEXT: .cfi_offset %r12, -48 ; LINUX-X32-NEXT: .cfi_offset %r13, -40 @@ -137,6 +233,11 @@ ; LINUX-X32-NEXT: movq %rdx, %rbp ; LINUX-X32-NEXT: movq %rsi, %rbx ; LINUX-X32-NEXT: movl %edi, %r14d +; LINUX-X32-NEXT: movq %rsi, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movq %rdx, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movq %rcx, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movq %r8, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movq %r9, {{[0-9]+}}(%esp) ; LINUX-X32-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; LINUX-X32-NEXT: testb %al, %al ; LINUX-X32-NEXT: je .LBB0_2 @@ -150,11 +251,6 @@ ; LINUX-X32-NEXT: movaps %xmm6, {{[0-9]+}}(%esp) ; LINUX-X32-NEXT: movaps %xmm7, {{[0-9]+}}(%esp) ; LINUX-X32-NEXT: .LBB0_2: -; LINUX-X32-NEXT: movq %rbx, {{[0-9]+}}(%esp) -; LINUX-X32-NEXT: movq %rbp, {{[0-9]+}}(%esp) -; LINUX-X32-NEXT: movq %r13, {{[0-9]+}}(%esp) -; LINUX-X32-NEXT: movq %r12, {{[0-9]+}}(%esp) -; LINUX-X32-NEXT: movq %r15, {{[0-9]+}}(%esp) ; LINUX-X32-NEXT: leal {{[0-9]+}}(%rsp), %eax ; LINUX-X32-NEXT: movl %eax, {{[0-9]+}}(%esp) ; LINUX-X32-NEXT: leal {{[0-9]+}}(%rsp), %eax @@ -162,14 +258,6 @@ ; LINUX-X32-NEXT: movabsq $206158430216, %rax # imm = 0x3000000008 ; LINUX-X32-NEXT: movq %rax, {{[0-9]+}}(%esp) ; LINUX-X32-NEXT: movl %r14d, %edi -; LINUX-X32-NEXT: movaps %xmm7, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; LINUX-X32-NEXT: movaps %xmm6, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; LINUX-X32-NEXT: movaps %xmm5, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; LINUX-X32-NEXT: movaps %xmm4, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; LINUX-X32-NEXT: movaps %xmm3, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; LINUX-X32-NEXT: movaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; LINUX-X32-NEXT: movaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; LINUX-X32-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; LINUX-X32-NEXT: callq get_f ; LINUX-X32-NEXT: movl %eax, %r11d ; LINUX-X32-NEXT: movl %r14d, %edi @@ -178,16 +266,36 @@ ; LINUX-X32-NEXT: movq %r13, %rcx ; LINUX-X32-NEXT: movq %r12, %r8 ; LINUX-X32-NEXT: movq %r15, %r9 -; LINUX-X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; LINUX-X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload -; LINUX-X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload -; LINUX-X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm3 # 16-byte Reload -; LINUX-X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm4 # 16-byte Reload -; LINUX-X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm5 # 16-byte Reload -; LINUX-X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm6 # 16-byte Reload -; LINUX-X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm7 # 16-byte Reload ; LINUX-X32-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload -; LINUX-X32-NEXT: addl $344, %esp # imm = 0x158 +; LINUX-X32-NEXT: testb %al, %al +; LINUX-X32-NEXT: je .LBB0_4 +; LINUX-X32-NEXT: # %bb.3: +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm7 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm6 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm5 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm4 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm3 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm2 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm1 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 +; LINUX-X32-NEXT: addl $216, %esp +; LINUX-X32-NEXT: .cfi_def_cfa_offset 56 +; LINUX-X32-NEXT: popq %rbx +; LINUX-X32-NEXT: .cfi_def_cfa_offset 48 +; LINUX-X32-NEXT: popq %r12 +; LINUX-X32-NEXT: .cfi_def_cfa_offset 40 +; LINUX-X32-NEXT: popq %r13 +; LINUX-X32-NEXT: .cfi_def_cfa_offset 32 +; LINUX-X32-NEXT: popq %r14 +; LINUX-X32-NEXT: .cfi_def_cfa_offset 24 +; LINUX-X32-NEXT: popq %r15 +; LINUX-X32-NEXT: .cfi_def_cfa_offset 16 +; LINUX-X32-NEXT: popq %rbp +; LINUX-X32-NEXT: .cfi_def_cfa_offset 8 +; LINUX-X32-NEXT: jmpq *%r11 # TAILCALL +; LINUX-X32-NEXT: .LBB0_4: +; LINUX-X32-NEXT: .cfi_def_cfa_offset 272 +; LINUX-X32-NEXT: addl $216, %esp ; LINUX-X32-NEXT: .cfi_def_cfa_offset 56 ; LINUX-X32-NEXT: popq %rbx ; LINUX-X32-NEXT: .cfi_def_cfa_offset 48 @@ -203,6 +311,87 @@ ; LINUX-X32-NEXT: .cfi_def_cfa_offset 8 ; LINUX-X32-NEXT: jmpq *%r11 # TAILCALL ; +; LINUX-X32-OPT0-LABEL: f_thunk: +; LINUX-X32-OPT0: # %bb.0: +; LINUX-X32-OPT0-NEXT: subl $312, %esp # imm = 0x138 +; LINUX-X32-OPT0-NEXT: .cfi_def_cfa_offset 320 +; LINUX-X32-OPT0-NEXT: testb %al, %al +; LINUX-X32-OPT0-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; LINUX-X32-OPT0-NEXT: movq %r9, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %r8, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %rcx, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %rdx, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %rsi, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; LINUX-X32-OPT0-NEXT: je .LBB0_2 +; LINUX-X32-OPT0-NEXT: # %bb.1: +; LINUX-X32-OPT0-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm5, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm6, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm7, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: .LBB0_2: +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %rax # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq %rax, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %rcx # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq %rcx, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %rdx # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq %rdx, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %rsi # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq %rsi, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %rdi # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq %rdi, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %r8b # 1-byte Reload +; LINUX-X32-OPT0-NEXT: leal {{[0-9]+}}(%rsp), %r9d +; LINUX-X32-OPT0-NEXT: movl %r9d, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: leal {{[0-9]+}}(%rsp), %r9d +; LINUX-X32-OPT0-NEXT: movl %r9d, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movl $48, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movl $8, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %r9d # 4-byte Reload +; LINUX-X32-OPT0-NEXT: movq %rdi, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movl %r9d, %edi +; LINUX-X32-OPT0-NEXT: movq %rsi, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %rdx, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %rcx, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %rax, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movb %r8b, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; LINUX-X32-OPT0-NEXT: callq get_f +; LINUX-X32-OPT0-NEXT: movl %eax, %eax +; LINUX-X32-OPT0-NEXT: movl %eax, %ecx +; LINUX-X32-OPT0-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %rsi # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %rdx # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %r10 # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq %rcx, (%esp) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %r10, %rcx +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %r8 # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %r9 # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload +; LINUX-X32-OPT0-NEXT: movq (%esp), %r11 # 8-byte Reload +; LINUX-X32-OPT0-NEXT: testb %al, %al +; LINUX-X32-OPT0-NEXT: je .LBB0_4 +; LINUX-X32-OPT0-NEXT: # %bb.3: +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm7 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm6 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm5 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm4 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm3 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm2 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm1 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 +; LINUX-X32-OPT0-NEXT: addl $312, %esp # imm = 0x138 +; LINUX-X32-OPT0-NEXT: .cfi_def_cfa_offset 8 +; LINUX-X32-OPT0-NEXT: jmpq *%r11 # TAILCALL +; LINUX-X32-OPT0-NEXT: .LBB0_4: +; LINUX-X32-OPT0-NEXT: .cfi_def_cfa_offset 320 +; LINUX-X32-OPT0-NEXT: addl $312, %esp # imm = 0x138 +; LINUX-X32-OPT0-NEXT: .cfi_def_cfa_offset 8 +; LINUX-X32-OPT0-NEXT: jmpq *%r11 # TAILCALL +; ; WINDOWS-LABEL: f_thunk: ; WINDOWS: # %bb.0: ; WINDOWS-NEXT: pushq %r14 @@ -246,6 +435,36 @@ ; WINDOWS-NEXT: .text ; WINDOWS-NEXT: .seh_endproc ; +; WINDOWS-OPT0-LABEL: f_thunk: +; WINDOWS-OPT0: # %bb.0: +; WINDOWS-OPT0-NEXT: subq $120, %rsp +; WINDOWS-OPT0-NEXT: .seh_stackalloc 120 +; WINDOWS-OPT0-NEXT: .seh_endprologue +; WINDOWS-OPT0-NEXT: movq %r9, {{[0-9]+}}(%rsp) +; WINDOWS-OPT0-NEXT: movq %r8, {{[0-9]+}}(%rsp) +; WINDOWS-OPT0-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; WINDOWS-OPT0-NEXT: leaq {{[0-9]+}}(%rsp), %r10 +; WINDOWS-OPT0-NEXT: movq %r10, {{[0-9]+}}(%rsp) +; WINDOWS-OPT0-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; WINDOWS-OPT0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; WINDOWS-OPT0-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; WINDOWS-OPT0-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; WINDOWS-OPT0-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; WINDOWS-OPT0-NEXT: callq get_f +; WINDOWS-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; WINDOWS-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; WINDOWS-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; WINDOWS-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; WINDOWS-OPT0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %r11b # 1-byte Reload +; WINDOWS-OPT0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; WINDOWS-OPT0-NEXT: movb %r11b, %al +; WINDOWS-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; WINDOWS-OPT0-NEXT: addq $120, %rsp +; WINDOWS-OPT0-NEXT: rex64 jmpq *%r10 # TAILCALL +; WINDOWS-OPT0-NEXT: .seh_handlerdata +; WINDOWS-OPT0-NEXT: .text +; WINDOWS-OPT0-NEXT: .seh_endproc +; ; X86-NOSSE-LABEL: f_thunk: ; X86-NOSSE: # %bb.0: ; X86-NOSSE-NEXT: pushl %ebp @@ -264,6 +483,25 @@ ; X86-NOSSE-NEXT: popl %ebp ; X86-NOSSE-NEXT: jmpl *%eax # TAILCALL ; +; X86-NOSSE-OPT0-LABEL: f_thunk: +; X86-NOSSE-OPT0: # %bb.0: +; X86-NOSSE-OPT0-NEXT: pushl %ebp +; X86-NOSSE-OPT0-NEXT: movl %esp, %ebp +; X86-NOSSE-OPT0-NEXT: andl $-16, %esp +; X86-NOSSE-OPT0-NEXT: subl $48, %esp +; X86-NOSSE-OPT0-NEXT: movl 8(%ebp), %eax +; X86-NOSSE-OPT0-NEXT: leal 12(%ebp), %ecx +; X86-NOSSE-OPT0-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NOSSE-OPT0-NEXT: movl %esp, %ecx +; X86-NOSSE-OPT0-NEXT: movl %eax, (%ecx) +; X86-NOSSE-OPT0-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NOSSE-OPT0-NEXT: calll _get_f +; X86-NOSSE-OPT0-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NOSSE-OPT0-NEXT: movl %ecx, 8(%ebp) +; X86-NOSSE-OPT0-NEXT: movl %ebp, %esp +; X86-NOSSE-OPT0-NEXT: popl %ebp +; X86-NOSSE-OPT0-NEXT: jmpl *%eax # TAILCALL +; ; X86-SSE-LABEL: f_thunk: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: pushl %ebp @@ -287,6 +525,31 @@ ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: popl %ebp ; X86-SSE-NEXT: jmpl *%eax # TAILCALL +; +; X86-SSE-OPT0-LABEL: f_thunk: +; X86-SSE-OPT0: # %bb.0: +; X86-SSE-OPT0-NEXT: pushl %ebp +; X86-SSE-OPT0-NEXT: movl %esp, %ebp +; X86-SSE-OPT0-NEXT: andl $-16, %esp +; X86-SSE-OPT0-NEXT: subl $112, %esp +; X86-SSE-OPT0-NEXT: movl 8(%ebp), %eax +; X86-SSE-OPT0-NEXT: leal 12(%ebp), %ecx +; X86-SSE-OPT0-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-SSE-OPT0-NEXT: movl %esp, %ecx +; X86-SSE-OPT0-NEXT: movl %eax, (%ecx) +; X86-SSE-OPT0-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-OPT0-NEXT: movaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-OPT0-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE-OPT0-NEXT: movaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-OPT0-NEXT: calll _get_f +; X86-SSE-OPT0-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-SSE-OPT0-NEXT: movl %ecx, 8(%ebp) +; X86-SSE-OPT0-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-OPT0-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload +; X86-SSE-OPT0-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload +; X86-SSE-OPT0-NEXT: movl %ebp, %esp +; X86-SSE-OPT0-NEXT: popl %ebp +; X86-SSE-OPT0-NEXT: jmpl *%eax # TAILCALL %ap = alloca [4 x i8*], align 16 %ap_i8 = bitcast [4 x i8*]* %ap to i8* call void @llvm.va_start(i8* %ap_i8) @@ -300,27 +563,192 @@ ; No regparms on normal x86 conventions. -; This thunk shouldn't require any spills and reloads, assuming the register -; allocator knows what it's doing. +; This thunk stores xmms on entry and restores them before jumping. +; Storing and restoring xmms could be optimized out for this concrete case. define void @g_thunk(i8* %fptr_i8, ...) { ; LINUX-LABEL: g_thunk: ; LINUX: # %bb.0: -; LINUX-NEXT: pushq %rax -; LINUX-NEXT: .cfi_def_cfa_offset 16 -; LINUX-NEXT: popq %r11 +; LINUX-NEXT: subq $136, %rsp +; LINUX-NEXT: .cfi_def_cfa_offset 144 +; LINUX-NEXT: testb %al, %al +; LINUX-NEXT: je .LBB1_2 +; LINUX-NEXT: # %bb.1: +; LINUX-NEXT: movaps %xmm0, (%rsp) +; LINUX-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movaps %xmm5, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp) +; LINUX-NEXT: .LBB1_2: +; LINUX-NEXT: testb %al, %al +; LINUX-NEXT: je .LBB1_4 +; LINUX-NEXT: # %bb.3: +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm6 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 +; LINUX-NEXT: movaps (%rsp), %xmm0 +; LINUX-NEXT: addq $136, %rsp +; LINUX-NEXT: .cfi_def_cfa_offset 8 +; LINUX-NEXT: jmpq *%rdi # TAILCALL +; LINUX-NEXT: .LBB1_4: +; LINUX-NEXT: .cfi_def_cfa_offset 144 +; LINUX-NEXT: addq $136, %rsp ; LINUX-NEXT: .cfi_def_cfa_offset 8 ; LINUX-NEXT: jmpq *%rdi # TAILCALL ; +; LINUX-OPT0-LABEL: g_thunk: +; LINUX-OPT0: # %bb.0: +; LINUX-OPT0-NEXT: subq $200, %rsp +; LINUX-OPT0-NEXT: .cfi_def_cfa_offset 208 +; LINUX-OPT0-NEXT: movb %al, %r10b +; LINUX-OPT0-NEXT: testb %al, %al +; LINUX-OPT0-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; LINUX-OPT0-NEXT: je .LBB1_2 +; LINUX-OPT0-NEXT: # %bb.1: +; LINUX-OPT0-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm5, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: .LBB1_2: +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; LINUX-OPT0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; LINUX-OPT0-NEXT: testb %al, %al +; LINUX-OPT0-NEXT: je .LBB1_4 +; LINUX-OPT0-NEXT: # %bb.3: +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm6 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; LINUX-OPT0-NEXT: addq $200, %rsp +; LINUX-OPT0-NEXT: .cfi_def_cfa_offset 8 +; LINUX-OPT0-NEXT: jmpq *%r11 # TAILCALL +; LINUX-OPT0-NEXT: .LBB1_4: +; LINUX-OPT0-NEXT: .cfi_def_cfa_offset 208 +; LINUX-OPT0-NEXT: addq $200, %rsp +; LINUX-OPT0-NEXT: .cfi_def_cfa_offset 8 +; LINUX-OPT0-NEXT: jmpq *%r11 # TAILCALL +; ; LINUX-X32-LABEL: g_thunk: ; LINUX-X32: # %bb.0: -; LINUX-X32-NEXT: pushq %rax -; LINUX-X32-NEXT: .cfi_def_cfa_offset 16 +; LINUX-X32-NEXT: subl $136, %esp +; LINUX-X32-NEXT: .cfi_def_cfa_offset 144 +; LINUX-X32-NEXT: testb %al, %al +; LINUX-X32-NEXT: je .LBB1_2 +; LINUX-X32-NEXT: # %bb.1: +; LINUX-X32-NEXT: movaps %xmm0, (%esp) +; LINUX-X32-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movaps %xmm5, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movaps %xmm6, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movaps %xmm7, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: .LBB1_2: ; LINUX-X32-NEXT: movl %edi, %r11d -; LINUX-X32-NEXT: addl $8, %esp +; LINUX-X32-NEXT: testb %al, %al +; LINUX-X32-NEXT: je .LBB1_4 +; LINUX-X32-NEXT: # %bb.3: +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm7 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm6 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm5 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm4 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm3 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm2 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm1 +; LINUX-X32-NEXT: movaps (%esp), %xmm0 +; LINUX-X32-NEXT: addl $136, %esp +; LINUX-X32-NEXT: .cfi_def_cfa_offset 8 +; LINUX-X32-NEXT: jmpq *%r11 # TAILCALL +; LINUX-X32-NEXT: .LBB1_4: +; LINUX-X32-NEXT: .cfi_def_cfa_offset 144 +; LINUX-X32-NEXT: addl $136, %esp ; LINUX-X32-NEXT: .cfi_def_cfa_offset 8 ; LINUX-X32-NEXT: jmpq *%r11 # TAILCALL ; +; LINUX-X32-OPT0-LABEL: g_thunk: +; LINUX-X32-OPT0: # %bb.0: +; LINUX-X32-OPT0-NEXT: subl $200, %esp +; LINUX-X32-OPT0-NEXT: .cfi_def_cfa_offset 208 +; LINUX-X32-OPT0-NEXT: movb %al, %r10b +; LINUX-X32-OPT0-NEXT: testb %al, %al +; LINUX-X32-OPT0-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; LINUX-X32-OPT0-NEXT: movq %rsi, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %rdx, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %rcx, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %r8, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %r9, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movb %r10b, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; LINUX-X32-OPT0-NEXT: je .LBB1_2 +; LINUX-X32-OPT0-NEXT: # %bb.1: +; LINUX-X32-OPT0-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm5, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm6, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm7, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: .LBB1_2: +; LINUX-X32-OPT0-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; LINUX-X32-OPT0-NEXT: movl %eax, %ecx +; LINUX-X32-OPT0-NEXT: movl %ecx, %edx +; LINUX-X32-OPT0-NEXT: movl %eax, %edi +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %rsi # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %r8 # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq %rdx, (%esp) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %r8, %rdx +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %rcx # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %r8 # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %r9 # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload +; LINUX-X32-OPT0-NEXT: movq (%esp), %r11 # 8-byte Reload +; LINUX-X32-OPT0-NEXT: testb %al, %al +; LINUX-X32-OPT0-NEXT: je .LBB1_4 +; LINUX-X32-OPT0-NEXT: # %bb.3: +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm7 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm6 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm5 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm4 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm3 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm2 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm1 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 +; LINUX-X32-OPT0-NEXT: addl $200, %esp +; LINUX-X32-OPT0-NEXT: .cfi_def_cfa_offset 8 +; LINUX-X32-OPT0-NEXT: jmpq *%r11 # TAILCALL +; LINUX-X32-OPT0-NEXT: .LBB1_4: +; LINUX-X32-OPT0-NEXT: .cfi_def_cfa_offset 208 +; LINUX-X32-OPT0-NEXT: addl $200, %esp +; LINUX-X32-OPT0-NEXT: .cfi_def_cfa_offset 8 +; LINUX-X32-OPT0-NEXT: jmpq *%r11 # TAILCALL +; ; WINDOWS-LABEL: g_thunk: ; WINDOWS: # %bb.0: ; WINDOWS-NEXT: subq $40, %rsp @@ -332,6 +760,19 @@ ; WINDOWS-NEXT: .text ; WINDOWS-NEXT: .seh_endproc ; +; WINDOWS-OPT0-LABEL: g_thunk: +; WINDOWS-OPT0: # %bb.0: +; WINDOWS-OPT0-NEXT: subq $40, %rsp +; WINDOWS-OPT0-NEXT: .seh_stackalloc 40 +; WINDOWS-OPT0-NEXT: .seh_endprologue +; WINDOWS-OPT0-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; WINDOWS-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; WINDOWS-OPT0-NEXT: addq $40, %rsp +; WINDOWS-OPT0-NEXT: rex64 jmpq *%r10 # TAILCALL +; WINDOWS-OPT0-NEXT: .seh_handlerdata +; WINDOWS-OPT0-NEXT: .text +; WINDOWS-OPT0-NEXT: .seh_endproc +; ; X86-LABEL: g_thunk: ; X86: # %bb.0: ; X86-NEXT: pushl %eax @@ -353,41 +794,321 @@ define void @h_thunk(%struct.Foo* %this, ...) { ; LINUX-LABEL: h_thunk: ; LINUX: # %bb.0: -; LINUX-NEXT: pushq %rax -; LINUX-NEXT: .cfi_def_cfa_offset 16 +; LINUX-NEXT: subq $136, %rsp +; LINUX-NEXT: .cfi_def_cfa_offset 144 +; LINUX-NEXT: testb %al, %al +; LINUX-NEXT: je .LBB2_2 +; LINUX-NEXT: # %bb.1: +; LINUX-NEXT: movaps %xmm0, (%rsp) +; LINUX-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movaps %xmm5, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp) +; LINUX-NEXT: .LBB2_2: ; LINUX-NEXT: cmpb $1, (%rdi) -; LINUX-NEXT: jne .LBB2_2 -; LINUX-NEXT: # %bb.1: # %then +; LINUX-NEXT: jne .LBB2_4 +; LINUX-NEXT: # %bb.3: # %then ; LINUX-NEXT: movq 8(%rdi), %r11 -; LINUX-NEXT: addq $8, %rsp +; LINUX-NEXT: testb %al, %al +; LINUX-NEXT: je .LBB2_6 +; LINUX-NEXT: # %bb.5: # %then +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm6 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 +; LINUX-NEXT: movaps (%rsp), %xmm0 +; LINUX-NEXT: addq $136, %rsp ; LINUX-NEXT: .cfi_def_cfa_offset 8 ; LINUX-NEXT: jmpq *%r11 # TAILCALL -; LINUX-NEXT: .LBB2_2: # %else -; LINUX-NEXT: .cfi_def_cfa_offset 16 +; LINUX-NEXT: .LBB2_4: # %else +; LINUX-NEXT: .cfi_def_cfa_offset 144 ; LINUX-NEXT: movq 16(%rdi), %r11 ; LINUX-NEXT: movl $42, {{.*}}(%rip) -; LINUX-NEXT: addq $8, %rsp +; LINUX-NEXT: testb %al, %al +; LINUX-NEXT: je .LBB2_8 +; LINUX-NEXT: # %bb.7: # %else +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm6 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 +; LINUX-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 +; LINUX-NEXT: movaps (%rsp), %xmm0 +; LINUX-NEXT: addq $136, %rsp +; LINUX-NEXT: .cfi_def_cfa_offset 8 +; LINUX-NEXT: jmpq *%r11 # TAILCALL +; LINUX-NEXT: .LBB2_6: # %then +; LINUX-NEXT: .cfi_def_cfa_offset 144 +; LINUX-NEXT: addq $136, %rsp +; LINUX-NEXT: .cfi_def_cfa_offset 8 +; LINUX-NEXT: jmpq *%r11 # TAILCALL +; LINUX-NEXT: .LBB2_8: # %else +; LINUX-NEXT: .cfi_def_cfa_offset 144 +; LINUX-NEXT: addq $136, %rsp ; LINUX-NEXT: .cfi_def_cfa_offset 8 ; LINUX-NEXT: jmpq *%r11 # TAILCALL ; +; LINUX-OPT0-LABEL: h_thunk: +; LINUX-OPT0: # %bb.0: +; LINUX-OPT0-NEXT: subq $216, %rsp +; LINUX-OPT0-NEXT: .cfi_def_cfa_offset 224 +; LINUX-OPT0-NEXT: movb %al, %r10b +; LINUX-OPT0-NEXT: testb %al, %al +; LINUX-OPT0-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; LINUX-OPT0-NEXT: je .LBB2_4 +; LINUX-OPT0-NEXT: # %bb.3: +; LINUX-OPT0-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm5, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp) +; LINUX-OPT0-NEXT: .LBB2_4: +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; LINUX-OPT0-NEXT: testb $1, (%rax) +; LINUX-OPT0-NEXT: jne .LBB2_1 +; LINUX-OPT0-NEXT: jmp .LBB2_2 +; LINUX-OPT0-NEXT: .LBB2_1: # %then +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; LINUX-OPT0-NEXT: movq 8(%rax), %rcx +; LINUX-OPT0-NEXT: movq %rax, %rdi +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; LINUX-OPT0-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %r8, %rcx +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; LINUX-OPT0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; LINUX-OPT0-NEXT: testb %al, %al +; LINUX-OPT0-NEXT: je .LBB2_6 +; LINUX-OPT0-NEXT: # %bb.5: # %then +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm6 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; LINUX-OPT0-NEXT: addq $216, %rsp +; LINUX-OPT0-NEXT: .cfi_def_cfa_offset 8 +; LINUX-OPT0-NEXT: jmpq *%r11 # TAILCALL +; LINUX-OPT0-NEXT: .LBB2_6: # %then +; LINUX-OPT0-NEXT: .cfi_def_cfa_offset 224 +; LINUX-OPT0-NEXT: addq $216, %rsp +; LINUX-OPT0-NEXT: .cfi_def_cfa_offset 8 +; LINUX-OPT0-NEXT: jmpq *%r11 # TAILCALL +; LINUX-OPT0-NEXT: .LBB2_2: # %else +; LINUX-OPT0-NEXT: .cfi_def_cfa_offset 224 +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; LINUX-OPT0-NEXT: movq 16(%rax), %rcx +; LINUX-OPT0-NEXT: movl $42, {{.*}}(%rip) +; LINUX-OPT0-NEXT: movq %rax, %rdi +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; LINUX-OPT0-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-OPT0-NEXT: movq %r8, %rcx +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; LINUX-OPT0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; LINUX-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; LINUX-OPT0-NEXT: testb %al, %al +; LINUX-OPT0-NEXT: je .LBB2_8 +; LINUX-OPT0-NEXT: # %bb.7: # %else +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm6 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 +; LINUX-OPT0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; LINUX-OPT0-NEXT: addq $216, %rsp +; LINUX-OPT0-NEXT: .cfi_def_cfa_offset 8 +; LINUX-OPT0-NEXT: jmpq *%r11 # TAILCALL +; LINUX-OPT0-NEXT: .LBB2_8: # %else +; LINUX-OPT0-NEXT: .cfi_def_cfa_offset 224 +; LINUX-OPT0-NEXT: addq $216, %rsp +; LINUX-OPT0-NEXT: .cfi_def_cfa_offset 8 +; LINUX-OPT0-NEXT: jmpq *%r11 # TAILCALL +; ; LINUX-X32-LABEL: h_thunk: ; LINUX-X32: # %bb.0: -; LINUX-X32-NEXT: pushq %rax -; LINUX-X32-NEXT: .cfi_def_cfa_offset 16 +; LINUX-X32-NEXT: subl $136, %esp +; LINUX-X32-NEXT: .cfi_def_cfa_offset 144 +; LINUX-X32-NEXT: testb %al, %al +; LINUX-X32-NEXT: je .LBB2_2 +; LINUX-X32-NEXT: # %bb.1: +; LINUX-X32-NEXT: movaps %xmm0, (%esp) +; LINUX-X32-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movaps %xmm5, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movaps %xmm6, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movaps %xmm7, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: .LBB2_2: ; LINUX-X32-NEXT: cmpb $1, (%edi) -; LINUX-X32-NEXT: jne .LBB2_2 -; LINUX-X32-NEXT: # %bb.1: # %then +; LINUX-X32-NEXT: jne .LBB2_4 +; LINUX-X32-NEXT: # %bb.3: # %then ; LINUX-X32-NEXT: movl 4(%edi), %r11d -; LINUX-X32-NEXT: addl $8, %esp +; LINUX-X32-NEXT: testb %al, %al +; LINUX-X32-NEXT: je .LBB2_6 +; LINUX-X32-NEXT: # %bb.5: # %then +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm7 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm6 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm5 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm4 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm3 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm2 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm1 +; LINUX-X32-NEXT: movaps (%esp), %xmm0 +; LINUX-X32-NEXT: addl $136, %esp ; LINUX-X32-NEXT: .cfi_def_cfa_offset 8 ; LINUX-X32-NEXT: jmpq *%r11 # TAILCALL -; LINUX-X32-NEXT: .LBB2_2: # %else -; LINUX-X32-NEXT: .cfi_def_cfa_offset 16 +; LINUX-X32-NEXT: .LBB2_4: # %else +; LINUX-X32-NEXT: .cfi_def_cfa_offset 144 ; LINUX-X32-NEXT: movl 8(%edi), %r11d ; LINUX-X32-NEXT: movl $42, {{.*}}(%rip) -; LINUX-X32-NEXT: addl $8, %esp +; LINUX-X32-NEXT: testb %al, %al +; LINUX-X32-NEXT: je .LBB2_8 +; LINUX-X32-NEXT: # %bb.7: # %else +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm7 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm6 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm5 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm4 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm3 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm2 +; LINUX-X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm1 +; LINUX-X32-NEXT: movaps (%esp), %xmm0 +; LINUX-X32-NEXT: addl $136, %esp +; LINUX-X32-NEXT: .cfi_def_cfa_offset 8 +; LINUX-X32-NEXT: jmpq *%r11 # TAILCALL +; LINUX-X32-NEXT: .LBB2_6: # %then +; LINUX-X32-NEXT: .cfi_def_cfa_offset 144 +; LINUX-X32-NEXT: addl $136, %esp ; LINUX-X32-NEXT: .cfi_def_cfa_offset 8 ; LINUX-X32-NEXT: jmpq *%r11 # TAILCALL +; LINUX-X32-NEXT: .LBB2_8: # %else +; LINUX-X32-NEXT: .cfi_def_cfa_offset 144 +; LINUX-X32-NEXT: addl $136, %esp +; LINUX-X32-NEXT: .cfi_def_cfa_offset 8 +; LINUX-X32-NEXT: jmpq *%r11 # TAILCALL +; +; LINUX-X32-OPT0-LABEL: h_thunk: +; LINUX-X32-OPT0: # %bb.0: +; LINUX-X32-OPT0-NEXT: subl $216, %esp +; LINUX-X32-OPT0-NEXT: .cfi_def_cfa_offset 224 +; LINUX-X32-OPT0-NEXT: movb %al, %r10b +; LINUX-X32-OPT0-NEXT: testb %al, %al +; LINUX-X32-OPT0-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; LINUX-X32-OPT0-NEXT: movq %rsi, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %rdx, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %rcx, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %r8, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %r9, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movb %r10b, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; LINUX-X32-OPT0-NEXT: je .LBB2_4 +; LINUX-X32-OPT0-NEXT: # %bb.3: +; LINUX-X32-OPT0-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm5, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm6, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: movaps %xmm7, {{[0-9]+}}(%esp) +; LINUX-X32-OPT0-NEXT: .LBB2_4: +; LINUX-X32-OPT0-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; LINUX-X32-OPT0-NEXT: testb $1, (%eax) +; LINUX-X32-OPT0-NEXT: jne .LBB2_1 +; LINUX-X32-OPT0-NEXT: jmp .LBB2_2 +; LINUX-X32-OPT0-NEXT: .LBB2_1: # %then +; LINUX-X32-OPT0-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; LINUX-X32-OPT0-NEXT: movl 4(%eax), %ecx +; LINUX-X32-OPT0-NEXT: movl %ecx, %edx +; LINUX-X32-OPT0-NEXT: movl %eax, %edi +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %rsi # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %r8 # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq %rdx, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %r8, %rdx +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %rcx # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %r8 # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %r9 # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %r11 # 8-byte Reload +; LINUX-X32-OPT0-NEXT: testb %al, %al +; LINUX-X32-OPT0-NEXT: je .LBB2_6 +; LINUX-X32-OPT0-NEXT: # %bb.5: # %then +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm7 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm6 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm5 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm4 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm3 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm2 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm1 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 +; LINUX-X32-OPT0-NEXT: addl $216, %esp +; LINUX-X32-OPT0-NEXT: .cfi_def_cfa_offset 8 +; LINUX-X32-OPT0-NEXT: jmpq *%r11 # TAILCALL +; LINUX-X32-OPT0-NEXT: .LBB2_6: # %then +; LINUX-X32-OPT0-NEXT: .cfi_def_cfa_offset 224 +; LINUX-X32-OPT0-NEXT: addl $216, %esp +; LINUX-X32-OPT0-NEXT: .cfi_def_cfa_offset 8 +; LINUX-X32-OPT0-NEXT: jmpq *%r11 # TAILCALL +; LINUX-X32-OPT0-NEXT: .LBB2_2: # %else +; LINUX-X32-OPT0-NEXT: .cfi_def_cfa_offset 224 +; LINUX-X32-OPT0-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; LINUX-X32-OPT0-NEXT: movl 8(%eax), %ecx +; LINUX-X32-OPT0-NEXT: movl %ecx, %edx +; LINUX-X32-OPT0-NEXT: movl $42, {{.*}}(%rip) +; LINUX-X32-OPT0-NEXT: movl %eax, %edi +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %rsi # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %r8 # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq %rdx, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-OPT0-NEXT: movq %r8, %rdx +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %rcx # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %r8 # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %r9 # 8-byte Reload +; LINUX-X32-OPT0-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload +; LINUX-X32-OPT0-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %r11 # 8-byte Reload +; LINUX-X32-OPT0-NEXT: testb %al, %al +; LINUX-X32-OPT0-NEXT: je .LBB2_8 +; LINUX-X32-OPT0-NEXT: # %bb.7: # %else +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm7 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm6 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm5 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm4 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm3 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm2 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm1 +; LINUX-X32-OPT0-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 +; LINUX-X32-OPT0-NEXT: addl $216, %esp +; LINUX-X32-OPT0-NEXT: .cfi_def_cfa_offset 8 +; LINUX-X32-OPT0-NEXT: jmpq *%r11 # TAILCALL +; LINUX-X32-OPT0-NEXT: .LBB2_8: # %else +; LINUX-X32-OPT0-NEXT: .cfi_def_cfa_offset 224 +; LINUX-X32-OPT0-NEXT: addl $216, %esp +; LINUX-X32-OPT0-NEXT: .cfi_def_cfa_offset 8 +; LINUX-X32-OPT0-NEXT: jmpq *%r11 # TAILCALL ; ; WINDOWS-LABEL: h_thunk: ; WINDOWS: # %bb.0: @@ -409,23 +1130,136 @@ ; WINDOWS-NEXT: .text ; WINDOWS-NEXT: .seh_endproc ; -; X86-LABEL: h_thunk: -; X86: # %bb.0: -; X86-NEXT: pushl %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpb $1, (%eax) -; X86-NEXT: jne LBB2_2 -; X86-NEXT: # %bb.1: # %then -; X86-NEXT: movl 4(%eax), %ecx -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: popl %eax -; X86-NEXT: jmpl *%ecx # TAILCALL -; X86-NEXT: LBB2_2: # %else -; X86-NEXT: movl 8(%eax), %ecx -; X86-NEXT: movl $42, _g -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: popl %eax -; X86-NEXT: jmpl *%ecx # TAILCALL +; WINDOWS-OPT0-LABEL: h_thunk: +; WINDOWS-OPT0: # %bb.0: +; WINDOWS-OPT0-NEXT: subq $88, %rsp +; WINDOWS-OPT0-NEXT: .seh_stackalloc 88 +; WINDOWS-OPT0-NEXT: .seh_endprologue +; WINDOWS-OPT0-NEXT: testb $1, (%rcx) +; WINDOWS-OPT0-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; WINDOWS-OPT0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; WINDOWS-OPT0-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; WINDOWS-OPT0-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; WINDOWS-OPT0-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; WINDOWS-OPT0-NEXT: jne .LBB2_1 +; WINDOWS-OPT0-NEXT: jmp .LBB2_2 +; WINDOWS-OPT0-NEXT: .LBB2_1: # %then +; WINDOWS-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; WINDOWS-OPT0-NEXT: movq 8(%rax), %rcx +; WINDOWS-OPT0-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; WINDOWS-OPT0-NEXT: movq %rax, %rcx +; WINDOWS-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; WINDOWS-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; WINDOWS-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; WINDOWS-OPT0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; WINDOWS-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; WINDOWS-OPT0-NEXT: addq $88, %rsp +; WINDOWS-OPT0-NEXT: rex64 jmpq *%r10 # TAILCALL +; WINDOWS-OPT0-NEXT: .LBB2_2: # %else +; WINDOWS-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; WINDOWS-OPT0-NEXT: movq 16(%rax), %rcx +; WINDOWS-OPT0-NEXT: movl $42, {{.*}}(%rip) +; WINDOWS-OPT0-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; WINDOWS-OPT0-NEXT: movq %rax, %rcx +; WINDOWS-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; WINDOWS-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; WINDOWS-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; WINDOWS-OPT0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; WINDOWS-OPT0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; WINDOWS-OPT0-NEXT: addq $88, %rsp +; WINDOWS-OPT0-NEXT: rex64 jmpq *%r10 # TAILCALL +; WINDOWS-OPT0-NEXT: .seh_handlerdata +; WINDOWS-OPT0-NEXT: .text +; WINDOWS-OPT0-NEXT: .seh_endproc +; +; X86-NOSSE-LABEL: h_thunk: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl %eax +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOSSE-NEXT: cmpb $1, (%eax) +; X86-NOSSE-NEXT: jne LBB2_2 +; X86-NOSSE-NEXT: # %bb.1: # %then +; X86-NOSSE-NEXT: movl 4(%eax), %ecx +; X86-NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: popl %eax +; X86-NOSSE-NEXT: jmpl *%ecx # TAILCALL +; X86-NOSSE-NEXT: LBB2_2: # %else +; X86-NOSSE-NEXT: movl 8(%eax), %ecx +; X86-NOSSE-NEXT: movl $42, _g +; X86-NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: popl %eax +; X86-NOSSE-NEXT: jmpl *%ecx # TAILCALL +; +; X86-NOSSE-OPT0-LABEL: h_thunk: +; X86-NOSSE-OPT0: # %bb.0: +; X86-NOSSE-OPT0-NEXT: subl $8, %esp +; X86-NOSSE-OPT0-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOSSE-OPT0-NEXT: testb $1, (%eax) +; X86-NOSSE-OPT0-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NOSSE-OPT0-NEXT: jne LBB2_1 +; X86-NOSSE-OPT0-NEXT: jmp LBB2_2 +; X86-NOSSE-OPT0-NEXT: LBB2_1: # %then +; X86-NOSSE-OPT0-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NOSSE-OPT0-NEXT: movl 4(%eax), %ecx +; X86-NOSSE-OPT0-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NOSSE-OPT0-NEXT: addl $8, %esp +; X86-NOSSE-OPT0-NEXT: jmpl *%ecx # TAILCALL +; X86-NOSSE-OPT0-NEXT: LBB2_2: # %else +; X86-NOSSE-OPT0-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NOSSE-OPT0-NEXT: movl 8(%eax), %ecx +; X86-NOSSE-OPT0-NEXT: movl $42, _g +; X86-NOSSE-OPT0-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NOSSE-OPT0-NEXT: addl $8, %esp +; X86-NOSSE-OPT0-NEXT: jmpl *%ecx # TAILCALL +; +; X86-SSE-LABEL: h_thunk: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pushl %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: cmpb $1, (%eax) +; X86-SSE-NEXT: jne LBB2_2 +; X86-SSE-NEXT: # %bb.1: # %then +; X86-SSE-NEXT: movl 4(%eax), %ecx +; X86-SSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-SSE-NEXT: popl %eax +; X86-SSE-NEXT: jmpl *%ecx # TAILCALL +; X86-SSE-NEXT: LBB2_2: # %else +; X86-SSE-NEXT: movl 8(%eax), %ecx +; X86-SSE-NEXT: movl $42, _g +; X86-SSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-SSE-NEXT: popl %eax +; X86-SSE-NEXT: jmpl *%ecx # TAILCALL +; +; X86-SSE-OPT0-LABEL: h_thunk: +; X86-SSE-OPT0: # %bb.0: +; X86-SSE-OPT0-NEXT: subl $92, %esp +; X86-SSE-OPT0-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-OPT0-NEXT: testb $1, (%eax) +; X86-SSE-OPT0-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-OPT0-NEXT: movups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-OPT0-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE-OPT0-NEXT: movups %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-OPT0-NEXT: jne LBB2_1 +; X86-SSE-OPT0-NEXT: jmp LBB2_2 +; X86-SSE-OPT0-NEXT: LBB2_1: # %then +; X86-SSE-OPT0-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-SSE-OPT0-NEXT: movl 4(%eax), %ecx +; X86-SSE-OPT0-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-SSE-OPT0-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-OPT0-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload +; X86-SSE-OPT0-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload +; X86-SSE-OPT0-NEXT: addl $92, %esp +; X86-SSE-OPT0-NEXT: jmpl *%ecx # TAILCALL +; X86-SSE-OPT0-NEXT: LBB2_2: # %else +; X86-SSE-OPT0-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-SSE-OPT0-NEXT: movl 8(%eax), %ecx +; X86-SSE-OPT0-NEXT: movl $42, _g +; X86-SSE-OPT0-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-SSE-OPT0-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-OPT0-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload +; X86-SSE-OPT0-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload +; X86-SSE-OPT0-NEXT: addl $92, %esp +; X86-SSE-OPT0-NEXT: jmpl *%ecx # TAILCALL %cond_p = getelementptr %struct.Foo, %struct.Foo* %this, i32 0, i32 0 %cond = load i1, i1* %cond_p br i1 %cond, label %then, label %else Index: llvm/test/CodeGen/X86/vastart-defs-eflags.ll =================================================================== --- llvm/test/CodeGen/X86/vastart-defs-eflags.ll +++ llvm/test/CodeGen/X86/vastart-defs-eflags.ll @@ -9,6 +9,11 @@ ; CHECK-LABEL: check_flag: ; CHECK: ## %bb.0: ## %entry ; CHECK-NEXT: subq $56, %rsp +; CHECK-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: je LBB0_2 ; CHECK-NEXT: ## %bb.1: ## %entry @@ -21,11 +26,6 @@ ; CHECK-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp) ; CHECK-NEXT: LBB0_2: ## %entry -; CHECK-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testl $512, %edi ## imm = 0x200 ; CHECK-NEXT: je LBB0_4 Index: llvm/test/CodeGen/X86/x32-va_start.ll =================================================================== --- llvm/test/CodeGen/X86/x32-va_start.ll +++ llvm/test/CodeGen/X86/x32-va_start.ll @@ -27,6 +27,11 @@ call void @llvm.lifetime.start.p0i8(i64 16, i8* %0) #2 call void @llvm.va_start(i8* %0) ; SSE: subl $72, %esp +; CHECK-DAG: movq %r9 +; CHECK-DAG: movq %r8 +; CHECK-DAG: movq %rcx +; CHECK-DAG: movq %rdx +; CHECK-DAG: movq %rsi ; SSE: testb %al, %al ; SSE: je .[[NOFP:.*]] ; SSE-DAG: movaps %xmm1 @@ -38,11 +43,6 @@ ; SSE-DAG: movaps %xmm7 ; NOSSE-NOT: xmm ; SSE: .[[NOFP]]: -; CHECK-DAG: movq %r9 -; CHECK-DAG: movq %r8 -; CHECK-DAG: movq %rcx -; CHECK-DAG: movq %rdx -; CHECK-DAG: movq %rsi %gp_offset_p = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %ap, i32 0, i32 0, i32 0 %gp_offset = load i32, i32* %gp_offset_p, align 16 %fits_in_gp = icmp ult i32 %gp_offset, 41 Index: llvm/test/CodeGen/X86/xmm-vararg-noopt.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/xmm-vararg-noopt.ll @@ -0,0 +1,49 @@ +; RUN: llc -O0 -mtriple=x86_64-unknown-unknown < %s | FileCheck %s + +; CHECK-LABEL: testvarargs +; Ensure that xmm registers are not used before testing %al +; CHECK-NOT: xmm +; CHECK: testb %al, %al +; CHECK-NOT: xmm +; CHECK: # %bb.1 +; CHECK-NEXT: %xmm0, {{.*}}%rsp +; CHECK-NEXT: %xmm1, {{.*}}%rsp +; CHECK-NEXT: %xmm2, {{.*}}%rsp +; CHECK-NEXT: %xmm3, {{.*}}%rsp +; CHECK-NEXT: %xmm4, {{.*}}%rsp +; CHECK-NEXT: %xmm5, {{.*}}%rsp +; CHECK-NEXT: %xmm6, {{.*}}%rsp +; CHECK-NEXT: %xmm7, {{.*}}%rsp + +; ModuleID = 'variadic.c' +source_filename = "variadic.c" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux" + +%struct.__va_list_tag = type { i32, i32, i8*, i8* } + +@.str = private unnamed_addr constant [9 x i8] c"\0A hello \00", align 1 + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local void @testvarargs(i8* %fmt, ...) { +entry: + %fmt.addr = alloca i8*, align 8 + %va = alloca [1 x %struct.__va_list_tag], align 16 + store i8* %fmt, i8** %fmt.addr, align 8 + %arraydecay = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %va, i64 0, i64 0 + %arraydecay1 = bitcast %struct.__va_list_tag* %arraydecay to i8* + call void @llvm.va_start(i8* %arraydecay1) + %arraydecay2 = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %va, i64 0, i64 0 + %arraydecay23 = bitcast %struct.__va_list_tag* %arraydecay2 to i8* + call void @llvm.va_end(i8* %arraydecay23) + %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str, i64 0, i64 0)) + ret void +} + +; Function Attrs: nounwind +declare void @llvm.va_start(i8*) + +; Function Attrs: nounwind +declare void @llvm.va_end(i8*) + +declare dso_local i32 @printf(i8*, ...)