diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt --- a/llvm/lib/Target/X86/CMakeLists.txt +++ b/llvm/lib/Target/X86/CMakeLists.txt @@ -53,6 +53,7 @@ X86FloatingPoint.cpp X86FrameLowering.cpp X86InstructionSelector.cpp + X86ISelCallLowering.cpp X86ISelDAGToDAG.cpp X86ISelLowering.cpp X86IndirectBranchTracking.cpp diff --git a/llvm/lib/Target/X86/X86ISelCallLowering.cpp b/llvm/lib/Target/X86/X86ISelCallLowering.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/X86/X86ISelCallLowering.cpp @@ -0,0 +1,2954 @@ +//===- llvm/lib/Target/X86/X86ISelCallLowering.cpp - Call lowering --------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This file implements the lowering of LLVM calls to DAG nodes. +// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86CallingConv.h" +#include "X86FrameLowering.h" +#include "X86ISelLowering.h" +#include "X86InstrBuilder.h" +#include "X86MachineFunctionInfo.h" +#include "X86TargetMachine.h" +#include "X86TargetObjectFile.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ObjCARCUtil.h" +#include "llvm/CodeGen/MachineJumpTableInfo.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/WinEHFuncInfo.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/IRBuilder.h" + +#define DEBUG_TYPE "x86-isel" + +using namespace llvm; + +STATISTIC(NumTailCalls, "Number of tail calls"); + +/// Call this when the user attempts to do something unsupported, like +/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike +/// report_fatal_error, so calling code should attempt to recover without +/// crashing. +static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl, + const char *Msg) { + MachineFunction &MF = DAG.getMachineFunction(); + DAG.getContext()->diagnose( + DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc())); +} + +/// Returns true if a CC can dynamically exclude a register from the list of +/// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on +/// the return registers. +static bool shouldDisableRetRegFromCSR(CallingConv::ID CC) { + switch (CC) { + default: + return false; + case CallingConv::X86_RegCall: + case CallingConv::PreserveMost: + case CallingConv::PreserveAll: + return true; + } +} + +/// Returns true if a CC can dynamically exclude a register from the list of +/// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on +/// the parameters. +static bool shouldDisableArgRegFromCSR(CallingConv::ID CC) { + return CC == CallingConv::X86_RegCall; +} + +static std::pair +handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC, + const X86Subtarget &Subtarget) { + // v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling + // convention is one that uses k registers. + if (NumElts == 2) + return {MVT::v2i64, 1}; + if (NumElts == 4) + return {MVT::v4i32, 1}; + if (NumElts == 8 && CC != CallingConv::X86_RegCall && + CC != CallingConv::Intel_OCL_BI) + return {MVT::v8i16, 1}; + if (NumElts == 16 && CC != CallingConv::X86_RegCall && + CC != CallingConv::Intel_OCL_BI) + return {MVT::v16i8, 1}; + // v32i1 passes in ymm unless we have BWI and the calling convention is + // regcall. + if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall)) + return {MVT::v32i8, 1}; + // Split v64i1 vectors if we don't have v64i8 available. + if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) { + if (Subtarget.useAVX512Regs()) + return {MVT::v64i8, 1}; + return {MVT::v32i8, 2}; + } + + // Break wide or odd vXi1 vectors into scalars to match avx2 behavior. + if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) || + NumElts > 64) + return {MVT::i8, NumElts}; + + return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0}; +} + +MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, + CallingConv::ID CC, + EVT VT) const { + if (VT.isVector()) { + if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) { + unsigned NumElts = VT.getVectorNumElements(); + + MVT RegisterVT; + unsigned NumRegisters; + std::tie(RegisterVT, NumRegisters) = + handleMaskRegisterForCallingConv(NumElts, CC, Subtarget); + if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE) + return RegisterVT; + } + + if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8) + return MVT::v8f16; + } + + // We will use more GPRs for f64 and f80 on 32 bits when x87 is disabled. + if ((VT == MVT::f64 || VT == MVT::f80) && !Subtarget.is64Bit() && + !Subtarget.hasX87()) + return MVT::i32; + + if (VT.isVector() && VT.getVectorElementType() == MVT::bf16) + return getRegisterTypeForCallingConv(Context, CC, + VT.changeVectorElementType(MVT::f16)); + + return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); +} + +unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, + CallingConv::ID CC, + EVT VT) const { + if (VT.isVector()) { + if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) { + unsigned NumElts = VT.getVectorNumElements(); + + MVT RegisterVT; + unsigned NumRegisters; + std::tie(RegisterVT, NumRegisters) = + handleMaskRegisterForCallingConv(NumElts, CC, Subtarget); + if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE) + return NumRegisters; + } + + if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8) + return 1; + } + + // We have to split f64 to 2 registers and f80 to 3 registers on 32 bits if + // x87 is disabled. + if (!Subtarget.is64Bit() && !Subtarget.hasX87()) { + if (VT == MVT::f64) + return 2; + if (VT == MVT::f80) + return 3; + } + + if (VT.isVector() && VT.getVectorElementType() == MVT::bf16) + return getNumRegistersForCallingConv(Context, CC, + VT.changeVectorElementType(MVT::f16)); + + return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); +} + +unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv( + LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, + unsigned &NumIntermediates, MVT &RegisterVT) const { + // Break wide or odd vXi1 vectors into scalars to match avx2 behavior. + if (VT.isVector() && VT.getVectorElementType() == MVT::i1 && + Subtarget.hasAVX512() && + (!isPowerOf2_32(VT.getVectorNumElements()) || + (VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) || + VT.getVectorNumElements() > 64)) { + RegisterVT = MVT::i8; + IntermediateVT = MVT::i1; + NumIntermediates = VT.getVectorNumElements(); + return NumIntermediates; + } + + // Split v64i1 vectors if we don't have v64i8 available. + if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() && + CC != CallingConv::X86_RegCall) { + RegisterVT = MVT::v32i8; + IntermediateVT = MVT::v32i1; + NumIntermediates = 2; + return 2; + } + + // Split vNbf16 vectors according to vNf16. + if (VT.isVector() && VT.getVectorElementType() == MVT::bf16) + VT = VT.changeVectorElementType(MVT::f16); + + return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT, + NumIntermediates, RegisterVT); +} + +EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, + LLVMContext& Context, + EVT VT) const { + if (!VT.isVector()) + return MVT::i8; + + if (Subtarget.hasAVX512()) { + // Figure out what this type will be legalized to. + EVT LegalVT = VT; + while (getTypeAction(Context, LegalVT) != TypeLegal) + LegalVT = getTypeToTransformTo(Context, LegalVT); + + // If we got a 512-bit vector then we'll definitely have a vXi1 compare. + if (LegalVT.getSimpleVT().is512BitVector()) + return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount()); + + if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) { + // If we legalized to less than a 512-bit vector, then we will use a vXi1 + // compare for vXi32/vXi64 for sure. If we have BWI we will also support + // vXi16/vXi8. + MVT EltVT = LegalVT.getSimpleVT().getVectorElementType(); + if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32) + return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount()); + } + } + + return VT.changeVectorElementTypeToInteger(); +} + +/// Helper for getByValTypeAlignment to determine +/// the desired ByVal argument alignment. +static void getMaxByValAlign(Type *Ty, Align &MaxAlign) { + if (MaxAlign == 16) + return; + if (VectorType *VTy = dyn_cast(Ty)) { + if (VTy->getPrimitiveSizeInBits().getFixedValue() == 128) + MaxAlign = Align(16); + } else if (ArrayType *ATy = dyn_cast(Ty)) { + Align EltAlign; + getMaxByValAlign(ATy->getElementType(), EltAlign); + if (EltAlign > MaxAlign) + MaxAlign = EltAlign; + } else if (StructType *STy = dyn_cast(Ty)) { + for (auto *EltTy : STy->elements()) { + Align EltAlign; + getMaxByValAlign(EltTy, EltAlign); + if (EltAlign > MaxAlign) + MaxAlign = EltAlign; + if (MaxAlign == 16) + break; + } + } +} + +/// Return the desired alignment for ByVal aggregate +/// function arguments in the caller parameter area. For X86, aggregates +/// that contain SSE vectors are placed at 16-byte boundaries while the rest +/// are at 4-byte boundaries. +uint64_t X86TargetLowering::getByValTypeAlignment(Type *Ty, + const DataLayout &DL) const { + if (Subtarget.is64Bit()) { + // Max of 8 and alignment of type. + Align TyAlign = DL.getABITypeAlign(Ty); + if (TyAlign > 8) + return TyAlign.value(); + return 8; + } + + Align Alignment(4); + if (Subtarget.hasSSE1()) + getMaxByValAlign(Ty, Alignment); + return Alignment.value(); +} + +/// It returns EVT::Other if the type should be determined using generic +/// target-independent logic. +/// For vector ops we check that the overall size isn't larger than our +/// preferred vector width. +EVT X86TargetLowering::getOptimalMemOpType( + const MemOp &Op, const AttributeList &FuncAttributes) const { + if (!FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) { + if (Op.size() >= 16 && + (!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) { + // FIXME: Check if unaligned 64-byte accesses are slow. + if (Op.size() >= 64 && Subtarget.hasAVX512() && + (Subtarget.getPreferVectorWidth() >= 512)) { + return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32; + } + // FIXME: Check if unaligned 32-byte accesses are slow. + if (Op.size() >= 32 && Subtarget.hasAVX() && + Subtarget.useLight256BitInstructions()) { + // Although this isn't a well-supported type for AVX1, we'll let + // legalization and shuffle lowering produce the optimal codegen. If we + // choose an optimal type with a vector element larger than a byte, + // getMemsetStores() may create an intermediate splat (using an integer + // multiply) before we splat as a vector. + return MVT::v32i8; + } + if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128)) + return MVT::v16i8; + // TODO: Can SSE1 handle a byte vector? + // If we have SSE1 registers we should be able to use them. + if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) && + (Subtarget.getPreferVectorWidth() >= 128)) + return MVT::v4f32; + } else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) && + Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) { + // Do not use f64 to lower memcpy if source is string constant. It's + // better to use i32 to avoid the loads. + // Also, do not use f64 to lower memset unless this is a memset of zeros. + // The gymnastics of splatting a byte value into an XMM register and then + // only using 8-byte stores (because this is a CPU with slow unaligned + // 16-byte accesses) makes that a loser. + return MVT::f64; + } + } + // This is a compromise. If we reach here, unaligned accesses may be slow on + // this target. However, creating smaller, aligned accesses could be even + // slower and would certainly be a lot more code. + if (Subtarget.is64Bit() && Op.size() >= 8) + return MVT::i64; + return MVT::i32; +} + +bool X86TargetLowering::isSafeMemOpType(MVT VT) const { + if (VT == MVT::f32) + return Subtarget.hasSSE1(); + if (VT == MVT::f64) + return Subtarget.hasSSE2(); + return true; +} + +static bool isBitAligned(Align Alignment, uint64_t SizeInBits) { + return (8 * Alignment.value()) % SizeInBits == 0; +} + +bool X86TargetLowering::isMemoryAccessFast(EVT VT, Align Alignment) const { + if (isBitAligned(Alignment, VT.getSizeInBits())) + return true; + switch (VT.getSizeInBits()) { + default: + // 8-byte and under are always assumed to be fast. + return true; + case 128: + return !Subtarget.isUnalignedMem16Slow(); + case 256: + return !Subtarget.isUnalignedMem32Slow(); + // TODO: What about AVX-512 (512-bit) accesses? + } +} + +bool X86TargetLowering::allowsMisalignedMemoryAccesses( + EVT VT, unsigned, Align Alignment, MachineMemOperand::Flags Flags, + unsigned *Fast) const { + if (Fast) + *Fast = isMemoryAccessFast(VT, Alignment); + // NonTemporal vector memory ops must be aligned. + if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) { + // NT loads can only be vector aligned, so if its less aligned than the + // minimum vector size (which we can split the vector down to), we might as + // well use a regular unaligned vector load. + // We don't have any NT loads pre-SSE41. + if (!!(Flags & MachineMemOperand::MOLoad)) + return (Alignment < 16 || !Subtarget.hasSSE41()); + return false; + } + // Misaligned accesses of any size are always allowed. + return true; +} + +bool X86TargetLowering::allowsMemoryAccess(LLVMContext &Context, + const DataLayout &DL, EVT VT, + unsigned AddrSpace, Align Alignment, + MachineMemOperand::Flags Flags, + unsigned *Fast) const { + if (Fast) + *Fast = isMemoryAccessFast(VT, Alignment); + if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) { + if (allowsMisalignedMemoryAccesses(VT, AddrSpace, Alignment, Flags, + /*Fast=*/nullptr)) + return true; + // NonTemporal vector memory ops are special, and must be aligned. + if (!isBitAligned(Alignment, VT.getSizeInBits())) + return false; + switch (VT.getSizeInBits()) { + case 128: + if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasSSE41()) + return true; + if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasSSE2()) + return true; + return false; + case 256: + if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasAVX2()) + return true; + if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasAVX()) + return true; + return false; + case 512: + if (Subtarget.hasAVX512()) + return true; + return false; + default: + return false; // Don't have NonTemporal vector memory ops of this size. + } + } + return true; +} + +/// Return the entry encoding for a jump table in the +/// current function. The returned value is a member of the +/// MachineJumpTableInfo::JTEntryKind enum. +unsigned X86TargetLowering::getJumpTableEncoding() const { + // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF + // symbol. + if (isPositionIndependent() && Subtarget.isPICStyleGOT()) + return MachineJumpTableInfo::EK_Custom32; + + // Otherwise, use the normal jump table encoding heuristics. + return TargetLowering::getJumpTableEncoding(); +} + +bool X86TargetLowering::splitValueIntoRegisterParts( + SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts, + unsigned NumParts, MVT PartVT, std::optional CC) const { + bool IsABIRegCopy = CC.has_value(); + EVT ValueVT = Val.getValueType(); + if (IsABIRegCopy && ValueVT == MVT::bf16 && PartVT == MVT::f32) { + unsigned ValueBits = ValueVT.getSizeInBits(); + unsigned PartBits = PartVT.getSizeInBits(); + Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val); + Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val); + Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val); + Parts[0] = Val; + return true; + } + return false; +} + +SDValue X86TargetLowering::joinRegisterPartsIntoValue( + SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts, + MVT PartVT, EVT ValueVT, std::optional CC) const { + bool IsABIRegCopy = CC.has_value(); + if (IsABIRegCopy && ValueVT == MVT::bf16 && PartVT == MVT::f32) { + unsigned ValueBits = ValueVT.getSizeInBits(); + unsigned PartBits = PartVT.getSizeInBits(); + SDValue Val = Parts[0]; + + Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val); + Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val); + Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val); + return Val; + } + return SDValue(); +} + +bool X86TargetLowering::useSoftFloat() const { + return Subtarget.useSoftFloat(); +} + +void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC, + ArgListTy &Args) const { + + // Only relabel X86-32 for C / Stdcall CCs. + if (Subtarget.is64Bit()) + return; + if (CC != CallingConv::C && CC != CallingConv::X86_StdCall) + return; + unsigned ParamRegs = 0; + if (auto *M = MF->getFunction().getParent()) + ParamRegs = M->getNumberRegisterParameters(); + + // Mark the first N int arguments as having reg + for (auto &Arg : Args) { + Type *T = Arg.Ty; + if (T->isIntOrPtrTy()) + if (MF->getDataLayout().getTypeAllocSize(T) <= 8) { + unsigned numRegs = 1; + if (MF->getDataLayout().getTypeAllocSize(T) > 4) + numRegs = 2; + if (ParamRegs < numRegs) + return; + ParamRegs -= numRegs; + Arg.IsInReg = true; + } + } +} + +const MCExpr * +X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, + const MachineBasicBlock *MBB, + unsigned uid,MCContext &Ctx) const{ + assert(isPositionIndependent() && Subtarget.isPICStyleGOT()); + // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF + // entries. + return MCSymbolRefExpr::create(MBB->getSymbol(), + MCSymbolRefExpr::VK_GOTOFF, Ctx); +} + +/// Returns relocation base for the given PIC jumptable. +SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, + SelectionDAG &DAG) const { + if (!Subtarget.is64Bit()) + // This doesn't have SDLoc associated with it, but is not really the + // same as a Register. + return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), + getPointerTy(DAG.getDataLayout())); + return Table; +} + +/// This returns the relocation base for the given PIC jumptable, +/// the same as getPICJumpTableRelocBase, but as an MCExpr. +const MCExpr *X86TargetLowering:: +getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, + MCContext &Ctx) const { + // X86-64 uses RIP relative addressing based on the jump table label. + if (Subtarget.isPICStyleRIPRel()) + return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); + + // Otherwise, the reference is relative to the PIC base. + return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx); +} + +std::pair +X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI, + MVT VT) const { + const TargetRegisterClass *RRC = nullptr; + uint8_t Cost = 1; + switch (VT.SimpleTy) { + default: + return TargetLowering::findRepresentativeClass(TRI, VT); + case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: + RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass; + break; + case MVT::x86mmx: + RRC = &X86::VR64RegClass; + break; + case MVT::f32: case MVT::f64: + case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: + case MVT::v4f32: case MVT::v2f64: + case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64: + case MVT::v8f32: case MVT::v4f64: + case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64: + case MVT::v16f32: case MVT::v8f64: + RRC = &X86::VR128XRegClass; + break; + } + return std::make_pair(RRC, Cost); +} + +unsigned X86TargetLowering::getAddressSpace() const { + if (Subtarget.is64Bit()) + return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257; + return 256; +} + +static bool hasStackGuardSlotTLS(const Triple &TargetTriple) { + return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() || + (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17)); +} + +static Constant* SegmentOffset(IRBuilderBase &IRB, + int Offset, unsigned AddressSpace) { + return ConstantExpr::getIntToPtr( + ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset), + Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace)); +} + +Value *X86TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const { + // glibc, bionic, and Fuchsia have a special slot for the stack guard in + // tcbhead_t; use it instead of the usual global variable (see + // sysdeps/{i386,x86_64}/nptl/tls.h) + if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) { + if (Subtarget.isTargetFuchsia()) { + // defines ZX_TLS_STACK_GUARD_OFFSET with this value. + return SegmentOffset(IRB, 0x10, getAddressSpace()); + } else { + unsigned AddressSpace = getAddressSpace(); + Module *M = IRB.GetInsertBlock()->getParent()->getParent(); + // Specially, some users may customize the base reg and offset. + int Offset = M->getStackProtectorGuardOffset(); + // If we don't set -stack-protector-guard-offset value: + // %fs:0x28, unless we're using a Kernel code model, in which case + // it's %gs:0x28. gs:0x14 on i386. + if (Offset == INT_MAX) + Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14; + + StringRef GuardReg = M->getStackProtectorGuardReg(); + if (GuardReg == "fs") + AddressSpace = X86AS::FS; + else if (GuardReg == "gs") + AddressSpace = X86AS::GS; + + // Use symbol guard if user specify. + StringRef GuardSymb = M->getStackProtectorGuardSymbol(); + if (!GuardSymb.empty()) { + GlobalVariable *GV = M->getGlobalVariable(GuardSymb); + if (!GV) { + Type *Ty = Subtarget.is64Bit() ? Type::getInt64Ty(M->getContext()) + : Type::getInt32Ty(M->getContext()); + GV = new GlobalVariable(*M, Ty, false, GlobalValue::ExternalLinkage, + nullptr, GuardSymb, nullptr, + GlobalValue::NotThreadLocal, AddressSpace); + GV->setDSOLocal(M->getDirectAccessExternalData()); + } + return GV; + } + + return SegmentOffset(IRB, Offset, AddressSpace); + } + } + return TargetLowering::getIRStackGuard(IRB); +} + +void X86TargetLowering::insertSSPDeclarations(Module &M) const { + // MSVC CRT provides functionalities for stack protection. + if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() || + Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) { + // MSVC CRT has a global variable holding security cookie. + M.getOrInsertGlobal("__security_cookie", + Type::getInt8PtrTy(M.getContext())); + + // MSVC CRT has a function to validate security cookie. + FunctionCallee SecurityCheckCookie = M.getOrInsertFunction( + "__security_check_cookie", Type::getVoidTy(M.getContext()), + Type::getInt8PtrTy(M.getContext())); + if (Function *F = dyn_cast(SecurityCheckCookie.getCallee())) { + F->setCallingConv(CallingConv::X86_FastCall); + F->addParamAttr(0, Attribute::AttrKind::InReg); + } + return; + } + + StringRef GuardMode = M.getStackProtectorGuard(); + + // glibc, bionic, and Fuchsia have a special slot for the stack guard. + if ((GuardMode == "tls" || GuardMode.empty()) && + hasStackGuardSlotTLS(Subtarget.getTargetTriple())) + return; + TargetLowering::insertSSPDeclarations(M); +} + +Value *X86TargetLowering::getSDagStackGuard(const Module &M) const { + // MSVC CRT has a global variable holding security cookie. + if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() || + Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) { + return M.getGlobalVariable("__security_cookie"); + } + return TargetLowering::getSDagStackGuard(M); +} + +Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const { + // MSVC CRT has a function to validate security cookie. + if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() || + Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) { + return M.getFunction("__security_check_cookie"); + } + return TargetLowering::getSSPStackGuardCheck(M); +} + +Value * +X86TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const { + if (Subtarget.getTargetTriple().isOSContiki()) + return getDefaultSafeStackPointerLocation(IRB, false); + + // Android provides a fixed TLS slot for the SafeStack pointer. See the + // definition of TLS_SLOT_SAFESTACK in + // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h + if (Subtarget.isTargetAndroid()) { + // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs: + // %gs:0x24 on i386 + int Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24; + return SegmentOffset(IRB, Offset, getAddressSpace()); + } + + // Fuchsia is similar. + if (Subtarget.isTargetFuchsia()) { + // defines ZX_TLS_UNSAFE_SP_OFFSET with this value. + return SegmentOffset(IRB, 0x18, getAddressSpace()); + } + + return TargetLowering::getSafeStackPointerLocation(IRB); +} + +//===----------------------------------------------------------------------===// +// Return Value Calling Convention Implementation +//===----------------------------------------------------------------------===// + +bool X86TargetLowering::CanLowerReturn( + CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, + const SmallVectorImpl &Outs, LLVMContext &Context) const { + SmallVector RVLocs; + CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); + return CCInfo.CheckReturn(Outs, RetCC_X86); +} + +const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const { + static const MCPhysReg ScratchRegs[] = { X86::R11, 0 }; + return ScratchRegs; +} + +ArrayRef X86TargetLowering::getRoundingControlRegisters() const { + // FIXME: We should def X86::FPCW for x87 as well. But it affects a lot of lit + // tests at the moment, which is not what we expected. + static const MCPhysReg RCRegs[] = {X86::MXCSR}; + return RCRegs; +} + +/// Lowers masks values (v*i1) to the local register values +/// \returns DAG node after lowering to register type +static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc, + const SDLoc &Dl, SelectionDAG &DAG) { + EVT ValVT = ValArg.getValueType(); + + if (ValVT == MVT::v1i1) + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg, + DAG.getIntPtrConstant(0, Dl)); + + if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) || + (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) { + // Two stage lowering might be required + // bitcast: v8i1 -> i8 / v16i1 -> i16 + // anyextend: i8 -> i32 / i16 -> i32 + EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16; + SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg); + if (ValLoc == MVT::i32) + ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy); + return ValToCopy; + } + + if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) || + (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) { + // One stage lowering is required + // bitcast: v32i1 -> i32 / v64i1 -> i64 + return DAG.getBitcast(ValLoc, ValArg); + } + + return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg); +} + +/// Breaks v64i1 value into two registers and adds the new node to the DAG +static void Passv64i1ArgInRegs( + const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg, + SmallVectorImpl> &RegsToPass, CCValAssign &VA, + CCValAssign &NextVA, const X86Subtarget &Subtarget) { + assert(Subtarget.hasBWI() && "Expected AVX512BW target!"); + assert(Subtarget.is32Bit() && "Expecting 32 bit target"); + assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value"); + assert(VA.isRegLoc() && NextVA.isRegLoc() && + "The value should reside in two registers"); + + // Before splitting the value we cast it to i64 + Arg = DAG.getBitcast(MVT::i64, Arg); + + // Splitting the value into two i32 types + SDValue Lo, Hi; + std::tie(Lo, Hi) = DAG.SplitScalar(Arg, Dl, MVT::i32, MVT::i32); + + // Attach the two i32 types into corresponding registers + RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo)); + RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi)); +} + +SDValue +X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl &Outs, + const SmallVectorImpl &OutVals, + const SDLoc &dl, SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + X86MachineFunctionInfo *FuncInfo = MF.getInfo(); + + // In some cases we need to disable registers from the default CSR list. + // For example, when they are used as return registers (preserve_* and X86's + // regcall) or for argument passing (X86's regcall). + bool ShouldDisableCalleeSavedRegister = + shouldDisableRetRegFromCSR(CallConv) || + MF.getFunction().hasFnAttribute("no_caller_saved_registers"); + + if (CallConv == CallingConv::X86_INTR && !Outs.empty()) + report_fatal_error("X86 interrupts may not return any value"); + + SmallVector RVLocs; + CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext()); + CCInfo.AnalyzeReturn(Outs, RetCC_X86); + + SmallVector, 4> RetVals; + for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E; + ++I, ++OutsIndex) { + CCValAssign &VA = RVLocs[I]; + assert(VA.isRegLoc() && "Can only return in registers!"); + + // Add the register to the CalleeSaveDisableRegs list. + if (ShouldDisableCalleeSavedRegister) + MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg()); + + SDValue ValToCopy = OutVals[OutsIndex]; + EVT ValVT = ValToCopy.getValueType(); + + // Promote values to the appropriate types. + if (VA.getLocInfo() == CCValAssign::SExt) + ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy); + else if (VA.getLocInfo() == CCValAssign::ZExt) + ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy); + else if (VA.getLocInfo() == CCValAssign::AExt) { + if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1) + ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG); + else + ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy); + } + else if (VA.getLocInfo() == CCValAssign::BCvt) + ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy); + + assert(VA.getLocInfo() != CCValAssign::FPExt && + "Unexpected FP-extend for return value."); + + // Report an error if we have attempted to return a value via an XMM + // register and SSE was disabled. + if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) { + errorUnsupported(DAG, dl, "SSE register return with SSE disabled"); + VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. + } else if (!Subtarget.hasSSE2() && + X86::FR64XRegClass.contains(VA.getLocReg()) && + ValVT == MVT::f64) { + // When returning a double via an XMM register, report an error if SSE2 is + // not enabled. + errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled"); + VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. + } + + // Returns in ST0/ST1 are handled specially: these are pushed as operands to + // the RET instruction and handled by the FP Stackifier. + if (VA.getLocReg() == X86::FP0 || + VA.getLocReg() == X86::FP1) { + // If this is a copy from an xmm register to ST(0), use an FPExtend to + // change the value to the FP stack register class. + if (isScalarFPTypeInSSEReg(VA.getValVT())) + ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); + RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy)); + // Don't emit a copytoreg. + continue; + } + + // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 + // which is returned in RAX / RDX. + if (Subtarget.is64Bit()) { + if (ValVT == MVT::x86mmx) { + if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { + ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy); + ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, + ValToCopy); + // If we don't have SSE2 available, convert to v4f32 so the generated + // register is legal. + if (!Subtarget.hasSSE2()) + ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy); + } + } + } + + if (VA.needsCustom()) { + assert(VA.getValVT() == MVT::v64i1 && + "Currently the only custom case is when we split v64i1 to 2 regs"); + + Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I], + Subtarget); + + // Add the second register to the CalleeSaveDisableRegs list. + if (ShouldDisableCalleeSavedRegister) + MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg()); + } else { + RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy)); + } + } + + SDValue Glue; + SmallVector RetOps; + RetOps.push_back(Chain); // Operand #0 = Chain (updated below) + // Operand #1 = Bytes To Pop + RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl, + MVT::i32)); + + // Copy the result values into the output registers. + for (auto &RetVal : RetVals) { + if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) { + RetOps.push_back(RetVal.second); + continue; // Don't emit a copytoreg. + } + + Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Glue); + Glue = Chain.getValue(1); + RetOps.push_back( + DAG.getRegister(RetVal.first, RetVal.second.getValueType())); + } + + // Swift calling convention does not require we copy the sret argument + // into %rax/%eax for the return, and SRetReturnReg is not set for Swift. + + // All x86 ABIs require that for returning structs by value we copy + // the sret argument into %rax/%eax (depending on ABI) for the return. + // We saved the argument into a virtual register in the entry block, + // so now we copy the value out and into %rax/%eax. + // + // Checking Function.hasStructRetAttr() here is insufficient because the IR + // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is + // false, then an sret argument may be implicitly inserted in the SelDAG. In + // either case FuncInfo->setSRetReturnReg() will have been called. + if (Register SRetReg = FuncInfo->getSRetReturnReg()) { + // When we have both sret and another return value, we should use the + // original Chain stored in RetOps[0], instead of the current Chain updated + // in the above loop. If we only have sret, RetOps[0] equals to Chain. + + // For the case of sret and another return value, we have + // Chain_0 at the function entry + // Chain_1 = getCopyToReg(Chain_0) in the above loop + // If we use Chain_1 in getCopyFromReg, we will have + // Val = getCopyFromReg(Chain_1) + // Chain_2 = getCopyToReg(Chain_1, Val) from below + + // getCopyToReg(Chain_0) will be glued together with + // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be + // in Unit B, and we will have cyclic dependency between Unit A and Unit B: + // Data dependency from Unit B to Unit A due to usage of Val in + // getCopyToReg(Chain_1, Val) + // Chain dependency from Unit A to Unit B + + // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg. + SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg, + getPointerTy(MF.getDataLayout())); + + Register RetValReg + = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ? + X86::RAX : X86::EAX; + Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Glue); + Glue = Chain.getValue(1); + + // RAX/EAX now acts like a return value. + RetOps.push_back( + DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout()))); + + // Add the returned register to the CalleeSaveDisableRegs list. Don't do + // this however for preserve_most/preserve_all to minimize the number of + // callee-saved registers for these CCs. + if (ShouldDisableCalleeSavedRegister && + CallConv != CallingConv::PreserveAll && + CallConv != CallingConv::PreserveMost) + MF.getRegInfo().disableCalleeSavedRegister(RetValReg); + } + + const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); + const MCPhysReg *I = + TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); + if (I) { + for (; *I; ++I) { + if (X86::GR64RegClass.contains(*I)) + RetOps.push_back(DAG.getRegister(*I, MVT::i64)); + else + llvm_unreachable("Unexpected register class in CSRsViaCopy!"); + } + } + + RetOps[0] = Chain; // Update chain. + + // Add the glue if we have it. + if (Glue.getNode()) + RetOps.push_back(Glue); + + X86ISD::NodeType opcode = X86ISD::RET_GLUE; + if (CallConv == CallingConv::X86_INTR) + opcode = X86ISD::IRET; + return DAG.getNode(opcode, dl, MVT::Other, RetOps); +} + +bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { + if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0)) + return false; + + SDValue TCChain = Chain; + SDNode *Copy = *N->use_begin(); + if (Copy->getOpcode() == ISD::CopyToReg) { + // If the copy has a glue operand, we conservatively assume it isn't safe to + // perform a tail call. + if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) + return false; + TCChain = Copy->getOperand(0); + } else if (Copy->getOpcode() != ISD::FP_EXTEND) + return false; + + bool HasRet = false; + for (const SDNode *U : Copy->uses()) { + if (U->getOpcode() != X86ISD::RET_GLUE) + return false; + // If we are returning more than one value, we can definitely + // not make a tail call see PR19530 + if (U->getNumOperands() > 4) + return false; + if (U->getNumOperands() == 4 && + U->getOperand(U->getNumOperands() - 1).getValueType() != MVT::Glue) + return false; + HasRet = true; + } + + if (!HasRet) + return false; + + Chain = TCChain; + return true; +} + +EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT, + ISD::NodeType ExtendKind) const { + MVT ReturnMVT = MVT::i32; + + bool Darwin = Subtarget.getTargetTriple().isOSDarwin(); + if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) { + // The ABI does not require i1, i8 or i16 to be extended. + // + // On Darwin, there is code in the wild relying on Clang's old behaviour of + // always extending i8/i16 return values, so keep doing that for now. + // (PR26665). + ReturnMVT = MVT::i8; + } + + EVT MinVT = getRegisterType(Context, ReturnMVT); + return VT.bitsLT(MinVT) ? MinVT : VT; +} + +/// Reads two 32 bit registers and creates a 64 bit mask value. +/// \param VA The current 32 bit value that need to be assigned. +/// \param NextVA The next 32 bit value that need to be assigned. +/// \param Root The parent DAG node. +/// \param [in,out] InGlue Represents SDvalue in the parent DAG node for +/// glue purposes. In the case the DAG is already using +/// physical register instead of virtual, we should glue +/// our new SDValue to InGlue SDvalue. +/// \return a new SDvalue of size 64bit. +static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA, + SDValue &Root, SelectionDAG &DAG, + const SDLoc &Dl, const X86Subtarget &Subtarget, + SDValue *InGlue = nullptr) { + assert((Subtarget.hasBWI()) && "Expected AVX512BW target!"); + assert(Subtarget.is32Bit() && "Expecting 32 bit target"); + assert(VA.getValVT() == MVT::v64i1 && + "Expecting first location of 64 bit width type"); + assert(NextVA.getValVT() == VA.getValVT() && + "The locations should have the same type"); + assert(VA.isRegLoc() && NextVA.isRegLoc() && + "The values should reside in two registers"); + + SDValue Lo, Hi; + SDValue ArgValueLo, ArgValueHi; + + MachineFunction &MF = DAG.getMachineFunction(); + const TargetRegisterClass *RC = &X86::GR32RegClass; + + // Read a 32 bit value from the registers. + if (nullptr == InGlue) { + // When no physical register is present, + // create an intermediate virtual register. + Register Reg = MF.addLiveIn(VA.getLocReg(), RC); + ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32); + Reg = MF.addLiveIn(NextVA.getLocReg(), RC); + ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32); + } else { + // When a physical register is available read the value from it and glue + // the reads together. + ArgValueLo = + DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InGlue); + *InGlue = ArgValueLo.getValue(2); + ArgValueHi = + DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InGlue); + *InGlue = ArgValueHi.getValue(2); + } + + // Convert the i32 type into v32i1 type. + Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo); + + // Convert the i32 type into v32i1 type. + Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi); + + // Concatenate the two values together. + return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi); +} + +/// The function will lower a register of various sizes (8/16/32/64) +/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1) +/// \returns a DAG node contains the operand after lowering to mask type. +static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT, + const EVT &ValLoc, const SDLoc &Dl, + SelectionDAG &DAG) { + SDValue ValReturned = ValArg; + + if (ValVT == MVT::v1i1) + return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned); + + if (ValVT == MVT::v64i1) { + // In 32 bit machine, this case is handled by getv64i1Argument + assert(ValLoc == MVT::i64 && "Expecting only i64 locations"); + // In 64 bit machine, There is no need to truncate the value only bitcast + } else { + MVT maskLen; + switch (ValVT.getSimpleVT().SimpleTy) { + case MVT::v8i1: + maskLen = MVT::i8; + break; + case MVT::v16i1: + maskLen = MVT::i16; + break; + case MVT::v32i1: + maskLen = MVT::i32; + break; + default: + llvm_unreachable("Expecting a vector of i1 types"); + } + + ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned); + } + return DAG.getBitcast(ValVT, ValReturned); +} + +/// Lower the result values of a call into the +/// appropriate copies out of appropriate physical registers. +/// +SDValue X86TargetLowering::LowerCallResult( + SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl &Ins, const SDLoc &dl, + SelectionDAG &DAG, SmallVectorImpl &InVals, + uint32_t *RegMask) const { + + const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); + // Assign locations to each value returned by this call. + SmallVector RVLocs; + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, + *DAG.getContext()); + CCInfo.AnalyzeCallResult(Ins, RetCC_X86); + + // Copy all of the result registers out of their specified physreg. + for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E; + ++I, ++InsIndex) { + CCValAssign &VA = RVLocs[I]; + EVT CopyVT = VA.getLocVT(); + + // In some calling conventions we need to remove the used registers + // from the register mask. + if (RegMask) { + for (MCPhysReg SubReg : TRI->subregs_inclusive(VA.getLocReg())) + RegMask[SubReg / 32] &= ~(1u << (SubReg % 32)); + } + + // Report an error if there was an attempt to return FP values via XMM + // registers. + if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) { + errorUnsupported(DAG, dl, "SSE register return with SSE disabled"); + if (VA.getLocReg() == X86::XMM1) + VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts. + else + VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. + } else if (!Subtarget.hasSSE2() && + X86::FR64XRegClass.contains(VA.getLocReg()) && + CopyVT == MVT::f64) { + errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled"); + if (VA.getLocReg() == X86::XMM1) + VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts. + else + VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. + } + + // If we prefer to use the value in xmm registers, copy it out as f80 and + // use a truncate to move it from fp stack reg to xmm reg. + bool RoundAfterCopy = false; + if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) && + isScalarFPTypeInSSEReg(VA.getValVT())) { + if (!Subtarget.hasX87()) + report_fatal_error("X87 register return with X87 disabled"); + CopyVT = MVT::f80; + RoundAfterCopy = (CopyVT != VA.getLocVT()); + } + + SDValue Val; + if (VA.needsCustom()) { + assert(VA.getValVT() == MVT::v64i1 && + "Currently the only custom case is when we split v64i1 to 2 regs"); + Val = + getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InGlue); + } else { + Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InGlue) + .getValue(1); + Val = Chain.getValue(0); + InGlue = Chain.getValue(2); + } + + if (RoundAfterCopy) + Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, + // This truncation won't change the value. + DAG.getIntPtrConstant(1, dl, /*isTarget=*/true)); + + if (VA.isExtInLoc()) { + if (VA.getValVT().isVector() && + VA.getValVT().getScalarType() == MVT::i1 && + ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) || + (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) { + // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8 + Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG); + } else + Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); + } + + if (VA.getLocInfo() == CCValAssign::BCvt) + Val = DAG.getBitcast(VA.getValVT(), Val); + + InVals.push_back(Val); + } + + return Chain; +} + +//===----------------------------------------------------------------------===// +// C & StdCall & Fast Calling Convention implementation +//===----------------------------------------------------------------------===// +// StdCall calling convention seems to be standard for many Windows' API +// routines and around. It differs from C calling convention just a little: +// callee should clean up the stack, not caller. Symbols should be also +// decorated in some fancy way :) It doesn't support any vector arguments. +// For info on fast calling convention see Fast Calling Convention (tail call) +// implementation LowerX86_32FastCCCallTo. + +/// Determines whether Args, either a set of outgoing arguments to a call, or a +/// set of incoming args of a call, contains an sret pointer that the callee +/// pops +template +static bool hasCalleePopSRet(const SmallVectorImpl &Args, + const X86Subtarget &Subtarget) { + // Not C++20 (yet), so no concepts available. + static_assert(std::is_same_v || + std::is_same_v, + "requires ISD::OutputArg or ISD::InputArg"); + + // Only 32-bit pops the sret. It's a 64-bit world these days, so early-out + // for most compilations. + if (!Subtarget.is32Bit()) + return false; + + if (Args.empty()) + return false; + + // Most calls do not have an sret argument, check the arg next. + const ISD::ArgFlagsTy &Flags = Args[0].Flags; + if (!Flags.isSRet() || Flags.isInReg()) + return false; + + // The MSVCabi does not pop the sret. + if (Subtarget.getTargetTriple().isOSMSVCRT()) + return false; + + // MCUs don't pop the sret + if (Subtarget.isTargetMCU()) + return false; + + // Callee pops argument + return true; +} + +/// Make a copy of an aggregate at address specified by "Src" to address +/// "Dst" with size and alignment information specified by the specific +/// parameter attribute. The copy will be passed as a byval function parameter. +static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst, + SDValue Chain, ISD::ArgFlagsTy Flags, + SelectionDAG &DAG, const SDLoc &dl) { + SDValue SizeNode = DAG.getIntPtrConstant(Flags.getByValSize(), dl); + + return DAG.getMemcpy( + Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(), + /*isVolatile*/ false, /*AlwaysInline=*/true, + /*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo()); +} + +/// Return true if the calling convention is one that we can guarantee TCO for. +static bool canGuaranteeTCO(CallingConv::ID CC) { + return (CC == CallingConv::Fast || CC == CallingConv::GHC || + CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE || + CC == CallingConv::Tail || CC == CallingConv::SwiftTail); +} + +/// Return true if we might ever do TCO for calls with this calling convention. +static bool mayTailCallThisCC(CallingConv::ID CC) { + switch (CC) { + // C calling conventions: + case CallingConv::C: + case CallingConv::Win64: + case CallingConv::X86_64_SysV: + // Callee pop conventions: + case CallingConv::X86_ThisCall: + case CallingConv::X86_StdCall: + case CallingConv::X86_VectorCall: + case CallingConv::X86_FastCall: + // Swift: + case CallingConv::Swift: + return true; + default: + return canGuaranteeTCO(CC); + } +} + +/// Return true if the function is being made into a tailcall target by +/// changing its ABI. +static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) { + return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) || + CC == CallingConv::Tail || CC == CallingConv::SwiftTail; +} + +bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { + if (!CI->isTailCall()) + return false; + + CallingConv::ID CalleeCC = CI->getCallingConv(); + if (!mayTailCallThisCC(CalleeCC)) + return false; + + return true; +} + +SDValue +X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, + const SmallVectorImpl &Ins, + const SDLoc &dl, SelectionDAG &DAG, + const CCValAssign &VA, + MachineFrameInfo &MFI, unsigned i) const { + // Create the nodes corresponding to a load from this parameter slot. + ISD::ArgFlagsTy Flags = Ins[i].Flags; + bool AlwaysUseMutable = shouldGuaranteeTCO( + CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt); + bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); + EVT ValVT; + MVT PtrVT = getPointerTy(DAG.getDataLayout()); + + // If value is passed by pointer we have address passed instead of the value + // itself. No need to extend if the mask value and location share the same + // absolute size. + bool ExtendedInMem = + VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 && + VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits(); + + if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem) + ValVT = VA.getLocVT(); + else + ValVT = VA.getValVT(); + + // FIXME: For now, all byval parameter objects are marked mutable. This can be + // changed with more analysis. + // In case of tail call optimization mark all arguments mutable. Since they + // could be overwritten by lowering of arguments in case of a tail call. + if (Flags.isByVal()) { + unsigned Bytes = Flags.getByValSize(); + if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. + + // FIXME: For now, all byval parameter objects are marked as aliasing. This + // can be improved with deeper analysis. + int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable, + /*isAliased=*/true); + return DAG.getFrameIndex(FI, PtrVT); + } + + EVT ArgVT = Ins[i].ArgVT; + + // If this is a vector that has been split into multiple parts, and the + // scalar size of the parts don't match the vector element size, then we can't + // elide the copy. The parts will have padding between them instead of being + // packed like a vector. + bool ScalarizedAndExtendedVector = + ArgVT.isVector() && !VA.getLocVT().isVector() && + VA.getLocVT().getSizeInBits() != ArgVT.getScalarSizeInBits(); + + // This is an argument in memory. We might be able to perform copy elision. + // If the argument is passed directly in memory without any extension, then we + // can perform copy elision. Large vector types, for example, may be passed + // indirectly by pointer. + if (Flags.isCopyElisionCandidate() && + VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem && + !ScalarizedAndExtendedVector) { + SDValue PartAddr; + if (Ins[i].PartOffset == 0) { + // If this is a one-part value or the first part of a multi-part value, + // create a stack object for the entire argument value type and return a + // load from our portion of it. This assumes that if the first part of an + // argument is in memory, the rest will also be in memory. + int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(), + /*IsImmutable=*/false); + PartAddr = DAG.getFrameIndex(FI, PtrVT); + return DAG.getLoad( + ValVT, dl, Chain, PartAddr, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); + } else { + // This is not the first piece of an argument in memory. See if there is + // already a fixed stack object including this offset. If so, assume it + // was created by the PartOffset == 0 branch above and create a load from + // the appropriate offset into it. + int64_t PartBegin = VA.getLocMemOffset(); + int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8; + int FI = MFI.getObjectIndexBegin(); + for (; MFI.isFixedObjectIndex(FI); ++FI) { + int64_t ObjBegin = MFI.getObjectOffset(FI); + int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI); + if (ObjBegin <= PartBegin && PartEnd <= ObjEnd) + break; + } + if (MFI.isFixedObjectIndex(FI)) { + SDValue Addr = + DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT), + DAG.getIntPtrConstant(Ins[i].PartOffset, dl)); + return DAG.getLoad( + ValVT, dl, Chain, Addr, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI, + Ins[i].PartOffset)); + } + } + } + + int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8, + VA.getLocMemOffset(), isImmutable); + + // Set SExt or ZExt flag. + if (VA.getLocInfo() == CCValAssign::ZExt) { + MFI.setObjectZExt(FI, true); + } else if (VA.getLocInfo() == CCValAssign::SExt) { + MFI.setObjectSExt(FI, true); + } + + MaybeAlign Alignment; + if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() && + ValVT != MVT::f80) + Alignment = MaybeAlign(4); + SDValue FIN = DAG.getFrameIndex(FI, PtrVT); + SDValue Val = DAG.getLoad( + ValVT, dl, Chain, FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), + Alignment); + return ExtendedInMem + ? (VA.getValVT().isVector() + ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val) + : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val)) + : Val; +} + +// FIXME: Get this from tablegen. +static ArrayRef get64BitArgumentGPRs(CallingConv::ID CallConv, + const X86Subtarget &Subtarget) { + assert(Subtarget.is64Bit()); + + if (Subtarget.isCallingConvWin64(CallConv)) { + static const MCPhysReg GPR64ArgRegsWin64[] = { + X86::RCX, X86::RDX, X86::R8, X86::R9 + }; + return ArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64)); + } + + static const MCPhysReg GPR64ArgRegs64Bit[] = { + X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 + }; + return ArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit)); +} + +// FIXME: Get this from tablegen. +static ArrayRef get64BitArgumentXMMs(MachineFunction &MF, + CallingConv::ID CallConv, + const X86Subtarget &Subtarget) { + assert(Subtarget.is64Bit()); + if (Subtarget.isCallingConvWin64(CallConv)) { + // The XMM registers which might contain var arg parameters are shadowed + // in their paired GPR. So we only need to save the GPR to their home + // slots. + // TODO: __vectorcall will change this. + return std::nullopt; + } + + bool isSoftFloat = Subtarget.useSoftFloat(); + if (isSoftFloat || !Subtarget.hasSSE1()) + // Kernel mode asks for SSE to be disabled, so there are no XMM argument + // registers. + return std::nullopt; + + static const MCPhysReg XMMArgRegs64Bit[] = { + X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, + X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 + }; + return ArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit)); +} + +#ifndef NDEBUG +static bool isSortedByValueNo(ArrayRef ArgLocs) { + return llvm::is_sorted( + ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool { + return A.getValNo() < B.getValNo(); + }); +} +#endif + +namespace { +/// This is a helper class for lowering variable arguments parameters. +class VarArgsLoweringHelper { +public: + VarArgsLoweringHelper(X86MachineFunctionInfo *FuncInfo, const SDLoc &Loc, + SelectionDAG &DAG, const X86Subtarget &Subtarget, + CallingConv::ID CallConv, CCState &CCInfo) + : FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget), + TheMachineFunction(DAG.getMachineFunction()), + TheFunction(TheMachineFunction.getFunction()), + FrameInfo(TheMachineFunction.getFrameInfo()), + FrameLowering(*Subtarget.getFrameLowering()), + TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv), + CCInfo(CCInfo) {} + + // Lower variable arguments parameters. + void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize); + +private: + void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize); + + void forwardMustTailParameters(SDValue &Chain); + + bool is64Bit() const { return Subtarget.is64Bit(); } + bool isWin64() const { return Subtarget.isCallingConvWin64(CallConv); } + + X86MachineFunctionInfo *FuncInfo; + const SDLoc &DL; + SelectionDAG &DAG; + const X86Subtarget &Subtarget; + MachineFunction &TheMachineFunction; + const Function &TheFunction; + MachineFrameInfo &FrameInfo; + const TargetFrameLowering &FrameLowering; + const TargetLowering &TargLowering; + CallingConv::ID CallConv; + CCState &CCInfo; +}; +} // namespace + +void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters( + SDValue &Chain, unsigned StackSize) { + // If the function takes variable number of arguments, make a frame index for + // the start of the first vararg value... for expansion of llvm.va_start. We + // can skip this if there are no va_start calls. + if (is64Bit() || (CallConv != CallingConv::X86_FastCall && + CallConv != CallingConv::X86_ThisCall)) { + FuncInfo->setVarArgsFrameIndex( + FrameInfo.CreateFixedObject(1, StackSize, true)); + } + + // 64-bit calling conventions support varargs and register parameters, so we + // have to do extra work to spill them in the prologue. + if (is64Bit()) { + // Find the first unallocated argument registers. + ArrayRef ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget); + ArrayRef ArgXMMs = + get64BitArgumentXMMs(TheMachineFunction, CallConv, Subtarget); + unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs); + unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs); + + assert(!(NumXMMRegs && !Subtarget.hasSSE1()) && + "SSE register cannot be used when SSE is disabled!"); + + if (isWin64()) { + // Get to the caller-allocated home save location. Add 8 to account + // for the return address. + int HomeOffset = FrameLowering.getOffsetOfLocalArea() + 8; + FuncInfo->setRegSaveFrameIndex( + FrameInfo.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); + // Fixup to set vararg frame on shadow area (4 x i64). + if (NumIntRegs < 4) + FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); + } else { + // For X86-64, if there are vararg parameters that are passed via + // registers, then we must store them to their spots on the stack so + // they may be loaded by dereferencing the result of va_next. + FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); + FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16); + FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject( + ArgGPRs.size() * 8 + ArgXMMs.size() * 16, Align(16), false)); + } + + SmallVector + LiveGPRs; // list of SDValue for GPR registers keeping live input value + SmallVector LiveXMMRegs; // list of SDValue for XMM registers + // keeping live input value + SDValue ALVal; // if applicable keeps SDValue for %al register + + // Gather all the live in physical registers. + for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) { + Register GPR = TheMachineFunction.addLiveIn(Reg, &X86::GR64RegClass); + LiveGPRs.push_back(DAG.getCopyFromReg(Chain, DL, GPR, MVT::i64)); + } + const auto &AvailableXmms = ArgXMMs.slice(NumXMMRegs); + if (!AvailableXmms.empty()) { + Register AL = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass); + ALVal = DAG.getCopyFromReg(Chain, DL, AL, MVT::i8); + for (MCPhysReg Reg : AvailableXmms) { + // FastRegisterAllocator spills virtual registers at basic + // block boundary. That leads to usages of xmm registers + // outside of check for %al. Pass physical registers to + // VASTART_SAVE_XMM_REGS to avoid unneccessary spilling. + TheMachineFunction.getRegInfo().addLiveIn(Reg); + LiveXMMRegs.push_back(DAG.getRegister(Reg, MVT::v4f32)); + } + } + + // Store the integer parameter registers. + SmallVector MemOps; + SDValue RSFIN = + DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), + TargLowering.getPointerTy(DAG.getDataLayout())); + unsigned Offset = FuncInfo->getVarArgsGPOffset(); + for (SDValue Val : LiveGPRs) { + SDValue FIN = DAG.getNode(ISD::ADD, DL, + TargLowering.getPointerTy(DAG.getDataLayout()), + RSFIN, DAG.getIntPtrConstant(Offset, DL)); + SDValue Store = + DAG.getStore(Val.getValue(1), DL, Val, FIN, + MachinePointerInfo::getFixedStack( + DAG.getMachineFunction(), + FuncInfo->getRegSaveFrameIndex(), Offset)); + MemOps.push_back(Store); + Offset += 8; + } + + // Now store the XMM (fp + vector) parameter registers. + if (!LiveXMMRegs.empty()) { + SmallVector SaveXMMOps; + SaveXMMOps.push_back(Chain); + SaveXMMOps.push_back(ALVal); + SaveXMMOps.push_back(RSFIN); + SaveXMMOps.push_back( + DAG.getTargetConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32)); + llvm::append_range(SaveXMMOps, LiveXMMRegs); + MachineMemOperand *StoreMMO = + DAG.getMachineFunction().getMachineMemOperand( + MachinePointerInfo::getFixedStack( + DAG.getMachineFunction(), FuncInfo->getRegSaveFrameIndex(), + Offset), + MachineMemOperand::MOStore, 128, Align(16)); + MemOps.push_back(DAG.getMemIntrinsicNode(X86ISD::VASTART_SAVE_XMM_REGS, + DL, DAG.getVTList(MVT::Other), + SaveXMMOps, MVT::i8, StoreMMO)); + } + + if (!MemOps.empty()) + Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); + } +} + +void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) { + // Find the largest legal vector type. + MVT VecVT = MVT::Other; + // FIXME: Only some x86_32 calling conventions support AVX512. + if (Subtarget.useAVX512Regs() && + (is64Bit() || (CallConv == CallingConv::X86_VectorCall || + CallConv == CallingConv::Intel_OCL_BI))) + VecVT = MVT::v16f32; + else if (Subtarget.hasAVX()) + VecVT = MVT::v8f32; + else if (Subtarget.hasSSE2()) + VecVT = MVT::v4f32; + + // We forward some GPRs and some vector types. + SmallVector RegParmTypes; + MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32; + RegParmTypes.push_back(IntVT); + if (VecVT != MVT::Other) + RegParmTypes.push_back(VecVT); + + // Compute the set of forwarded registers. The rest are scratch. + SmallVectorImpl &Forwards = + FuncInfo->getForwardedMustTailRegParms(); + CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86); + + // Forward AL for SysV x86_64 targets, since it is used for varargs. + if (is64Bit() && !isWin64() && !CCInfo.isAllocated(X86::AL)) { + Register ALVReg = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass); + Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8)); + } + + // Copy all forwards from physical to virtual registers. + for (ForwardedRegister &FR : Forwards) { + // FIXME: Can we use a less constrained schedule? + SDValue RegVal = DAG.getCopyFromReg(Chain, DL, FR.VReg, FR.VT); + FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister( + TargLowering.getRegClassFor(FR.VT)); + Chain = DAG.getCopyToReg(Chain, DL, FR.VReg, RegVal); + } +} + +void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain, + unsigned StackSize) { + // Set FrameIndex to the 0xAAAAAAA value to mark unset state. + // If necessary, it would be set into the correct value later. + FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); + FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); + + if (FrameInfo.hasVAStart()) + createVarArgAreaAndStoreRegisters(Chain, StackSize); + + if (FrameInfo.hasMustTailInVarArgFunc()) + forwardMustTailParameters(Chain); +} + +SDValue X86TargetLowering::LowerFormalArguments( + SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, + const SmallVectorImpl &Ins, const SDLoc &dl, + SelectionDAG &DAG, SmallVectorImpl &InVals) const { + MachineFunction &MF = DAG.getMachineFunction(); + X86MachineFunctionInfo *FuncInfo = MF.getInfo(); + + const Function &F = MF.getFunction(); + if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() && + F.getName() == "main") + FuncInfo->setForceFramePointer(true); + + MachineFrameInfo &MFI = MF.getFrameInfo(); + bool Is64Bit = Subtarget.is64Bit(); + bool IsWin64 = Subtarget.isCallingConvWin64(CallConv); + + assert( + !(IsVarArg && canGuaranteeTCO(CallConv)) && + "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"); + + // Assign locations to all of the incoming arguments. + SmallVector ArgLocs; + CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); + + // Allocate shadow area for Win64. + if (IsWin64) + CCInfo.AllocateStack(32, Align(8)); + + CCInfo.AnalyzeArguments(Ins, CC_X86); + + // In vectorcall calling convention a second pass is required for the HVA + // types. + if (CallingConv::X86_VectorCall == CallConv) { + CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86); + } + + // The next loop assumes that the locations are in the same order of the + // input arguments. + assert(isSortedByValueNo(ArgLocs) && + "Argument Location list must be sorted before lowering"); + + SDValue ArgValue; + for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E; + ++I, ++InsIndex) { + assert(InsIndex < Ins.size() && "Invalid Ins index"); + CCValAssign &VA = ArgLocs[I]; + + if (VA.isRegLoc()) { + EVT RegVT = VA.getLocVT(); + if (VA.needsCustom()) { + assert( + VA.getValVT() == MVT::v64i1 && + "Currently the only custom case is when we split v64i1 to 2 regs"); + + // v64i1 values, in regcall calling convention, that are + // compiled to 32 bit arch, are split up into two registers. + ArgValue = + getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget); + } else { + const TargetRegisterClass *RC; + if (RegVT == MVT::i8) + RC = &X86::GR8RegClass; + else if (RegVT == MVT::i16) + RC = &X86::GR16RegClass; + else if (RegVT == MVT::i32) + RC = &X86::GR32RegClass; + else if (Is64Bit && RegVT == MVT::i64) + RC = &X86::GR64RegClass; + else if (RegVT == MVT::f16) + RC = Subtarget.hasAVX512() ? &X86::FR16XRegClass : &X86::FR16RegClass; + else if (RegVT == MVT::f32) + RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass; + else if (RegVT == MVT::f64) + RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass; + else if (RegVT == MVT::f80) + RC = &X86::RFP80RegClass; + else if (RegVT == MVT::f128) + RC = &X86::VR128RegClass; + else if (RegVT.is512BitVector()) + RC = &X86::VR512RegClass; + else if (RegVT.is256BitVector()) + RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass; + else if (RegVT.is128BitVector()) + RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass; + else if (RegVT == MVT::x86mmx) + RC = &X86::VR64RegClass; + else if (RegVT == MVT::v1i1) + RC = &X86::VK1RegClass; + else if (RegVT == MVT::v8i1) + RC = &X86::VK8RegClass; + else if (RegVT == MVT::v16i1) + RC = &X86::VK16RegClass; + else if (RegVT == MVT::v32i1) + RC = &X86::VK32RegClass; + else if (RegVT == MVT::v64i1) + RC = &X86::VK64RegClass; + else + llvm_unreachable("Unknown argument type!"); + + Register Reg = MF.addLiveIn(VA.getLocReg(), RC); + ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); + } + + // If this is an 8 or 16-bit value, it is really passed promoted to 32 + // bits. Insert an assert[sz]ext to capture this, then truncate to the + // right size. + if (VA.getLocInfo() == CCValAssign::SExt) + ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, + DAG.getValueType(VA.getValVT())); + else if (VA.getLocInfo() == CCValAssign::ZExt) + ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, + DAG.getValueType(VA.getValVT())); + else if (VA.getLocInfo() == CCValAssign::BCvt) + ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue); + + if (VA.isExtInLoc()) { + // Handle MMX values passed in XMM regs. + if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1) + ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue); + else if (VA.getValVT().isVector() && + VA.getValVT().getScalarType() == MVT::i1 && + ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) || + (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) { + // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8 + ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG); + } else + ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); + } + } else { + assert(VA.isMemLoc()); + ArgValue = + LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex); + } + + // If value is passed via pointer - do a load. + if (VA.getLocInfo() == CCValAssign::Indirect && + !(Ins[I].Flags.isByVal() && VA.isRegLoc())) { + ArgValue = + DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo()); + } + + InVals.push_back(ArgValue); + } + + for (unsigned I = 0, E = Ins.size(); I != E; ++I) { + if (Ins[I].Flags.isSwiftAsync()) { + auto X86FI = MF.getInfo(); + if (Subtarget.is64Bit()) + X86FI->setHasSwiftAsyncContext(true); + else { + int FI = MF.getFrameInfo().CreateStackObject(4, Align(4), false); + X86FI->setSwiftAsyncContextFrameIdx(FI); + SDValue St = DAG.getStore(DAG.getEntryNode(), dl, InVals[I], + DAG.getFrameIndex(FI, MVT::i32), + MachinePointerInfo::getFixedStack(MF, FI)); + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, St, Chain); + } + } + + // Swift calling convention does not require we copy the sret argument + // into %rax/%eax for the return. We don't set SRetReturnReg for Swift. + if (CallConv == CallingConv::Swift || CallConv == CallingConv::SwiftTail) + continue; + + // All x86 ABIs require that for returning structs by value we copy the + // sret argument into %rax/%eax (depending on ABI) for the return. Save + // the argument into a virtual register so that we can access it from the + // return points. + if (Ins[I].Flags.isSRet()) { + assert(!FuncInfo->getSRetReturnReg() && + "SRet return has already been set"); + MVT PtrTy = getPointerTy(DAG.getDataLayout()); + Register Reg = + MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); + FuncInfo->setSRetReturnReg(Reg); + SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]); + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); + break; + } + } + + unsigned StackSize = CCInfo.getStackSize(); + // Align stack specially for tail calls. + if (shouldGuaranteeTCO(CallConv, + MF.getTarget().Options.GuaranteedTailCallOpt)) + StackSize = GetAlignedArgumentStackSize(StackSize, DAG); + + if (IsVarArg) + VarArgsLoweringHelper(FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo) + .lowerVarArgsParameters(Chain, StackSize); + + // Some CCs need callee pop. + if (X86::isCalleePop(CallConv, Is64Bit, IsVarArg, + MF.getTarget().Options.GuaranteedTailCallOpt)) { + FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. + } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) { + // X86 interrupts must pop the error code (and the alignment padding) if + // present. + FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4); + } else { + FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. + // If this is an sret function, the return should pop the hidden pointer. + if (!canGuaranteeTCO(CallConv) && hasCalleePopSRet(Ins, Subtarget)) + FuncInfo->setBytesToPopOnReturn(4); + } + + if (!Is64Bit) { + // RegSaveFrameIndex is X86-64 only. + FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); + } + + FuncInfo->setArgumentStackSize(StackSize); + + if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) { + EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn()); + if (Personality == EHPersonality::CoreCLR) { + assert(Is64Bit); + // TODO: Add a mechanism to frame lowering that will allow us to indicate + // that we'd prefer this slot be allocated towards the bottom of the frame + // (i.e. near the stack pointer after allocating the frame). Every + // funclet needs a copy of this slot in its (mostly empty) frame, and the + // offset from the bottom of this and each funclet's frame must be the + // same, so the size of funclets' (mostly empty) frames is dictated by + // how far this slot is from the bottom (since they allocate just enough + // space to accommodate holding this slot at the correct offset). + int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSpillSlot=*/false); + EHInfo->PSPSymFrameIdx = PSPSymFI; + } + } + + if (shouldDisableArgRegFromCSR(CallConv) || + F.hasFnAttribute("no_caller_saved_registers")) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + for (std::pair Pair : MRI.liveins()) + MRI.disableCalleeSavedRegister(Pair.first); + } + + return Chain; +} + +SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, + SDValue Arg, const SDLoc &dl, + SelectionDAG &DAG, + const CCValAssign &VA, + ISD::ArgFlagsTy Flags, + bool isByVal) const { + unsigned LocMemOffset = VA.getLocMemOffset(); + SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); + PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), + StackPtr, PtrOff); + if (isByVal) + return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); + + MaybeAlign Alignment; + if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() && + Arg.getSimpleValueType() != MVT::f80) + Alignment = MaybeAlign(4); + return DAG.getStore( + Chain, dl, Arg, PtrOff, + MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset), + Alignment); +} + +/// Emit a load of return address if tail call +/// optimization is performed and it is required. +SDValue X86TargetLowering::EmitTailCallLoadRetAddr( + SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall, + bool Is64Bit, int FPDiff, const SDLoc &dl) const { + // Adjust the Return address stack slot. + EVT VT = getPointerTy(DAG.getDataLayout()); + OutRetAddr = getReturnAddressFrameIndex(DAG); + + // Load the "old" Return address. + OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo()); + return SDValue(OutRetAddr.getNode(), 1); +} + +/// Emit a store of the return address if tail call +/// optimization is performed and it is required (FPDiff!=0). +static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF, + SDValue Chain, SDValue RetAddrFrIdx, + EVT PtrVT, unsigned SlotSize, + int FPDiff, const SDLoc &dl) { + // Store the return address to the appropriate stack slot. + if (!FPDiff) return Chain; + // Calculate the new stack slot for the return address. + int NewReturnAddrFI = + MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize, + false); + SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT); + Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, + MachinePointerInfo::getFixedStack( + DAG.getMachineFunction(), NewReturnAddrFI)); + return Chain; +} + +/// Returns a vector_shuffle mask for an movs{s|d}, movd +/// operation of specified width. +SDValue X86TargetLowering::getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, + SDValue V1, SDValue V2) const { + unsigned NumElems = VT.getVectorNumElements(); + SmallVector Mask; + Mask.push_back(NumElems); + for (unsigned i = 1; i != NumElems; ++i) + Mask.push_back(i); + return DAG.getVectorShuffle(VT, dl, V1, V2, Mask); +} + +SDValue +X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, + SmallVectorImpl &InVals) const { + SelectionDAG &DAG = CLI.DAG; + SDLoc &dl = CLI.DL; + SmallVectorImpl &Outs = CLI.Outs; + SmallVectorImpl &OutVals = CLI.OutVals; + SmallVectorImpl &Ins = CLI.Ins; + SDValue Chain = CLI.Chain; + SDValue Callee = CLI.Callee; + CallingConv::ID CallConv = CLI.CallConv; + bool &isTailCall = CLI.IsTailCall; + bool isVarArg = CLI.IsVarArg; + const auto *CB = CLI.CB; + + MachineFunction &MF = DAG.getMachineFunction(); + bool Is64Bit = Subtarget.is64Bit(); + bool IsWin64 = Subtarget.isCallingConvWin64(CallConv); + bool IsSibcall = false; + bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt || + CallConv == CallingConv::Tail || CallConv == CallingConv::SwiftTail; + bool IsCalleePopSRet = !IsGuaranteeTCO && hasCalleePopSRet(Outs, Subtarget); + X86MachineFunctionInfo *X86Info = MF.getInfo(); + bool HasNCSR = (CB && isa(CB) && + CB->hasFnAttr("no_caller_saved_registers")); + bool HasNoCfCheck = (CB && CB->doesNoCfCheck()); + bool IsIndirectCall = (CB && isa(CB) && CB->isIndirectCall()); + bool IsCFICall = IsIndirectCall && CLI.CFIType; + const Module *M = MF.getMMI().getModule(); + Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch"); + + MachineFunction::CallSiteInfo CSInfo; + if (CallConv == CallingConv::X86_INTR) + report_fatal_error("X86 interrupts may not be called directly"); + + bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall(); + if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO && !IsMustTail) { + // If we are using a GOT, disable tail calls to external symbols with + // default visibility. Tail calling such a symbol requires using a GOT + // relocation, which forces early binding of the symbol. This breaks code + // that require lazy function symbol resolution. Using musttail or + // GuaranteedTailCallOpt will override this. + GlobalAddressSDNode *G = dyn_cast(Callee); + if (!G || (!G->getGlobal()->hasLocalLinkage() && + G->getGlobal()->hasDefaultVisibility())) + isTailCall = false; + } + + if (isTailCall && !IsMustTail) { + // Check if it's really possible to do a tail call. + isTailCall = IsEligibleForTailCallOptimization( + Callee, CallConv, IsCalleePopSRet, isVarArg, CLI.RetTy, Outs, OutVals, + Ins, DAG); + + // Sibcalls are automatically detected tailcalls which do not require + // ABI changes. + if (!IsGuaranteeTCO && isTailCall) + IsSibcall = true; + + if (isTailCall) + ++NumTailCalls; + } + + if (IsMustTail && !isTailCall) + report_fatal_error("failed to perform tail call elimination on a call " + "site marked musttail"); + + assert(!(isVarArg && canGuaranteeTCO(CallConv)) && + "Var args not supported with calling convention fastcc, ghc or hipe"); + + // Analyze operands of the call, assigning locations to each operand. + SmallVector ArgLocs; + CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); + + // Allocate shadow area for Win64. + if (IsWin64) + CCInfo.AllocateStack(32, Align(8)); + + CCInfo.AnalyzeArguments(Outs, CC_X86); + + // In vectorcall calling convention a second pass is required for the HVA + // types. + if (CallingConv::X86_VectorCall == CallConv) { + CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86); + } + + // Get a count of how many bytes are to be pushed on the stack. + unsigned NumBytes = CCInfo.getAlignedCallFrameSize(); + if (IsSibcall) + // This is a sibcall. The memory operands are available in caller's + // own caller's stack. + NumBytes = 0; + else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv)) + NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); + + int FPDiff = 0; + if (isTailCall && + shouldGuaranteeTCO(CallConv, + MF.getTarget().Options.GuaranteedTailCallOpt)) { + // Lower arguments at fp - stackoffset + fpdiff. + unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn(); + + FPDiff = NumBytesCallerPushed - NumBytes; + + // Set the delta of movement of the returnaddr stackslot. + // But only set if delta is greater than previous delta. + if (FPDiff < X86Info->getTCReturnAddrDelta()) + X86Info->setTCReturnAddrDelta(FPDiff); + } + + unsigned NumBytesToPush = NumBytes; + unsigned NumBytesToPop = NumBytes; + + // If we have an inalloca argument, all stack space has already been allocated + // for us and be right at the top of the stack. We don't support multiple + // arguments passed in memory when using inalloca. + if (!Outs.empty() && Outs.back().Flags.isInAlloca()) { + NumBytesToPush = 0; + if (!ArgLocs.back().isMemLoc()) + report_fatal_error("cannot use inalloca attribute on a register " + "parameter"); + if (ArgLocs.back().getLocMemOffset() != 0) + report_fatal_error("any parameter with the inalloca attribute must be " + "the only memory argument"); + } else if (CLI.IsPreallocated) { + assert(ArgLocs.back().isMemLoc() && + "cannot use preallocated attribute on a register " + "parameter"); + SmallVector PreallocatedOffsets; + for (size_t i = 0; i < CLI.OutVals.size(); ++i) { + if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) { + PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset()); + } + } + auto *MFI = DAG.getMachineFunction().getInfo(); + size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CLI.CB); + MFI->setPreallocatedStackSize(PreallocatedId, NumBytes); + MFI->setPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets); + NumBytesToPush = 0; + } + + if (!IsSibcall && !IsMustTail) + Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush, + NumBytes - NumBytesToPush, dl); + + SDValue RetAddrFrIdx; + // Load return address for tail calls. + if (isTailCall && FPDiff) + Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, + Is64Bit, FPDiff, dl); + + SmallVector, 8> RegsToPass; + SmallVector MemOpChains; + SDValue StackPtr; + + // The next loop assumes that the locations are in the same order of the + // input arguments. + assert(isSortedByValueNo(ArgLocs) && + "Argument Location list must be sorted before lowering"); + + // Walk the register/memloc assignments, inserting copies/loads. In the case + // of tail call optimization arguments are handle later. + const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E; + ++I, ++OutIndex) { + assert(OutIndex < Outs.size() && "Invalid Out index"); + // Skip inalloca/preallocated arguments, they have already been written. + ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags; + if (Flags.isInAlloca() || Flags.isPreallocated()) + continue; + + CCValAssign &VA = ArgLocs[I]; + EVT RegVT = VA.getLocVT(); + SDValue Arg = OutVals[OutIndex]; + bool isByVal = Flags.isByVal(); + + // Promote the value if needed. + switch (VA.getLocInfo()) { + default: llvm_unreachable("Unknown loc info!"); + case CCValAssign::Full: break; + case CCValAssign::SExt: + Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); + break; + case CCValAssign::ZExt: + Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); + break; + case CCValAssign::AExt: + if (Arg.getValueType().isVector() && + Arg.getValueType().getVectorElementType() == MVT::i1) + Arg = lowerMasksToReg(Arg, RegVT, dl, DAG); + else if (RegVT.is128BitVector()) { + // Special case: passing MMX values in XMM registers. + Arg = DAG.getBitcast(MVT::i64, Arg); + Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); + Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); + } else + Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); + break; + case CCValAssign::BCvt: + Arg = DAG.getBitcast(RegVT, Arg); + break; + case CCValAssign::Indirect: { + if (isByVal) { + // Memcpy the argument to a temporary stack slot to prevent + // the caller from seeing any modifications the callee may make + // as guaranteed by the `byval` attribute. + int FrameIdx = MF.getFrameInfo().CreateStackObject( + Flags.getByValSize(), + std::max(Align(16), Flags.getNonZeroByValAlign()), false); + SDValue StackSlot = + DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout())); + Chain = + CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl); + // From now on treat this as a regular pointer + Arg = StackSlot; + isByVal = false; + } else { + // Store the argument. + SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); + int FI = cast(SpillSlot)->getIndex(); + Chain = DAG.getStore( + Chain, dl, Arg, SpillSlot, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); + Arg = SpillSlot; + } + break; + } + } + + if (VA.needsCustom()) { + assert(VA.getValVT() == MVT::v64i1 && + "Currently the only custom case is when we split v64i1 to 2 regs"); + // Split v64i1 value into two registers + Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget); + } else if (VA.isRegLoc()) { + RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); + const TargetOptions &Options = DAG.getTarget().Options; + if (Options.EmitCallSiteInfo) + CSInfo.emplace_back(VA.getLocReg(), I); + if (isVarArg && IsWin64) { + // Win64 ABI requires argument XMM reg to be copied to the corresponding + // shadow reg if callee is a varargs function. + Register ShadowReg; + switch (VA.getLocReg()) { + case X86::XMM0: ShadowReg = X86::RCX; break; + case X86::XMM1: ShadowReg = X86::RDX; break; + case X86::XMM2: ShadowReg = X86::R8; break; + case X86::XMM3: ShadowReg = X86::R9; break; + } + if (ShadowReg) + RegsToPass.push_back(std::make_pair(ShadowReg, Arg)); + } + } else if (!IsSibcall && (!isTailCall || isByVal)) { + assert(VA.isMemLoc()); + if (!StackPtr.getNode()) + StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), + getPointerTy(DAG.getDataLayout())); + MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, + dl, DAG, VA, Flags, isByVal)); + } + } + + if (!MemOpChains.empty()) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); + + if (Subtarget.isPICStyleGOT()) { + // ELF / PIC requires GOT in the EBX register before function calls via PLT + // GOT pointer (except regcall). + if (!isTailCall) { + // Indirect call with RegCall calling convertion may use up all the + // general registers, so it is not suitable to bind EBX reister for + // GOT address, just let register allocator handle it. + if (CallConv != CallingConv::X86_RegCall) + RegsToPass.push_back(std::make_pair( + Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), + getPointerTy(DAG.getDataLayout())))); + } else { + // If we are tail calling and generating PIC/GOT style code load the + // address of the callee into ECX. The value in ecx is used as target of + // the tail jump. This is done to circumvent the ebx/callee-saved problem + // for tail calls on PIC/GOT architectures. Normally we would just put the + // address of GOT into ebx and then call target@PLT. But for tail calls + // ebx would be restored (since ebx is callee saved) before jumping to the + // target@PLT. + + // Note: The actual moving to ECX is done further down. + GlobalAddressSDNode *G = dyn_cast(Callee); + if (G && !G->getGlobal()->hasLocalLinkage() && + G->getGlobal()->hasDefaultVisibility()) + Callee = LowerGlobalAddress(Callee, DAG); + else if (isa(Callee)) + Callee = LowerExternalSymbol(Callee, DAG); + } + } + + if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail && + (Subtarget.hasSSE1() || !M->getModuleFlag("SkipRaxSetup"))) { + // From AMD64 ABI document: + // For calls that may call functions that use varargs or stdargs + // (prototype-less calls or calls to functions containing ellipsis (...) in + // the declaration) %al is used as hidden argument to specify the number + // of SSE registers used. The contents of %al do not need to match exactly + // the number of registers, but must be an ubound on the number of SSE + // registers used and is in the range 0 - 8 inclusive. + + // Count the number of XMM registers allocated. + static const MCPhysReg XMMArgRegs[] = { + X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, + X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 + }; + unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs); + assert((Subtarget.hasSSE1() || !NumXMMRegs) + && "SSE registers cannot be used when SSE is disabled"); + RegsToPass.push_back(std::make_pair(Register(X86::AL), + DAG.getConstant(NumXMMRegs, dl, + MVT::i8))); + } + + if (isVarArg && IsMustTail) { + const auto &Forwards = X86Info->getForwardedMustTailRegParms(); + for (const auto &F : Forwards) { + SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT); + RegsToPass.push_back(std::make_pair(F.PReg, Val)); + } + } + + // For tail calls lower the arguments to the 'real' stack slots. Sibcalls + // don't need this because the eligibility check rejects calls that require + // shuffling arguments passed in memory. + if (!IsSibcall && isTailCall) { + // Force all the incoming stack arguments to be loaded from the stack + // before any new outgoing arguments are stored to the stack, because the + // outgoing stack slots may alias the incoming argument stack slots, and + // the alias isn't otherwise explicit. This is slightly more conservative + // than necessary, because it means that each store effectively depends + // on every argument instead of just those arguments it would clobber. + SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); + + SmallVector MemOpChains2; + SDValue FIN; + int FI = 0; + for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E; + ++I, ++OutsIndex) { + CCValAssign &VA = ArgLocs[I]; + + if (VA.isRegLoc()) { + if (VA.needsCustom()) { + assert((CallConv == CallingConv::X86_RegCall) && + "Expecting custom case only in regcall calling convention"); + // This means that we are in special case where one argument was + // passed through two register locations - Skip the next location + ++I; + } + + continue; + } + + assert(VA.isMemLoc()); + SDValue Arg = OutVals[OutsIndex]; + ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags; + // Skip inalloca/preallocated arguments. They don't require any work. + if (Flags.isInAlloca() || Flags.isPreallocated()) + continue; + // Create frame index. + int32_t Offset = VA.getLocMemOffset()+FPDiff; + uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; + FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true); + FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); + + if (Flags.isByVal()) { + // Copy relative to framepointer. + SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl); + if (!StackPtr.getNode()) + StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), + getPointerTy(DAG.getDataLayout())); + Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), + StackPtr, Source); + + MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, + ArgChain, + Flags, DAG, dl)); + } else { + // Store relative to framepointer. + MemOpChains2.push_back(DAG.getStore( + ArgChain, dl, Arg, FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI))); + } + } + + if (!MemOpChains2.empty()) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2); + + // Store the return address to the appropriate stack slot. + Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, + getPointerTy(DAG.getDataLayout()), + RegInfo->getSlotSize(), FPDiff, dl); + } + + // Build a sequence of copy-to-reg nodes chained together with token chain + // and glue operands which copy the outgoing args into registers. + SDValue InGlue; + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { + Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, + RegsToPass[i].second, InGlue); + InGlue = Chain.getValue(1); + } + + if (DAG.getTarget().getCodeModel() == CodeModel::Large) { + assert(Is64Bit && "Large code model is only legal in 64-bit mode."); + // In the 64-bit large code model, we have to make all calls + // through a register, since the call instruction's 32-bit + // pc-relative offset may not be large enough to hold the whole + // address. + } else if (Callee->getOpcode() == ISD::GlobalAddress || + Callee->getOpcode() == ISD::ExternalSymbol) { + // Lower direct calls to global addresses and external symbols. Setting + // ForCall to true here has the effect of removing WrapperRIP when possible + // to allow direct calls to be selected without first materializing the + // address into a register. + Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true); + } else if (Subtarget.isTarget64BitILP32() && + Callee.getValueType() == MVT::i32) { + // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI + Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee); + } + + // Returns a chain & a glue for retval copy to use. + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); + SmallVector Ops; + + if (!IsSibcall && isTailCall && !IsMustTail) { + Chain = DAG.getCALLSEQ_END(Chain, NumBytesToPop, 0, InGlue, dl); + InGlue = Chain.getValue(1); + } + + Ops.push_back(Chain); + Ops.push_back(Callee); + + if (isTailCall) + Ops.push_back(DAG.getTargetConstant(FPDiff, dl, MVT::i32)); + + // Add argument registers to the end of the list so that they are known live + // into the call. + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) + Ops.push_back(DAG.getRegister(RegsToPass[i].first, + RegsToPass[i].second.getValueType())); + + // Add a register mask operand representing the call-preserved registers. + const uint32_t *Mask = [&]() { + auto AdaptedCC = CallConv; + // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists), + // use X86_INTR calling convention because it has the same CSR mask + // (same preserved registers). + if (HasNCSR) + AdaptedCC = (CallingConv::ID)CallingConv::X86_INTR; + // If NoCalleeSavedRegisters is requested, than use GHC since it happens + // to use the CSR_NoRegs_RegMask. + if (CB && CB->hasFnAttr("no_callee_saved_registers")) + AdaptedCC = (CallingConv::ID)CallingConv::GHC; + return RegInfo->getCallPreservedMask(MF, AdaptedCC); + }(); + assert(Mask && "Missing call preserved mask for calling convention"); + + // If this is an invoke in a 32-bit function using a funclet-based + // personality, assume the function clobbers all registers. If an exception + // is thrown, the runtime will not restore CSRs. + // FIXME: Model this more precisely so that we can register allocate across + // the normal edge and spill and fill across the exceptional edge. + if (!Is64Bit && CLI.CB && isa(CLI.CB)) { + const Function &CallerFn = MF.getFunction(); + EHPersonality Pers = + CallerFn.hasPersonalityFn() + ? classifyEHPersonality(CallerFn.getPersonalityFn()) + : EHPersonality::Unknown; + if (isFuncletEHPersonality(Pers)) + Mask = RegInfo->getNoPreservedMask(); + } + + // Define a new register mask from the existing mask. + uint32_t *RegMask = nullptr; + + // In some calling conventions we need to remove the used physical registers + // from the reg mask. Create a new RegMask for such calling conventions. + // RegMask for calling conventions that disable only return registers (e.g. + // preserve_most) will be modified later in LowerCallResult. + bool ShouldDisableArgRegs = shouldDisableArgRegFromCSR(CallConv) || HasNCSR; + if (ShouldDisableArgRegs || shouldDisableRetRegFromCSR(CallConv)) { + const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); + + // Allocate a new Reg Mask and copy Mask. + RegMask = MF.allocateRegMask(); + unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs()); + memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize); + + // Make sure all sub registers of the argument registers are reset + // in the RegMask. + if (ShouldDisableArgRegs) { + for (auto const &RegPair : RegsToPass) + for (MCPhysReg SubReg : TRI->subregs_inclusive(RegPair.first)) + RegMask[SubReg / 32] &= ~(1u << (SubReg % 32)); + } + + // Create the RegMask Operand according to our updated mask. + Ops.push_back(DAG.getRegisterMask(RegMask)); + } else { + // Create the RegMask Operand according to the static mask. + Ops.push_back(DAG.getRegisterMask(Mask)); + } + + if (InGlue.getNode()) + Ops.push_back(InGlue); + + if (isTailCall) { + // We used to do: + //// If this is the first return lowered for this function, add the regs + //// to the liveout set for the function. + // This isn't right, although it's probably harmless on x86; liveouts + // should be computed from returns not tail calls. Consider a void + // function making a tail call to a function returning int. + MF.getFrameInfo().setHasTailCall(); + SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops); + + if (IsCFICall) + Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue()); + + DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge); + DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo)); + return Ret; + } + + if (HasNoCfCheck && IsCFProtectionSupported && IsIndirectCall) { + Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops); + } else if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) { + // Calls with a "clang.arc.attachedcall" bundle are special. They should be + // expanded to the call, directly followed by a special marker sequence and + // a call to a ObjC library function. Use the CALL_RVMARKER to do that. + assert(!isTailCall && + "tail calls cannot be marked with clang.arc.attachedcall"); + assert(Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode"); + + // Add a target global address for the retainRV/claimRV runtime function + // just before the call target. + Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB); + auto PtrVT = getPointerTy(DAG.getDataLayout()); + auto GA = DAG.getTargetGlobalAddress(ARCFn, dl, PtrVT); + Ops.insert(Ops.begin() + 1, GA); + Chain = DAG.getNode(X86ISD::CALL_RVMARKER, dl, NodeTys, Ops); + } else { + Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops); + } + + if (IsCFICall) + Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue()); + + InGlue = Chain.getValue(1); + DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge); + DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); + + // Save heapallocsite metadata. + if (CLI.CB) + if (MDNode *HeapAlloc = CLI.CB->getMetadata("heapallocsite")) + DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc); + + // Create the CALLSEQ_END node. + unsigned NumBytesForCalleeToPop = 0; // Callee pops nothing. + if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, + DAG.getTarget().Options.GuaranteedTailCallOpt)) + NumBytesForCalleeToPop = NumBytes; // Callee pops everything + else if (!canGuaranteeTCO(CallConv) && IsCalleePopSRet) + // If this call passes a struct-return pointer, the callee + // pops that struct pointer. + NumBytesForCalleeToPop = 4; + + // Returns a glue for retval copy to use. + if (!IsSibcall) { + Chain = DAG.getCALLSEQ_END(Chain, NumBytesToPop, NumBytesForCalleeToPop, + InGlue, dl); + InGlue = Chain.getValue(1); + } + + // Handle result values, copying them out of physregs into vregs that we + // return. + return LowerCallResult(Chain, InGlue, CallConv, isVarArg, Ins, dl, DAG, + InVals, RegMask); +} + +//===----------------------------------------------------------------------===// +// Fast Calling Convention (tail call) implementation +//===----------------------------------------------------------------------===// + +// Like std call, callee cleans arguments, convention except that ECX is +// reserved for storing the tail called function address. Only 2 registers are +// free for argument passing (inreg). Tail call optimization is performed +// provided: +// * tailcallopt is enabled +// * caller/callee are fastcc +// On X86_64 architecture with GOT-style position independent code only local +// (within module) calls are supported at the moment. +// To keep the stack aligned according to platform abi the function +// GetAlignedArgumentStackSize ensures that argument delta is always multiples +// of stack alignment. (Dynamic linkers need this - Darwin's dyld for example) +// If a tail called function callee has more arguments than the caller the +// caller needs to make sure that there is room to move the RETADDR to. This is +// achieved by reserving an area the size of the argument delta right after the +// original RETADDR, but before the saved framepointer or the spilled registers +// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) +// stack layout: +// arg1 +// arg2 +// RETADDR +// [ new RETADDR +// move area ] +// (possible EBP) +// ESI +// EDI +// local1 .. + +/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align +/// requirement. +unsigned +X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize, + SelectionDAG &DAG) const { + const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign(); + const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize(); + assert(StackSize % SlotSize == 0 && + "StackSize must be a multiple of SlotSize"); + return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize; +} + +/// Return true if the given stack call argument is already available in the +/// same position (relatively) of the caller's incoming argument stack. +static +bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, + MachineFrameInfo &MFI, const MachineRegisterInfo *MRI, + const X86InstrInfo *TII, const CCValAssign &VA) { + unsigned Bytes = Arg.getValueSizeInBits() / 8; + + for (;;) { + // Look through nodes that don't alter the bits of the incoming value. + unsigned Op = Arg.getOpcode(); + if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) { + Arg = Arg.getOperand(0); + continue; + } + if (Op == ISD::TRUNCATE) { + const SDValue &TruncInput = Arg.getOperand(0); + if (TruncInput.getOpcode() == ISD::AssertZext && + cast(TruncInput.getOperand(1))->getVT() == + Arg.getValueType()) { + Arg = TruncInput.getOperand(0); + continue; + } + } + break; + } + + int FI = INT_MAX; + if (Arg.getOpcode() == ISD::CopyFromReg) { + Register VR = cast(Arg.getOperand(1))->getReg(); + if (!VR.isVirtual()) + return false; + MachineInstr *Def = MRI->getVRegDef(VR); + if (!Def) + return false; + if (!Flags.isByVal()) { + if (!TII->isLoadFromStackSlot(*Def, FI)) + return false; + } else { + unsigned Opcode = Def->getOpcode(); + if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r || + Opcode == X86::LEA64_32r) && + Def->getOperand(1).isFI()) { + FI = Def->getOperand(1).getIndex(); + Bytes = Flags.getByValSize(); + } else + return false; + } + } else if (LoadSDNode *Ld = dyn_cast(Arg)) { + if (Flags.isByVal()) + // ByVal argument is passed in as a pointer but it's now being + // dereferenced. e.g. + // define @foo(%struct.X* %A) { + // tail call @bar(%struct.X* byval %A) + // } + return false; + SDValue Ptr = Ld->getBasePtr(); + FrameIndexSDNode *FINode = dyn_cast(Ptr); + if (!FINode) + return false; + FI = FINode->getIndex(); + } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) { + FrameIndexSDNode *FINode = cast(Arg); + FI = FINode->getIndex(); + Bytes = Flags.getByValSize(); + } else + return false; + + assert(FI != INT_MAX); + if (!MFI.isFixedObjectIndex(FI)) + return false; + + if (Offset != MFI.getObjectOffset(FI)) + return false; + + // If this is not byval, check that the argument stack object is immutable. + // inalloca and argument copy elision can create mutable argument stack + // objects. Byval objects can be mutated, but a byval call intends to pass the + // mutated memory. + if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI)) + return false; + + if (VA.getLocVT().getFixedSizeInBits() > + Arg.getValueSizeInBits().getFixedValue()) { + // If the argument location is wider than the argument type, check that any + // extension flags match. + if (Flags.isZExt() != MFI.isObjectZExt(FI) || + Flags.isSExt() != MFI.isObjectSExt(FI)) { + return false; + } + } + + return Bytes == MFI.getObjectSize(FI); +} + +/// Check whether the call is eligible for tail call optimization. Targets +/// that want to do tail call optimization should implement this function. +bool X86TargetLowering::IsEligibleForTailCallOptimization( + SDValue Callee, CallingConv::ID CalleeCC, bool IsCalleePopSRet, + bool isVarArg, Type *RetTy, const SmallVectorImpl &Outs, + const SmallVectorImpl &OutVals, + const SmallVectorImpl &Ins, SelectionDAG &DAG) const { + if (!mayTailCallThisCC(CalleeCC)) + return false; + + // If -tailcallopt is specified, make fastcc functions tail-callable. + MachineFunction &MF = DAG.getMachineFunction(); + const Function &CallerF = MF.getFunction(); + + // If the function return type is x86_fp80 and the callee return type is not, + // then the FP_EXTEND of the call result is not a nop. It's not safe to + // perform a tailcall optimization here. + if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty()) + return false; + + CallingConv::ID CallerCC = CallerF.getCallingConv(); + bool CCMatch = CallerCC == CalleeCC; + bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC); + bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC); + bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt || + CalleeCC == CallingConv::Tail || CalleeCC == CallingConv::SwiftTail; + + // Win64 functions have extra shadow space for argument homing. Don't do the + // sibcall if the caller and callee have mismatched expectations for this + // space. + if (IsCalleeWin64 != IsCallerWin64) + return false; + + if (IsGuaranteeTCO) { + if (canGuaranteeTCO(CalleeCC) && CCMatch) + return true; + return false; + } + + // Look for obvious safe cases to perform tail call optimization that do not + // require ABI changes. This is what gcc calls sibcall. + + // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to + // emit a special epilogue. + const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + if (RegInfo->hasStackRealignment(MF)) + return false; + + // Also avoid sibcall optimization if we're an sret return fn and the callee + // is incompatible. See comment in LowerReturn about why hasStructRetAttr is + // insufficient. + if (MF.getInfo()->getSRetReturnReg()) { + // For a compatible tail call the callee must return our sret pointer. So it + // needs to be (a) an sret function itself and (b) we pass our sret as its + // sret. Condition #b is harder to determine. + return false; + } else if (IsCalleePopSRet) + // The callee pops an sret, so we cannot tail-call, as our caller doesn't + // expect that. + return false; + + // Do not sibcall optimize vararg calls unless all arguments are passed via + // registers. + LLVMContext &C = *DAG.getContext(); + if (isVarArg && !Outs.empty()) { + // Optimizing for varargs on Win64 is unlikely to be safe without + // additional testing. + if (IsCalleeWin64 || IsCallerWin64) + return false; + + SmallVector ArgLocs; + CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); + + CCInfo.AnalyzeCallOperands(Outs, CC_X86); + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) + if (!ArgLocs[i].isRegLoc()) + return false; + } + + // If the call result is in ST0 / ST1, it needs to be popped off the x87 + // stack. Therefore, if it's not used by the call it is not safe to optimize + // this into a sibcall. + bool Unused = false; + for (unsigned i = 0, e = Ins.size(); i != e; ++i) { + if (!Ins[i].Used) { + Unused = true; + break; + } + } + if (Unused) { + SmallVector RVLocs; + CCState CCInfo(CalleeCC, false, MF, RVLocs, C); + CCInfo.AnalyzeCallResult(Ins, RetCC_X86); + for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { + CCValAssign &VA = RVLocs[i]; + if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) + return false; + } + } + + // Check that the call results are passed in the same way. + if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins, + RetCC_X86, RetCC_X86)) + return false; + // The callee has to preserve all registers the caller needs to preserve. + const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); + const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); + if (!CCMatch) { + const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); + if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) + return false; + } + + unsigned StackArgsSize = 0; + + // If the callee takes no arguments then go on to check the results of the + // call. + if (!Outs.empty()) { + // Check if stack adjustment is needed. For now, do not do this if any + // argument is passed on the stack. + SmallVector ArgLocs; + CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); + + // Allocate shadow area for Win64 + if (IsCalleeWin64) + CCInfo.AllocateStack(32, Align(8)); + + CCInfo.AnalyzeCallOperands(Outs, CC_X86); + StackArgsSize = CCInfo.getStackSize(); + + if (CCInfo.getStackSize()) { + // Check if the arguments are already laid out in the right way as + // the caller's fixed stack objects. + MachineFrameInfo &MFI = MF.getFrameInfo(); + const MachineRegisterInfo *MRI = &MF.getRegInfo(); + const X86InstrInfo *TII = Subtarget.getInstrInfo(); + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + SDValue Arg = OutVals[i]; + ISD::ArgFlagsTy Flags = Outs[i].Flags; + if (VA.getLocInfo() == CCValAssign::Indirect) + return false; + if (!VA.isRegLoc()) { + if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, + MFI, MRI, TII, VA)) + return false; + } + } + } + + bool PositionIndependent = isPositionIndependent(); + // If the tailcall address may be in a register, then make sure it's + // possible to register allocate for it. In 32-bit, the call address can + // only target EAX, EDX, or ECX since the tail call must be scheduled after + // callee-saved registers are restored. These happen to be the same + // registers used to pass 'inreg' arguments so watch out for those. + if (!Subtarget.is64Bit() && ((!isa(Callee) && + !isa(Callee)) || + PositionIndependent)) { + unsigned NumInRegs = 0; + // In PIC we need an extra register to formulate the address computation + // for the callee. + unsigned MaxInRegs = PositionIndependent ? 2 : 3; + + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + if (!VA.isRegLoc()) + continue; + Register Reg = VA.getLocReg(); + switch (Reg) { + default: break; + case X86::EAX: case X86::EDX: case X86::ECX: + if (++NumInRegs == MaxInRegs) + return false; + break; + } + } + } + + const MachineRegisterInfo &MRI = MF.getRegInfo(); + if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) + return false; + } + + bool CalleeWillPop = + X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg, + MF.getTarget().Options.GuaranteedTailCallOpt); + + if (unsigned BytesToPop = + MF.getInfo()->getBytesToPopOnReturn()) { + // If we have bytes to pop, the callee must pop them. + bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize; + if (!CalleePopMatches) + return false; + } else if (CalleeWillPop && StackArgsSize > 0) { + // If we don't have bytes to pop, make sure the callee doesn't pop any. + return false; + } + + return true; +} + +/// Determines whether the callee is required to pop its own arguments. +/// Callee pop is necessary to support tail calls. +bool X86::isCalleePop(CallingConv::ID CallingConv, + bool is64Bit, bool IsVarArg, bool GuaranteeTCO) { + // If GuaranteeTCO is true, we force some calls to be callee pop so that we + // can guarantee TCO. + if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO)) + return true; + + switch (CallingConv) { + default: + return false; + case CallingConv::X86_StdCall: + case CallingConv::X86_FastCall: + case CallingConv::X86_ThisCall: + case CallingConv::X86_VectorCall: + return !is64Bit; + } +} + diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1813,6 +1813,9 @@ SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl &Created) const override; + + SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1, + SDValue V2) const; }; namespace X86 { diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -43,7 +43,6 @@ #include "llvm/IR/CallingConv.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" -#include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/EHPersonalities.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalAlias.h" @@ -70,8 +69,6 @@ #define DEBUG_TYPE "x86-isel" -STATISTIC(NumTailCalls, "Number of tail calls"); - static cl::opt ExperimentalPrefInnermostLoopAlignment( "x86-experimental-pref-innermost-loop-alignment", cl::init(4), cl::desc( @@ -93,38 +90,6 @@ "stores respectively."), cl::Hidden); -/// Call this when the user attempts to do something unsupported, like -/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike -/// report_fatal_error, so calling code should attempt to recover without -/// crashing. -static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl, - const char *Msg) { - MachineFunction &MF = DAG.getMachineFunction(); - DAG.getContext()->diagnose( - DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc())); -} - -/// Returns true if a CC can dynamically exclude a register from the list of -/// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on -/// the return registers. -static bool shouldDisableRetRegFromCSR(CallingConv::ID CC) { - switch (CC) { - default: - return false; - case CallingConv::X86_RegCall: - case CallingConv::PreserveMost: - case CallingConv::PreserveAll: - return true; - } -} - -/// Returns true if a CC can dynamically exclude a register from the list of -/// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on -/// the parameters. -static bool shouldDisableArgRegFromCSR(CallingConv::ID CC) { - return CC == CallingConv::X86_RegCall; -} - X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI) : TargetLowering(TM), Subtarget(STI) { @@ -2548,3056 +2513,169 @@ return TargetLoweringBase::getPreferredVectorAction(VT); } -static std::pair -handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC, - const X86Subtarget &Subtarget) { - // v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling - // convention is one that uses k registers. - if (NumElts == 2) - return {MVT::v2i64, 1}; - if (NumElts == 4) - return {MVT::v4i32, 1}; - if (NumElts == 8 && CC != CallingConv::X86_RegCall && - CC != CallingConv::Intel_OCL_BI) - return {MVT::v8i16, 1}; - if (NumElts == 16 && CC != CallingConv::X86_RegCall && - CC != CallingConv::Intel_OCL_BI) - return {MVT::v16i8, 1}; - // v32i1 passes in ymm unless we have BWI and the calling convention is - // regcall. - if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall)) - return {MVT::v32i8, 1}; - // Split v64i1 vectors if we don't have v64i8 available. - if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) { - if (Subtarget.useAVX512Regs()) - return {MVT::v64i8, 1}; - return {MVT::v32i8, 2}; - } - - // Break wide or odd vXi1 vectors into scalars to match avx2 behavior. - if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) || - NumElts > 64) - return {MVT::i8, NumElts}; - - return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0}; -} - -MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, - CallingConv::ID CC, - EVT VT) const { - if (VT.isVector()) { - if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) { - unsigned NumElts = VT.getVectorNumElements(); - - MVT RegisterVT; - unsigned NumRegisters; - std::tie(RegisterVT, NumRegisters) = - handleMaskRegisterForCallingConv(NumElts, CC, Subtarget); - if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE) - return RegisterVT; - } - - if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8) - return MVT::v8f16; - } - - // We will use more GPRs for f64 and f80 on 32 bits when x87 is disabled. - if ((VT == MVT::f64 || VT == MVT::f80) && !Subtarget.is64Bit() && - !Subtarget.hasX87()) - return MVT::i32; - - if (VT.isVector() && VT.getVectorElementType() == MVT::bf16) - return getRegisterTypeForCallingConv(Context, CC, - VT.changeVectorElementType(MVT::f16)); - - return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); -} - -unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, - CallingConv::ID CC, - EVT VT) const { - if (VT.isVector()) { - if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) { - unsigned NumElts = VT.getVectorNumElements(); - - MVT RegisterVT; - unsigned NumRegisters; - std::tie(RegisterVT, NumRegisters) = - handleMaskRegisterForCallingConv(NumElts, CC, Subtarget); - if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE) - return NumRegisters; - } - - if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8) - return 1; - } - - // We have to split f64 to 2 registers and f80 to 3 registers on 32 bits if - // x87 is disabled. - if (!Subtarget.is64Bit() && !Subtarget.hasX87()) { - if (VT == MVT::f64) - return 2; - if (VT == MVT::f80) - return 3; - } - - if (VT.isVector() && VT.getVectorElementType() == MVT::bf16) - return getNumRegistersForCallingConv(Context, CC, - VT.changeVectorElementType(MVT::f16)); - - return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); -} - -unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv( - LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, - unsigned &NumIntermediates, MVT &RegisterVT) const { - // Break wide or odd vXi1 vectors into scalars to match avx2 behavior. - if (VT.isVector() && VT.getVectorElementType() == MVT::i1 && - Subtarget.hasAVX512() && - (!isPowerOf2_32(VT.getVectorNumElements()) || - (VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) || - VT.getVectorNumElements() > 64)) { - RegisterVT = MVT::i8; - IntermediateVT = MVT::i1; - NumIntermediates = VT.getVectorNumElements(); - return NumIntermediates; - } - - // Split v64i1 vectors if we don't have v64i8 available. - if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() && - CC != CallingConv::X86_RegCall) { - RegisterVT = MVT::v32i8; - IntermediateVT = MVT::v32i1; - NumIntermediates = 2; - return 2; - } - - // Split vNbf16 vectors according to vNf16. - if (VT.isVector() && VT.getVectorElementType() == MVT::bf16) - VT = VT.changeVectorElementType(MVT::f16); - - return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT, - NumIntermediates, RegisterVT); +FastISel * +X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, + const TargetLibraryInfo *libInfo) const { + return X86::createFastISel(funcInfo, libInfo); } -EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, - LLVMContext& Context, - EVT VT) const { - if (!VT.isVector()) - return MVT::i8; - - if (Subtarget.hasAVX512()) { - // Figure out what this type will be legalized to. - EVT LegalVT = VT; - while (getTypeAction(Context, LegalVT) != TypeLegal) - LegalVT = getTypeToTransformTo(Context, LegalVT); - - // If we got a 512-bit vector then we'll definitely have a vXi1 compare. - if (LegalVT.getSimpleVT().is512BitVector()) - return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount()); +//===----------------------------------------------------------------------===// +// Other Lowering Hooks +//===----------------------------------------------------------------------===// - if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) { - // If we legalized to less than a 512-bit vector, then we will use a vXi1 - // compare for vXi32/vXi64 for sure. If we have BWI we will also support - // vXi16/vXi8. - MVT EltVT = LegalVT.getSimpleVT().getVectorElementType(); - if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32) - return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount()); - } - } +bool X86::mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget, + bool AssumeSingleUse) { + if (!AssumeSingleUse && !Op.hasOneUse()) + return false; + if (!ISD::isNormalLoad(Op.getNode())) + return false; - return VT.changeVectorElementTypeToInteger(); -} + // If this is an unaligned vector, make sure the target supports folding it. + auto *Ld = cast(Op.getNode()); + if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() && + Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16)) + return false; -/// Helper for getByValTypeAlignment to determine -/// the desired ByVal argument alignment. -static void getMaxByValAlign(Type *Ty, Align &MaxAlign) { - if (MaxAlign == 16) - return; - if (VectorType *VTy = dyn_cast(Ty)) { - if (VTy->getPrimitiveSizeInBits().getFixedValue() == 128) - MaxAlign = Align(16); - } else if (ArrayType *ATy = dyn_cast(Ty)) { - Align EltAlign; - getMaxByValAlign(ATy->getElementType(), EltAlign); - if (EltAlign > MaxAlign) - MaxAlign = EltAlign; - } else if (StructType *STy = dyn_cast(Ty)) { - for (auto *EltTy : STy->elements()) { - Align EltAlign; - getMaxByValAlign(EltTy, EltAlign); - if (EltAlign > MaxAlign) - MaxAlign = EltAlign; - if (MaxAlign == 16) - break; - } - } -} + // TODO: If this is a non-temporal load and the target has an instruction + // for it, it should not be folded. See "useNonTemporalLoad()". -/// Return the desired alignment for ByVal aggregate -/// function arguments in the caller parameter area. For X86, aggregates -/// that contain SSE vectors are placed at 16-byte boundaries while the rest -/// are at 4-byte boundaries. -uint64_t X86TargetLowering::getByValTypeAlignment(Type *Ty, - const DataLayout &DL) const { - if (Subtarget.is64Bit()) { - // Max of 8 and alignment of type. - Align TyAlign = DL.getABITypeAlign(Ty); - if (TyAlign > 8) - return TyAlign.value(); - return 8; - } - - Align Alignment(4); - if (Subtarget.hasSSE1()) - getMaxByValAlign(Ty, Alignment); - return Alignment.value(); -} - -/// It returns EVT::Other if the type should be determined using generic -/// target-independent logic. -/// For vector ops we check that the overall size isn't larger than our -/// preferred vector width. -EVT X86TargetLowering::getOptimalMemOpType( - const MemOp &Op, const AttributeList &FuncAttributes) const { - if (!FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) { - if (Op.size() >= 16 && - (!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) { - // FIXME: Check if unaligned 64-byte accesses are slow. - if (Op.size() >= 64 && Subtarget.hasAVX512() && - (Subtarget.getPreferVectorWidth() >= 512)) { - return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32; - } - // FIXME: Check if unaligned 32-byte accesses are slow. - if (Op.size() >= 32 && Subtarget.hasAVX() && - Subtarget.useLight256BitInstructions()) { - // Although this isn't a well-supported type for AVX1, we'll let - // legalization and shuffle lowering produce the optimal codegen. If we - // choose an optimal type with a vector element larger than a byte, - // getMemsetStores() may create an intermediate splat (using an integer - // multiply) before we splat as a vector. - return MVT::v32i8; - } - if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128)) - return MVT::v16i8; - // TODO: Can SSE1 handle a byte vector? - // If we have SSE1 registers we should be able to use them. - if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) && - (Subtarget.getPreferVectorWidth() >= 128)) - return MVT::v4f32; - } else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) && - Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) { - // Do not use f64 to lower memcpy if source is string constant. It's - // better to use i32 to avoid the loads. - // Also, do not use f64 to lower memset unless this is a memset of zeros. - // The gymnastics of splatting a byte value into an XMM register and then - // only using 8-byte stores (because this is a CPU with slow unaligned - // 16-byte accesses) makes that a loser. - return MVT::f64; - } - } - // This is a compromise. If we reach here, unaligned accesses may be slow on - // this target. However, creating smaller, aligned accesses could be even - // slower and would certainly be a lot more code. - if (Subtarget.is64Bit() && Op.size() >= 8) - return MVT::i64; - return MVT::i32; -} - -bool X86TargetLowering::isSafeMemOpType(MVT VT) const { - if (VT == MVT::f32) - return Subtarget.hasSSE1(); - if (VT == MVT::f64) - return Subtarget.hasSSE2(); return true; } -static bool isBitAligned(Align Alignment, uint64_t SizeInBits) { - return (8 * Alignment.value()) % SizeInBits == 0; -} - -bool X86TargetLowering::isMemoryAccessFast(EVT VT, Align Alignment) const { - if (isBitAligned(Alignment, VT.getSizeInBits())) - return true; - switch (VT.getSizeInBits()) { - default: - // 8-byte and under are always assumed to be fast. - return true; - case 128: - return !Subtarget.isUnalignedMem16Slow(); - case 256: - return !Subtarget.isUnalignedMem32Slow(); - // TODO: What about AVX-512 (512-bit) accesses? - } -} - -bool X86TargetLowering::allowsMisalignedMemoryAccesses( - EVT VT, unsigned, Align Alignment, MachineMemOperand::Flags Flags, - unsigned *Fast) const { - if (Fast) - *Fast = isMemoryAccessFast(VT, Alignment); - // NonTemporal vector memory ops must be aligned. - if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) { - // NT loads can only be vector aligned, so if its less aligned than the - // minimum vector size (which we can split the vector down to), we might as - // well use a regular unaligned vector load. - // We don't have any NT loads pre-SSE41. - if (!!(Flags & MachineMemOperand::MOLoad)) - return (Alignment < 16 || !Subtarget.hasSSE41()); +bool X86::mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT, + const X86Subtarget &Subtarget, + bool AssumeSingleUse) { + assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory"); + if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse)) return false; - } - // Misaligned accesses of any size are always allowed. - return true; -} - -bool X86TargetLowering::allowsMemoryAccess(LLVMContext &Context, - const DataLayout &DL, EVT VT, - unsigned AddrSpace, Align Alignment, - MachineMemOperand::Flags Flags, - unsigned *Fast) const { - if (Fast) - *Fast = isMemoryAccessFast(VT, Alignment); - if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) { - if (allowsMisalignedMemoryAccesses(VT, AddrSpace, Alignment, Flags, - /*Fast=*/nullptr)) - return true; - // NonTemporal vector memory ops are special, and must be aligned. - if (!isBitAligned(Alignment, VT.getSizeInBits())) - return false; - switch (VT.getSizeInBits()) { - case 128: - if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasSSE41()) - return true; - if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasSSE2()) - return true; - return false; - case 256: - if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasAVX2()) - return true; - if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasAVX()) - return true; - return false; - case 512: - if (Subtarget.hasAVX512()) - return true; - return false; - default: - return false; // Don't have NonTemporal vector memory ops of this size. - } - } - return true; -} - -/// Return the entry encoding for a jump table in the -/// current function. The returned value is a member of the -/// MachineJumpTableInfo::JTEntryKind enum. -unsigned X86TargetLowering::getJumpTableEncoding() const { - // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF - // symbol. - if (isPositionIndependent() && Subtarget.isPICStyleGOT()) - return MachineJumpTableInfo::EK_Custom32; - - // Otherwise, use the normal jump table encoding heuristics. - return TargetLowering::getJumpTableEncoding(); -} - -bool X86TargetLowering::splitValueIntoRegisterParts( - SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts, - unsigned NumParts, MVT PartVT, std::optional CC) const { - bool IsABIRegCopy = CC.has_value(); - EVT ValueVT = Val.getValueType(); - if (IsABIRegCopy && ValueVT == MVT::bf16 && PartVT == MVT::f32) { - unsigned ValueBits = ValueVT.getSizeInBits(); - unsigned PartBits = PartVT.getSizeInBits(); - Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val); - Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val); - Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val); - Parts[0] = Val; - return true; - } - return false; -} -SDValue X86TargetLowering::joinRegisterPartsIntoValue( - SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts, - MVT PartVT, EVT ValueVT, std::optional CC) const { - bool IsABIRegCopy = CC.has_value(); - if (IsABIRegCopy && ValueVT == MVT::bf16 && PartVT == MVT::f32) { - unsigned ValueBits = ValueVT.getSizeInBits(); - unsigned PartBits = PartVT.getSizeInBits(); - SDValue Val = Parts[0]; - - Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val); - Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val); - Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val); - return Val; - } - return SDValue(); + // We can not replace a wide volatile load with a broadcast-from-memory, + // because that would narrow the load, which isn't legal for volatiles. + auto *Ld = cast(Op.getNode()); + return !Ld->isVolatile() || + Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits(); } -bool X86TargetLowering::useSoftFloat() const { - return Subtarget.useSoftFloat(); +bool X86::mayFoldIntoStore(SDValue Op) { + return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); } -void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC, - ArgListTy &Args) const { - - // Only relabel X86-32 for C / Stdcall CCs. - if (Subtarget.is64Bit()) - return; - if (CC != CallingConv::C && CC != CallingConv::X86_StdCall) - return; - unsigned ParamRegs = 0; - if (auto *M = MF->getFunction().getParent()) - ParamRegs = M->getNumberRegisterParameters(); - - // Mark the first N int arguments as having reg - for (auto &Arg : Args) { - Type *T = Arg.Ty; - if (T->isIntOrPtrTy()) - if (MF->getDataLayout().getTypeAllocSize(T) <= 8) { - unsigned numRegs = 1; - if (MF->getDataLayout().getTypeAllocSize(T) > 4) - numRegs = 2; - if (ParamRegs < numRegs) - return; - ParamRegs -= numRegs; - Arg.IsInReg = true; - } +bool X86::mayFoldIntoZeroExtend(SDValue Op) { + if (Op.hasOneUse()) { + unsigned Opcode = Op.getNode()->use_begin()->getOpcode(); + return (ISD::ZERO_EXTEND == Opcode); } + return false; } -const MCExpr * -X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, - const MachineBasicBlock *MBB, - unsigned uid,MCContext &Ctx) const{ - assert(isPositionIndependent() && Subtarget.isPICStyleGOT()); - // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF - // entries. - return MCSymbolRefExpr::create(MBB->getSymbol(), - MCSymbolRefExpr::VK_GOTOFF, Ctx); -} - -/// Returns relocation base for the given PIC jumptable. -SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, - SelectionDAG &DAG) const { - if (!Subtarget.is64Bit()) - // This doesn't have SDLoc associated with it, but is not really the - // same as a Register. - return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), - getPointerTy(DAG.getDataLayout())); - return Table; -} - -/// This returns the relocation base for the given PIC jumptable, -/// the same as getPICJumpTableRelocBase, but as an MCExpr. -const MCExpr *X86TargetLowering:: -getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, - MCContext &Ctx) const { - // X86-64 uses RIP relative addressing based on the jump table label. - if (Subtarget.isPICStyleRIPRel()) - return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); - - // Otherwise, the reference is relative to the PIC base. - return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx); -} - -std::pair -X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI, - MVT VT) const { - const TargetRegisterClass *RRC = nullptr; - uint8_t Cost = 1; - switch (VT.SimpleTy) { - default: - return TargetLowering::findRepresentativeClass(TRI, VT); - case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: - RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass; - break; - case MVT::x86mmx: - RRC = &X86::VR64RegClass; - break; - case MVT::f32: case MVT::f64: - case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: - case MVT::v4f32: case MVT::v2f64: - case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64: - case MVT::v8f32: case MVT::v4f64: - case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64: - case MVT::v16f32: case MVT::v8f64: - RRC = &X86::VR128XRegClass; - break; +static bool isTargetShuffle(unsigned Opcode) { + switch(Opcode) { + default: return false; + case X86ISD::BLENDI: + case X86ISD::PSHUFB: + case X86ISD::PSHUFD: + case X86ISD::PSHUFHW: + case X86ISD::PSHUFLW: + case X86ISD::SHUFP: + case X86ISD::INSERTPS: + case X86ISD::EXTRQI: + case X86ISD::INSERTQI: + case X86ISD::VALIGN: + case X86ISD::PALIGNR: + case X86ISD::VSHLDQ: + case X86ISD::VSRLDQ: + case X86ISD::MOVLHPS: + case X86ISD::MOVHLPS: + case X86ISD::MOVSHDUP: + case X86ISD::MOVSLDUP: + case X86ISD::MOVDDUP: + case X86ISD::MOVSS: + case X86ISD::MOVSD: + case X86ISD::MOVSH: + case X86ISD::UNPCKL: + case X86ISD::UNPCKH: + case X86ISD::VBROADCAST: + case X86ISD::VPERMILPI: + case X86ISD::VPERMILPV: + case X86ISD::VPERM2X128: + case X86ISD::SHUF128: + case X86ISD::VPERMIL2: + case X86ISD::VPERMI: + case X86ISD::VPPERM: + case X86ISD::VPERMV: + case X86ISD::VPERMV3: + case X86ISD::VZEXT_MOVL: + return true; } - return std::make_pair(RRC, Cost); -} - -unsigned X86TargetLowering::getAddressSpace() const { - if (Subtarget.is64Bit()) - return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257; - return 256; -} - -static bool hasStackGuardSlotTLS(const Triple &TargetTriple) { - return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() || - (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17)); -} - -static Constant* SegmentOffset(IRBuilderBase &IRB, - int Offset, unsigned AddressSpace) { - return ConstantExpr::getIntToPtr( - ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset), - Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace)); } -Value *X86TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const { - // glibc, bionic, and Fuchsia have a special slot for the stack guard in - // tcbhead_t; use it instead of the usual global variable (see - // sysdeps/{i386,x86_64}/nptl/tls.h) - if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) { - if (Subtarget.isTargetFuchsia()) { - // defines ZX_TLS_STACK_GUARD_OFFSET with this value. - return SegmentOffset(IRB, 0x10, getAddressSpace()); - } else { - unsigned AddressSpace = getAddressSpace(); - Module *M = IRB.GetInsertBlock()->getParent()->getParent(); - // Specially, some users may customize the base reg and offset. - int Offset = M->getStackProtectorGuardOffset(); - // If we don't set -stack-protector-guard-offset value: - // %fs:0x28, unless we're using a Kernel code model, in which case - // it's %gs:0x28. gs:0x14 on i386. - if (Offset == INT_MAX) - Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14; - - StringRef GuardReg = M->getStackProtectorGuardReg(); - if (GuardReg == "fs") - AddressSpace = X86AS::FS; - else if (GuardReg == "gs") - AddressSpace = X86AS::GS; - - // Use symbol guard if user specify. - StringRef GuardSymb = M->getStackProtectorGuardSymbol(); - if (!GuardSymb.empty()) { - GlobalVariable *GV = M->getGlobalVariable(GuardSymb); - if (!GV) { - Type *Ty = Subtarget.is64Bit() ? Type::getInt64Ty(M->getContext()) - : Type::getInt32Ty(M->getContext()); - GV = new GlobalVariable(*M, Ty, false, GlobalValue::ExternalLinkage, - nullptr, GuardSymb, nullptr, - GlobalValue::NotThreadLocal, AddressSpace); - GV->setDSOLocal(M->getDirectAccessExternalData()); - } - return GV; - } - - return SegmentOffset(IRB, Offset, AddressSpace); - } +static bool isTargetShuffleVariableMask(unsigned Opcode) { + switch (Opcode) { + default: return false; + // Target Shuffles. + case X86ISD::PSHUFB: + case X86ISD::VPERMILPV: + case X86ISD::VPERMIL2: + case X86ISD::VPPERM: + case X86ISD::VPERMV: + case X86ISD::VPERMV3: + return true; + // 'Faux' Target Shuffles. + case ISD::OR: + case ISD::AND: + case X86ISD::ANDNP: + return true; } - return TargetLowering::getIRStackGuard(IRB); } -void X86TargetLowering::insertSSPDeclarations(Module &M) const { - // MSVC CRT provides functionalities for stack protection. - if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() || - Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) { - // MSVC CRT has a global variable holding security cookie. - M.getOrInsertGlobal("__security_cookie", - Type::getInt8PtrTy(M.getContext())); +SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + X86MachineFunctionInfo *FuncInfo = MF.getInfo(); + int ReturnAddrIndex = FuncInfo->getRAIndex(); - // MSVC CRT has a function to validate security cookie. - FunctionCallee SecurityCheckCookie = M.getOrInsertFunction( - "__security_check_cookie", Type::getVoidTy(M.getContext()), - Type::getInt8PtrTy(M.getContext())); - if (Function *F = dyn_cast(SecurityCheckCookie.getCallee())) { - F->setCallingConv(CallingConv::X86_FastCall); - F->addParamAttr(0, Attribute::AttrKind::InReg); - } - return; + if (ReturnAddrIndex == 0) { + // Set up a frame object for the return address. + unsigned SlotSize = RegInfo->getSlotSize(); + ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize, + -(int64_t)SlotSize, + false); + FuncInfo->setRAIndex(ReturnAddrIndex); } - StringRef GuardMode = M.getStackProtectorGuard(); - - // glibc, bionic, and Fuchsia have a special slot for the stack guard. - if ((GuardMode == "tls" || GuardMode.empty()) && - hasStackGuardSlotTLS(Subtarget.getTargetTriple())) - return; - TargetLowering::insertSSPDeclarations(M); -} - -Value *X86TargetLowering::getSDagStackGuard(const Module &M) const { - // MSVC CRT has a global variable holding security cookie. - if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() || - Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) { - return M.getGlobalVariable("__security_cookie"); - } - return TargetLowering::getSDagStackGuard(M); + return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout())); } -Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const { - // MSVC CRT has a function to validate security cookie. - if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() || - Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) { - return M.getFunction("__security_check_cookie"); - } - return TargetLowering::getSSPStackGuardCheck(M); -} +bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, + bool hasSymbolicDisplacement) { + // Offset should fit into 32 bit immediate field. + if (!isInt<32>(Offset)) + return false; -Value * -X86TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const { - if (Subtarget.getTargetTriple().isOSContiki()) - return getDefaultSafeStackPointerLocation(IRB, false); + // If we don't have a symbolic displacement - we don't have any extra + // restrictions. + if (!hasSymbolicDisplacement) + return true; - // Android provides a fixed TLS slot for the SafeStack pointer. See the - // definition of TLS_SLOT_SAFESTACK in - // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h - if (Subtarget.isTargetAndroid()) { - // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs: - // %gs:0x24 on i386 - int Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24; - return SegmentOffset(IRB, Offset, getAddressSpace()); - } + // FIXME: Some tweaks might be needed for medium code model. + if (M != CodeModel::Small && M != CodeModel::Kernel) + return false; - // Fuchsia is similar. - if (Subtarget.isTargetFuchsia()) { - // defines ZX_TLS_UNSAFE_SP_OFFSET with this value. - return SegmentOffset(IRB, 0x18, getAddressSpace()); - } + // For small code model we assume that latest object is 16MB before end of 31 + // bits boundary. We may also accept pretty large negative constants knowing + // that all objects are in the positive half of address space. + if (M == CodeModel::Small && Offset < 16*1024*1024) + return true; - return TargetLowering::getSafeStackPointerLocation(IRB); -} - -//===----------------------------------------------------------------------===// -// Return Value Calling Convention Implementation -//===----------------------------------------------------------------------===// - -bool X86TargetLowering::CanLowerReturn( - CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, - const SmallVectorImpl &Outs, LLVMContext &Context) const { - SmallVector RVLocs; - CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); - return CCInfo.CheckReturn(Outs, RetCC_X86); -} - -const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const { - static const MCPhysReg ScratchRegs[] = { X86::R11, 0 }; - return ScratchRegs; -} - -ArrayRef X86TargetLowering::getRoundingControlRegisters() const { - // FIXME: We should def X86::FPCW for x87 as well. But it affects a lot of lit - // tests at the moment, which is not what we expected. - static const MCPhysReg RCRegs[] = {X86::MXCSR}; - return RCRegs; -} - -/// Lowers masks values (v*i1) to the local register values -/// \returns DAG node after lowering to register type -static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc, - const SDLoc &Dl, SelectionDAG &DAG) { - EVT ValVT = ValArg.getValueType(); - - if (ValVT == MVT::v1i1) - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg, - DAG.getIntPtrConstant(0, Dl)); - - if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) || - (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) { - // Two stage lowering might be required - // bitcast: v8i1 -> i8 / v16i1 -> i16 - // anyextend: i8 -> i32 / i16 -> i32 - EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16; - SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg); - if (ValLoc == MVT::i32) - ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy); - return ValToCopy; - } - - if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) || - (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) { - // One stage lowering is required - // bitcast: v32i1 -> i32 / v64i1 -> i64 - return DAG.getBitcast(ValLoc, ValArg); - } - - return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg); -} - -/// Breaks v64i1 value into two registers and adds the new node to the DAG -static void Passv64i1ArgInRegs( - const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg, - SmallVectorImpl> &RegsToPass, CCValAssign &VA, - CCValAssign &NextVA, const X86Subtarget &Subtarget) { - assert(Subtarget.hasBWI() && "Expected AVX512BW target!"); - assert(Subtarget.is32Bit() && "Expecting 32 bit target"); - assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value"); - assert(VA.isRegLoc() && NextVA.isRegLoc() && - "The value should reside in two registers"); - - // Before splitting the value we cast it to i64 - Arg = DAG.getBitcast(MVT::i64, Arg); - - // Splitting the value into two i32 types - SDValue Lo, Hi; - std::tie(Lo, Hi) = DAG.SplitScalar(Arg, Dl, MVT::i32, MVT::i32); - - // Attach the two i32 types into corresponding registers - RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo)); - RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi)); -} - -SDValue -X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, - bool isVarArg, - const SmallVectorImpl &Outs, - const SmallVectorImpl &OutVals, - const SDLoc &dl, SelectionDAG &DAG) const { - MachineFunction &MF = DAG.getMachineFunction(); - X86MachineFunctionInfo *FuncInfo = MF.getInfo(); - - // In some cases we need to disable registers from the default CSR list. - // For example, when they are used as return registers (preserve_* and X86's - // regcall) or for argument passing (X86's regcall). - bool ShouldDisableCalleeSavedRegister = - shouldDisableRetRegFromCSR(CallConv) || - MF.getFunction().hasFnAttribute("no_caller_saved_registers"); - - if (CallConv == CallingConv::X86_INTR && !Outs.empty()) - report_fatal_error("X86 interrupts may not return any value"); - - SmallVector RVLocs; - CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext()); - CCInfo.AnalyzeReturn(Outs, RetCC_X86); - - SmallVector, 4> RetVals; - for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E; - ++I, ++OutsIndex) { - CCValAssign &VA = RVLocs[I]; - assert(VA.isRegLoc() && "Can only return in registers!"); - - // Add the register to the CalleeSaveDisableRegs list. - if (ShouldDisableCalleeSavedRegister) - MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg()); - - SDValue ValToCopy = OutVals[OutsIndex]; - EVT ValVT = ValToCopy.getValueType(); - - // Promote values to the appropriate types. - if (VA.getLocInfo() == CCValAssign::SExt) - ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy); - else if (VA.getLocInfo() == CCValAssign::ZExt) - ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy); - else if (VA.getLocInfo() == CCValAssign::AExt) { - if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1) - ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG); - else - ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy); - } - else if (VA.getLocInfo() == CCValAssign::BCvt) - ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy); - - assert(VA.getLocInfo() != CCValAssign::FPExt && - "Unexpected FP-extend for return value."); - - // Report an error if we have attempted to return a value via an XMM - // register and SSE was disabled. - if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) { - errorUnsupported(DAG, dl, "SSE register return with SSE disabled"); - VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. - } else if (!Subtarget.hasSSE2() && - X86::FR64XRegClass.contains(VA.getLocReg()) && - ValVT == MVT::f64) { - // When returning a double via an XMM register, report an error if SSE2 is - // not enabled. - errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled"); - VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. - } - - // Returns in ST0/ST1 are handled specially: these are pushed as operands to - // the RET instruction and handled by the FP Stackifier. - if (VA.getLocReg() == X86::FP0 || - VA.getLocReg() == X86::FP1) { - // If this is a copy from an xmm register to ST(0), use an FPExtend to - // change the value to the FP stack register class. - if (isScalarFPTypeInSSEReg(VA.getValVT())) - ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); - RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy)); - // Don't emit a copytoreg. - continue; - } - - // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 - // which is returned in RAX / RDX. - if (Subtarget.is64Bit()) { - if (ValVT == MVT::x86mmx) { - if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { - ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy); - ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, - ValToCopy); - // If we don't have SSE2 available, convert to v4f32 so the generated - // register is legal. - if (!Subtarget.hasSSE2()) - ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy); - } - } - } - - if (VA.needsCustom()) { - assert(VA.getValVT() == MVT::v64i1 && - "Currently the only custom case is when we split v64i1 to 2 regs"); - - Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I], - Subtarget); - - // Add the second register to the CalleeSaveDisableRegs list. - if (ShouldDisableCalleeSavedRegister) - MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg()); - } else { - RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy)); - } - } - - SDValue Glue; - SmallVector RetOps; - RetOps.push_back(Chain); // Operand #0 = Chain (updated below) - // Operand #1 = Bytes To Pop - RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl, - MVT::i32)); - - // Copy the result values into the output registers. - for (auto &RetVal : RetVals) { - if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) { - RetOps.push_back(RetVal.second); - continue; // Don't emit a copytoreg. - } - - Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Glue); - Glue = Chain.getValue(1); - RetOps.push_back( - DAG.getRegister(RetVal.first, RetVal.second.getValueType())); - } - - // Swift calling convention does not require we copy the sret argument - // into %rax/%eax for the return, and SRetReturnReg is not set for Swift. - - // All x86 ABIs require that for returning structs by value we copy - // the sret argument into %rax/%eax (depending on ABI) for the return. - // We saved the argument into a virtual register in the entry block, - // so now we copy the value out and into %rax/%eax. - // - // Checking Function.hasStructRetAttr() here is insufficient because the IR - // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is - // false, then an sret argument may be implicitly inserted in the SelDAG. In - // either case FuncInfo->setSRetReturnReg() will have been called. - if (Register SRetReg = FuncInfo->getSRetReturnReg()) { - // When we have both sret and another return value, we should use the - // original Chain stored in RetOps[0], instead of the current Chain updated - // in the above loop. If we only have sret, RetOps[0] equals to Chain. - - // For the case of sret and another return value, we have - // Chain_0 at the function entry - // Chain_1 = getCopyToReg(Chain_0) in the above loop - // If we use Chain_1 in getCopyFromReg, we will have - // Val = getCopyFromReg(Chain_1) - // Chain_2 = getCopyToReg(Chain_1, Val) from below - - // getCopyToReg(Chain_0) will be glued together with - // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be - // in Unit B, and we will have cyclic dependency between Unit A and Unit B: - // Data dependency from Unit B to Unit A due to usage of Val in - // getCopyToReg(Chain_1, Val) - // Chain dependency from Unit A to Unit B - - // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg. - SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg, - getPointerTy(MF.getDataLayout())); - - Register RetValReg - = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ? - X86::RAX : X86::EAX; - Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Glue); - Glue = Chain.getValue(1); - - // RAX/EAX now acts like a return value. - RetOps.push_back( - DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout()))); - - // Add the returned register to the CalleeSaveDisableRegs list. Don't do - // this however for preserve_most/preserve_all to minimize the number of - // callee-saved registers for these CCs. - if (ShouldDisableCalleeSavedRegister && - CallConv != CallingConv::PreserveAll && - CallConv != CallingConv::PreserveMost) - MF.getRegInfo().disableCalleeSavedRegister(RetValReg); - } - - const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); - const MCPhysReg *I = - TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); - if (I) { - for (; *I; ++I) { - if (X86::GR64RegClass.contains(*I)) - RetOps.push_back(DAG.getRegister(*I, MVT::i64)); - else - llvm_unreachable("Unexpected register class in CSRsViaCopy!"); - } - } - - RetOps[0] = Chain; // Update chain. - - // Add the glue if we have it. - if (Glue.getNode()) - RetOps.push_back(Glue); - - X86ISD::NodeType opcode = X86ISD::RET_GLUE; - if (CallConv == CallingConv::X86_INTR) - opcode = X86ISD::IRET; - return DAG.getNode(opcode, dl, MVT::Other, RetOps); -} - -bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { - if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0)) - return false; - - SDValue TCChain = Chain; - SDNode *Copy = *N->use_begin(); - if (Copy->getOpcode() == ISD::CopyToReg) { - // If the copy has a glue operand, we conservatively assume it isn't safe to - // perform a tail call. - if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) - return false; - TCChain = Copy->getOperand(0); - } else if (Copy->getOpcode() != ISD::FP_EXTEND) - return false; - - bool HasRet = false; - for (const SDNode *U : Copy->uses()) { - if (U->getOpcode() != X86ISD::RET_GLUE) - return false; - // If we are returning more than one value, we can definitely - // not make a tail call see PR19530 - if (U->getNumOperands() > 4) - return false; - if (U->getNumOperands() == 4 && - U->getOperand(U->getNumOperands() - 1).getValueType() != MVT::Glue) - return false; - HasRet = true; - } - - if (!HasRet) - return false; - - Chain = TCChain; - return true; -} - -EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT, - ISD::NodeType ExtendKind) const { - MVT ReturnMVT = MVT::i32; - - bool Darwin = Subtarget.getTargetTriple().isOSDarwin(); - if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) { - // The ABI does not require i1, i8 or i16 to be extended. - // - // On Darwin, there is code in the wild relying on Clang's old behaviour of - // always extending i8/i16 return values, so keep doing that for now. - // (PR26665). - ReturnMVT = MVT::i8; - } - - EVT MinVT = getRegisterType(Context, ReturnMVT); - return VT.bitsLT(MinVT) ? MinVT : VT; -} - -/// Reads two 32 bit registers and creates a 64 bit mask value. -/// \param VA The current 32 bit value that need to be assigned. -/// \param NextVA The next 32 bit value that need to be assigned. -/// \param Root The parent DAG node. -/// \param [in,out] InGlue Represents SDvalue in the parent DAG node for -/// glue purposes. In the case the DAG is already using -/// physical register instead of virtual, we should glue -/// our new SDValue to InGlue SDvalue. -/// \return a new SDvalue of size 64bit. -static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA, - SDValue &Root, SelectionDAG &DAG, - const SDLoc &Dl, const X86Subtarget &Subtarget, - SDValue *InGlue = nullptr) { - assert((Subtarget.hasBWI()) && "Expected AVX512BW target!"); - assert(Subtarget.is32Bit() && "Expecting 32 bit target"); - assert(VA.getValVT() == MVT::v64i1 && - "Expecting first location of 64 bit width type"); - assert(NextVA.getValVT() == VA.getValVT() && - "The locations should have the same type"); - assert(VA.isRegLoc() && NextVA.isRegLoc() && - "The values should reside in two registers"); - - SDValue Lo, Hi; - SDValue ArgValueLo, ArgValueHi; - - MachineFunction &MF = DAG.getMachineFunction(); - const TargetRegisterClass *RC = &X86::GR32RegClass; - - // Read a 32 bit value from the registers. - if (nullptr == InGlue) { - // When no physical register is present, - // create an intermediate virtual register. - Register Reg = MF.addLiveIn(VA.getLocReg(), RC); - ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32); - Reg = MF.addLiveIn(NextVA.getLocReg(), RC); - ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32); - } else { - // When a physical register is available read the value from it and glue - // the reads together. - ArgValueLo = - DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InGlue); - *InGlue = ArgValueLo.getValue(2); - ArgValueHi = - DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InGlue); - *InGlue = ArgValueHi.getValue(2); - } - - // Convert the i32 type into v32i1 type. - Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo); - - // Convert the i32 type into v32i1 type. - Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi); - - // Concatenate the two values together. - return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi); -} - -/// The function will lower a register of various sizes (8/16/32/64) -/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1) -/// \returns a DAG node contains the operand after lowering to mask type. -static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT, - const EVT &ValLoc, const SDLoc &Dl, - SelectionDAG &DAG) { - SDValue ValReturned = ValArg; - - if (ValVT == MVT::v1i1) - return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned); - - if (ValVT == MVT::v64i1) { - // In 32 bit machine, this case is handled by getv64i1Argument - assert(ValLoc == MVT::i64 && "Expecting only i64 locations"); - // In 64 bit machine, There is no need to truncate the value only bitcast - } else { - MVT maskLen; - switch (ValVT.getSimpleVT().SimpleTy) { - case MVT::v8i1: - maskLen = MVT::i8; - break; - case MVT::v16i1: - maskLen = MVT::i16; - break; - case MVT::v32i1: - maskLen = MVT::i32; - break; - default: - llvm_unreachable("Expecting a vector of i1 types"); - } - - ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned); - } - return DAG.getBitcast(ValVT, ValReturned); -} - -/// Lower the result values of a call into the -/// appropriate copies out of appropriate physical registers. -/// -SDValue X86TargetLowering::LowerCallResult( - SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, - const SmallVectorImpl &Ins, const SDLoc &dl, - SelectionDAG &DAG, SmallVectorImpl &InVals, - uint32_t *RegMask) const { - - const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); - // Assign locations to each value returned by this call. - SmallVector RVLocs; - CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, - *DAG.getContext()); - CCInfo.AnalyzeCallResult(Ins, RetCC_X86); - - // Copy all of the result registers out of their specified physreg. - for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E; - ++I, ++InsIndex) { - CCValAssign &VA = RVLocs[I]; - EVT CopyVT = VA.getLocVT(); - - // In some calling conventions we need to remove the used registers - // from the register mask. - if (RegMask) { - for (MCPhysReg SubReg : TRI->subregs_inclusive(VA.getLocReg())) - RegMask[SubReg / 32] &= ~(1u << (SubReg % 32)); - } - - // Report an error if there was an attempt to return FP values via XMM - // registers. - if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) { - errorUnsupported(DAG, dl, "SSE register return with SSE disabled"); - if (VA.getLocReg() == X86::XMM1) - VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts. - else - VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. - } else if (!Subtarget.hasSSE2() && - X86::FR64XRegClass.contains(VA.getLocReg()) && - CopyVT == MVT::f64) { - errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled"); - if (VA.getLocReg() == X86::XMM1) - VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts. - else - VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. - } - - // If we prefer to use the value in xmm registers, copy it out as f80 and - // use a truncate to move it from fp stack reg to xmm reg. - bool RoundAfterCopy = false; - if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) && - isScalarFPTypeInSSEReg(VA.getValVT())) { - if (!Subtarget.hasX87()) - report_fatal_error("X87 register return with X87 disabled"); - CopyVT = MVT::f80; - RoundAfterCopy = (CopyVT != VA.getLocVT()); - } - - SDValue Val; - if (VA.needsCustom()) { - assert(VA.getValVT() == MVT::v64i1 && - "Currently the only custom case is when we split v64i1 to 2 regs"); - Val = - getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InGlue); - } else { - Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InGlue) - .getValue(1); - Val = Chain.getValue(0); - InGlue = Chain.getValue(2); - } - - if (RoundAfterCopy) - Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, - // This truncation won't change the value. - DAG.getIntPtrConstant(1, dl, /*isTarget=*/true)); - - if (VA.isExtInLoc()) { - if (VA.getValVT().isVector() && - VA.getValVT().getScalarType() == MVT::i1 && - ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) || - (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) { - // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8 - Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG); - } else - Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); - } - - if (VA.getLocInfo() == CCValAssign::BCvt) - Val = DAG.getBitcast(VA.getValVT(), Val); - - InVals.push_back(Val); - } - - return Chain; -} - -//===----------------------------------------------------------------------===// -// C & StdCall & Fast Calling Convention implementation -//===----------------------------------------------------------------------===// -// StdCall calling convention seems to be standard for many Windows' API -// routines and around. It differs from C calling convention just a little: -// callee should clean up the stack, not caller. Symbols should be also -// decorated in some fancy way :) It doesn't support any vector arguments. -// For info on fast calling convention see Fast Calling Convention (tail call) -// implementation LowerX86_32FastCCCallTo. - -/// Determines whether Args, either a set of outgoing arguments to a call, or a -/// set of incoming args of a call, contains an sret pointer that the callee -/// pops -template -static bool hasCalleePopSRet(const SmallVectorImpl &Args, - const X86Subtarget &Subtarget) { - // Not C++20 (yet), so no concepts available. - static_assert(std::is_same_v || - std::is_same_v, - "requires ISD::OutputArg or ISD::InputArg"); - - // Only 32-bit pops the sret. It's a 64-bit world these days, so early-out - // for most compilations. - if (!Subtarget.is32Bit()) - return false; - - if (Args.empty()) - return false; - - // Most calls do not have an sret argument, check the arg next. - const ISD::ArgFlagsTy &Flags = Args[0].Flags; - if (!Flags.isSRet() || Flags.isInReg()) - return false; - - // The MSVCabi does not pop the sret. - if (Subtarget.getTargetTriple().isOSMSVCRT()) - return false; - - // MCUs don't pop the sret - if (Subtarget.isTargetMCU()) - return false; - - // Callee pops argument - return true; -} - -/// Make a copy of an aggregate at address specified by "Src" to address -/// "Dst" with size and alignment information specified by the specific -/// parameter attribute. The copy will be passed as a byval function parameter. -static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst, - SDValue Chain, ISD::ArgFlagsTy Flags, - SelectionDAG &DAG, const SDLoc &dl) { - SDValue SizeNode = DAG.getIntPtrConstant(Flags.getByValSize(), dl); - - return DAG.getMemcpy( - Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(), - /*isVolatile*/ false, /*AlwaysInline=*/true, - /*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo()); -} - -/// Return true if the calling convention is one that we can guarantee TCO for. -static bool canGuaranteeTCO(CallingConv::ID CC) { - return (CC == CallingConv::Fast || CC == CallingConv::GHC || - CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE || - CC == CallingConv::Tail || CC == CallingConv::SwiftTail); -} - -/// Return true if we might ever do TCO for calls with this calling convention. -static bool mayTailCallThisCC(CallingConv::ID CC) { - switch (CC) { - // C calling conventions: - case CallingConv::C: - case CallingConv::Win64: - case CallingConv::X86_64_SysV: - // Callee pop conventions: - case CallingConv::X86_ThisCall: - case CallingConv::X86_StdCall: - case CallingConv::X86_VectorCall: - case CallingConv::X86_FastCall: - // Swift: - case CallingConv::Swift: - return true; - default: - return canGuaranteeTCO(CC); - } -} - -/// Return true if the function is being made into a tailcall target by -/// changing its ABI. -static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) { - return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) || - CC == CallingConv::Tail || CC == CallingConv::SwiftTail; -} - -bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { - if (!CI->isTailCall()) - return false; - - CallingConv::ID CalleeCC = CI->getCallingConv(); - if (!mayTailCallThisCC(CalleeCC)) - return false; - - return true; -} - -SDValue -X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, - const SmallVectorImpl &Ins, - const SDLoc &dl, SelectionDAG &DAG, - const CCValAssign &VA, - MachineFrameInfo &MFI, unsigned i) const { - // Create the nodes corresponding to a load from this parameter slot. - ISD::ArgFlagsTy Flags = Ins[i].Flags; - bool AlwaysUseMutable = shouldGuaranteeTCO( - CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt); - bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); - EVT ValVT; - MVT PtrVT = getPointerTy(DAG.getDataLayout()); - - // If value is passed by pointer we have address passed instead of the value - // itself. No need to extend if the mask value and location share the same - // absolute size. - bool ExtendedInMem = - VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 && - VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits(); - - if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem) - ValVT = VA.getLocVT(); - else - ValVT = VA.getValVT(); - - // FIXME: For now, all byval parameter objects are marked mutable. This can be - // changed with more analysis. - // In case of tail call optimization mark all arguments mutable. Since they - // could be overwritten by lowering of arguments in case of a tail call. - if (Flags.isByVal()) { - unsigned Bytes = Flags.getByValSize(); - if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. - - // FIXME: For now, all byval parameter objects are marked as aliasing. This - // can be improved with deeper analysis. - int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable, - /*isAliased=*/true); - return DAG.getFrameIndex(FI, PtrVT); - } - - EVT ArgVT = Ins[i].ArgVT; - - // If this is a vector that has been split into multiple parts, and the - // scalar size of the parts don't match the vector element size, then we can't - // elide the copy. The parts will have padding between them instead of being - // packed like a vector. - bool ScalarizedAndExtendedVector = - ArgVT.isVector() && !VA.getLocVT().isVector() && - VA.getLocVT().getSizeInBits() != ArgVT.getScalarSizeInBits(); - - // This is an argument in memory. We might be able to perform copy elision. - // If the argument is passed directly in memory without any extension, then we - // can perform copy elision. Large vector types, for example, may be passed - // indirectly by pointer. - if (Flags.isCopyElisionCandidate() && - VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem && - !ScalarizedAndExtendedVector) { - SDValue PartAddr; - if (Ins[i].PartOffset == 0) { - // If this is a one-part value or the first part of a multi-part value, - // create a stack object for the entire argument value type and return a - // load from our portion of it. This assumes that if the first part of an - // argument is in memory, the rest will also be in memory. - int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(), - /*IsImmutable=*/false); - PartAddr = DAG.getFrameIndex(FI, PtrVT); - return DAG.getLoad( - ValVT, dl, Chain, PartAddr, - MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); - } else { - // This is not the first piece of an argument in memory. See if there is - // already a fixed stack object including this offset. If so, assume it - // was created by the PartOffset == 0 branch above and create a load from - // the appropriate offset into it. - int64_t PartBegin = VA.getLocMemOffset(); - int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8; - int FI = MFI.getObjectIndexBegin(); - for (; MFI.isFixedObjectIndex(FI); ++FI) { - int64_t ObjBegin = MFI.getObjectOffset(FI); - int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI); - if (ObjBegin <= PartBegin && PartEnd <= ObjEnd) - break; - } - if (MFI.isFixedObjectIndex(FI)) { - SDValue Addr = - DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT), - DAG.getIntPtrConstant(Ins[i].PartOffset, dl)); - return DAG.getLoad( - ValVT, dl, Chain, Addr, - MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI, - Ins[i].PartOffset)); - } - } - } - - int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8, - VA.getLocMemOffset(), isImmutable); - - // Set SExt or ZExt flag. - if (VA.getLocInfo() == CCValAssign::ZExt) { - MFI.setObjectZExt(FI, true); - } else if (VA.getLocInfo() == CCValAssign::SExt) { - MFI.setObjectSExt(FI, true); - } - - MaybeAlign Alignment; - if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() && - ValVT != MVT::f80) - Alignment = MaybeAlign(4); - SDValue FIN = DAG.getFrameIndex(FI, PtrVT); - SDValue Val = DAG.getLoad( - ValVT, dl, Chain, FIN, - MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), - Alignment); - return ExtendedInMem - ? (VA.getValVT().isVector() - ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val) - : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val)) - : Val; -} - -// FIXME: Get this from tablegen. -static ArrayRef get64BitArgumentGPRs(CallingConv::ID CallConv, - const X86Subtarget &Subtarget) { - assert(Subtarget.is64Bit()); - - if (Subtarget.isCallingConvWin64(CallConv)) { - static const MCPhysReg GPR64ArgRegsWin64[] = { - X86::RCX, X86::RDX, X86::R8, X86::R9 - }; - return ArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64)); - } - - static const MCPhysReg GPR64ArgRegs64Bit[] = { - X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 - }; - return ArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit)); -} - -// FIXME: Get this from tablegen. -static ArrayRef get64BitArgumentXMMs(MachineFunction &MF, - CallingConv::ID CallConv, - const X86Subtarget &Subtarget) { - assert(Subtarget.is64Bit()); - if (Subtarget.isCallingConvWin64(CallConv)) { - // The XMM registers which might contain var arg parameters are shadowed - // in their paired GPR. So we only need to save the GPR to their home - // slots. - // TODO: __vectorcall will change this. - return std::nullopt; - } - - bool isSoftFloat = Subtarget.useSoftFloat(); - if (isSoftFloat || !Subtarget.hasSSE1()) - // Kernel mode asks for SSE to be disabled, so there are no XMM argument - // registers. - return std::nullopt; - - static const MCPhysReg XMMArgRegs64Bit[] = { - X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, - X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 - }; - return ArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit)); -} - -#ifndef NDEBUG -static bool isSortedByValueNo(ArrayRef ArgLocs) { - return llvm::is_sorted( - ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool { - return A.getValNo() < B.getValNo(); - }); -} -#endif - -namespace { -/// This is a helper class for lowering variable arguments parameters. -class VarArgsLoweringHelper { -public: - VarArgsLoweringHelper(X86MachineFunctionInfo *FuncInfo, const SDLoc &Loc, - SelectionDAG &DAG, const X86Subtarget &Subtarget, - CallingConv::ID CallConv, CCState &CCInfo) - : FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget), - TheMachineFunction(DAG.getMachineFunction()), - TheFunction(TheMachineFunction.getFunction()), - FrameInfo(TheMachineFunction.getFrameInfo()), - FrameLowering(*Subtarget.getFrameLowering()), - TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv), - CCInfo(CCInfo) {} - - // Lower variable arguments parameters. - void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize); - -private: - void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize); - - void forwardMustTailParameters(SDValue &Chain); - - bool is64Bit() const { return Subtarget.is64Bit(); } - bool isWin64() const { return Subtarget.isCallingConvWin64(CallConv); } - - X86MachineFunctionInfo *FuncInfo; - const SDLoc &DL; - SelectionDAG &DAG; - const X86Subtarget &Subtarget; - MachineFunction &TheMachineFunction; - const Function &TheFunction; - MachineFrameInfo &FrameInfo; - const TargetFrameLowering &FrameLowering; - const TargetLowering &TargLowering; - CallingConv::ID CallConv; - CCState &CCInfo; -}; -} // namespace - -void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters( - SDValue &Chain, unsigned StackSize) { - // If the function takes variable number of arguments, make a frame index for - // the start of the first vararg value... for expansion of llvm.va_start. We - // can skip this if there are no va_start calls. - if (is64Bit() || (CallConv != CallingConv::X86_FastCall && - CallConv != CallingConv::X86_ThisCall)) { - FuncInfo->setVarArgsFrameIndex( - FrameInfo.CreateFixedObject(1, StackSize, true)); - } - - // 64-bit calling conventions support varargs and register parameters, so we - // have to do extra work to spill them in the prologue. - if (is64Bit()) { - // Find the first unallocated argument registers. - ArrayRef ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget); - ArrayRef ArgXMMs = - get64BitArgumentXMMs(TheMachineFunction, CallConv, Subtarget); - unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs); - unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs); - - assert(!(NumXMMRegs && !Subtarget.hasSSE1()) && - "SSE register cannot be used when SSE is disabled!"); - - if (isWin64()) { - // Get to the caller-allocated home save location. Add 8 to account - // for the return address. - int HomeOffset = FrameLowering.getOffsetOfLocalArea() + 8; - FuncInfo->setRegSaveFrameIndex( - FrameInfo.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); - // Fixup to set vararg frame on shadow area (4 x i64). - if (NumIntRegs < 4) - FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); - } else { - // For X86-64, if there are vararg parameters that are passed via - // registers, then we must store them to their spots on the stack so - // they may be loaded by dereferencing the result of va_next. - FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); - FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16); - FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject( - ArgGPRs.size() * 8 + ArgXMMs.size() * 16, Align(16), false)); - } - - SmallVector - LiveGPRs; // list of SDValue for GPR registers keeping live input value - SmallVector LiveXMMRegs; // list of SDValue for XMM registers - // keeping live input value - SDValue ALVal; // if applicable keeps SDValue for %al register - - // Gather all the live in physical registers. - for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) { - Register GPR = TheMachineFunction.addLiveIn(Reg, &X86::GR64RegClass); - LiveGPRs.push_back(DAG.getCopyFromReg(Chain, DL, GPR, MVT::i64)); - } - const auto &AvailableXmms = ArgXMMs.slice(NumXMMRegs); - if (!AvailableXmms.empty()) { - Register AL = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass); - ALVal = DAG.getCopyFromReg(Chain, DL, AL, MVT::i8); - for (MCPhysReg Reg : AvailableXmms) { - // FastRegisterAllocator spills virtual registers at basic - // block boundary. That leads to usages of xmm registers - // outside of check for %al. Pass physical registers to - // VASTART_SAVE_XMM_REGS to avoid unneccessary spilling. - TheMachineFunction.getRegInfo().addLiveIn(Reg); - LiveXMMRegs.push_back(DAG.getRegister(Reg, MVT::v4f32)); - } - } - - // Store the integer parameter registers. - SmallVector MemOps; - SDValue RSFIN = - DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), - TargLowering.getPointerTy(DAG.getDataLayout())); - unsigned Offset = FuncInfo->getVarArgsGPOffset(); - for (SDValue Val : LiveGPRs) { - SDValue FIN = DAG.getNode(ISD::ADD, DL, - TargLowering.getPointerTy(DAG.getDataLayout()), - RSFIN, DAG.getIntPtrConstant(Offset, DL)); - SDValue Store = - DAG.getStore(Val.getValue(1), DL, Val, FIN, - MachinePointerInfo::getFixedStack( - DAG.getMachineFunction(), - FuncInfo->getRegSaveFrameIndex(), Offset)); - MemOps.push_back(Store); - Offset += 8; - } - - // Now store the XMM (fp + vector) parameter registers. - if (!LiveXMMRegs.empty()) { - SmallVector SaveXMMOps; - SaveXMMOps.push_back(Chain); - SaveXMMOps.push_back(ALVal); - SaveXMMOps.push_back(RSFIN); - SaveXMMOps.push_back( - DAG.getTargetConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32)); - llvm::append_range(SaveXMMOps, LiveXMMRegs); - MachineMemOperand *StoreMMO = - DAG.getMachineFunction().getMachineMemOperand( - MachinePointerInfo::getFixedStack( - DAG.getMachineFunction(), FuncInfo->getRegSaveFrameIndex(), - Offset), - MachineMemOperand::MOStore, 128, Align(16)); - MemOps.push_back(DAG.getMemIntrinsicNode(X86ISD::VASTART_SAVE_XMM_REGS, - DL, DAG.getVTList(MVT::Other), - SaveXMMOps, MVT::i8, StoreMMO)); - } - - if (!MemOps.empty()) - Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); - } -} - -void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) { - // Find the largest legal vector type. - MVT VecVT = MVT::Other; - // FIXME: Only some x86_32 calling conventions support AVX512. - if (Subtarget.useAVX512Regs() && - (is64Bit() || (CallConv == CallingConv::X86_VectorCall || - CallConv == CallingConv::Intel_OCL_BI))) - VecVT = MVT::v16f32; - else if (Subtarget.hasAVX()) - VecVT = MVT::v8f32; - else if (Subtarget.hasSSE2()) - VecVT = MVT::v4f32; - - // We forward some GPRs and some vector types. - SmallVector RegParmTypes; - MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32; - RegParmTypes.push_back(IntVT); - if (VecVT != MVT::Other) - RegParmTypes.push_back(VecVT); - - // Compute the set of forwarded registers. The rest are scratch. - SmallVectorImpl &Forwards = - FuncInfo->getForwardedMustTailRegParms(); - CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86); - - // Forward AL for SysV x86_64 targets, since it is used for varargs. - if (is64Bit() && !isWin64() && !CCInfo.isAllocated(X86::AL)) { - Register ALVReg = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass); - Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8)); - } - - // Copy all forwards from physical to virtual registers. - for (ForwardedRegister &FR : Forwards) { - // FIXME: Can we use a less constrained schedule? - SDValue RegVal = DAG.getCopyFromReg(Chain, DL, FR.VReg, FR.VT); - FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister( - TargLowering.getRegClassFor(FR.VT)); - Chain = DAG.getCopyToReg(Chain, DL, FR.VReg, RegVal); - } -} - -void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain, - unsigned StackSize) { - // Set FrameIndex to the 0xAAAAAAA value to mark unset state. - // If necessary, it would be set into the correct value later. - FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); - FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); - - if (FrameInfo.hasVAStart()) - createVarArgAreaAndStoreRegisters(Chain, StackSize); - - if (FrameInfo.hasMustTailInVarArgFunc()) - forwardMustTailParameters(Chain); -} - -SDValue X86TargetLowering::LowerFormalArguments( - SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, - const SmallVectorImpl &Ins, const SDLoc &dl, - SelectionDAG &DAG, SmallVectorImpl &InVals) const { - MachineFunction &MF = DAG.getMachineFunction(); - X86MachineFunctionInfo *FuncInfo = MF.getInfo(); - - const Function &F = MF.getFunction(); - if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() && - F.getName() == "main") - FuncInfo->setForceFramePointer(true); - - MachineFrameInfo &MFI = MF.getFrameInfo(); - bool Is64Bit = Subtarget.is64Bit(); - bool IsWin64 = Subtarget.isCallingConvWin64(CallConv); - - assert( - !(IsVarArg && canGuaranteeTCO(CallConv)) && - "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"); - - // Assign locations to all of the incoming arguments. - SmallVector ArgLocs; - CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); - - // Allocate shadow area for Win64. - if (IsWin64) - CCInfo.AllocateStack(32, Align(8)); - - CCInfo.AnalyzeArguments(Ins, CC_X86); - - // In vectorcall calling convention a second pass is required for the HVA - // types. - if (CallingConv::X86_VectorCall == CallConv) { - CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86); - } - - // The next loop assumes that the locations are in the same order of the - // input arguments. - assert(isSortedByValueNo(ArgLocs) && - "Argument Location list must be sorted before lowering"); - - SDValue ArgValue; - for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E; - ++I, ++InsIndex) { - assert(InsIndex < Ins.size() && "Invalid Ins index"); - CCValAssign &VA = ArgLocs[I]; - - if (VA.isRegLoc()) { - EVT RegVT = VA.getLocVT(); - if (VA.needsCustom()) { - assert( - VA.getValVT() == MVT::v64i1 && - "Currently the only custom case is when we split v64i1 to 2 regs"); - - // v64i1 values, in regcall calling convention, that are - // compiled to 32 bit arch, are split up into two registers. - ArgValue = - getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget); - } else { - const TargetRegisterClass *RC; - if (RegVT == MVT::i8) - RC = &X86::GR8RegClass; - else if (RegVT == MVT::i16) - RC = &X86::GR16RegClass; - else if (RegVT == MVT::i32) - RC = &X86::GR32RegClass; - else if (Is64Bit && RegVT == MVT::i64) - RC = &X86::GR64RegClass; - else if (RegVT == MVT::f16) - RC = Subtarget.hasAVX512() ? &X86::FR16XRegClass : &X86::FR16RegClass; - else if (RegVT == MVT::f32) - RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass; - else if (RegVT == MVT::f64) - RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass; - else if (RegVT == MVT::f80) - RC = &X86::RFP80RegClass; - else if (RegVT == MVT::f128) - RC = &X86::VR128RegClass; - else if (RegVT.is512BitVector()) - RC = &X86::VR512RegClass; - else if (RegVT.is256BitVector()) - RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass; - else if (RegVT.is128BitVector()) - RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass; - else if (RegVT == MVT::x86mmx) - RC = &X86::VR64RegClass; - else if (RegVT == MVT::v1i1) - RC = &X86::VK1RegClass; - else if (RegVT == MVT::v8i1) - RC = &X86::VK8RegClass; - else if (RegVT == MVT::v16i1) - RC = &X86::VK16RegClass; - else if (RegVT == MVT::v32i1) - RC = &X86::VK32RegClass; - else if (RegVT == MVT::v64i1) - RC = &X86::VK64RegClass; - else - llvm_unreachable("Unknown argument type!"); - - Register Reg = MF.addLiveIn(VA.getLocReg(), RC); - ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); - } - - // If this is an 8 or 16-bit value, it is really passed promoted to 32 - // bits. Insert an assert[sz]ext to capture this, then truncate to the - // right size. - if (VA.getLocInfo() == CCValAssign::SExt) - ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, - DAG.getValueType(VA.getValVT())); - else if (VA.getLocInfo() == CCValAssign::ZExt) - ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, - DAG.getValueType(VA.getValVT())); - else if (VA.getLocInfo() == CCValAssign::BCvt) - ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue); - - if (VA.isExtInLoc()) { - // Handle MMX values passed in XMM regs. - if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1) - ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue); - else if (VA.getValVT().isVector() && - VA.getValVT().getScalarType() == MVT::i1 && - ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) || - (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) { - // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8 - ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG); - } else - ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); - } - } else { - assert(VA.isMemLoc()); - ArgValue = - LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex); - } - - // If value is passed via pointer - do a load. - if (VA.getLocInfo() == CCValAssign::Indirect && - !(Ins[I].Flags.isByVal() && VA.isRegLoc())) { - ArgValue = - DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo()); - } - - InVals.push_back(ArgValue); - } - - for (unsigned I = 0, E = Ins.size(); I != E; ++I) { - if (Ins[I].Flags.isSwiftAsync()) { - auto X86FI = MF.getInfo(); - if (Subtarget.is64Bit()) - X86FI->setHasSwiftAsyncContext(true); - else { - int FI = MF.getFrameInfo().CreateStackObject(4, Align(4), false); - X86FI->setSwiftAsyncContextFrameIdx(FI); - SDValue St = DAG.getStore(DAG.getEntryNode(), dl, InVals[I], - DAG.getFrameIndex(FI, MVT::i32), - MachinePointerInfo::getFixedStack(MF, FI)); - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, St, Chain); - } - } - - // Swift calling convention does not require we copy the sret argument - // into %rax/%eax for the return. We don't set SRetReturnReg for Swift. - if (CallConv == CallingConv::Swift || CallConv == CallingConv::SwiftTail) - continue; - - // All x86 ABIs require that for returning structs by value we copy the - // sret argument into %rax/%eax (depending on ABI) for the return. Save - // the argument into a virtual register so that we can access it from the - // return points. - if (Ins[I].Flags.isSRet()) { - assert(!FuncInfo->getSRetReturnReg() && - "SRet return has already been set"); - MVT PtrTy = getPointerTy(DAG.getDataLayout()); - Register Reg = - MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); - FuncInfo->setSRetReturnReg(Reg); - SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]); - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); - break; - } - } - - unsigned StackSize = CCInfo.getStackSize(); - // Align stack specially for tail calls. - if (shouldGuaranteeTCO(CallConv, - MF.getTarget().Options.GuaranteedTailCallOpt)) - StackSize = GetAlignedArgumentStackSize(StackSize, DAG); - - if (IsVarArg) - VarArgsLoweringHelper(FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo) - .lowerVarArgsParameters(Chain, StackSize); - - // Some CCs need callee pop. - if (X86::isCalleePop(CallConv, Is64Bit, IsVarArg, - MF.getTarget().Options.GuaranteedTailCallOpt)) { - FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. - } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) { - // X86 interrupts must pop the error code (and the alignment padding) if - // present. - FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4); - } else { - FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. - // If this is an sret function, the return should pop the hidden pointer. - if (!canGuaranteeTCO(CallConv) && hasCalleePopSRet(Ins, Subtarget)) - FuncInfo->setBytesToPopOnReturn(4); - } - - if (!Is64Bit) { - // RegSaveFrameIndex is X86-64 only. - FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); - } - - FuncInfo->setArgumentStackSize(StackSize); - - if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) { - EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn()); - if (Personality == EHPersonality::CoreCLR) { - assert(Is64Bit); - // TODO: Add a mechanism to frame lowering that will allow us to indicate - // that we'd prefer this slot be allocated towards the bottom of the frame - // (i.e. near the stack pointer after allocating the frame). Every - // funclet needs a copy of this slot in its (mostly empty) frame, and the - // offset from the bottom of this and each funclet's frame must be the - // same, so the size of funclets' (mostly empty) frames is dictated by - // how far this slot is from the bottom (since they allocate just enough - // space to accommodate holding this slot at the correct offset). - int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSpillSlot=*/false); - EHInfo->PSPSymFrameIdx = PSPSymFI; - } - } - - if (shouldDisableArgRegFromCSR(CallConv) || - F.hasFnAttribute("no_caller_saved_registers")) { - MachineRegisterInfo &MRI = MF.getRegInfo(); - for (std::pair Pair : MRI.liveins()) - MRI.disableCalleeSavedRegister(Pair.first); - } - - return Chain; -} - -SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, - SDValue Arg, const SDLoc &dl, - SelectionDAG &DAG, - const CCValAssign &VA, - ISD::ArgFlagsTy Flags, - bool isByVal) const { - unsigned LocMemOffset = VA.getLocMemOffset(); - SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); - PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), - StackPtr, PtrOff); - if (isByVal) - return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); - - MaybeAlign Alignment; - if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() && - Arg.getSimpleValueType() != MVT::f80) - Alignment = MaybeAlign(4); - return DAG.getStore( - Chain, dl, Arg, PtrOff, - MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset), - Alignment); -} - -/// Emit a load of return address if tail call -/// optimization is performed and it is required. -SDValue X86TargetLowering::EmitTailCallLoadRetAddr( - SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall, - bool Is64Bit, int FPDiff, const SDLoc &dl) const { - // Adjust the Return address stack slot. - EVT VT = getPointerTy(DAG.getDataLayout()); - OutRetAddr = getReturnAddressFrameIndex(DAG); - - // Load the "old" Return address. - OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo()); - return SDValue(OutRetAddr.getNode(), 1); -} - -/// Emit a store of the return address if tail call -/// optimization is performed and it is required (FPDiff!=0). -static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF, - SDValue Chain, SDValue RetAddrFrIdx, - EVT PtrVT, unsigned SlotSize, - int FPDiff, const SDLoc &dl) { - // Store the return address to the appropriate stack slot. - if (!FPDiff) return Chain; - // Calculate the new stack slot for the return address. - int NewReturnAddrFI = - MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize, - false); - SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT); - Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, - MachinePointerInfo::getFixedStack( - DAG.getMachineFunction(), NewReturnAddrFI)); - return Chain; -} - -/// Returns a vector_shuffle mask for an movs{s|d}, movd -/// operation of specified width. -static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1, - SDValue V2) { - unsigned NumElems = VT.getVectorNumElements(); - SmallVector Mask; - Mask.push_back(NumElems); - for (unsigned i = 1; i != NumElems; ++i) - Mask.push_back(i); - return DAG.getVectorShuffle(VT, dl, V1, V2, Mask); -} - -SDValue -X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, - SmallVectorImpl &InVals) const { - SelectionDAG &DAG = CLI.DAG; - SDLoc &dl = CLI.DL; - SmallVectorImpl &Outs = CLI.Outs; - SmallVectorImpl &OutVals = CLI.OutVals; - SmallVectorImpl &Ins = CLI.Ins; - SDValue Chain = CLI.Chain; - SDValue Callee = CLI.Callee; - CallingConv::ID CallConv = CLI.CallConv; - bool &isTailCall = CLI.IsTailCall; - bool isVarArg = CLI.IsVarArg; - const auto *CB = CLI.CB; - - MachineFunction &MF = DAG.getMachineFunction(); - bool Is64Bit = Subtarget.is64Bit(); - bool IsWin64 = Subtarget.isCallingConvWin64(CallConv); - bool IsSibcall = false; - bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt || - CallConv == CallingConv::Tail || CallConv == CallingConv::SwiftTail; - bool IsCalleePopSRet = !IsGuaranteeTCO && hasCalleePopSRet(Outs, Subtarget); - X86MachineFunctionInfo *X86Info = MF.getInfo(); - bool HasNCSR = (CB && isa(CB) && - CB->hasFnAttr("no_caller_saved_registers")); - bool HasNoCfCheck = (CB && CB->doesNoCfCheck()); - bool IsIndirectCall = (CB && isa(CB) && CB->isIndirectCall()); - bool IsCFICall = IsIndirectCall && CLI.CFIType; - const Module *M = MF.getMMI().getModule(); - Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch"); - - MachineFunction::CallSiteInfo CSInfo; - if (CallConv == CallingConv::X86_INTR) - report_fatal_error("X86 interrupts may not be called directly"); - - bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall(); - if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO && !IsMustTail) { - // If we are using a GOT, disable tail calls to external symbols with - // default visibility. Tail calling such a symbol requires using a GOT - // relocation, which forces early binding of the symbol. This breaks code - // that require lazy function symbol resolution. Using musttail or - // GuaranteedTailCallOpt will override this. - GlobalAddressSDNode *G = dyn_cast(Callee); - if (!G || (!G->getGlobal()->hasLocalLinkage() && - G->getGlobal()->hasDefaultVisibility())) - isTailCall = false; - } - - if (isTailCall && !IsMustTail) { - // Check if it's really possible to do a tail call. - isTailCall = IsEligibleForTailCallOptimization( - Callee, CallConv, IsCalleePopSRet, isVarArg, CLI.RetTy, Outs, OutVals, - Ins, DAG); - - // Sibcalls are automatically detected tailcalls which do not require - // ABI changes. - if (!IsGuaranteeTCO && isTailCall) - IsSibcall = true; - - if (isTailCall) - ++NumTailCalls; - } - - if (IsMustTail && !isTailCall) - report_fatal_error("failed to perform tail call elimination on a call " - "site marked musttail"); - - assert(!(isVarArg && canGuaranteeTCO(CallConv)) && - "Var args not supported with calling convention fastcc, ghc or hipe"); - - // Analyze operands of the call, assigning locations to each operand. - SmallVector ArgLocs; - CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); - - // Allocate shadow area for Win64. - if (IsWin64) - CCInfo.AllocateStack(32, Align(8)); - - CCInfo.AnalyzeArguments(Outs, CC_X86); - - // In vectorcall calling convention a second pass is required for the HVA - // types. - if (CallingConv::X86_VectorCall == CallConv) { - CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86); - } - - // Get a count of how many bytes are to be pushed on the stack. - unsigned NumBytes = CCInfo.getAlignedCallFrameSize(); - if (IsSibcall) - // This is a sibcall. The memory operands are available in caller's - // own caller's stack. - NumBytes = 0; - else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv)) - NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); - - int FPDiff = 0; - if (isTailCall && - shouldGuaranteeTCO(CallConv, - MF.getTarget().Options.GuaranteedTailCallOpt)) { - // Lower arguments at fp - stackoffset + fpdiff. - unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn(); - - FPDiff = NumBytesCallerPushed - NumBytes; - - // Set the delta of movement of the returnaddr stackslot. - // But only set if delta is greater than previous delta. - if (FPDiff < X86Info->getTCReturnAddrDelta()) - X86Info->setTCReturnAddrDelta(FPDiff); - } - - unsigned NumBytesToPush = NumBytes; - unsigned NumBytesToPop = NumBytes; - - // If we have an inalloca argument, all stack space has already been allocated - // for us and be right at the top of the stack. We don't support multiple - // arguments passed in memory when using inalloca. - if (!Outs.empty() && Outs.back().Flags.isInAlloca()) { - NumBytesToPush = 0; - if (!ArgLocs.back().isMemLoc()) - report_fatal_error("cannot use inalloca attribute on a register " - "parameter"); - if (ArgLocs.back().getLocMemOffset() != 0) - report_fatal_error("any parameter with the inalloca attribute must be " - "the only memory argument"); - } else if (CLI.IsPreallocated) { - assert(ArgLocs.back().isMemLoc() && - "cannot use preallocated attribute on a register " - "parameter"); - SmallVector PreallocatedOffsets; - for (size_t i = 0; i < CLI.OutVals.size(); ++i) { - if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) { - PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset()); - } - } - auto *MFI = DAG.getMachineFunction().getInfo(); - size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CLI.CB); - MFI->setPreallocatedStackSize(PreallocatedId, NumBytes); - MFI->setPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets); - NumBytesToPush = 0; - } - - if (!IsSibcall && !IsMustTail) - Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush, - NumBytes - NumBytesToPush, dl); - - SDValue RetAddrFrIdx; - // Load return address for tail calls. - if (isTailCall && FPDiff) - Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, - Is64Bit, FPDiff, dl); - - SmallVector, 8> RegsToPass; - SmallVector MemOpChains; - SDValue StackPtr; - - // The next loop assumes that the locations are in the same order of the - // input arguments. - assert(isSortedByValueNo(ArgLocs) && - "Argument Location list must be sorted before lowering"); - - // Walk the register/memloc assignments, inserting copies/loads. In the case - // of tail call optimization arguments are handle later. - const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); - for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E; - ++I, ++OutIndex) { - assert(OutIndex < Outs.size() && "Invalid Out index"); - // Skip inalloca/preallocated arguments, they have already been written. - ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags; - if (Flags.isInAlloca() || Flags.isPreallocated()) - continue; - - CCValAssign &VA = ArgLocs[I]; - EVT RegVT = VA.getLocVT(); - SDValue Arg = OutVals[OutIndex]; - bool isByVal = Flags.isByVal(); - - // Promote the value if needed. - switch (VA.getLocInfo()) { - default: llvm_unreachable("Unknown loc info!"); - case CCValAssign::Full: break; - case CCValAssign::SExt: - Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); - break; - case CCValAssign::ZExt: - Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); - break; - case CCValAssign::AExt: - if (Arg.getValueType().isVector() && - Arg.getValueType().getVectorElementType() == MVT::i1) - Arg = lowerMasksToReg(Arg, RegVT, dl, DAG); - else if (RegVT.is128BitVector()) { - // Special case: passing MMX values in XMM registers. - Arg = DAG.getBitcast(MVT::i64, Arg); - Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); - Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); - } else - Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); - break; - case CCValAssign::BCvt: - Arg = DAG.getBitcast(RegVT, Arg); - break; - case CCValAssign::Indirect: { - if (isByVal) { - // Memcpy the argument to a temporary stack slot to prevent - // the caller from seeing any modifications the callee may make - // as guaranteed by the `byval` attribute. - int FrameIdx = MF.getFrameInfo().CreateStackObject( - Flags.getByValSize(), - std::max(Align(16), Flags.getNonZeroByValAlign()), false); - SDValue StackSlot = - DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout())); - Chain = - CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl); - // From now on treat this as a regular pointer - Arg = StackSlot; - isByVal = false; - } else { - // Store the argument. - SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); - int FI = cast(SpillSlot)->getIndex(); - Chain = DAG.getStore( - Chain, dl, Arg, SpillSlot, - MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); - Arg = SpillSlot; - } - break; - } - } - - if (VA.needsCustom()) { - assert(VA.getValVT() == MVT::v64i1 && - "Currently the only custom case is when we split v64i1 to 2 regs"); - // Split v64i1 value into two registers - Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget); - } else if (VA.isRegLoc()) { - RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); - const TargetOptions &Options = DAG.getTarget().Options; - if (Options.EmitCallSiteInfo) - CSInfo.emplace_back(VA.getLocReg(), I); - if (isVarArg && IsWin64) { - // Win64 ABI requires argument XMM reg to be copied to the corresponding - // shadow reg if callee is a varargs function. - Register ShadowReg; - switch (VA.getLocReg()) { - case X86::XMM0: ShadowReg = X86::RCX; break; - case X86::XMM1: ShadowReg = X86::RDX; break; - case X86::XMM2: ShadowReg = X86::R8; break; - case X86::XMM3: ShadowReg = X86::R9; break; - } - if (ShadowReg) - RegsToPass.push_back(std::make_pair(ShadowReg, Arg)); - } - } else if (!IsSibcall && (!isTailCall || isByVal)) { - assert(VA.isMemLoc()); - if (!StackPtr.getNode()) - StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), - getPointerTy(DAG.getDataLayout())); - MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, - dl, DAG, VA, Flags, isByVal)); - } - } - - if (!MemOpChains.empty()) - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); - - if (Subtarget.isPICStyleGOT()) { - // ELF / PIC requires GOT in the EBX register before function calls via PLT - // GOT pointer (except regcall). - if (!isTailCall) { - // Indirect call with RegCall calling convertion may use up all the - // general registers, so it is not suitable to bind EBX reister for - // GOT address, just let register allocator handle it. - if (CallConv != CallingConv::X86_RegCall) - RegsToPass.push_back(std::make_pair( - Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), - getPointerTy(DAG.getDataLayout())))); - } else { - // If we are tail calling and generating PIC/GOT style code load the - // address of the callee into ECX. The value in ecx is used as target of - // the tail jump. This is done to circumvent the ebx/callee-saved problem - // for tail calls on PIC/GOT architectures. Normally we would just put the - // address of GOT into ebx and then call target@PLT. But for tail calls - // ebx would be restored (since ebx is callee saved) before jumping to the - // target@PLT. - - // Note: The actual moving to ECX is done further down. - GlobalAddressSDNode *G = dyn_cast(Callee); - if (G && !G->getGlobal()->hasLocalLinkage() && - G->getGlobal()->hasDefaultVisibility()) - Callee = LowerGlobalAddress(Callee, DAG); - else if (isa(Callee)) - Callee = LowerExternalSymbol(Callee, DAG); - } - } - - if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail && - (Subtarget.hasSSE1() || !M->getModuleFlag("SkipRaxSetup"))) { - // From AMD64 ABI document: - // For calls that may call functions that use varargs or stdargs - // (prototype-less calls or calls to functions containing ellipsis (...) in - // the declaration) %al is used as hidden argument to specify the number - // of SSE registers used. The contents of %al do not need to match exactly - // the number of registers, but must be an ubound on the number of SSE - // registers used and is in the range 0 - 8 inclusive. - - // Count the number of XMM registers allocated. - static const MCPhysReg XMMArgRegs[] = { - X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, - X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 - }; - unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs); - assert((Subtarget.hasSSE1() || !NumXMMRegs) - && "SSE registers cannot be used when SSE is disabled"); - RegsToPass.push_back(std::make_pair(Register(X86::AL), - DAG.getConstant(NumXMMRegs, dl, - MVT::i8))); - } - - if (isVarArg && IsMustTail) { - const auto &Forwards = X86Info->getForwardedMustTailRegParms(); - for (const auto &F : Forwards) { - SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT); - RegsToPass.push_back(std::make_pair(F.PReg, Val)); - } - } - - // For tail calls lower the arguments to the 'real' stack slots. Sibcalls - // don't need this because the eligibility check rejects calls that require - // shuffling arguments passed in memory. - if (!IsSibcall && isTailCall) { - // Force all the incoming stack arguments to be loaded from the stack - // before any new outgoing arguments are stored to the stack, because the - // outgoing stack slots may alias the incoming argument stack slots, and - // the alias isn't otherwise explicit. This is slightly more conservative - // than necessary, because it means that each store effectively depends - // on every argument instead of just those arguments it would clobber. - SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); - - SmallVector MemOpChains2; - SDValue FIN; - int FI = 0; - for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E; - ++I, ++OutsIndex) { - CCValAssign &VA = ArgLocs[I]; - - if (VA.isRegLoc()) { - if (VA.needsCustom()) { - assert((CallConv == CallingConv::X86_RegCall) && - "Expecting custom case only in regcall calling convention"); - // This means that we are in special case where one argument was - // passed through two register locations - Skip the next location - ++I; - } - - continue; - } - - assert(VA.isMemLoc()); - SDValue Arg = OutVals[OutsIndex]; - ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags; - // Skip inalloca/preallocated arguments. They don't require any work. - if (Flags.isInAlloca() || Flags.isPreallocated()) - continue; - // Create frame index. - int32_t Offset = VA.getLocMemOffset()+FPDiff; - uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; - FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true); - FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); - - if (Flags.isByVal()) { - // Copy relative to framepointer. - SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl); - if (!StackPtr.getNode()) - StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), - getPointerTy(DAG.getDataLayout())); - Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), - StackPtr, Source); - - MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, - ArgChain, - Flags, DAG, dl)); - } else { - // Store relative to framepointer. - MemOpChains2.push_back(DAG.getStore( - ArgChain, dl, Arg, FIN, - MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI))); - } - } - - if (!MemOpChains2.empty()) - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2); - - // Store the return address to the appropriate stack slot. - Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, - getPointerTy(DAG.getDataLayout()), - RegInfo->getSlotSize(), FPDiff, dl); - } - - // Build a sequence of copy-to-reg nodes chained together with token chain - // and glue operands which copy the outgoing args into registers. - SDValue InGlue; - for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { - Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, - RegsToPass[i].second, InGlue); - InGlue = Chain.getValue(1); - } - - if (DAG.getTarget().getCodeModel() == CodeModel::Large) { - assert(Is64Bit && "Large code model is only legal in 64-bit mode."); - // In the 64-bit large code model, we have to make all calls - // through a register, since the call instruction's 32-bit - // pc-relative offset may not be large enough to hold the whole - // address. - } else if (Callee->getOpcode() == ISD::GlobalAddress || - Callee->getOpcode() == ISD::ExternalSymbol) { - // Lower direct calls to global addresses and external symbols. Setting - // ForCall to true here has the effect of removing WrapperRIP when possible - // to allow direct calls to be selected without first materializing the - // address into a register. - Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true); - } else if (Subtarget.isTarget64BitILP32() && - Callee.getValueType() == MVT::i32) { - // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI - Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee); - } - - // Returns a chain & a glue for retval copy to use. - SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); - SmallVector Ops; - - if (!IsSibcall && isTailCall && !IsMustTail) { - Chain = DAG.getCALLSEQ_END(Chain, NumBytesToPop, 0, InGlue, dl); - InGlue = Chain.getValue(1); - } - - Ops.push_back(Chain); - Ops.push_back(Callee); - - if (isTailCall) - Ops.push_back(DAG.getTargetConstant(FPDiff, dl, MVT::i32)); - - // Add argument registers to the end of the list so that they are known live - // into the call. - for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) - Ops.push_back(DAG.getRegister(RegsToPass[i].first, - RegsToPass[i].second.getValueType())); - - // Add a register mask operand representing the call-preserved registers. - const uint32_t *Mask = [&]() { - auto AdaptedCC = CallConv; - // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists), - // use X86_INTR calling convention because it has the same CSR mask - // (same preserved registers). - if (HasNCSR) - AdaptedCC = (CallingConv::ID)CallingConv::X86_INTR; - // If NoCalleeSavedRegisters is requested, than use GHC since it happens - // to use the CSR_NoRegs_RegMask. - if (CB && CB->hasFnAttr("no_callee_saved_registers")) - AdaptedCC = (CallingConv::ID)CallingConv::GHC; - return RegInfo->getCallPreservedMask(MF, AdaptedCC); - }(); - assert(Mask && "Missing call preserved mask for calling convention"); - - // If this is an invoke in a 32-bit function using a funclet-based - // personality, assume the function clobbers all registers. If an exception - // is thrown, the runtime will not restore CSRs. - // FIXME: Model this more precisely so that we can register allocate across - // the normal edge and spill and fill across the exceptional edge. - if (!Is64Bit && CLI.CB && isa(CLI.CB)) { - const Function &CallerFn = MF.getFunction(); - EHPersonality Pers = - CallerFn.hasPersonalityFn() - ? classifyEHPersonality(CallerFn.getPersonalityFn()) - : EHPersonality::Unknown; - if (isFuncletEHPersonality(Pers)) - Mask = RegInfo->getNoPreservedMask(); - } - - // Define a new register mask from the existing mask. - uint32_t *RegMask = nullptr; - - // In some calling conventions we need to remove the used physical registers - // from the reg mask. Create a new RegMask for such calling conventions. - // RegMask for calling conventions that disable only return registers (e.g. - // preserve_most) will be modified later in LowerCallResult. - bool ShouldDisableArgRegs = shouldDisableArgRegFromCSR(CallConv) || HasNCSR; - if (ShouldDisableArgRegs || shouldDisableRetRegFromCSR(CallConv)) { - const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); - - // Allocate a new Reg Mask and copy Mask. - RegMask = MF.allocateRegMask(); - unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs()); - memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize); - - // Make sure all sub registers of the argument registers are reset - // in the RegMask. - if (ShouldDisableArgRegs) { - for (auto const &RegPair : RegsToPass) - for (MCPhysReg SubReg : TRI->subregs_inclusive(RegPair.first)) - RegMask[SubReg / 32] &= ~(1u << (SubReg % 32)); - } - - // Create the RegMask Operand according to our updated mask. - Ops.push_back(DAG.getRegisterMask(RegMask)); - } else { - // Create the RegMask Operand according to the static mask. - Ops.push_back(DAG.getRegisterMask(Mask)); - } - - if (InGlue.getNode()) - Ops.push_back(InGlue); - - if (isTailCall) { - // We used to do: - //// If this is the first return lowered for this function, add the regs - //// to the liveout set for the function. - // This isn't right, although it's probably harmless on x86; liveouts - // should be computed from returns not tail calls. Consider a void - // function making a tail call to a function returning int. - MF.getFrameInfo().setHasTailCall(); - SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops); - - if (IsCFICall) - Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue()); - - DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge); - DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo)); - return Ret; - } - - if (HasNoCfCheck && IsCFProtectionSupported && IsIndirectCall) { - Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops); - } else if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) { - // Calls with a "clang.arc.attachedcall" bundle are special. They should be - // expanded to the call, directly followed by a special marker sequence and - // a call to a ObjC library function. Use the CALL_RVMARKER to do that. - assert(!isTailCall && - "tail calls cannot be marked with clang.arc.attachedcall"); - assert(Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode"); - - // Add a target global address for the retainRV/claimRV runtime function - // just before the call target. - Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB); - auto PtrVT = getPointerTy(DAG.getDataLayout()); - auto GA = DAG.getTargetGlobalAddress(ARCFn, dl, PtrVT); - Ops.insert(Ops.begin() + 1, GA); - Chain = DAG.getNode(X86ISD::CALL_RVMARKER, dl, NodeTys, Ops); - } else { - Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops); - } - - if (IsCFICall) - Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue()); - - InGlue = Chain.getValue(1); - DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge); - DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); - - // Save heapallocsite metadata. - if (CLI.CB) - if (MDNode *HeapAlloc = CLI.CB->getMetadata("heapallocsite")) - DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc); - - // Create the CALLSEQ_END node. - unsigned NumBytesForCalleeToPop = 0; // Callee pops nothing. - if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, - DAG.getTarget().Options.GuaranteedTailCallOpt)) - NumBytesForCalleeToPop = NumBytes; // Callee pops everything - else if (!canGuaranteeTCO(CallConv) && IsCalleePopSRet) - // If this call passes a struct-return pointer, the callee - // pops that struct pointer. - NumBytesForCalleeToPop = 4; - - // Returns a glue for retval copy to use. - if (!IsSibcall) { - Chain = DAG.getCALLSEQ_END(Chain, NumBytesToPop, NumBytesForCalleeToPop, - InGlue, dl); - InGlue = Chain.getValue(1); - } - - // Handle result values, copying them out of physregs into vregs that we - // return. - return LowerCallResult(Chain, InGlue, CallConv, isVarArg, Ins, dl, DAG, - InVals, RegMask); -} - -//===----------------------------------------------------------------------===// -// Fast Calling Convention (tail call) implementation -//===----------------------------------------------------------------------===// - -// Like std call, callee cleans arguments, convention except that ECX is -// reserved for storing the tail called function address. Only 2 registers are -// free for argument passing (inreg). Tail call optimization is performed -// provided: -// * tailcallopt is enabled -// * caller/callee are fastcc -// On X86_64 architecture with GOT-style position independent code only local -// (within module) calls are supported at the moment. -// To keep the stack aligned according to platform abi the function -// GetAlignedArgumentStackSize ensures that argument delta is always multiples -// of stack alignment. (Dynamic linkers need this - Darwin's dyld for example) -// If a tail called function callee has more arguments than the caller the -// caller needs to make sure that there is room to move the RETADDR to. This is -// achieved by reserving an area the size of the argument delta right after the -// original RETADDR, but before the saved framepointer or the spilled registers -// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) -// stack layout: -// arg1 -// arg2 -// RETADDR -// [ new RETADDR -// move area ] -// (possible EBP) -// ESI -// EDI -// local1 .. - -/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align -/// requirement. -unsigned -X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize, - SelectionDAG &DAG) const { - const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign(); - const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize(); - assert(StackSize % SlotSize == 0 && - "StackSize must be a multiple of SlotSize"); - return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize; -} - -/// Return true if the given stack call argument is already available in the -/// same position (relatively) of the caller's incoming argument stack. -static -bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, - MachineFrameInfo &MFI, const MachineRegisterInfo *MRI, - const X86InstrInfo *TII, const CCValAssign &VA) { - unsigned Bytes = Arg.getValueSizeInBits() / 8; - - for (;;) { - // Look through nodes that don't alter the bits of the incoming value. - unsigned Op = Arg.getOpcode(); - if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) { - Arg = Arg.getOperand(0); - continue; - } - if (Op == ISD::TRUNCATE) { - const SDValue &TruncInput = Arg.getOperand(0); - if (TruncInput.getOpcode() == ISD::AssertZext && - cast(TruncInput.getOperand(1))->getVT() == - Arg.getValueType()) { - Arg = TruncInput.getOperand(0); - continue; - } - } - break; - } - - int FI = INT_MAX; - if (Arg.getOpcode() == ISD::CopyFromReg) { - Register VR = cast(Arg.getOperand(1))->getReg(); - if (!VR.isVirtual()) - return false; - MachineInstr *Def = MRI->getVRegDef(VR); - if (!Def) - return false; - if (!Flags.isByVal()) { - if (!TII->isLoadFromStackSlot(*Def, FI)) - return false; - } else { - unsigned Opcode = Def->getOpcode(); - if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r || - Opcode == X86::LEA64_32r) && - Def->getOperand(1).isFI()) { - FI = Def->getOperand(1).getIndex(); - Bytes = Flags.getByValSize(); - } else - return false; - } - } else if (LoadSDNode *Ld = dyn_cast(Arg)) { - if (Flags.isByVal()) - // ByVal argument is passed in as a pointer but it's now being - // dereferenced. e.g. - // define @foo(%struct.X* %A) { - // tail call @bar(%struct.X* byval %A) - // } - return false; - SDValue Ptr = Ld->getBasePtr(); - FrameIndexSDNode *FINode = dyn_cast(Ptr); - if (!FINode) - return false; - FI = FINode->getIndex(); - } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) { - FrameIndexSDNode *FINode = cast(Arg); - FI = FINode->getIndex(); - Bytes = Flags.getByValSize(); - } else - return false; - - assert(FI != INT_MAX); - if (!MFI.isFixedObjectIndex(FI)) - return false; - - if (Offset != MFI.getObjectOffset(FI)) - return false; - - // If this is not byval, check that the argument stack object is immutable. - // inalloca and argument copy elision can create mutable argument stack - // objects. Byval objects can be mutated, but a byval call intends to pass the - // mutated memory. - if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI)) - return false; - - if (VA.getLocVT().getFixedSizeInBits() > - Arg.getValueSizeInBits().getFixedValue()) { - // If the argument location is wider than the argument type, check that any - // extension flags match. - if (Flags.isZExt() != MFI.isObjectZExt(FI) || - Flags.isSExt() != MFI.isObjectSExt(FI)) { - return false; - } - } - - return Bytes == MFI.getObjectSize(FI); -} - -/// Check whether the call is eligible for tail call optimization. Targets -/// that want to do tail call optimization should implement this function. -bool X86TargetLowering::IsEligibleForTailCallOptimization( - SDValue Callee, CallingConv::ID CalleeCC, bool IsCalleePopSRet, - bool isVarArg, Type *RetTy, const SmallVectorImpl &Outs, - const SmallVectorImpl &OutVals, - const SmallVectorImpl &Ins, SelectionDAG &DAG) const { - if (!mayTailCallThisCC(CalleeCC)) - return false; - - // If -tailcallopt is specified, make fastcc functions tail-callable. - MachineFunction &MF = DAG.getMachineFunction(); - const Function &CallerF = MF.getFunction(); - - // If the function return type is x86_fp80 and the callee return type is not, - // then the FP_EXTEND of the call result is not a nop. It's not safe to - // perform a tailcall optimization here. - if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty()) - return false; - - CallingConv::ID CallerCC = CallerF.getCallingConv(); - bool CCMatch = CallerCC == CalleeCC; - bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC); - bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC); - bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt || - CalleeCC == CallingConv::Tail || CalleeCC == CallingConv::SwiftTail; - - // Win64 functions have extra shadow space for argument homing. Don't do the - // sibcall if the caller and callee have mismatched expectations for this - // space. - if (IsCalleeWin64 != IsCallerWin64) - return false; - - if (IsGuaranteeTCO) { - if (canGuaranteeTCO(CalleeCC) && CCMatch) - return true; - return false; - } - - // Look for obvious safe cases to perform tail call optimization that do not - // require ABI changes. This is what gcc calls sibcall. - - // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to - // emit a special epilogue. - const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); - if (RegInfo->hasStackRealignment(MF)) - return false; - - // Also avoid sibcall optimization if we're an sret return fn and the callee - // is incompatible. See comment in LowerReturn about why hasStructRetAttr is - // insufficient. - if (MF.getInfo()->getSRetReturnReg()) { - // For a compatible tail call the callee must return our sret pointer. So it - // needs to be (a) an sret function itself and (b) we pass our sret as its - // sret. Condition #b is harder to determine. - return false; - } else if (IsCalleePopSRet) - // The callee pops an sret, so we cannot tail-call, as our caller doesn't - // expect that. - return false; - - // Do not sibcall optimize vararg calls unless all arguments are passed via - // registers. - LLVMContext &C = *DAG.getContext(); - if (isVarArg && !Outs.empty()) { - // Optimizing for varargs on Win64 is unlikely to be safe without - // additional testing. - if (IsCalleeWin64 || IsCallerWin64) - return false; - - SmallVector ArgLocs; - CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); - - CCInfo.AnalyzeCallOperands(Outs, CC_X86); - for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) - if (!ArgLocs[i].isRegLoc()) - return false; - } - - // If the call result is in ST0 / ST1, it needs to be popped off the x87 - // stack. Therefore, if it's not used by the call it is not safe to optimize - // this into a sibcall. - bool Unused = false; - for (unsigned i = 0, e = Ins.size(); i != e; ++i) { - if (!Ins[i].Used) { - Unused = true; - break; - } - } - if (Unused) { - SmallVector RVLocs; - CCState CCInfo(CalleeCC, false, MF, RVLocs, C); - CCInfo.AnalyzeCallResult(Ins, RetCC_X86); - for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { - CCValAssign &VA = RVLocs[i]; - if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) - return false; - } - } - - // Check that the call results are passed in the same way. - if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins, - RetCC_X86, RetCC_X86)) - return false; - // The callee has to preserve all registers the caller needs to preserve. - const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); - const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); - if (!CCMatch) { - const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); - if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) - return false; - } - - unsigned StackArgsSize = 0; - - // If the callee takes no arguments then go on to check the results of the - // call. - if (!Outs.empty()) { - // Check if stack adjustment is needed. For now, do not do this if any - // argument is passed on the stack. - SmallVector ArgLocs; - CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); - - // Allocate shadow area for Win64 - if (IsCalleeWin64) - CCInfo.AllocateStack(32, Align(8)); - - CCInfo.AnalyzeCallOperands(Outs, CC_X86); - StackArgsSize = CCInfo.getStackSize(); - - if (CCInfo.getStackSize()) { - // Check if the arguments are already laid out in the right way as - // the caller's fixed stack objects. - MachineFrameInfo &MFI = MF.getFrameInfo(); - const MachineRegisterInfo *MRI = &MF.getRegInfo(); - const X86InstrInfo *TII = Subtarget.getInstrInfo(); - for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { - CCValAssign &VA = ArgLocs[i]; - SDValue Arg = OutVals[i]; - ISD::ArgFlagsTy Flags = Outs[i].Flags; - if (VA.getLocInfo() == CCValAssign::Indirect) - return false; - if (!VA.isRegLoc()) { - if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, - MFI, MRI, TII, VA)) - return false; - } - } - } - - bool PositionIndependent = isPositionIndependent(); - // If the tailcall address may be in a register, then make sure it's - // possible to register allocate for it. In 32-bit, the call address can - // only target EAX, EDX, or ECX since the tail call must be scheduled after - // callee-saved registers are restored. These happen to be the same - // registers used to pass 'inreg' arguments so watch out for those. - if (!Subtarget.is64Bit() && ((!isa(Callee) && - !isa(Callee)) || - PositionIndependent)) { - unsigned NumInRegs = 0; - // In PIC we need an extra register to formulate the address computation - // for the callee. - unsigned MaxInRegs = PositionIndependent ? 2 : 3; - - for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { - CCValAssign &VA = ArgLocs[i]; - if (!VA.isRegLoc()) - continue; - Register Reg = VA.getLocReg(); - switch (Reg) { - default: break; - case X86::EAX: case X86::EDX: case X86::ECX: - if (++NumInRegs == MaxInRegs) - return false; - break; - } - } - } - - const MachineRegisterInfo &MRI = MF.getRegInfo(); - if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) - return false; - } - - bool CalleeWillPop = - X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg, - MF.getTarget().Options.GuaranteedTailCallOpt); - - if (unsigned BytesToPop = - MF.getInfo()->getBytesToPopOnReturn()) { - // If we have bytes to pop, the callee must pop them. - bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize; - if (!CalleePopMatches) - return false; - } else if (CalleeWillPop && StackArgsSize > 0) { - // If we don't have bytes to pop, make sure the callee doesn't pop any. - return false; - } - - return true; -} - -FastISel * -X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, - const TargetLibraryInfo *libInfo) const { - return X86::createFastISel(funcInfo, libInfo); -} - -//===----------------------------------------------------------------------===// -// Other Lowering Hooks -//===----------------------------------------------------------------------===// - -bool X86::mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget, - bool AssumeSingleUse) { - if (!AssumeSingleUse && !Op.hasOneUse()) - return false; - if (!ISD::isNormalLoad(Op.getNode())) - return false; - - // If this is an unaligned vector, make sure the target supports folding it. - auto *Ld = cast(Op.getNode()); - if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() && - Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16)) - return false; - - // TODO: If this is a non-temporal load and the target has an instruction - // for it, it should not be folded. See "useNonTemporalLoad()". - - return true; -} - -bool X86::mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT, - const X86Subtarget &Subtarget, - bool AssumeSingleUse) { - assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory"); - if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse)) - return false; - - // We can not replace a wide volatile load with a broadcast-from-memory, - // because that would narrow the load, which isn't legal for volatiles. - auto *Ld = cast(Op.getNode()); - return !Ld->isVolatile() || - Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits(); -} - -bool X86::mayFoldIntoStore(SDValue Op) { - return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); -} - -bool X86::mayFoldIntoZeroExtend(SDValue Op) { - if (Op.hasOneUse()) { - unsigned Opcode = Op.getNode()->use_begin()->getOpcode(); - return (ISD::ZERO_EXTEND == Opcode); - } - return false; -} - -static bool isTargetShuffle(unsigned Opcode) { - switch(Opcode) { - default: return false; - case X86ISD::BLENDI: - case X86ISD::PSHUFB: - case X86ISD::PSHUFD: - case X86ISD::PSHUFHW: - case X86ISD::PSHUFLW: - case X86ISD::SHUFP: - case X86ISD::INSERTPS: - case X86ISD::EXTRQI: - case X86ISD::INSERTQI: - case X86ISD::VALIGN: - case X86ISD::PALIGNR: - case X86ISD::VSHLDQ: - case X86ISD::VSRLDQ: - case X86ISD::MOVLHPS: - case X86ISD::MOVHLPS: - case X86ISD::MOVSHDUP: - case X86ISD::MOVSLDUP: - case X86ISD::MOVDDUP: - case X86ISD::MOVSS: - case X86ISD::MOVSD: - case X86ISD::MOVSH: - case X86ISD::UNPCKL: - case X86ISD::UNPCKH: - case X86ISD::VBROADCAST: - case X86ISD::VPERMILPI: - case X86ISD::VPERMILPV: - case X86ISD::VPERM2X128: - case X86ISD::SHUF128: - case X86ISD::VPERMIL2: - case X86ISD::VPERMI: - case X86ISD::VPPERM: - case X86ISD::VPERMV: - case X86ISD::VPERMV3: - case X86ISD::VZEXT_MOVL: - return true; - } -} - -static bool isTargetShuffleVariableMask(unsigned Opcode) { - switch (Opcode) { - default: return false; - // Target Shuffles. - case X86ISD::PSHUFB: - case X86ISD::VPERMILPV: - case X86ISD::VPERMIL2: - case X86ISD::VPPERM: - case X86ISD::VPERMV: - case X86ISD::VPERMV3: - return true; - // 'Faux' Target Shuffles. - case ISD::OR: - case ISD::AND: - case X86ISD::ANDNP: - return true; - } -} - -SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { - MachineFunction &MF = DAG.getMachineFunction(); - const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); - X86MachineFunctionInfo *FuncInfo = MF.getInfo(); - int ReturnAddrIndex = FuncInfo->getRAIndex(); - - if (ReturnAddrIndex == 0) { - // Set up a frame object for the return address. - unsigned SlotSize = RegInfo->getSlotSize(); - ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize, - -(int64_t)SlotSize, - false); - FuncInfo->setRAIndex(ReturnAddrIndex); - } - - return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout())); -} - -bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, - bool hasSymbolicDisplacement) { - // Offset should fit into 32 bit immediate field. - if (!isInt<32>(Offset)) - return false; - - // If we don't have a symbolic displacement - we don't have any extra - // restrictions. - if (!hasSymbolicDisplacement) - return true; - - // FIXME: Some tweaks might be needed for medium code model. - if (M != CodeModel::Small && M != CodeModel::Kernel) - return false; - - // For small code model we assume that latest object is 16MB before end of 31 - // bits boundary. We may also accept pretty large negative constants knowing - // that all objects are in the positive half of address space. - if (M == CodeModel::Small && Offset < 16*1024*1024) - return true; - - // For kernel code model we know that all object resist in the negative half - // of 32bits address space. We may not accept negative offsets, since they may - // be just off and we may accept pretty large positive ones. - if (M == CodeModel::Kernel && Offset >= 0) - return true; + // For kernel code model we know that all object resist in the negative half + // of 32bits address space. We may not accept negative offsets, since they may + // be just off and we may accept pretty large positive ones. + if (M == CodeModel::Kernel && Offset >= 0) + return true; return false; } -/// Determines whether the callee is required to pop its own arguments. -/// Callee pop is necessary to support tail calls. -bool X86::isCalleePop(CallingConv::ID CallingConv, - bool is64Bit, bool IsVarArg, bool GuaranteeTCO) { - // If GuaranteeTCO is true, we force some calls to be callee pop so that we - // can guarantee TCO. - if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO)) - return true; - - switch (CallingConv) { - default: - return false; - case CallingConv::X86_StdCall: - case CallingConv::X86_FastCall: - case CallingConv::X86_ThisCall: - case CallingConv::X86_VectorCall: - return !is64Bit; - } -} - /// Return true if the condition is an signed comparison operation. static bool isX86CCSigned(unsigned X86CC) { switch (X86CC) { diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn --- a/llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn @@ -95,6 +95,7 @@ "X86FlagsCopyLowering.cpp", "X86FloatingPoint.cpp", "X86FrameLowering.cpp", + "X86ISelCallLowering.cpp", "X86ISelDAGToDAG.cpp", "X86ISelLowering.cpp", "X86IndirectBranchTracking.cpp",