Index: include/llvm/CodeGen/CallingConvLower.h =================================================================== --- include/llvm/CodeGen/CallingConvLower.h +++ include/llvm/CodeGen/CallingConvLower.h @@ -296,6 +296,12 @@ void AnalyzeFormalArguments(const SmallVectorImpl &Ins, CCAssignFn Fn); + /// The function will invoke AnalyzeFormalArguments. + void AnalyzeArguments(const SmallVectorImpl &Ins, + CCAssignFn Fn) { + AnalyzeFormalArguments(Ins, Fn); + } + /// AnalyzeReturn - Analyze the returned values of a return, /// incorporating info about the result values into this state. void AnalyzeReturn(const SmallVectorImpl &Outs, @@ -318,11 +324,22 @@ SmallVectorImpl &Flags, CCAssignFn Fn); + /// The function will invoke AnalyzeCallOperands. + void AnalyzeArguments(const SmallVectorImpl &Outs, + CCAssignFn Fn) { + AnalyzeCallOperands(Outs, Fn); + } + /// AnalyzeCallResult - Analyze the return values of a call, /// incorporating info about the passed values into this state. void AnalyzeCallResult(const SmallVectorImpl &Ins, CCAssignFn Fn); + /// A shadow allocated register is a register that was allocated + /// but wasn't added to the location list (Locs). + /// \returns true if the register was allocated as shadow or false otherwise. + bool IsShadowAllocatedReg(unsigned Reg) const; + /// AnalyzeCallResult - Same as above except it's specialized for calls which /// produce a single value. void AnalyzeCallResult(MVT VT, CCAssignFn Fn); @@ -521,6 +538,37 @@ const SmallVectorImpl &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn); + /// The function runs an additional analysis pass over function arguments. + /// It will mark each argument with the attribute flag SecArgPass. + /// After running, it will sort the locs list. + template + void AnalyzeArgumentsSecondPass(const SmallVectorImpl &Args, + CCAssignFn Fn) { + unsigned NumFirstPassLocs = Locs.size(); + + /// Creates similar argument list to \p Args in which each argument is + /// marked using SecArgPass flag. + SmallVector SecPassArg; + // SmallVector SecPassArg; + for (auto Arg : Args) { + Arg.Flags.setSecArgPass(); + SecPassArg.push_back(Arg); + } + + // Run the second argument pass + AnalyzeArguments(SecPassArg, Fn); + + // Sort the locations of the arguments according to their original position. + SmallVector TmpArgLocs; + std::swap(TmpArgLocs, Locs); + auto B = TmpArgLocs.begin(), E = TmpArgLocs.end(); + std::merge(B, B + NumFirstPassLocs, B + NumFirstPassLocs, E, + std::back_inserter(Locs), + [](const CCValAssign &A, const CCValAssign &B) -> bool { + return A.getValNo() < B.getValNo(); + }); + } + private: /// MarkAllocated - Mark a register and all of its aliases as allocated. void MarkAllocated(unsigned Reg); Index: include/llvm/Target/TargetCallingConv.h =================================================================== --- include/llvm/Target/TargetCallingConv.h +++ include/llvm/Target/TargetCallingConv.h @@ -51,6 +51,15 @@ static const uint64_t SwiftSelfOffs = 14; static const uint64_t SwiftError = 1ULL<<15; ///< Swift error parameter static const uint64_t SwiftErrorOffs = 15; + static const uint64_t Hva = 1ULL << 16; ///< HVA field for + ///< vectorcall + static const uint64_t HvaOffs = 16; + static const uint64_t HvaStart = 1ULL << 17; ///< HVA structure start + ///< for vectorcall + static const uint64_t HvaStartOffs = 17; + static const uint64_t SecArgPass = 1ULL << 18; ///< Second argument + ///< pass for vectorcall + static const uint64_t SecArgPassOffs = 18; static const uint64_t OrigAlign = 0x1FULL<<27; static const uint64_t OrigAlignOffs = 27; static const uint64_t ByValSize = 0x3fffffffULL<<32; ///< Struct size @@ -91,6 +100,15 @@ bool isSwiftError() const { return Flags & SwiftError; } void setSwiftError() { Flags |= One << SwiftErrorOffs; } + bool isHva() const { return Flags & Hva; } + void setHva() { Flags |= One << HvaOffs; } + + bool isHvaStart() const { return Flags & HvaStart; } + void setHvaStart() { Flags |= One << HvaStartOffs; } + + bool isSecArgPass() const { return Flags & SecArgPass; } + void setSecArgPass() { Flags |= One << SecArgPassOffs; } + bool isNest() const { return Flags & Nest; } void setNest() { Flags |= One << NestOffs; } Index: lib/CodeGen/CallingConvLower.cpp =================================================================== --- lib/CodeGen/CallingConvLower.cpp +++ lib/CodeGen/CallingConvLower.cpp @@ -23,6 +23,8 @@ #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include + using namespace llvm; CCState::CCState(CallingConv::ID CC, bool isVarArg, MachineFunction &mf, @@ -64,6 +66,22 @@ UsedRegs[*AI/32] |= 1 << (*AI&31); } +bool CCState::IsShadowAllocatedReg(unsigned Reg) const { + if (!isAllocated(Reg)) + return false; + + for (auto const &ValAssign : Locs) { + if (ValAssign.isRegLoc()) { + for (MCRegAliasIterator AI(ValAssign.getLocReg(), &TRI, true); + AI.isValid(); ++AI) { + if (*AI == Reg) + return false; + } + } + } + return true; +} + /// Analyze an array of argument values, /// incorporating info about the formals into this state. void Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -7733,8 +7733,19 @@ Flags.setZExt(); if (Args[i].isSExt) Flags.setSExt(); - if (Args[i].isInReg) + if (Args[i].isInReg) { + // If we are using vectorcall calling convention, a structure that is + // passed InReg - is surely an HVA + if (CLI.CallConv == CallingConv::X86_VectorCall && + isa(FinalType)) { + // The first value of a structure is marked + if (0 == Value) + Flags.setHvaStart(); + Flags.setHva(); + } + // Set InReg Flag Flags.setInReg(); + } if (Args[i].isSRet) Flags.setSRet(); if (Args[i].isSwiftSelf) @@ -8020,8 +8031,19 @@ Flags.setZExt(); if (F.getAttributes().hasAttribute(Idx, Attribute::SExt)) Flags.setSExt(); - if (F.getAttributes().hasAttribute(Idx, Attribute::InReg)) + if (F.getAttributes().hasAttribute(Idx, Attribute::InReg)) { + // If we are using vectorcall calling convention, a structure that is + // passed InReg - is surely an HVA + if (F.getCallingConv() == CallingConv::X86_VectorCall && + isa(I->getType())) { + // The first value of a structure is marked + if (0 == Value) + Flags.setHvaStart(); + Flags.setHva(); + } + // Set InReg Flag Flags.setInReg(); + } if (F.getAttributes().hasAttribute(Idx, Attribute::StructRet)) Flags.setSRet(); if (F.getAttributes().hasAttribute(Idx, Attribute::SwiftSelf)) Index: lib/Target/X86/X86CallingConv.h =================================================================== --- lib/Target/X86/X86CallingConv.h +++ lib/Target/X86/X86CallingConv.h @@ -24,23 +24,29 @@ /// When regcall calling convention compiled to 32 bit arch, special treatment /// is required for 64 bit masks. /// The value should be assigned to two GPRs. -/// @return true if registers were allocated and false otherwise +/// \return true if registers were allocated and false otherwise. bool CC_X86_32_RegCall_Assign2Regs(unsigned &ValNo, MVT &ValVT, MVT &LocVT, CCValAssign::LocInfo &LocInfo, ISD::ArgFlagsTy &ArgFlags, CCState &State); -inline bool CC_X86_32_VectorCallIndirect(unsigned &ValNo, MVT &ValVT, - MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State) { - // Similar to CCPassIndirect, with the addition of inreg. - LocVT = MVT::i32; - LocInfo = CCValAssign::Indirect; - ArgFlags.setInReg(); - return false; // Continue the search, but now for i32. -} - +/// Vectorcall calling convention has special handling for vector types or +/// HVA for 64 bit arch. +/// For HVAs shadow registers might be allocated on the first pass +/// and actual XMM registers are allocated on the second pass. +/// For vector types, actual XMM registers are allocated on the first pass. +/// \return true if registers were allocated and false otherwise. +bool CC_X86_64_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, CCState &State); + +/// Vectorcall calling convention has special handling for vector types or +/// HVA for 32 bit arch. +/// For HVAs actual XMM registers are allocated on the second pass. +/// For vector types, actual XMM registers are allocated on the first pass. +/// \return true if registers were allocated and false otherwise. +bool CC_X86_32_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, CCState &State); inline bool CC_X86_AnyReg_Error(unsigned &, MVT &, MVT &, CCValAssign::LocInfo &, ISD::ArgFlagsTy &, Index: lib/Target/X86/X86CallingConv.cpp =================================================================== --- lib/Target/X86/X86CallingConv.cpp +++ lib/Target/X86/X86CallingConv.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "MCTargetDesc/X86MCTargetDesc.h" +#include "X86Subtarget.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/IR/CallingConv.h" @@ -39,14 +40,14 @@ if (AvailableRegs.size() < RequiredGprsUponSplit) return false; // Not enough free registers - continue the search. - // Allocating the available registers + // Allocating the available registers. for (unsigned I = 0; I < RequiredGprsUponSplit; I++) { - // Marking the register as located + // Marking the register as located. unsigned Reg = State.AllocateReg(AvailableRegs[I]); // Since we previously made sure that 2 registers are available - // we expect that a real register number will be returned + // we expect that a real register number will be returned. assert(Reg && "Expecting a register will be available"); // Assign the value to the allocated register @@ -57,4 +58,151 @@ return true; } +static ArrayRef CC_X86_VectorCallGetSSEs(const MVT &ValVT) { + if (ValVT.is512BitVector()) { + static const MCPhysReg RegListZMM[] = {X86::ZMM0, X86::ZMM1, X86::ZMM2, + X86::ZMM3, X86::ZMM4, X86::ZMM5}; + return RegListZMM; + } + + if (ValVT.is256BitVector()) { + static const MCPhysReg RegListYMM[] = {X86::YMM0, X86::YMM1, X86::YMM2, + X86::YMM3, X86::YMM4, X86::YMM5}; + return RegListYMM; + } + + static const MCPhysReg RegListXMM[] = {X86::XMM0, X86::XMM1, X86::XMM2, + X86::XMM3, X86::XMM4, X86::XMM5}; + return RegListXMM; +} + +static ArrayRef CC_X86_64_VectorCallGetGPRs() { + static const MCPhysReg RegListGPR[] = {X86::RCX, X86::RDX, X86::R8, X86::R9}; + return RegListGPR; +} + +static bool CC_X86_VectorCallAssignRegister(unsigned &ValNo, MVT &ValVT, + MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + + ArrayRef RegList = CC_X86_VectorCallGetSSEs(ValVT); + bool Is64bit = static_cast( + State.getMachineFunction().getSubtarget()) + .is64Bit(); + + for (auto Reg : RegList) { + // If the register is not marked as allocated - assign to it. + if (!State.isAllocated(Reg)) { + unsigned AssigedReg = State.AllocateReg(Reg); + assert(AssigedReg == Reg && "Expecting a valid register allocation"); + State.addLoc( + CCValAssign::getReg(ValNo, ValVT, AssigedReg, LocVT, LocInfo)); + return true; + } + // If the register is marked as shadow allocated - assign to it. + if (Is64bit && State.IsShadowAllocatedReg(Reg)) { + State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + return true; + } + } + + llvm_unreachable("Clang should ensure that hva marked vectors will have " + "an available register."); + return false; +} + +bool CC_X86_64_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, CCState &State) { + // On the second pass, go through the HVAs only. + if (ArgFlags.isSecArgPass()) { + if (ArgFlags.isHva()) + return CC_X86_VectorCallAssignRegister(ValNo, ValVT, LocVT, LocInfo, + ArgFlags, State); + return true; + } + + // Process only vector types as defined by vectorcall spec: + // "A vector type is either a floating-point type, for example, + // a float or double, or an SIMD vector type, for example, __m128 or __m256". + if (!(ValVT.isFloatingPoint() || + (ValVT.isVector() && ValVT.getSizeInBits() >= 128))) { + // If R9 was already assigned it means that we are after the fourth element + // and because this is not an HVA / Vector type, we need to allocate + // shadow XMM register. + if (State.isAllocated(X86::R9)) { + // Assign shadow XMM register. + (void)State.AllocateReg(CC_X86_VectorCallGetSSEs(ValVT)); + } + + return false; + } + + if (!ArgFlags.isHva() || ArgFlags.isHvaStart()) { + // Assign shadow GPR register. + (void)State.AllocateReg(CC_X86_64_VectorCallGetGPRs()); + + // Assign XMM register - (shadow for HVA and non-shadow for non HVA). + if (unsigned Reg = State.AllocateReg(CC_X86_VectorCallGetSSEs(ValVT))) { + // In Vectorcall Calling convention, additional shadow stack can be + // created on top of the basic 32 bytes of win64. + // It can happen if the fifth or sixth argument is vector type or HVA. + // At that case for each argument a shadow stack of 8 bytes is allocated. + if (Reg == X86::XMM4 || Reg == X86::XMM5) + State.AllocateStack(8, 8); + + if (!ArgFlags.isHva()) { + State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + return true; // Allocated a register - Stop the search. + } + } + } + + // If this is an HVA - Stop the search, + // otherwise continue the search. + return ArgFlags.isHva(); +} + +bool CC_X86_32_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, CCState &State) { + // On the second pass, go through the HVAs only. + if (ArgFlags.isSecArgPass()) { + if (ArgFlags.isHva()) + return CC_X86_VectorCallAssignRegister(ValNo, ValVT, LocVT, LocInfo, + ArgFlags, State); + return true; + } + + // Process only vector types as defined by vectorcall spec: + // "A vector type is either a floating point type, for example, + // a float or double, or an SIMD vector type, for example, __m128 or __m256". + if (!(ValVT.isFloatingPoint() || + (ValVT.isVector() && ValVT.getSizeInBits() >= 128))) { + return false; + } + + if (ArgFlags.isHva()) + return true; // If this is an HVA - Stop the search. + + // Assign XMM register. + if (unsigned Reg = State.AllocateReg(CC_X86_VectorCallGetSSEs(ValVT))) { + State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + return true; + } + + // In case we did not find an available XMM register for a vector - + // pass it indirectly. + // It is similar to CCPassIndirect, with the addition of inreg. + if (!ValVT.isFloatingPoint()) { + LocVT = MVT::i32; + LocInfo = CCValAssign::Indirect; + ArgFlags.setInReg(); + } + + return false; // No register was assigned - Continue the search. +} + } // End llvm namespace Index: lib/Target/X86/X86CallingConv.td =================================================================== --- lib/Target/X86/X86CallingConv.td +++ lib/Target/X86/X86CallingConv.td @@ -305,20 +305,12 @@ CCIfType<[i32], CCAssignToReg<[ESI, EBP, EAX, EDX]>> ]>; -// X86-32 HiPE return-value convention. +// X86-32 Vectorcall return-value convention. def RetCC_X86_32_VectorCall : CallingConv<[ - // Vector types are returned in XMM0,XMM1,XMMM2 and XMM3. - CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + // Floating Point types are returned in XMM0,XMM1,XMMM2 and XMM3. + CCIfType<[f32, f64, f128], CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>, - // 256-bit FP vectors - CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], - CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>, - - // 512-bit FP vectors - CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], - CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>, - // Return integers in the standard way. CCDelegateTo ]>; @@ -347,6 +339,16 @@ CCDelegateTo ]>; +// X86-64 vectorcall return-value convention. +def RetCC_X86_64_Vectorcall : CallingConv<[ + // Vectorcall calling convention always returns FP values in XMMs. + CCIfType<[f32, f64, f128], + CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>, + + // Otherwise, everything is the same as Windows X86-64 C CC. + CCDelegateTo +]>; + // X86-64 HiPE return-value convention. def RetCC_X86_64_HiPE : CallingConv<[ // Promote all types to i64 @@ -444,6 +446,9 @@ CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo>, CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo>, + // Handle Vectorcall CC + CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo>, + // Handle HHVM calls. CCIfCC<"CallingConv::HHVM", CCDelegateTo>, @@ -623,18 +628,7 @@ ]>; def CC_X86_Win64_VectorCall : CallingConv<[ - // The first 6 floating point and vector types of 128 bits or less use - // XMM0-XMM5. - CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], - CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5]>>, - - // 256-bit vectors use YMM registers. - CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], - CCAssignToReg<[YMM0, YMM1, YMM2, YMM3, YMM4, YMM5]>>, - - // 512-bit vectors use ZMM registers. - CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], - CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5]>>, + CCCustom<"CC_X86_64_VectorCall">, // Delegate to fastcall to handle integer types. CCDelegateTo @@ -844,25 +838,9 @@ CCDelegateTo ]>; -def CC_X86_32_VectorCall : CallingConv<[ - // The first 6 floating point and vector types of 128 bits or less use - // XMM0-XMM5. - CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], - CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5]>>, - - // 256-bit vectors use YMM registers. - CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], - CCAssignToReg<[YMM0, YMM1, YMM2, YMM3, YMM4, YMM5]>>, - - // 512-bit vectors use ZMM registers. - CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], - CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5]>>, - - // Otherwise, pass it indirectly. - CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, - v32i8, v16i16, v8i32, v4i64, v8f32, v4f64, - v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], - CCCustom<"CC_X86_32_VectorCallIndirect">>, +def CC_X86_Win32_VectorCall : CallingConv<[ + // Pass floating point in XMMs + CCCustom<"CC_X86_32_VectorCall">, // Delegate to fastcall to handle integer types. CCDelegateTo @@ -996,7 +974,7 @@ CCIfCC<"CallingConv::X86_INTR", CCDelegateTo>, CCIfSubtarget<"isTargetMCU()", CCDelegateTo>, CCIfCC<"CallingConv::X86_FastCall", CCDelegateTo>, - CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo>, + CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo>, CCIfCC<"CallingConv::X86_ThisCall", CCDelegateTo>, CCIfCC<"CallingConv::Fast", CCDelegateTo>, CCIfCC<"CallingConv::GHC", CCDelegateTo>, Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -17,6 +17,7 @@ #include "X86CallingConv.h" #include "X86FrameLowering.h" #include "X86InstrBuilder.h" +#include "X86IntrinsicsInfo.h" #include "X86MachineFunctionInfo.h" #include "X86ShuffleDecodeConstantPool.h" #include "X86TargetMachine.h" @@ -53,10 +54,10 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetOptions.h" -#include "X86IntrinsicsInfo.h" +#include #include -#include #include +#include using namespace llvm; #define DEBUG_TYPE "x86-isel" @@ -2751,6 +2752,13 @@ return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit)); } +static bool isSortedByValueNo(const SmallVectorImpl &ArgLocs) { + return std::is_sorted(ArgLocs.begin(), ArgLocs.end(), + [](const CCValAssign &A, const CCValAssign &B) -> bool { + return A.getValNo() < B.getValNo(); + }); +} + SDValue X86TargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, const SDLoc &dl, @@ -2785,11 +2793,22 @@ SmallVector ArgLocs; CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); - // Allocate shadow area for Win64 + // Allocate shadow area for Win64. if (IsWin64) CCInfo.AllocateStack(32, 8); - CCInfo.AnalyzeFormalArguments(Ins, CC_X86); + CCInfo.AnalyzeArguments(Ins, CC_X86); + + // In vectorcall calling convention a second pass is required for the HVA + // types. + if (CallingConv::X86_VectorCall == CallConv) { + CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86); + } + + // The next loop assumes that the locations are in the same order of the + // input arguments. + assert(isSortedByValueNo(ArgLocs) && + "Argument Location list must be sorted before lowering"); SDValue ArgValue; for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E; @@ -3232,11 +3251,17 @@ SmallVector ArgLocs; CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); - // Allocate shadow area for Win64 + // Allocate shadow area for Win64. if (IsWin64) CCInfo.AllocateStack(32, 8); - CCInfo.AnalyzeCallOperands(Outs, CC_X86); + CCInfo.AnalyzeArguments(Outs, CC_X86); + + // In vectorcall calling convention a second pass is required for the HVA + // types. + if (CallingConv::X86_VectorCall == CallConv) { + CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86); + } // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = CCInfo.getAlignedCallFrameSize(); @@ -3291,6 +3316,11 @@ SmallVector MemOpChains; SDValue StackPtr; + // The next loop assumes that the locations are in the same order of the + // input arguments. + assert(isSortedByValueNo(ArgLocs) && + "Argument Location list must be sorted before lowering"); + // Walk the register/memloc assignments, inserting copies/loads. In the case // of tail call optimization arguments are handle later. const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); Index: test/CodeGen/X86/vectorcall.ll =================================================================== --- test/CodeGen/X86/vectorcall.ll +++ test/CodeGen/X86/vectorcall.ll @@ -6,14 +6,12 @@ define x86_vectorcallcc i32 @test_int_1() { ret i32 0 } - ; CHECK-LABEL: {{^}}test_int_1@@0: ; CHECK: xorl %eax, %eax define x86_vectorcallcc i32 @test_int_2(i32 inreg %a) { ret i32 %a } - ; X86-LABEL: {{^}}test_int_2@@4: ; X64-LABEL: {{^}}test_int_2@@8: ; CHECK: movl %ecx, %eax @@ -22,7 +20,6 @@ %at = trunc i64 %a to i32 ret i32 %at } - ; X86-LABEL: {{^}}test_int_3@@8: ; X64-LABEL: {{^}}test_int_3@@8: ; CHECK: movl %ecx, %eax @@ -31,10 +28,8 @@ %s = add i32 %a, %b ret i32 %s } - ; X86-LABEL: {{^}}test_int_4@@8: ; X86: leal (%ecx,%edx), %eax - ; X64-LABEL: {{^}}test_int_4@@16: ; X64: leal (%rcx,%rdx), %eax @@ -90,4 +85,139 @@ ret <16 x i8> %r } ; CHECK-LABEL: {{^}}test_vec_2@@104: -; CHECK: movaps (%{{[re]}}cx), %xmm0 +; x64: movq {{[0-9]*}}(%rsp), %rax +; CHECK: movaps (%{{rax|ecx}}), %xmm0 + +%struct.HVA5 = type { <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float> } +%struct.HVA4 = type { <4 x float>, <4 x float>, <4 x float>, <4 x float> } +%struct.HVA3 = type { <4 x float>, <4 x float>, <4 x float> } +%struct.HVA2 = type { <4 x float>, <4 x float> } + +define x86_vectorcallcc <4 x float> @test_mixed_1(i32 %a, %struct.HVA4 inreg %bb, i32 %c) { +entry: + %b = alloca %struct.HVA4, align 16 + store %struct.HVA4 %bb, %struct.HVA4* %b, align 16 + %w1 = getelementptr inbounds %struct.HVA4, %struct.HVA4* %b, i32 0, i32 1 + %0 = load <4 x float>, <4 x float>* %w1, align 16 + ret <4 x float> %0 +} +; CHECK-LABEL: test_mixed_1 +; CHECK: movaps %xmm1, 16(%{{(e|r)}}sp) +; CHECK: movaps 16(%{{(e|r)}}sp), %xmm0 +; CHECK: ret{{q|l}} + +define x86_vectorcallcc <4 x float> @test_mixed_2(%struct.HVA4 inreg %a, %struct.HVA4* %b, <4 x float> %c) { +entry: + %c.addr = alloca <4 x float>, align 16 + store <4 x float> %c, <4 x float>* %c.addr, align 16 + %0 = load <4 x float>, <4 x float>* %c.addr, align 16 + ret <4 x float> %0 +} +; CHECK-LABEL: test_mixed_2 +; X86: movaps %xmm0, (%esp) +; X64: movaps %xmm2, %xmm0 +; CHECK: ret{{[ql]}} + +define x86_vectorcallcc <4 x float> @test_mixed_3(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, <4 x float> %e, %struct.HVA2* %f) { +entry: + %x = getelementptr inbounds %struct.HVA2, %struct.HVA2* %f, i32 0, i32 0 + %0 = load <4 x float>, <4 x float>* %x, align 16 + ret <4 x float> %0 +} +; CHECK-LABEL: test_mixed_3 +; CHECK: movaps (%{{[re][ac]}}x), %xmm0 +; CHECK: ret{{[ql]}} + +define x86_vectorcallcc <4 x float> @test_mixed_4(%struct.HVA4 inreg %a, %struct.HVA2* %bb, <4 x float> %c) { +entry: + %y4 = getelementptr inbounds %struct.HVA2, %struct.HVA2* %bb, i32 0, i32 1 + %0 = load <4 x float>, <4 x float>* %y4, align 16 + ret <4 x float> %0 +} +; CHECK-LABEL: test_mixed_4 +; X86: movaps 16(%eax), %xmm0 +; X64: movaps 16(%rdx), %xmm0 +; CHECK: ret{{[ql]}} + +define x86_vectorcallcc <4 x float> @test_mixed_5(%struct.HVA3 inreg %a, %struct.HVA3* %b, <4 x float> %c, %struct.HVA2 inreg %dd) { +entry: + %d = alloca %struct.HVA2, align 16 + store %struct.HVA2 %dd, %struct.HVA2* %d, align 16 + %y5 = getelementptr inbounds %struct.HVA2, %struct.HVA2* %d, i32 0, i32 1 + %0 = load <4 x float>, <4 x float>* %y5, align 16 + ret <4 x float> %0 +} +; CHECK-LABEL: test_mixed_5 +; CHECK: movaps %xmm5, 16(%{{(e|r)}}sp) +; CHECK: movaps 16(%{{(e|r)}}sp), %xmm0 +; CHECK: ret{{[ql]}} + +define x86_vectorcallcc %struct.HVA4 @test_mixed_6(%struct.HVA4 inreg %a, %struct.HVA4* %b) { +entry: + %retval = alloca %struct.HVA4, align 16 + %0 = bitcast %struct.HVA4* %retval to i8* + %1 = bitcast %struct.HVA4* %b to i8* + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %0, i8* %1, i32 64, i32 16, i1 false) + %2 = load %struct.HVA4, %struct.HVA4* %retval, align 16 + ret %struct.HVA4 %2 +} +; CHECK-LABEL: test_mixed_6 +; CHECK: movaps (%{{[re]}}sp), %xmm0 +; CHECK: movaps 16(%{{[re]}}sp), %xmm1 +; CHECK: movaps 32(%{{[re]}}sp), %xmm2 +; CHECK: movaps 48(%{{[re]}}sp), %xmm3 +; CHECK: ret{{[ql]}} + +declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i32, i1) +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) +declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture writeonly, i8* nocapture readonly, i32, i32, i1) + +define x86_vectorcallcc void @test_mixed_7(%struct.HVA5* noalias sret %agg.result) { +entry: + %a = alloca %struct.HVA5, align 16 + %0 = bitcast %struct.HVA5* %a to i8* + call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 80, i32 16, i1 false) + %1 = bitcast %struct.HVA5* %agg.result to i8* + %2 = bitcast %struct.HVA5* %a to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* %2, i64 80, i32 16, i1 false) + ret void +} +; CHECK-LABEL: test_mixed_7 +; CHECK: movaps %xmm{{[0-9]}}, 64(%{{rcx|eax}}) +; CHECK: movaps %xmm{{[0-9]}}, 48(%{{rcx|eax}}) +; CHECK: movaps %xmm{{[0-9]}}, 32(%{{rcx|eax}}) +; CHECK: movaps %xmm{{[0-9]}}, 16(%{{rcx|eax}}) +; CHECK: movaps %xmm{{[0-9]}}, (%{{rcx|eax}}) +; X64: mov{{[ql]}} %rcx, %rax +; CHECK: ret{{[ql]}} + +define x86_vectorcallcc <4 x float> @test_mixed_8(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, i32 %e, <4 x float> %f) { +entry: + %f.addr = alloca <4 x float>, align 16 + store <4 x float> %f, <4 x float>* %f.addr, align 16 + %0 = load <4 x float>, <4 x float>* %f.addr, align 16 + ret <4 x float> %0 +} +; CHECK-LABEL: test_mixed_8 +; X86: movaps %xmm4, %xmm0 +; X64: movaps %xmm5, %xmm0 +; CHECK: ret{{[ql]}} + +%struct.HFA4 = type { double, double, double, double } +declare x86_vectorcallcc double @test_mixed_9_callee(%struct.HFA4 %x, double %y) + +define x86_vectorcallcc double @test_mixed_9_caller(%struct.HFA4 inreg %b) { +entry: + %call = call x86_vectorcallcc double @test_mixed_9_callee(%struct.HFA4 inreg %b, double 3.000000e+00) + %add = fadd double 1.000000e+00, %call + ret double %add +} +; CHECK-LABEL: test_mixed_9_caller +; CHECK: movaps %xmm3, %xmm4 +; CHECK: movaps %xmm2, %xmm3 +; CHECK: movaps %xmm1, %xmm2 +; X32: movasd %xmm0, %xmm1 +; X64: movapd %xmm5, %xmm1 +; CHECK: call{{l|q}} test_mixed_9_callee@@40 +; CHECK: addsd {{.*}}, %xmm0 +; CHECK: ret{{l|q}}