Index: lib/CodeGen/TargetInfo.cpp =================================================================== --- lib/CodeGen/TargetInfo.cpp +++ lib/CodeGen/TargetInfo.cpp @@ -951,8 +951,7 @@ Class classify(QualType Ty) const; ABIArgInfo classifyReturnType(QualType RetTy, CCState &State) const; ABIArgInfo classifyArgumentType(QualType RetTy, CCState &State) const; - ABIArgInfo reclassifyHvaArgType(QualType RetTy, CCState &State, - const ABIArgInfo& current) const; + /// \brief Updates the number of available free registers, returns /// true if any registers were allocated. bool updateFreeRegs(QualType Ty, CCState &State) const; @@ -1536,27 +1535,6 @@ return true; } -ABIArgInfo -X86_32ABIInfo::reclassifyHvaArgType(QualType Ty, CCState &State, - const ABIArgInfo ¤t) const { - // Assumes vectorCall calling convention. - const Type *Base = nullptr; - uint64_t NumElts = 0; - - if (!Ty->isBuiltinType() && !Ty->isVectorType() && - isHomogeneousAggregate(Ty, Base, NumElts)) { - if (State.FreeSSERegs >= NumElts) { - // HVA types get passed directly in registers if there is room. - State.FreeSSERegs -= NumElts; - return getDirectX86Hva(); - } - // If there's no room, the HVA gets passed as normal indirect - // structure. - return getIndirectResult(Ty, /*ByVal=*/false, State); - } - return current; -} - ABIArgInfo X86_32ABIInfo::classifyArgumentType(QualType Ty, CCState &State) const { // FIXME: Set alignment on indirect arguments. @@ -1575,35 +1553,20 @@ } } - // vectorcall adds the concept of a homogenous vector aggregate, similar - // to other targets, regcall uses some of the HVA rules. + // Regcall uses the concept of a homogenous vector aggregate, similar + // to other targets. const Type *Base = nullptr; uint64_t NumElts = 0; - if ((State.CC == llvm::CallingConv::X86_VectorCall || - State.CC == llvm::CallingConv::X86_RegCall) && + if (State.CC == llvm::CallingConv::X86_RegCall && isHomogeneousAggregate(Ty, Base, NumElts)) { - if (State.CC == llvm::CallingConv::X86_RegCall) { - if (State.FreeSSERegs >= NumElts) { - State.FreeSSERegs -= NumElts; - if (Ty->isBuiltinType() || Ty->isVectorType()) - return ABIArgInfo::getDirect(); - return ABIArgInfo::getExpand(); - - } - return getIndirectResult(Ty, /*ByVal=*/false, State); - } else if (State.CC == llvm::CallingConv::X86_VectorCall) { - if (State.FreeSSERegs >= NumElts && (Ty->isBuiltinType() || Ty->isVectorType())) { - // Actual floating-point types get registers first time through if - // there is registers available - State.FreeSSERegs -= NumElts; + if (State.FreeSSERegs >= NumElts) { + State.FreeSSERegs -= NumElts; + if (Ty->isBuiltinType() || Ty->isVectorType()) return ABIArgInfo::getDirect(); - } else if (!Ty->isBuiltinType() && !Ty->isVectorType()) { - // HVA Types only get registers after everything else has been - // set, so it gets set as indirect for now. - return ABIArgInfo::getIndirect(getContext().getTypeAlignInChars(Ty)); - } + return ABIArgInfo::getExpand(); } + return getIndirectResult(Ty, /*ByVal=*/false, State); } if (isAggregateTypeForABI(Ty)) { @@ -1684,31 +1647,53 @@ void X86_32ABIInfo::computeVectorCallArgs(CGFunctionInfo &FI, CCState &State, bool &UsedInAlloca) const { - // Vectorcall only allows the first 6 parameters to be passed in registers, - // and homogeneous vector aggregates are only put into registers as a second - // priority. - unsigned Count = 0; - CCState ZeroState = State; - ZeroState.FreeRegs = ZeroState.FreeSSERegs = 0; - // HVAs must be done as a second priority for registers, so the deferred - // items are dealt with by going through the pattern a second time. + // Vectorcall x86 works subtly different than in x64, so the format is + // a bit different than the x64 version. First, all vector types (not HVAs) + // are assigned, with the first 6 ending up in the YMM0-5 or XMM0-5 registers. + // This differs from the x64 implementation, where the first 6 by INDEX get + // registers. + // After that, integers AND HVAs are assigned Left to Right in the same pass. + // Integers are passed as ECX/EDX if one is available (in order). HVAs will + // first take up the remaining YMM/XMM registers. If insufficient registers + // remain but an integer register (ECX/EDX) is available, it will be passed + // in that, else, on the stack. for (auto &I : FI.arguments()) { - if (Count < VectorcallMaxParamNumAsReg) - I.info = classifyArgumentType(I.type, State); - else - // Parameters after the 6th cannot be passed in registers, - // so pretend there are no registers left for them. - I.info = classifyArgumentType(I.type, ZeroState); - UsedInAlloca |= (I.info.getKind() == ABIArgInfo::InAlloca); - ++Count; + // First pass do all the vector types. + const Type *Base = nullptr; + uint64_t NumElts = 0; + const QualType& Ty = I.type; + if ((Ty->isVectorType() || Ty->isBuiltinType()) && + isHomogeneousAggregate(Ty, Base, NumElts)) { + if (State.FreeSSERegs >= NumElts) { + State.FreeSSERegs -= NumElts; + I.info = ABIArgInfo::getDirect(); + } else { + I.info = classifyArgumentType(Ty, State); + } + UsedInAlloca |= (I.info.getKind() == ABIArgInfo::InAlloca); + } } - Count = 0; - // Go through the arguments a second time to get HVAs registers if there - // are still some available. + for (auto &I : FI.arguments()) { - if (Count < VectorcallMaxParamNumAsReg) - I.info = reclassifyHvaArgType(I.type, State, I.info); - ++Count; + // Second pass, do the rest! + const Type *Base = nullptr; + uint64_t NumElts = 0; + const QualType& Ty = I.type; + bool IsHva = isHomogeneousAggregate(Ty, Base, NumElts); + + if (IsHva && !Ty->isVectorType() && !Ty->isBuiltinType()) { + // Assign true HVAs (non vector/native FP types). + if (State.FreeSSERegs >= NumElts) { + State.FreeSSERegs -= NumElts; + I.info = getDirectX86Hva(); + } else { + I.info = getIndirectResult(Ty, /*ByVal=*/false, State); + } + } else if (!IsHva) { + // Assign all Non-HVAs, so this will exclude Vector/FP args. + I.info = classifyArgumentType(Ty, State); + UsedInAlloca |= (I.info.getKind() == ABIArgInfo::InAlloca); + } } } @@ -3901,6 +3886,8 @@ bool IsRegCall) const { unsigned Count = 0; for (auto &I : FI.arguments()) { + // Vectorcall in x64 only permits the first 6 arguments to be passed + // as XMM/YMM registers. if (Count < VectorcallMaxParamNumAsReg) I.info = classify(I.type, FreeSSERegs, false, IsVectorCall, IsRegCall); else { @@ -3913,11 +3900,8 @@ ++Count; } - Count = 0; for (auto &I : FI.arguments()) { - if (Count < VectorcallMaxParamNumAsReg) - I.info = reclassifyHvaArgType(I.type, FreeSSERegs, I.info); - ++Count; + I.info = reclassifyHvaArgType(I.type, FreeSSERegs, I.info); } } Index: test/CodeGen/vectorcall.c =================================================================== --- test/CodeGen/vectorcall.c +++ test/CodeGen/vectorcall.c @@ -100,8 +100,19 @@ // X32: define x86_vectorcallcc void @"\01odd_size_hva@@32"(%struct.OddSizeHVA inreg %a.coerce) // X64: define x86_vectorcallcc void @"\01odd_size_hva@@32"(%struct.OddSizeHVA inreg %a.coerce) -// The Vectorcall ABI only allows passing the first 6 items in registers, so this shouldn't +// The Vectorcall ABI only allows passing the first 6 items in registers in x64, so this shouldn't // consider 'p7' as a register. Instead p5 gets put into the register on the second pass. -struct HFA2 __vectorcall AddParticles(struct HFA2 p1, float p2, struct HFA4 p3, int p4, struct HFA2 p5, float p6, float p7){ return p1;} -// X32: define x86_vectorcallcc %struct.HFA2 @"\01AddParticles@@80"(%struct.HFA2 inreg %p1.coerce, float %p2, %struct.HFA4* inreg %p3, i32 inreg %p4, %struct.HFA2 inreg %p5.coerce, float %p6, float %p7) -// X64: define x86_vectorcallcc %struct.HFA2 @"\01AddParticles@@96"(%struct.HFA2 inreg %p1.coerce, float %p2, %struct.HFA4* %p3, i32 %p4, %struct.HFA2 inreg %p5.coerce, float %p6, float %p7) +// x86 should pass p2, p6 and p7 in registers, then p1 in the second pass. +struct HFA2 __vectorcall AddParticles(struct HFA2 p1, float p2, struct HFA4 p3, int p4, struct HFA2 p5, float p6, float p7, int p8){ return p1;} +// X32: define x86_vectorcallcc %struct.HFA2 @"\01AddParticles@@84"(%struct.HFA2 inreg %p1.coerce, float %p2, %struct.HFA4* inreg %p3, i32 inreg %p4, %struct.HFA2* %p5, float %p6, float %p7, i32 %p8) +// X64: define x86_vectorcallcc %struct.HFA2 @"\01AddParticles@@104"(%struct.HFA2 inreg %p1.coerce, float %p2, %struct.HFA4* %p3, i32 %p4, %struct.HFA2 inreg %p5.coerce, float %p6, float %p7, i32 %p8) + +// Vectorcall in both architectures allows passing of an HVA as long as there is room, +// even if it is not one of the first 6 arguments. First pass puts p4 into a +// register on both. p9 ends up in a register in x86 only. Second pass puts p1 +// in a register, does NOT put p7 in a register (since theres no room), then puts +// p8 in a register. +void __vectorcall HVAAnywhere(struct HFA2 p1, int p2, int p3, float p4, int p5, int p6, struct HFA4 p7, struct HFA2 p8, float p9){} +// X32: define x86_vectorcallcc void @"\01HVAAnywhere@@88"(%struct.HFA2 inreg %p1.coerce, i32 inreg %p2, i32 inreg %p3, float %p4, i32 %p5, i32 %p6, %struct.HFA4* %p7, %struct.HFA2 inreg %p8.coerce, float %p9) +// X64: define x86_vectorcallcc void @"\01HVAAnywhere@@112"(%struct.HFA2 inreg %p1.coerce, i32 %p2, i32 %p3, float %p4, i32 %p5, i32 %p6, %struct.HFA4* %p7, %struct.HFA2 inreg %p8.coerce, float %p9) +