Index: cfe/trunk/lib/AST/Mangle.cpp =================================================================== --- cfe/trunk/lib/AST/Mangle.cpp +++ cfe/trunk/lib/AST/Mangle.cpp @@ -49,10 +49,11 @@ void MangleContext::anchor() { } -enum StdOrFastCC { - SOF_OTHER, - SOF_FAST, - SOF_STD +enum CCMangling { + CCM_Other, + CCM_Fast, + CCM_Vector, + CCM_Std }; static bool isExternC(const NamedDecl *ND) { @@ -61,20 +62,22 @@ return cast(ND)->isExternC(); } -static StdOrFastCC getStdOrFastCallMangling(const ASTContext &Context, - const NamedDecl *ND) { +static CCMangling getCallingConvMangling(const ASTContext &Context, + const NamedDecl *ND) { const TargetInfo &TI = Context.getTargetInfo(); const llvm::Triple &Triple = TI.getTriple(); - if (!Triple.isOSWindows() || Triple.getArch() != llvm::Triple::x86) - return SOF_OTHER; + if (!Triple.isOSWindows() || + !(Triple.getArch() == llvm::Triple::x86 || + Triple.getArch() == llvm::Triple::x86_64)) + return CCM_Other; if (Context.getLangOpts().CPlusPlus && !isExternC(ND) && TI.getCXXABI() == TargetCXXABI::Microsoft) - return SOF_OTHER; + return CCM_Other; const FunctionDecl *FD = dyn_cast(ND); if (!FD) - return SOF_OTHER; + return CCM_Other; QualType T = FD->getType(); const FunctionType *FT = T->castAs(); @@ -82,19 +85,21 @@ CallingConv CC = FT->getCallConv(); switch (CC) { default: - return SOF_OTHER; + return CCM_Other; case CC_X86FastCall: - return SOF_FAST; + return CCM_Fast; case CC_X86StdCall: - return SOF_STD; + return CCM_Std; + case CC_X86VectorCall: + return CCM_Vector; } } bool MangleContext::shouldMangleDeclName(const NamedDecl *D) { const ASTContext &ASTContext = getASTContext(); - StdOrFastCC CC = getStdOrFastCallMangling(ASTContext, D); - if (CC != SOF_OTHER) + CCMangling CC = getCallingConvMangling(ASTContext, D); + if (CC != CCM_Other) return true; // In C, functions with no attributes never need to be mangled. Fastpath them. @@ -131,10 +136,10 @@ } const ASTContext &ASTContext = getASTContext(); - StdOrFastCC CC = getStdOrFastCallMangling(ASTContext, D); + CCMangling CC = getCallingConvMangling(ASTContext, D); bool MCXX = shouldMangleCXXName(D); const TargetInfo &TI = Context.getTargetInfo(); - if (CC == SOF_OTHER || (MCXX && TI.getCXXABI() == TargetCXXABI::Microsoft)) { + if (CC == CCM_Other || (MCXX && TI.getCXXABI() == TargetCXXABI::Microsoft)) { if (const ObjCMethodDecl *OMD = dyn_cast(D)) mangleObjCMethodName(OMD, Out); else @@ -143,9 +148,9 @@ } Out << '\01'; - if (CC == SOF_STD) + if (CC == CCM_Std) Out << '_'; - else + else if (CC == CCM_Fast) Out << '@'; if (!MCXX) @@ -158,6 +163,8 @@ const FunctionDecl *FD = cast(D); const FunctionType *FT = FD->getType()->castAs(); const FunctionProtoType *Proto = dyn_cast(FT); + if (CC == CCM_Vector) + Out << '@'; Out << '@'; if (!Proto) { Out << '0'; @@ -169,9 +176,11 @@ if (!MD->isStatic()) ++ArgWords; for (const auto &AT : Proto->param_types()) - // Size should be aligned to DWORD boundary - ArgWords += llvm::RoundUpToAlignment(ASTContext.getTypeSize(AT), 32) / 32; - Out << 4 * ArgWords; + // Size should be aligned to pointer size. + ArgWords += llvm::RoundUpToAlignment(ASTContext.getTypeSize(AT), + TI.getPointerWidth(0)) / + TI.getPointerWidth(0); + Out << ((TI.getPointerWidth(0) / 8) * ArgWords); } void MangleContext::mangleGlobalBlock(const BlockDecl *BD, Index: cfe/trunk/lib/Basic/Targets.cpp =================================================================== --- cfe/trunk/lib/Basic/Targets.cpp +++ cfe/trunk/lib/Basic/Targets.cpp @@ -3503,6 +3503,7 @@ CallingConvCheckResult checkCallingConvention(CallingConv CC) const override { return (CC == CC_C || + CC == CC_X86VectorCall || CC == CC_IntelOclBicc || CC == CC_X86_64Win64) ? CCCR_OK : CCCR_Warning; } @@ -3542,6 +3543,7 @@ } CallingConvCheckResult checkCallingConvention(CallingConv CC) const override { return (CC == CC_C || + CC == CC_X86VectorCall || CC == CC_IntelOclBicc || CC == CC_X86_64SysV) ? CCCR_OK : CCCR_Warning; } Index: cfe/trunk/lib/CodeGen/CGCall.cpp =================================================================== --- cfe/trunk/lib/CodeGen/CGCall.cpp +++ cfe/trunk/lib/CodeGen/CGCall.cpp @@ -50,7 +50,7 @@ // TODO: Add support for __pascal to LLVM. case CC_X86Pascal: return llvm::CallingConv::C; // TODO: Add support for __vectorcall to LLVM. - case CC_X86VectorCall: return llvm::CallingConv::C; + case CC_X86VectorCall: return llvm::CallingConv::X86_VectorCall; } } @@ -603,6 +603,9 @@ CharUnits UnionSize = CharUnits::Zero(); for (const auto *FD : RD->fields()) { + // Skip zero length bitfields. + if (FD->isBitField() && FD->getBitWidthValue(Context) == 0) + continue; assert(!FD->isBitField() && "Cannot expand structure with bit-field members."); CharUnits FieldSize = Context.getTypeSizeInChars(FD->getType()); @@ -622,6 +625,9 @@ } for (const auto *FD : RD->fields()) { + // Skip zero length bitfields. + if (FD->isBitField() && FD->getBitWidthValue(Context) == 0) + continue; assert(!FD->isBitField() && "Cannot expand structure with bit-field members."); Fields.push_back(FD); Index: cfe/trunk/lib/CodeGen/MicrosoftCXXABI.cpp =================================================================== --- cfe/trunk/lib/CodeGen/MicrosoftCXXABI.cpp +++ cfe/trunk/lib/CodeGen/MicrosoftCXXABI.cpp @@ -617,8 +617,15 @@ if (RD->hasNonTrivialCopyConstructor()) return RAA_Indirect; - // Win64 passes objects larger than 8 bytes indirectly. - if (getContext().getTypeSize(RD->getTypeForDecl()) > 64) + // If an object has a destructor, we'd really like to pass it indirectly + // because it allows us to elide copies. Unfortunately, MSVC makes that + // impossible for small types, which it will pass in a single register or + // stack slot. Most objects with dtors are large-ish, so handle that early. + // We can't call out all large objects as being indirect because there are + // multiple x64 calling conventions and the C++ ABI code shouldn't dictate + // how we pass large POD types. + if (RD->hasNonTrivialDestructor() && + getContext().getTypeSize(RD->getTypeForDecl()) > 64) return RAA_Indirect; // We have a trivial copy constructor or no copy constructors, but we have Index: cfe/trunk/lib/CodeGen/TargetInfo.cpp =================================================================== --- cfe/trunk/lib/CodeGen/TargetInfo.cpp +++ cfe/trunk/lib/CodeGen/TargetInfo.cpp @@ -508,18 +508,39 @@ return Ty; } +/// Returns true if this type can be passed in SSE registers with the +/// X86_VectorCall calling convention. Shared between x86_32 and x86_64. +static bool isX86VectorTypeForVectorCall(ASTContext &Context, QualType Ty) { + if (const BuiltinType *BT = Ty->getAs()) { + if (BT->isFloatingPoint() && BT->getKind() != BuiltinType::Half) + return true; + } else if (const VectorType *VT = Ty->getAs()) { + // vectorcall can pass XMM, YMM, and ZMM vectors. We don't pass SSE1 MMX + // registers specially. + unsigned VecSize = Context.getTypeSize(VT); + if (VecSize == 128 || VecSize == 256 || VecSize == 512) + return true; + } + return false; +} + +/// Returns true if this aggregate is small enough to be passed in SSE registers +/// in the X86_VectorCall calling convention. Shared between x86_32 and x86_64. +static bool isX86VectorCallAggregateSmallEnough(uint64_t NumMembers) { + return NumMembers <= 4; +} + //===----------------------------------------------------------------------===// // X86-32 ABI Implementation //===----------------------------------------------------------------------===// /// \brief Similar to llvm::CCState, but for Clang. struct CCState { - CCState(unsigned CC) : CC(CC), FreeRegs(0) {} + CCState(unsigned CC) : CC(CC), FreeRegs(0), FreeSSERegs(0) {} unsigned CC; unsigned FreeRegs; - unsigned StackOffset; - bool UseInAlloca; + unsigned FreeSSERegs; }; /// X86_32ABIInfo - The X86-32 ABI information. @@ -540,6 +561,17 @@ return (Size == 8 || Size == 16 || Size == 32 || Size == 64); } + bool isHomogeneousAggregateBaseType(QualType Ty) const override { + // FIXME: Assumes vectorcall is in use. + return isX86VectorTypeForVectorCall(getContext(), Ty); + } + + bool isHomogeneousAggregateSmallEnough(const Type *Ty, + uint64_t NumMembers) const override { + // FIXME: Assumes vectorcall is in use. + return isX86VectorCallAggregateSmallEnough(NumMembers); + } + bool shouldReturnTypeInRegister(QualType Ty, ASTContext &Context) const; /// getIndirectResult - Give a source type \arg Ty, return a suitable result @@ -767,6 +799,14 @@ if (RetTy->isVoidType()) return ABIArgInfo::getIgnore(); + const Type *Base = nullptr; + uint64_t NumElts = 0; + if (State.CC == llvm::CallingConv::X86_VectorCall && + isHomogeneousAggregate(RetTy, Base, NumElts)) { + // The LLVM struct type for such an aggregate should lower properly. + return ABIArgInfo::getDirect(); + } + if (const VectorType *VT = RetTy->getAs()) { // On Darwin, some vectors are returned in registers. if (IsDarwinVectorABI) { @@ -939,7 +979,8 @@ State.FreeRegs -= SizeInRegs; - if (State.CC == llvm::CallingConv::X86_FastCall) { + if (State.CC == llvm::CallingConv::X86_FastCall || + State.CC == llvm::CallingConv::X86_VectorCall) { if (Size > 32) return false; @@ -964,17 +1005,36 @@ ABIArgInfo X86_32ABIInfo::classifyArgumentType(QualType Ty, CCState &State) const { // FIXME: Set alignment on indirect arguments. - if (isAggregateTypeForABI(Ty)) { - if (const RecordType *RT = Ty->getAs()) { - // Check with the C++ ABI first. - CGCXXABI::RecordArgABI RAA = getRecordArgABI(RT, getCXXABI()); - if (RAA == CGCXXABI::RAA_Indirect) { - return getIndirectResult(Ty, false, State); - } else if (RAA == CGCXXABI::RAA_DirectInMemory) { - // The field index doesn't matter, we'll fix it up later. - return ABIArgInfo::getInAlloca(/*FieldIndex=*/0); - } + // Check with the C++ ABI first. + const RecordType *RT = Ty->getAs(); + if (RT) { + CGCXXABI::RecordArgABI RAA = getRecordArgABI(RT, getCXXABI()); + if (RAA == CGCXXABI::RAA_Indirect) { + return getIndirectResult(Ty, false, State); + } else if (RAA == CGCXXABI::RAA_DirectInMemory) { + // The field index doesn't matter, we'll fix it up later. + return ABIArgInfo::getInAlloca(/*FieldIndex=*/0); + } + } + + // vectorcall adds the concept of a homogenous vector aggregate, similar + // to other targets. + const Type *Base = nullptr; + uint64_t NumElts = 0; + if (State.CC == llvm::CallingConv::X86_VectorCall && + isHomogeneousAggregate(Ty, Base, NumElts)) { + if (State.FreeSSERegs >= NumElts) { + State.FreeSSERegs -= NumElts; + if (Ty->isBuiltinType() || Ty->isVectorType()) + return ABIArgInfo::getDirect(); + return ABIArgInfo::getExpand(); + } + return getIndirectResult(Ty, /*ByVal=*/false, State); + } + + if (isAggregateTypeForABI(Ty)) { + if (RT) { // Structs are always byval on win32, regardless of what they contain. if (IsWin32StructABI) return getIndirectResult(Ty, true, State); @@ -1006,7 +1066,9 @@ if (getContext().getTypeSize(Ty) <= 4*32 && canExpandIndirectArgument(Ty, getContext())) return ABIArgInfo::getExpandWithPadding( - State.CC == llvm::CallingConv::X86_FastCall, PaddingType); + State.CC == llvm::CallingConv::X86_FastCall || + State.CC == llvm::CallingConv::X86_VectorCall, + PaddingType); return getIndirectResult(Ty, true, State); } @@ -1049,7 +1111,10 @@ CCState State(FI.getCallingConvention()); if (State.CC == llvm::CallingConv::X86_FastCall) State.FreeRegs = 2; - else if (FI.getHasRegParm()) + else if (State.CC == llvm::CallingConv::X86_VectorCall) { + State.FreeRegs = 2; + State.FreeSSERegs = 6; + } else if (FI.getHasRegParm()) State.FreeRegs = FI.getRegParm(); else State.FreeRegs = DefaultNumRegisterParameters; @@ -1434,7 +1499,8 @@ /// WinX86_64ABIInfo - The Windows X86_64 ABI information. class WinX86_64ABIInfo : public ABIInfo { - ABIArgInfo classify(QualType Ty, bool IsReturnType) const; + ABIArgInfo classify(QualType Ty, unsigned &FreeSSERegs, + bool IsReturnType) const; public: WinX86_64ABIInfo(CodeGen::CodeGenTypes &CGT) : ABIInfo(CGT) {} @@ -1443,6 +1509,17 @@ llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty, CodeGenFunction &CGF) const override; + + bool isHomogeneousAggregateBaseType(QualType Ty) const override { + // FIXME: Assumes vectorcall is in use. + return isX86VectorTypeForVectorCall(getContext(), Ty); + } + + bool isHomogeneousAggregateSmallEnough(const Type *Ty, + uint64_t NumMembers) const override { + // FIXME: Assumes vectorcall is in use. + return isX86VectorCallAggregateSmallEnough(NumMembers); + } }; class X86_64TargetCodeGenInfo : public TargetCodeGenInfo { @@ -2844,7 +2921,8 @@ return ResAddr; } -ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, bool IsReturnType) const { +ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, unsigned &FreeSSERegs, + bool IsReturnType) const { if (Ty->isVoidType()) return ABIArgInfo::getIgnore(); @@ -2852,7 +2930,9 @@ if (const EnumType *EnumTy = Ty->getAs()) Ty = EnumTy->getDecl()->getIntegerType(); - uint64_t Size = getContext().getTypeSize(Ty); + TypeInfo Info = getContext().getTypeInfo(Ty); + uint64_t Width = Info.Width; + unsigned Align = getContext().toCharUnitsFromBits(Info.Align).getQuantity(); const RecordType *RT = Ty->getAs(); if (RT) { @@ -2865,11 +2945,26 @@ return ABIArgInfo::getIndirect(0, /*ByVal=*/false); // FIXME: mingw-w64-gcc emits 128-bit struct as i128 - if (Size == 128 && getTarget().getTriple().isWindowsGNUEnvironment()) + if (Width == 128 && getTarget().getTriple().isWindowsGNUEnvironment()) return ABIArgInfo::getDirect(llvm::IntegerType::get(getVMContext(), - Size)); + Width)); + } + + // vectorcall adds the concept of a homogenous vector aggregate, similar to + // other targets. + const Type *Base = nullptr; + uint64_t NumElts = 0; + if (FreeSSERegs && isHomogeneousAggregate(Ty, Base, NumElts)) { + if (FreeSSERegs >= NumElts) { + FreeSSERegs -= NumElts; + if (IsReturnType || Ty->isBuiltinType() || Ty->isVectorType()) + return ABIArgInfo::getDirect(); + return ABIArgInfo::getExpand(); + } + return ABIArgInfo::getIndirect(Align, /*ByVal=*/false); } + if (Ty->isMemberPointerType()) { // If the member pointer is represented by an LLVM int or ptr, pass it // directly. @@ -2881,11 +2976,11 @@ if (RT || Ty->isMemberPointerType()) { // MS x64 ABI requirement: "Any argument that doesn't fit in 8 bytes, or is // not 1, 2, 4, or 8 bytes, must be passed by reference." - if (Size > 64 || !llvm::isPowerOf2_64(Size)) + if (Width > 64 || !llvm::isPowerOf2_64(Width)) return ABIArgInfo::getIndirect(0, /*ByVal=*/false); // Otherwise, coerce it to a small integer. - return ABIArgInfo::getDirect(llvm::IntegerType::get(getVMContext(), Size)); + return ABIArgInfo::getDirect(llvm::IntegerType::get(getVMContext(), Width)); } // Bool type is always extended to the ABI, other builtin types are not @@ -2898,11 +2993,18 @@ } void WinX86_64ABIInfo::computeInfo(CGFunctionInfo &FI) const { + bool IsVectorCall = + FI.getCallingConvention() == llvm::CallingConv::X86_VectorCall; + + // We can use up to 4 SSE return registers with vectorcall. + unsigned FreeSSERegs = IsVectorCall ? 4 : 0; if (!getCXXABI().classifyReturnType(FI)) - FI.getReturnInfo() = classify(FI.getReturnType(), true); + FI.getReturnInfo() = classify(FI.getReturnType(), FreeSSERegs, true); + // We can use up to 6 SSE register parameters with vectorcall. + FreeSSERegs = IsVectorCall ? 6 : 0; for (auto &I : FI.arguments()) - I.info = classify(I.type, false); + I.info = classify(I.type, FreeSSERegs, false); } llvm::Value *WinX86_64ABIInfo::EmitVAArg(llvm::Value *VAListAddr, QualType Ty, Index: cfe/trunk/test/CodeGen/mangle-windows.c =================================================================== --- cfe/trunk/test/CodeGen/mangle-windows.c +++ cfe/trunk/test/CodeGen/mangle-windows.c @@ -1,33 +1,68 @@ // RUN: %clang_cc1 -emit-llvm %s -o - -triple=i386-pc-win32 | FileCheck %s // RUN: %clang_cc1 -emit-llvm %s -o - -triple=i386-mingw32 | FileCheck %s +// RUN: %clang_cc1 -emit-llvm %s -o - -triple=x86_64-pc-win32 | FileCheck %s --check-prefix=X64 +// RUN: %clang_cc1 -emit-llvm %s -o - -triple=x86_64-mingw32 | FileCheck %s --check-prefix=X64 void __stdcall f1(void) {} // CHECK: define x86_stdcallcc void @"\01_f1@0" +// X64: define void @f1( void __fastcall f2(void) {} // CHECK: define x86_fastcallcc void @"\01@f2@0" +// X64: define void @f2( void __stdcall f3() {} // CHECK: define x86_stdcallcc void @"\01_f3@0" +// X64: define void @f3( void __fastcall f4(char a) {} // CHECK: define x86_fastcallcc void @"\01@f4@4" +// X64: define void @f4( void __fastcall f5(short a) {} // CHECK: define x86_fastcallcc void @"\01@f5@4" +// X64: define void @f5( void __fastcall f6(int a) {} // CHECK: define x86_fastcallcc void @"\01@f6@4" +// X64: define void @f6( void __fastcall f7(long a) {} // CHECK: define x86_fastcallcc void @"\01@f7@4" +// X64: define void @f7( void __fastcall f8(long long a) {} // CHECK: define x86_fastcallcc void @"\01@f8@8" +// X64: define void @f8( void __fastcall f9(long long a, char b, char c, short d) {} -// CHECK: define x86_fastcallcc void @"\01@f9@20"(i64 %a, i8 signext %b, i8 -// signext %c, i16 signext %d) +// CHECK: define x86_fastcallcc void @"\01@f9@20"(i64 %a, i8 signext %b, i8 signext %c, i16 signext %d) +// X64: define void @f9( void f12(void) {} // CHECK: define void @f12( +// X64: define void @f12( + +void __vectorcall v1(void) {} +// CHECK: define x86_vectorcallcc void @"\01v1@@0"( +// X64: define x86_vectorcallcc void @"\01v1@@0"( + +void __vectorcall v2(char a) {} +// CHECK: define x86_vectorcallcc void @"\01v2@@4"( +// X64: define x86_vectorcallcc void @"\01v2@@8"( + +void __vectorcall v3(short a) {} +// CHECK: define x86_vectorcallcc void @"\01v3@@4"( +// X64: define x86_vectorcallcc void @"\01v3@@8"( + +void __vectorcall v4(int a) {} +// CHECK: define x86_vectorcallcc void @"\01v4@@4"( +// X64: define x86_vectorcallcc void @"\01v4@@8"( + +void __vectorcall v5(long long a) {} +// CHECK: define x86_vectorcallcc void @"\01v5@@8"( +// X64: define x86_vectorcallcc void @"\01v5@@8"( + +void __vectorcall v6(char a, char b) {} +// CHECK: define x86_vectorcallcc void @"\01v6@@8"( +// X64: define x86_vectorcallcc void @"\01v6@@16"( Index: cfe/trunk/test/CodeGen/microsoft-call-conv.c =================================================================== --- cfe/trunk/test/CodeGen/microsoft-call-conv.c +++ cfe/trunk/test/CodeGen/microsoft-call-conv.c @@ -20,9 +20,8 @@ f3(); // CHECK: call x86_thiscallcc void @f3() } -// FIXME: Add this to LLVM. void __vectorcall f61(void) { -// CHECK-LABEL: define void @f61() +// CHECK-LABEL: define x86_vectorcallcc void @f61() f3(); // CHECK: call x86_thiscallcc void @f3() } @@ -41,7 +40,7 @@ // CHECK: call x86_fastcallcc void @f4() // CHECK: call x86_stdcallcc void @f5() // CHECK: call x86_thiscallcc void @f6() - // CHECK: call void @f61() + // CHECK: call x86_vectorcallcc void @f61() pf1(); pf2(); pf3(); pf4(); pf5(); pf6(); pf7(); // CHECK: call x86_fastcallcc void %{{.*}}() // CHECK: call x86_stdcallcc void %{{.*}}() @@ -49,7 +48,7 @@ // CHECK: call x86_fastcallcc void %{{.*}}() // CHECK: call x86_stdcallcc void %{{.*}}() // CHECK: call x86_thiscallcc void %{{.*}}() - // CHECK: call void %{{.*}}() + // CHECK: call x86_vectorcallcc void %{{.*}}() return 0; } Index: cfe/trunk/test/CodeGen/vectorcall.c =================================================================== --- cfe/trunk/test/CodeGen/vectorcall.c +++ cfe/trunk/test/CodeGen/vectorcall.c @@ -0,0 +1,77 @@ +// RUN: %clang_cc1 -emit-llvm %s -o - -triple=i386-pc-win32 | FileCheck %s +// RUN: %clang_cc1 -emit-llvm %s -o - -triple=x86_64-pc-win32 | FileCheck %s --check-prefix=X64 + +void __vectorcall v1(int a, int b) {} +// CHECK: define x86_vectorcallcc void @"\01v1@@8"(i32 inreg %a, i32 inreg %b) +// X64: define x86_vectorcallcc void @"\01v1@@16"(i32 %a, i32 %b) + +void __vectorcall v2(char a, char b) {} +// CHECK: define x86_vectorcallcc void @"\01v2@@8"(i8 inreg signext %a, i8 inreg signext %b) +// X64: define x86_vectorcallcc void @"\01v2@@16"(i8 %a, i8 %b) + +struct Small { int a; }; +void __vectorcall v3(int a, struct Small b, int c) {} +// CHECK: define x86_vectorcallcc void @"\01v3@@12"(i32 inreg %a, %struct.Small* byval align 4 %b, i32 inreg %c) +// X64: define x86_vectorcallcc void @"\01v3@@24"(i32 %a, i32 %b.coerce, i32 %c) + +struct Large { int a[5]; }; +void __vectorcall v4(int a, struct Large b, int c) {} +// CHECK: define x86_vectorcallcc void @"\01v4@@28"(i32 inreg %a, %struct.Large* byval align 4 %b, i32 inreg %c) +// X64: define x86_vectorcallcc void @"\01v4@@40"(i32 %a, %struct.Large* %b, i32 %c) + +struct HFA2 { double x, y; }; +struct HFA4 { double w, x, y, z; }; +struct HFA5 { double v, w, x, y, z; }; + +void __vectorcall hfa1(int a, struct HFA4 b, int c) {} +// CHECK: define x86_vectorcallcc void @"\01hfa1@@40"(i32 inreg %a, double %b.0, double %b.1, double %b.2, double %b.3, i32 inreg %c) +// X64: define x86_vectorcallcc void @"\01hfa1@@48"(i32 %a, double %b.0, double %b.1, double %b.2, double %b.3, i32 %c) + +// HFAs that would require more than six total SSE registers are passed +// indirectly. Additional vector arguments can consume the rest of the SSE +// registers. +void __vectorcall hfa2(struct HFA4 a, struct HFA4 b, double c) {} +// CHECK: define x86_vectorcallcc void @"\01hfa2@@72"(double %a.0, double %a.1, double %a.2, double %a.3, %struct.HFA4* inreg %b, double %c) +// X64: define x86_vectorcallcc void @"\01hfa2@@72"(double %a.0, double %a.1, double %a.2, double %a.3, %struct.HFA4* align 8 %b, double %c) + +// Ensure that we pass builtin types directly while counting them against the +// SSE register usage. +void __vectorcall hfa3(double a, double b, double c, double d, double e, struct HFA2 f) {} +// CHECK: define x86_vectorcallcc void @"\01hfa3@@56"(double %a, double %b, double %c, double %d, double %e, %struct.HFA2* inreg %f) +// X64: define x86_vectorcallcc void @"\01hfa3@@56"(double %a, double %b, double %c, double %d, double %e, %struct.HFA2* align 8 %f) + +// Aggregates with more than four elements are not HFAs and are passed byval. +// Because they are not classified as homogeneous, they don't get special +// handling to ensure alignment. +void __vectorcall hfa4(struct HFA5 a) {} +// CHECK: define x86_vectorcallcc void @"\01hfa4@@40"(%struct.HFA5* byval align 4) +// X64: define x86_vectorcallcc void @"\01hfa4@@40"(%struct.HFA5* %a) + +// Return HFAs of 4 or fewer elements in registers. +static struct HFA2 g_hfa2; +struct HFA2 __vectorcall hfa5(void) { return g_hfa2; } +// CHECK: define x86_vectorcallcc %struct.HFA2 @"\01hfa5@@0"() +// X64: define x86_vectorcallcc %struct.HFA2 @"\01hfa5@@0"() + +typedef float __attribute__((vector_size(16))) v4f32; +struct HVA2 { v4f32 x, y; }; +struct HVA4 { v4f32 w, x, y, z; }; + +void __vectorcall hva1(int a, struct HVA4 b, int c) {} +// CHECK: define x86_vectorcallcc void @"\01hva1@@72"(i32 inreg %a, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, i32 inreg %c) +// X64: define x86_vectorcallcc void @"\01hva1@@80"(i32 %a, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, i32 %c) + +void __vectorcall hva2(struct HVA4 a, struct HVA4 b, v4f32 c) {} +// CHECK: define x86_vectorcallcc void @"\01hva2@@144"(<4 x float> %a.0, <4 x float> %a.1, <4 x float> %a.2, <4 x float> %a.3, %struct.HVA4* inreg %b, <4 x float> %c) +// X64: define x86_vectorcallcc void @"\01hva2@@144"(<4 x float> %a.0, <4 x float> %a.1, <4 x float> %a.2, <4 x float> %a.3, %struct.HVA4* align 16 %b, <4 x float> %c) + +void __vectorcall hva3(v4f32 a, v4f32 b, v4f32 c, v4f32 d, v4f32 e, struct HVA2 f) {} +// CHECK: define x86_vectorcallcc void @"\01hva3@@112"(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, <4 x float> %e, %struct.HVA2* inreg %f) +// X64: define x86_vectorcallcc void @"\01hva3@@112"(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, <4 x float> %e, %struct.HVA2* align 16 %f) + +typedef float __attribute__((ext_vector_type(3))) v3f32; +struct OddSizeHVA { v3f32 x, y; }; + +void __vectorcall odd_size_hva(struct OddSizeHVA a) {} +// CHECK: define x86_vectorcallcc void @"\01odd_size_hva@@32"(<3 x float> %a.0, <3 x float> %a.1) +// X64: define x86_vectorcallcc void @"\01odd_size_hva@@32"(<3 x float> %a.0, <3 x float> %a.1) Index: cfe/trunk/test/CodeGenCXX/homogeneous-aggregates.cpp =================================================================== --- cfe/trunk/test/CodeGenCXX/homogeneous-aggregates.cpp +++ cfe/trunk/test/CodeGenCXX/homogeneous-aggregates.cpp @@ -1,6 +1,13 @@ -// RUNxX: %clang_cc1 -triple powerpc64le-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=PPC +// RUN: %clang_cc1 -triple powerpc64le-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=PPC // RUN: %clang_cc1 -mfloat-abi hard -triple armv7-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=ARM32 // RUN: %clang_cc1 -mfloat-abi hard -triple aarch64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=ARM64 +// RUN: %clang_cc1 -mfloat-abi hard -triple x86_64-unknown-windows-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=X64 + +#if defined(__x86_64__) +#define CC __attribute__((vectorcall)) +#else +#define CC +#endif // Test that C++ classes are correctly classified as homogeneous aggregates. @@ -34,24 +41,26 @@ // PPC: define void @_Z7func_D12D1(%struct.D1* noalias sret %agg.result, [3 x i64] %x.coerce) // ARM32: define arm_aapcs_vfpcc void @_Z7func_D12D1(%struct.D1* noalias sret %agg.result, { [3 x i64] } %x.coerce) // ARM64: define void @_Z7func_D12D1(%struct.D1* noalias sret %agg.result, %struct.D1* %x) -D1 func_D1(D1 x) { return x; } +// X64: define x86_vectorcallcc void @"\01_Z7func_D12D1@@24"(%struct.D1* noalias sret %agg.result, %struct.D1* %x) +D1 CC func_D1(D1 x) { return x; } // PPC: define [3 x double] @_Z7func_D22D2([3 x double] %x.coerce) // ARM32: define arm_aapcs_vfpcc %struct.D2 @_Z7func_D22D2(%struct.D2 %x.coerce) // ARM64: define %struct.D2 @_Z7func_D22D2(double %x.0, double %x.1, double %x.2) -D2 func_D2(D2 x) { return x; } +// X64: define x86_vectorcallcc %struct.D2 @"\01_Z7func_D22D2@@24"(double %x.0, double %x.1, double %x.2) +D2 CC func_D2(D2 x) { return x; } // PPC: define void @_Z7func_D32D3(%struct.D3* noalias sret %agg.result, [4 x i64] %x.coerce) // ARM32: define arm_aapcs_vfpcc void @_Z7func_D32D3(%struct.D3* noalias sret %agg.result, { [4 x i64] } %x.coerce) // ARM64: define void @_Z7func_D32D3(%struct.D3* noalias sret %agg.result, %struct.D3* %x) -D3 func_D3(D3 x) { return x; } +D3 CC func_D3(D3 x) { return x; } // PPC: define [4 x double] @_Z7func_D42D4([4 x double] %x.coerce) // ARM32: define arm_aapcs_vfpcc %struct.D4 @_Z7func_D42D4(%struct.D4 %x.coerce) // ARM64: define %struct.D4 @_Z7func_D42D4(double %x.0, double %x.1, double %x.2, double %x.3) -D4 func_D4(D4 x) { return x; } +D4 CC func_D4(D4 x) { return x; } -D5 func_D5(D5 x) { return x; } +D5 CC func_D5(D5 x) { return x; } // PPC: define [3 x double] @_Z7func_D52D5([3 x double] %x.coerce) // ARM32: define arm_aapcs_vfpcc %struct.D5 @_Z7func_D52D5(%struct.D5 %x.coerce) @@ -92,3 +101,27 @@ // ARM64: getelementptr inbounds %struct.Base2* %{{.*}}, i32 0, i32 0 // ARM64: load double* // ARM64: call %struct.D5 @_Z7func_D52D5(double %{{.*}}, double %{{.*}}, double %{{.*}}) + +struct Empty { }; +struct Float1 { float x; }; +struct Float2 { float y; }; +struct HVAWithEmptyBase : Float1, Empty, Float2 { float z; }; + +// PPC: define void @_Z15with_empty_base16HVAWithEmptyBase([3 x float] %a.coerce) +// ARM64: define void @_Z15with_empty_base16HVAWithEmptyBase(float %a.0, float %a.1, float %a.2) +// ARM32: define arm_aapcs_vfpcc void @_Z15with_empty_base16HVAWithEmptyBase(%struct.HVAWithEmptyBase %a.coerce) +void CC with_empty_base(HVAWithEmptyBase a) {} + +// FIXME: MSVC doesn't consider this an HVA becuase of the empty base. +// X64: define x86_vectorcallcc void @"\01_Z15with_empty_base16HVAWithEmptyBase@@16"(float %a.0, float %a.1, float %a.2) + +struct HVAWithEmptyBitField : Float1, Float2 { + int : 0; // Takes no space. + float z; +}; + +// PPC: define void @_Z19with_empty_bitfield20HVAWithEmptyBitField([3 x float] %a.coerce) +// ARM64: define void @_Z19with_empty_bitfield20HVAWithEmptyBitField(float %a.0, float %a.1, float %a.2) +// ARM32: define arm_aapcs_vfpcc void @_Z19with_empty_bitfield20HVAWithEmptyBitField(%struct.HVAWithEmptyBitField %a.coerce) +// X64: define x86_vectorcallcc void @"\01_Z19with_empty_bitfield20HVAWithEmptyBitField@@16"(float %a.0, float %a.1, float %a.2) +void CC with_empty_bitfield(HVAWithEmptyBitField a) {}