Index: llvm/trunk/include/llvm/IR/CallingConv.h =================================================================== --- llvm/trunk/include/llvm/IR/CallingConv.h +++ llvm/trunk/include/llvm/IR/CallingConv.h @@ -140,7 +140,11 @@ /// convention differs from the more common \c X86_64_SysV convention /// in a number of ways, most notably in that XMM registers used to pass /// arguments are shadowed by GPRs, and vice versa. - X86_64_Win64 = 79 + X86_64_Win64 = 79, + + /// \brief MSVC calling convention that passes vectors and vector aggregates + /// in SSE registers. + X86_VectorCall = 80 }; } // End CallingConv namespace Index: llvm/trunk/lib/AsmParser/LLLexer.cpp =================================================================== --- llvm/trunk/lib/AsmParser/LLLexer.cpp +++ llvm/trunk/lib/AsmParser/LLLexer.cpp @@ -580,6 +580,7 @@ KEYWORD(x86_stdcallcc); KEYWORD(x86_fastcallcc); KEYWORD(x86_thiscallcc); + KEYWORD(x86_vectorcallcc); KEYWORD(arm_apcscc); KEYWORD(arm_aapcscc); KEYWORD(arm_aapcs_vfpcc); Index: llvm/trunk/lib/AsmParser/LLParser.cpp =================================================================== --- llvm/trunk/lib/AsmParser/LLParser.cpp +++ llvm/trunk/lib/AsmParser/LLParser.cpp @@ -1448,6 +1448,7 @@ /// ::= 'x86_stdcallcc' /// ::= 'x86_fastcallcc' /// ::= 'x86_thiscallcc' +/// ::= 'x86_vectorcallcc' /// ::= 'arm_apcscc' /// ::= 'arm_aapcscc' /// ::= 'arm_aapcs_vfpcc' @@ -1473,6 +1474,7 @@ case lltok::kw_x86_stdcallcc: CC = CallingConv::X86_StdCall; break; case lltok::kw_x86_fastcallcc: CC = CallingConv::X86_FastCall; break; case lltok::kw_x86_thiscallcc: CC = CallingConv::X86_ThisCall; break; + case lltok::kw_x86_vectorcallcc:CC = CallingConv::X86_VectorCall; break; case lltok::kw_arm_apcscc: CC = CallingConv::ARM_APCS; break; case lltok::kw_arm_aapcscc: CC = CallingConv::ARM_AAPCS; break; case lltok::kw_arm_aapcs_vfpcc:CC = CallingConv::ARM_AAPCS_VFP; break; Index: llvm/trunk/lib/AsmParser/LLToken.h =================================================================== --- llvm/trunk/lib/AsmParser/LLToken.h +++ llvm/trunk/lib/AsmParser/LLToken.h @@ -87,7 +87,7 @@ kw_cc, kw_ccc, kw_fastcc, kw_coldcc, kw_intel_ocl_bicc, - kw_x86_stdcallcc, kw_x86_fastcallcc, kw_x86_thiscallcc, + kw_x86_stdcallcc, kw_x86_fastcallcc, kw_x86_thiscallcc, kw_x86_vectorcallcc, kw_arm_apcscc, kw_arm_aapcscc, kw_arm_aapcs_vfpcc, kw_msp430_intrcc, kw_ptx_kernel, kw_ptx_device, Index: llvm/trunk/lib/IR/AsmWriter.cpp =================================================================== --- llvm/trunk/lib/IR/AsmWriter.cpp +++ llvm/trunk/lib/IR/AsmWriter.cpp @@ -285,6 +285,7 @@ case CallingConv::X86_StdCall: Out << "x86_stdcallcc"; break; case CallingConv::X86_FastCall: Out << "x86_fastcallcc"; break; case CallingConv::X86_ThisCall: Out << "x86_thiscallcc"; break; + case CallingConv::X86_VectorCall:Out << "x86_vectorcallcc"; break; case CallingConv::Intel_OCL_BI: Out << "intel_ocl_bicc"; break; case CallingConv::ARM_APCS: Out << "arm_apcscc"; break; case CallingConv::ARM_AAPCS: Out << "arm_aapcscc"; break; Index: llvm/trunk/lib/IR/Mangler.cpp =================================================================== --- llvm/trunk/lib/IR/Mangler.cpp +++ llvm/trunk/lib/IR/Mangler.cpp @@ -22,7 +22,7 @@ static void getNameWithPrefixx(raw_ostream &OS, const Twine &GVName, Mangler::ManglerPrefixTy PrefixTy, - const DataLayout &DL, bool UseAt) { + const DataLayout &DL, char Prefix) { SmallString<256> TmpData; StringRef Name = GVName.toStringRef(TmpData); assert(!Name.empty() && "getNameWithPrefix requires non-empty name"); @@ -39,13 +39,8 @@ else if (PrefixTy == Mangler::LinkerPrivate) OS << DL.getLinkerPrivateGlobalPrefix(); - if (UseAt) { - OS << '@'; - } else { - char Prefix = DL.getGlobalPrefix(); - if (Prefix != '\0') - OS << Prefix; - } + if (Prefix != '\0') + OS << Prefix; // If this is a simple string that doesn't need escaping, just append it. OS << Name; @@ -53,7 +48,8 @@ void Mangler::getNameWithPrefix(raw_ostream &OS, const Twine &GVName, ManglerPrefixTy PrefixTy) const { - return getNameWithPrefixx(OS, GVName, PrefixTy, *DL, false); + char Prefix = DL->getGlobalPrefix(); + return getNameWithPrefixx(OS, GVName, PrefixTy, *DL, Prefix); } void Mangler::getNameWithPrefix(SmallVectorImpl &OutName, @@ -63,11 +59,21 @@ return getNameWithPrefix(OS, GVName, PrefixTy); } -/// AddFastCallStdCallSuffix - Microsoft fastcall and stdcall functions require -/// a suffix on their name indicating the number of words of arguments they -/// take. -static void AddFastCallStdCallSuffix(raw_ostream &OS, const Function *F, - const DataLayout &TD) { +static bool hasByteCountSuffix(CallingConv::ID CC) { + switch (CC) { + case CallingConv::X86_FastCall: + case CallingConv::X86_StdCall: + case CallingConv::X86_VectorCall: + return true; + default: + return false; + } +} + +/// Microsoft fastcall and stdcall functions require a suffix on their name +/// indicating the number of words of arguments they take. +static void addByteCountSuffix(raw_ostream &OS, const Function *F, + const DataLayout &TD) { // Calculate arguments size total. unsigned ArgWords = 0; for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end(); @@ -76,8 +82,9 @@ // 'Dereference' type in case of byval or inalloca parameter attribute. if (AI->hasByValOrInAllocaAttr()) Ty = cast(Ty)->getElementType(); - // Size should be aligned to DWORD boundary - ArgWords += ((TD.getTypeAllocSize(Ty) + 3)/4)*4; + // Size should be aligned to pointer size. + unsigned PtrSize = TD.getPointerSize(); + ArgWords += RoundUpToAlignment(TD.getTypeAllocSize(Ty), PtrSize); } OS << '@' << ArgWords; @@ -106,34 +113,40 @@ } StringRef Name = GV->getName(); + char Prefix = DL->getGlobalPrefix(); - bool UseAt = false; - const Function *MSFunc = nullptr; - CallingConv::ID CC; - if (Name[0] != '\1' && DL->hasMicrosoftFastStdCallMangling()) { - if ((MSFunc = dyn_cast(GV))) { - CC = MSFunc->getCallingConv(); - // fastcall functions need to start with @ instead of _. - if (CC == CallingConv::X86_FastCall) - UseAt = true; - } + // Mangle functions with Microsoft calling conventions specially. Only do + // this mangling for x86_64 vectorcall and 32-bit x86. + const Function *MSFunc = dyn_cast(GV); + if (Name.startswith("\01")) + MSFunc = nullptr; // Don't mangle when \01 is present. + CallingConv::ID CC = MSFunc ? MSFunc->getCallingConv() : CallingConv::C; + if (!DL->hasMicrosoftFastStdCallMangling() && + CC != CallingConv::X86_VectorCall) + MSFunc = nullptr; + if (MSFunc) { + if (CC == CallingConv::X86_FastCall) + Prefix = '@'; // fastcall functions have an @ prefix instead of _. + else if (CC == CallingConv::X86_VectorCall) + Prefix = '\0'; // vectorcall functions have no prefix. } - getNameWithPrefixx(OS, Name, PrefixTy, *DL, UseAt); + getNameWithPrefixx(OS, Name, PrefixTy, *DL, Prefix); if (!MSFunc) return; - // If we are supposed to add a microsoft-style suffix for stdcall/fastcall, - // add it. - // fastcall and stdcall functions usually need @42 at the end to specify - // the argument info. + // If we are supposed to add a microsoft-style suffix for stdcall, fastcall, + // or vectorcall, add it. These functions have a suffix of @N where N is the + // cumulative byte size of all of the parameters to the function in decimal. + if (CC == CallingConv::X86_VectorCall) + OS << '@'; // vectorcall functions use a double @ suffix. FunctionType *FT = MSFunc->getFunctionType(); - if ((CC == CallingConv::X86_FastCall || CC == CallingConv::X86_StdCall) && + if (hasByteCountSuffix(CC) && // "Pure" variadic functions do not receive @0 suffix. (!FT->isVarArg() || FT->getNumParams() == 0 || (FT->getNumParams() == 1 && MSFunc->hasStructRetAttr()))) - AddFastCallStdCallSuffix(OS, MSFunc, *DL); + addByteCountSuffix(OS, MSFunc, *DL); } void Mangler::getNameWithPrefix(SmallVectorImpl &OutName, Index: llvm/trunk/lib/Target/X86/X86CallingConv.h =================================================================== --- llvm/trunk/lib/Target/X86/X86CallingConv.h +++ llvm/trunk/lib/Target/X86/X86CallingConv.h @@ -20,6 +20,19 @@ namespace llvm { +inline bool CC_X86_32_VectorCallIndirect(unsigned &ValNo, MVT &ValVT, + MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + // Similar to CCPassIndirect, with the addition of inreg. + LocVT = MVT::i32; + LocInfo = CCValAssign::Indirect; + ArgFlags.setInReg(); + return false; // Continue the search, but now for i32. +} + + inline bool CC_X86_AnyReg_Error(unsigned &, MVT &, MVT &, CCValAssign::LocInfo &, ISD::ArgFlagsTy &, CCState &) { Index: llvm/trunk/lib/Target/X86/X86CallingConv.td =================================================================== --- llvm/trunk/lib/Target/X86/X86CallingConv.td +++ llvm/trunk/lib/Target/X86/X86CallingConv.td @@ -124,6 +124,24 @@ CCIfType<[i32], CCAssignToReg<[ESI, EBP, EAX, EDX]>> ]>; +// X86-32 HiPE return-value convention. +def RetCC_X86_32_VectorCall : CallingConv<[ + // Vector types are returned in XMM0,XMM1,XMMM2 and XMM3. + CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>, + + // 256-bit FP vectors + CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>, + + // 512-bit FP vectors + CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>, + + // Return integers in the standard way. + CCDelegateTo +]>; + // X86-64 C return-value convention. def RetCC_X86_64_C : CallingConv<[ // The X86-64 calling convention always returns FP values in XMM0. @@ -179,6 +197,7 @@ CCIfCC<"CallingConv::Fast", CCDelegateTo>, // If HiPE, use RetCC_X86_32_HiPE. CCIfCC<"CallingConv::HiPE", CCDelegateTo>, + CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo>, // Otherwise, use RetCC_X86_32_C. CCDelegateTo @@ -330,6 +349,25 @@ CCIfType<[f80], CCAssignToStack<0, 0>> ]>; +def CC_X86_Win64_VectorCall : CallingConv<[ + // The first 6 floating point and vector types of 128 bits or less use + // XMM0-XMM5. + CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5]>>, + + // 256-bit vectors use YMM registers. + CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCAssignToReg<[YMM0, YMM1, YMM2, YMM3, YMM4, YMM5]>>, + + // 512-bit vectors use ZMM registers. + CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5]>>, + + // Delegate to fastcall to handle integer types. + CCDelegateTo +]>; + + def CC_X86_64_GHC : CallingConv<[ // Promote i8/i16/i32 arguments to i64. CCIfType<[i8, i16, i32], CCPromoteToType>, @@ -463,6 +501,30 @@ CCDelegateTo ]>; +def CC_X86_32_VectorCall : CallingConv<[ + // The first 6 floating point and vector types of 128 bits or less use + // XMM0-XMM5. + CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5]>>, + + // 256-bit vectors use YMM registers. + CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCAssignToReg<[YMM0, YMM1, YMM2, YMM3, YMM4, YMM5]>>, + + // 512-bit vectors use ZMM registers. + CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5]>>, + + // Otherwise, pass it indirectly. + CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, + v32i8, v16i16, v8i32, v4i64, v8f32, v4f64, + v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCCustom<"CC_X86_32_VectorCallIndirect">>, + + // Delegate to fastcall to handle integer types. + CCDelegateTo +]>; + def CC_X86_32_ThisCall_Common : CallingConv<[ // The first integer argument is passed in ECX CCIfType<[i32], CCAssignToReg<[ECX]>>, @@ -576,6 +638,7 @@ // This is the root argument convention for the X86-32 backend. def CC_X86_32 : CallingConv<[ CCIfCC<"CallingConv::X86_FastCall", CCDelegateTo>, + CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo>, CCIfCC<"CallingConv::X86_ThisCall", CCDelegateTo>, CCIfCC<"CallingConv::Fast", CCDelegateTo>, CCIfCC<"CallingConv::GHC", CCDelegateTo>, @@ -593,6 +656,7 @@ CCIfCC<"CallingConv::AnyReg", CCDelegateTo>, CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo>, CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo>, + CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo>, // Mingw64 and native Win64 use Win64 CC CCIfSubtarget<"isTargetWin64()", CCDelegateTo>, Index: llvm/trunk/test/CodeGen/X86/vectorcall.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vectorcall.ll +++ llvm/trunk/test/CodeGen/X86/vectorcall.ll @@ -0,0 +1,93 @@ +; RUN: llc -mtriple=i686-pc-win32 -mattr=+sse2 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=X86 +; RUN: llc -mtriple=x86_64-pc-win32 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=X64 + +; Test integer arguments. + +define x86_vectorcallcc i32 @test_int_1() { + ret i32 0 +} + +; CHECK-LABEL: {{^}}test_int_1@@0: +; CHECK: xorl %eax, %eax + +define x86_vectorcallcc i32 @test_int_2(i32 inreg %a) { + ret i32 %a +} + +; X86-LABEL: {{^}}test_int_2@@4: +; X64-LABEL: {{^}}test_int_2@@8: +; CHECK: movl %ecx, %eax + +define x86_vectorcallcc i32 @test_int_3(i64 inreg %a) { + %at = trunc i64 %a to i32 + ret i32 %at +} + +; X86-LABEL: {{^}}test_int_3@@8: +; X64-LABEL: {{^}}test_int_3@@8: +; CHECK: movl %ecx, %eax + +define x86_vectorcallcc i32 @test_int_4(i32 inreg %a, i32 inreg %b) { + %s = add i32 %a, %b + ret i32 %s +} + +; X86-LABEL: {{^}}test_int_4@@8: +; X86: leal (%ecx,%edx), %eax + +; X64-LABEL: {{^}}test_int_4@@16: +; X64: leal (%rcx,%rdx), %eax + +define x86_vectorcallcc i32 @"\01test_int_5"(i32, i32) { + ret i32 0 +} +; CHECK-LABEL: {{^}}test_int_5: + +define x86_vectorcallcc double @test_fp_1(double %a, double %b) { + ret double %b +} +; CHECK-LABEL: {{^}}test_fp_1@@16: +; CHECK: movaps %xmm1, %xmm0 + +define x86_vectorcallcc double @test_fp_2( + double, double, double, double, double, double, double %r) { + ret double %r +} +; CHECK-LABEL: {{^}}test_fp_2@@56: +; CHECK: movsd {{[0-9]+\(%[re]sp\)}}, %xmm0 + +define x86_vectorcallcc {double, double, double, double} @test_fp_3() { + ret {double, double, double, double} + { double 0.0, double 0.0, double 0.0, double 0.0 } +} +; CHECK-LABEL: {{^}}test_fp_3@@0: +; CHECK: xorps %xmm0 +; CHECK: xorps %xmm1 +; CHECK: xorps %xmm2 +; CHECK: xorps %xmm3 + +; FIXME: Returning via x87 isn't compatible, but its hard to structure the +; tablegen any other way. +define x86_vectorcallcc {double, double, double, double, double} @test_fp_4() { + ret {double, double, double, double, double} + { double 0.0, double 0.0, double 0.0, double 0.0, double 0.0 } +} +; CHECK-LABEL: {{^}}test_fp_4@@0: +; CHECK: fldz +; CHECK: xorps %xmm0 +; CHECK: xorps %xmm1 +; CHECK: xorps %xmm2 +; CHECK: xorps %xmm3 + +define x86_vectorcallcc <16 x i8> @test_vec_1(<16 x i8> %a, <16 x i8> %b) { + ret <16 x i8> %b +} +; CHECK-LABEL: {{^}}test_vec_1@@32: +; CHECK: movaps %xmm1, %xmm0 + +define x86_vectorcallcc <16 x i8> @test_vec_2( + double, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> %r) { + ret <16 x i8> %r +} +; CHECK-LABEL: {{^}}test_vec_2@@104: +; CHECK: movaps (%{{[re]}}cx), %xmm0