diff --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.h b/llvm/include/llvm/Analysis/TargetLibraryInfo.h --- a/llvm/include/llvm/Analysis/TargetLibraryInfo.h +++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.h @@ -164,6 +164,12 @@ /// such mapping exists, return the empty string. StringRef getVectorizedFunction(StringRef F, const ElementCount &VF) const; + /// Returns vectorized function custom calling convention. Returns empty + /// Optional if no special calling convention is needed for function. + Optional + getVectorizedFunctionCallingConv(StringRef F, const FunctionType &FTy, + const DataLayout &DL) const; + /// Set to true iff i32 parameters to library functions should have signext /// or zeroext attributes if they correspond to C-level int or unsigned int, /// respectively. @@ -320,6 +326,11 @@ StringRef getVectorizedFunction(StringRef F, const ElementCount &VF) const { return Impl->getVectorizedFunction(F, VF); } + Optional + getVectorizedFunctionCallingConv(StringRef F, const FunctionType &FTy, + const DataLayout &DL) const { + return Impl->getVectorizedFunctionCallingConv(F, FTy, DL); + } /// Tests if the function is both available and a candidate for optimized code /// generation. diff --git a/llvm/include/llvm/IR/CallingConv.h b/llvm/include/llvm/IR/CallingConv.h --- a/llvm/include/llvm/IR/CallingConv.h +++ b/llvm/include/llvm/IR/CallingConv.h @@ -247,6 +247,11 @@ /// M68k_INTR - Calling convention used for M68k interrupt routines. M68k_INTR = 101, + /// Intel_SVML - Calling conventions for Intel Short Math Vector Library + Intel_SVML128 = 102, + Intel_SVML256 = 103, + Intel_SVML512 = 104, + /// The highest possible calling convention ID. Must be some 2^k - 1. MaxID = 1023 }; diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp --- a/llvm/lib/Analysis/TargetLibraryInfo.cpp +++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp @@ -1677,6 +1677,33 @@ return StringRef(); } +static CallingConv::ID getSVMLCallingConv(const DataLayout &DL, + const FunctionType &FType) { + assert(isa(FType.getReturnType())); + auto *VecCallRetType = cast(FType.getReturnType()); + auto TypeBitWidth = DL.getTypeSizeInBits(VecCallRetType); + if (TypeBitWidth == 128) { + return CallingConv::Intel_SVML128; + } else if (TypeBitWidth == 256) { + return CallingConv::Intel_SVML256; + } else if (TypeBitWidth == 512) { + return CallingConv::Intel_SVML512; + } else { + llvm_unreachable("Invalid vector width"); + } + return 0; // not reachable +} + +Optional +TargetLibraryInfoImpl::getVectorizedFunctionCallingConv( + StringRef F, const FunctionType &FTy, const DataLayout &DL) const { + if (ClVectorLibrary == SVML) { + assert(F.startswith("__svml")); + return getSVMLCallingConv(DL, FTy); + } + return {}; +} + TargetLibraryInfo TargetLibraryAnalysis::run(const Function &F, FunctionAnalysisManager &) { if (!BaselineInfoImpl) diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp --- a/llvm/lib/AsmParser/LLLexer.cpp +++ b/llvm/lib/AsmParser/LLLexer.cpp @@ -604,6 +604,9 @@ KEYWORD(spir_kernel); KEYWORD(spir_func); KEYWORD(intel_ocl_bicc); + KEYWORD(intel_svmlcc128); + KEYWORD(intel_svmlcc256); + KEYWORD(intel_svmlcc512); KEYWORD(x86_64_sysvcc); KEYWORD(win64cc); KEYWORD(x86_regcallcc); diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -2095,6 +2095,9 @@ /// ::= 'ccc' /// ::= 'fastcc' /// ::= 'intel_ocl_bicc' +/// ::= 'intel_svmlcc128' +/// ::= 'intel_svmlcc256' +/// ::= 'intel_svmlcc512' /// ::= 'coldcc' /// ::= 'cfguard_checkcc' /// ::= 'x86_stdcallcc' @@ -2163,6 +2166,15 @@ case lltok::kw_spir_kernel: CC = CallingConv::SPIR_KERNEL; break; case lltok::kw_spir_func: CC = CallingConv::SPIR_FUNC; break; case lltok::kw_intel_ocl_bicc: CC = CallingConv::Intel_OCL_BI; break; + case lltok::kw_intel_svmlcc128: + CC = CallingConv::Intel_SVML128; + break; + case lltok::kw_intel_svmlcc256: + CC = CallingConv::Intel_SVML256; + break; + case lltok::kw_intel_svmlcc512: + CC = CallingConv::Intel_SVML512; + break; case lltok::kw_x86_64_sysvcc: CC = CallingConv::X86_64_SysV; break; case lltok::kw_win64cc: CC = CallingConv::Win64; break; case lltok::kw_webkit_jscc: CC = CallingConv::WebKit_JS; break; diff --git a/llvm/lib/AsmParser/LLToken.h b/llvm/lib/AsmParser/LLToken.h --- a/llvm/lib/AsmParser/LLToken.h +++ b/llvm/lib/AsmParser/LLToken.h @@ -133,6 +133,9 @@ kw_fastcc, kw_coldcc, kw_intel_ocl_bicc, + kw_intel_svmlcc128, + kw_intel_svmlcc256, + kw_intel_svmlcc512, kw_cfguard_checkcc, kw_x86_stdcallcc, kw_x86_fastcallcc, diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp --- a/llvm/lib/IR/AsmWriter.cpp +++ b/llvm/lib/IR/AsmWriter.cpp @@ -371,6 +371,15 @@ case CallingConv::X86_RegCall: Out << "x86_regcallcc"; break; case CallingConv::X86_VectorCall:Out << "x86_vectorcallcc"; break; case CallingConv::Intel_OCL_BI: Out << "intel_ocl_bicc"; break; + case CallingConv::Intel_SVML128: + Out << "intel_svmlcc128"; + break; + case CallingConv::Intel_SVML256: + Out << "intel_svmlcc256"; + break; + case CallingConv::Intel_SVML512: + Out << "intel_svmlcc512"; + break; case CallingConv::ARM_APCS: Out << "arm_apcscc"; break; case CallingConv::ARM_AAPCS: Out << "arm_aapcscc"; break; case CallingConv::ARM_AAPCS_VFP: Out << "arm_aapcs_vfpcc"; break; diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -2472,6 +2472,9 @@ case CallingConv::Fast: case CallingConv::Cold: case CallingConv::Intel_OCL_BI: + case CallingConv::Intel_SVML128: + case CallingConv::Intel_SVML256: + case CallingConv::Intel_SVML512: case CallingConv::PTX_Kernel: case CallingConv::PTX_Device: Assert(!F.isVarArg(), "Calling convention does not support varargs or " diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td --- a/llvm/lib/Target/X86/X86CallingConv.td +++ b/llvm/lib/Target/X86/X86CallingConv.td @@ -92,14 +92,14 @@ // __mmask64 (v64i1) --> GPR64 (for x64) or 2 x GPR32 (for IA32) CCIfType<[v64i1], CCPromoteToType>, - CCIfSubtarget<"is64Bit()", CCIfType<[i64], + CCIfSubtarget<"is64Bit()", CCIfType<[i64], CCAssignToReg>>, - CCIfSubtarget<"is32Bit()", CCIfType<[i64], + CCIfSubtarget<"is32Bit()", CCIfType<[i64], CCCustom<"CC_X86_32_RegCall_Assign2Regs">>>, // float, double, float128 --> XMM // In the case of SSE disabled --> save to stack - CCIfType<[f32, f64, f128], + CCIfType<[f32, f64, f128], CCIfSubtarget<"hasSSE1()", CCAssignToReg>>, // long double --> FP @@ -107,23 +107,23 @@ // __m128, __m128i, __m128d --> XMM // In the case of SSE disabled --> save to stack - CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCIfSubtarget<"hasSSE1()", CCAssignToReg>>, // __m256, __m256i, __m256d --> YMM // In the case of SSE disabled --> save to stack - CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], CCIfSubtarget<"hasAVX()", CCAssignToReg>>, // __m512, __m512i, __m512d --> ZMM // In the case of SSE disabled --> save to stack - CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], CCIfSubtarget<"hasAVX512()",CCAssignToReg>>, // If no register was found -> assign to stack // In 64 bit, assign 64/32 bit values to 8 byte stack - CCIfSubtarget<"is64Bit()", CCIfType<[i32, i64, f32, f64], + CCIfSubtarget<"is64Bit()", CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>>, // In 32 bit, assign 64/32 bit values to 8/4 byte stack @@ -134,16 +134,16 @@ CCIfSubtarget<"is64Bit()", CCIfType<[x86mmx], CCAssignToStack<8, 8>>>, CCIfType<[x86mmx], CCAssignToStack<8, 4>>, - // float 128 get stack slots whose size and alignment depends + // float 128 get stack slots whose size and alignment depends // on the subtarget. CCIfType<[f80, f128], CCAssignToStack<0, 0>>, // Vectors get 16-byte stack slots that are 16-byte aligned. - CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>, // 256-bit vectors get 32-byte stack slots that are 32-byte aligned. - CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], CCAssignToStack<32, 32>>, // 512-bit vectors get 64-byte stack slots that are 64-byte aligned. @@ -171,28 +171,28 @@ // __mmask64 (v64i1) --> GPR64 (for x64) or 2 x GPR32 (for IA32) CCIfType<[v64i1], CCPromoteToType>, - CCIfSubtarget<"is64Bit()", CCIfType<[i64], + CCIfSubtarget<"is64Bit()", CCIfType<[i64], CCAssignToReg>>, - CCIfSubtarget<"is32Bit()", CCIfType<[i64], + CCIfSubtarget<"is32Bit()", CCIfType<[i64], CCCustom<"CC_X86_32_RegCall_Assign2Regs">>>, // long double --> FP CCIfType<[f80], CCAssignToReg>, // float, double, float128 --> XMM - CCIfType<[f32, f64, f128], + CCIfType<[f32, f64, f128], CCIfSubtarget<"hasSSE1()", CCAssignToReg>>, // __m128, __m128i, __m128d --> XMM - CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCIfSubtarget<"hasSSE1()", CCAssignToReg>>, // __m256, __m256i, __m256d --> YMM - CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], CCIfSubtarget<"hasAVX()", CCAssignToReg>>, // __m512, __m512i, __m512d --> ZMM - CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], CCIfSubtarget<"hasAVX512()", CCAssignToReg>> ]>; } @@ -360,7 +360,7 @@ // X86-64 vectorcall return-value convention. def RetCC_X86_64_Vectorcall : CallingConv<[ // Vectorcall calling convention always returns FP values in XMMs. - CCIfType<[f32, f64, f128], + CCIfType<[f32, f64, f128], CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>, // Otherwise, everything is the same as Windows X86-64 C CC. @@ -477,7 +477,7 @@ CCIfSubtarget<"isTargetWin64()", CCDelegateTo>>, CCIfCC<"CallingConv::X86_RegCall", CCDelegateTo>, - + // Mingw64 and native Win64 use Win64 CC CCIfSubtarget<"isTargetWin64()", CCDelegateTo>, @@ -485,6 +485,22 @@ CCDelegateTo ]>; +// Intel_SVML return-value convention. +def RetCC_Intel_SVML : CallingConv<[ + // Vector types are returned in XMM0,XMM1 + CCIfType<[v4f32, v2f64], + CCAssignToReg<[XMM0,XMM1]>>, + + // 256-bit FP vectors + CCIfType<[v8f32, v4f64], + CCAssignToReg<[YMM0,YMM1]>>, + + // 512-bit FP vectors + CCIfType<[v16f32, v8f64], + CCAssignToReg<[ZMM0,ZMM1]>> +]>; + + // This is the return-value convention used for the entire X86 backend. let Entry = 1 in def RetCC_X86 : CallingConv<[ @@ -492,6 +508,10 @@ // Check if this is the Intel OpenCL built-ins calling convention CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo>, + CCIfCC<"CallingConv::Intel_SVML128", CCDelegateTo>, + CCIfCC<"CallingConv::Intel_SVML256", CCDelegateTo>, + CCIfCC<"CallingConv::Intel_SVML512", CCDelegateTo>, + CCIfSubtarget<"is64Bit()", CCDelegateTo>, CCDelegateTo ]>; @@ -1015,6 +1035,22 @@ CCDelegateTo ]>; +// X86-64 Intel Short Vector Math Library calling convention. +def CC_Intel_SVML : CallingConv<[ + + // The SSE vector arguments are passed in XMM registers. + CCIfType<[v4f32, v2f64], + CCAssignToReg<[XMM0, XMM1, XMM2]>>, + + // The 256-bit vector arguments are passed in YMM registers. + CCIfType<[v8f32, v4f64], + CCAssignToReg<[YMM0, YMM1, YMM2]>>, + + // The 512-bit vector arguments are passed in ZMM registers. + CCIfType<[v16f32, v8f64], + CCAssignToReg<[ZMM0, ZMM1, ZMM2]>> +]>; + //===----------------------------------------------------------------------===// // X86 Root Argument Calling Conventions //===----------------------------------------------------------------------===// @@ -1066,6 +1102,9 @@ let Entry = 1 in def CC_X86 : CallingConv<[ CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo>, + CCIfCC<"CallingConv::Intel_SVML128", CCDelegateTo>, + CCIfCC<"CallingConv::Intel_SVML256", CCDelegateTo>, + CCIfCC<"CallingConv::Intel_SVML512", CCDelegateTo>, CCIfSubtarget<"is64Bit()", CCDelegateTo>, CCDelegateTo ]>; @@ -1170,9 +1209,33 @@ def CSR_Win32_CFGuard_Check : CalleeSavedRegs<(add CSR_32_RegCall, ECX)>; def CSR_Win64_RegCall_NoSSE : CalleeSavedRegs<(add RBX, RBP, (sequence "R%u", 10, 15))>; -def CSR_Win64_RegCall : CalleeSavedRegs<(add CSR_Win64_RegCall_NoSSE, +def CSR_Win64_RegCall : CalleeSavedRegs<(add CSR_Win64_RegCall_NoSSE, (sequence "XMM%u", 8, 15))>; def CSR_SysV64_RegCall_NoSSE : CalleeSavedRegs<(add RBX, RBP, (sequence "R%u", 12, 15))>; -def CSR_SysV64_RegCall : CalleeSavedRegs<(add CSR_SysV64_RegCall_NoSSE, +def CSR_SysV64_RegCall : CalleeSavedRegs<(add CSR_SysV64_RegCall_NoSSE, + (sequence "XMM%u", 8, 15))>; + +// SVML calling convention +def CSR_32_Intel_SVML : CalleeSavedRegs<(add CSR_32_RegCall_NoSSE)>; +def CSR_32_Intel_SVML_AVX512 : CalleeSavedRegs<(add CSR_32_Intel_SVML, + K4, K5, K6, K7)>; + +def CSR_64_Intel_SVML_NoSSE : CalleeSavedRegs<(add RBX, RSI, RDI, RBP, RSP, R12, R13, R14, R15)>; + +def CSR_64_Intel_SVML : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE, (sequence "XMM%u", 8, 15))>; +def CSR_Win64_Intel_SVML : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE, + (sequence "XMM%u", 6, 15))>; + +def CSR_64_Intel_SVML_AVX : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE, + (sequence "YMM%u", 8, 15))>; +def CSR_Win64_Intel_SVML_AVX : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE, + (sequence "YMM%u", 6, 15))>; + +def CSR_64_Intel_SVML_AVX512 : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE, + (sequence "ZMM%u", 16, 31), + K4, K5, K6, K7)>; +def CSR_Win64_Intel_SVML_AVX512 : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE, + (sequence "ZMM%u", 6, 21), + K4, K5, K6, K7)>; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -3546,7 +3546,8 @@ // FIXME: Only some x86_32 calling conventions support AVX512. if (Subtarget.useAVX512Regs() && (is64Bit() || (CallConv == CallingConv::X86_VectorCall || - CallConv == CallingConv::Intel_OCL_BI))) + CallConv == CallingConv::Intel_OCL_BI || + CallConv == CallingConv::Intel_SVML512))) VecVT = MVT::v16f32; else if (Subtarget.hasAVX()) VecVT = MVT::v8f32; @@ -37591,7 +37592,7 @@ return Res; // Fold vperm2x128 subvector shuffle with an inner concat pattern. - // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc. + // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc. auto FindSubVector128 = [&](unsigned Idx) { if (Idx > 3) return SDValue(); diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp --- a/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -272,6 +272,46 @@ } } +static std::pair +getSVMLRegMaskAndSaveList(bool Is64Bit, bool IsWin64, CallingConv::ID CC) { + assert(CC >= CallingConv::Intel_SVML128 && CC <= CallingConv::Intel_SVML512); + unsigned Abi = CC - CallingConv::Intel_SVML128; // 0 - 128, 1 - 256, 2 - 512 + + const std::pair Abi64[] = { + std::make_pair(CSR_64_Intel_SVML_RegMask, CSR_64_Intel_SVML_SaveList), + std::make_pair(CSR_64_Intel_SVML_AVX_RegMask, + CSR_64_Intel_SVML_AVX_SaveList), + std::make_pair(CSR_64_Intel_SVML_AVX512_RegMask, + CSR_64_Intel_SVML_AVX512_SaveList), + }; + + const std::pair AbiWin64[] = { + std::make_pair(CSR_Win64_Intel_SVML_RegMask, + CSR_Win64_Intel_SVML_SaveList), + std::make_pair(CSR_Win64_Intel_SVML_AVX_RegMask, + CSR_Win64_Intel_SVML_AVX_SaveList), + std::make_pair(CSR_Win64_Intel_SVML_AVX512_RegMask, + CSR_Win64_Intel_SVML_AVX512_SaveList), + }; + + const std::pair Abi32[] = { + std::make_pair(CSR_32_Intel_SVML_RegMask, CSR_32_Intel_SVML_SaveList), + std::make_pair(CSR_32_Intel_SVML_RegMask, CSR_32_Intel_SVML_SaveList), + std::make_pair(CSR_32_Intel_SVML_AVX512_RegMask, + CSR_32_Intel_SVML_AVX512_SaveList), + }; + + if (Is64Bit) { + if (IsWin64) { + return AbiWin64[Abi]; + } else { + return Abi64[Abi]; + } + } else { + return Abi32[Abi]; + } +} + const MCPhysReg * X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { assert(MF && "MachineFunction required"); @@ -327,6 +367,11 @@ return CSR_64_Intel_OCL_BI_SaveList; break; } + case CallingConv::Intel_SVML128: + case CallingConv::Intel_SVML256: + case CallingConv::Intel_SVML512: { + return getSVMLRegMaskAndSaveList(Is64Bit, IsWin64, CC).second; + } case CallingConv::HHVM: return CSR_64_HHVM_SaveList; case CallingConv::X86_RegCall: @@ -445,6 +490,11 @@ return CSR_64_Intel_OCL_BI_RegMask; break; } + case CallingConv::Intel_SVML128: + case CallingConv::Intel_SVML256: + case CallingConv::Intel_SVML512: { + return getSVMLRegMaskAndSaveList(Is64Bit, IsWin64, CC).first; + } case CallingConv::HHVM: return CSR_64_HHVM_RegMask; case CallingConv::X86_RegCall: diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -890,6 +890,9 @@ case CallingConv::X86_ThisCall: case CallingConv::X86_VectorCall: case CallingConv::Intel_OCL_BI: + case CallingConv::Intel_SVML128: + case CallingConv::Intel_SVML256: + case CallingConv::Intel_SVML512: return isTargetWin64(); // This convention allows using the Win64 convention on other targets. case CallingConv::Win64: diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4965,6 +4965,17 @@ } // end of switch. } +static void setVectorFunctionCallingConv(CallInst &CI, const DataLayout &DL, + const TargetLibraryInfo &TLI) { + Function *VectorF = CI.getCalledFunction(); + FunctionType *FTy = VectorF->getFunctionType(); + StringRef VFName = VectorF->getName(); + auto CC = TLI.getVectorizedFunctionCallingConv(VFName, *FTy, DL); + if (CC) { + CI.setCallingConv(*CC); + } +} + void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, VPTransformState &State) { @@ -5030,6 +5041,9 @@ if (isa(V)) V->copyFastMathFlags(CI); + const DataLayout &DL = V->getModule()->getDataLayout(); + setVectorFunctionCallingConv(*V, DL, *TLI); + State.set(Def, V, Part); addMetadata(V, &I); } diff --git a/llvm/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll b/llvm/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll --- a/llvm/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll @@ -190,7 +190,7 @@ define void @exp2f_finite(float* nocapture %varray) { ; CHECK-LABEL: @exp2f_finite( -; CHECK: call <4 x float> @__svml_exp2f4(<4 x float> %{{.*}}) +; CHECK: call intel_svmlcc128 <4 x float> @__svml_exp2f4(<4 x float> %{{.*}}) ; CHECK: ret void ; entry: @@ -219,7 +219,7 @@ define void @exp2_finite(double* nocapture %varray) { ; CHECK-LABEL: @exp2_finite( -; CHECK: call <4 x double> @__svml_exp24(<4 x double> {{.*}}) +; CHECK: call intel_svmlcc256 <4 x double> @__svml_exp24(<4 x double> {{.*}}) ; CHECK: ret void ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/X86/svml-calls.ll b/llvm/test/Transforms/LoopVectorize/X86/svml-calls.ll --- a/llvm/test/Transforms/LoopVectorize/X86/svml-calls.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/svml-calls.ll @@ -48,7 +48,7 @@ define void @sin_f64(double* nocapture %varray) { ; CHECK-LABEL: @sin_f64( -; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_sin4(<4 x double> [[TMP4:%.*]]) +; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_sin4(<4 x double> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -71,7 +71,7 @@ define void @sin_f32(float* nocapture %varray) { ; CHECK-LABEL: @sin_f32( -; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_sinf4(<4 x float> [[TMP4:%.*]]) +; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_sinf4(<4 x float> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -94,7 +94,7 @@ define void @sin_f64_intrinsic(double* nocapture %varray) { ; CHECK-LABEL: @sin_f64_intrinsic( -; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_sin4(<4 x double> [[TMP4:%.*]]) +; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_sin4(<4 x double> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -117,7 +117,7 @@ define void @sin_f32_intrinsic(float* nocapture %varray) { ; CHECK-LABEL: @sin_f32_intrinsic( -; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_sinf4(<4 x float> [[TMP4:%.*]]) +; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_sinf4(<4 x float> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -140,7 +140,7 @@ define void @cos_f64(double* nocapture %varray) { ; CHECK-LABEL: @cos_f64( -; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_cos4(<4 x double> [[TMP4:%.*]]) +; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_cos4(<4 x double> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -163,7 +163,7 @@ define void @cos_f32(float* nocapture %varray) { ; CHECK-LABEL: @cos_f32( -; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_cosf4(<4 x float> [[TMP4:%.*]]) +; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_cosf4(<4 x float> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -186,7 +186,7 @@ define void @cos_f64_intrinsic(double* nocapture %varray) { ; CHECK-LABEL: @cos_f64_intrinsic( -; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_cos4(<4 x double> [[TMP4:%.*]]) +; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_cos4(<4 x double> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -209,7 +209,7 @@ define void @cos_f32_intrinsic(float* nocapture %varray) { ; CHECK-LABEL: @cos_f32_intrinsic( -; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_cosf4(<4 x float> [[TMP4:%.*]]) +; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_cosf4(<4 x float> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -232,7 +232,7 @@ define void @pow_f64(double* nocapture %varray, double* nocapture readonly %exp) { ; CHECK-LABEL: @pow_f64( -; CHECK: [[TMP8:%.*]] = call <4 x double> @__svml_pow4(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]]) +; CHECK: [[TMP8:%.*]] = call intel_svmlcc256 <4 x double> @__svml_pow4(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]]) ; CHECK: ret void ; entry: @@ -257,7 +257,7 @@ define void @pow_f64_intrinsic(double* nocapture %varray, double* nocapture readonly %exp) { ; CHECK-LABEL: @pow_f64_intrinsic( -; CHECK: [[TMP8:%.*]] = call <4 x double> @__svml_pow4(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]]) +; CHECK: [[TMP8:%.*]] = call intel_svmlcc256 <4 x double> @__svml_pow4(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]]) ; CHECK: ret void ; entry: @@ -282,7 +282,7 @@ define void @pow_f32(float* nocapture %varray, float* nocapture readonly %exp) { ; CHECK-LABEL: @pow_f32( -; CHECK: [[TMP8:%.*]] = call <4 x float> @__svml_powf4(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]]) +; CHECK: [[TMP8:%.*]] = call intel_svmlcc128 <4 x float> @__svml_powf4(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]]) ; CHECK: ret void ; entry: @@ -307,7 +307,7 @@ define void @pow_f32_intrinsic(float* nocapture %varray, float* nocapture readonly %exp) { ; CHECK-LABEL: @pow_f32_intrinsic( -; CHECK: [[TMP8:%.*]] = call <4 x float> @__svml_powf4(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]]) +; CHECK: [[TMP8:%.*]] = call intel_svmlcc128 <4 x float> @__svml_powf4(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]]) ; CHECK: ret void ; entry: @@ -332,7 +332,7 @@ define void @exp_f64(double* nocapture %varray) { ; CHECK-LABEL: @exp_f64( -; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_exp4(<4 x double> [[TMP4:%.*]]) +; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp4(<4 x double> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -355,7 +355,7 @@ define void @exp_f32(float* nocapture %varray) { ; CHECK-LABEL: @exp_f32( -; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_expf4(<4 x float> [[TMP4:%.*]]) +; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_expf4(<4 x float> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -378,7 +378,7 @@ define void @exp_f64_intrinsic(double* nocapture %varray) { ; CHECK-LABEL: @exp_f64_intrinsic( -; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_exp4(<4 x double> [[TMP4:%.*]]) +; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp4(<4 x double> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -401,7 +401,7 @@ define void @exp_f32_intrinsic(float* nocapture %varray) { ; CHECK-LABEL: @exp_f32_intrinsic( -; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_expf4(<4 x float> [[TMP4:%.*]]) +; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_expf4(<4 x float> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -424,7 +424,7 @@ define void @log_f64(double* nocapture %varray) { ; CHECK-LABEL: @log_f64( -; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_log4(<4 x double> [[TMP4:%.*]]) +; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log4(<4 x double> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -447,7 +447,7 @@ define void @log_f32(float* nocapture %varray) { ; CHECK-LABEL: @log_f32( -; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_logf4(<4 x float> [[TMP4:%.*]]) +; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_logf4(<4 x float> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -470,7 +470,7 @@ define void @log_f64_intrinsic(double* nocapture %varray) { ; CHECK-LABEL: @log_f64_intrinsic( -; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_log4(<4 x double> [[TMP4:%.*]]) +; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log4(<4 x double> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -493,7 +493,7 @@ define void @log_f32_intrinsic(float* nocapture %varray) { ; CHECK-LABEL: @log_f32_intrinsic( -; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_logf4(<4 x float> [[TMP4:%.*]]) +; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_logf4(<4 x float> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -516,7 +516,7 @@ define void @log2_f64(double* nocapture %varray) { ; CHECK-LABEL: @log2_f64( -; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_log24(<4 x double> [[TMP4:%.*]]) +; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log24(<4 x double> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -539,7 +539,7 @@ define void @log2_f32(float* nocapture %varray) { ; CHECK-LABEL: @log2_f32( -; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_log2f4(<4 x float> [[TMP4:%.*]]) +; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_log2f4(<4 x float> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -562,7 +562,7 @@ define void @log2_f64_intrinsic(double* nocapture %varray) { ; CHECK-LABEL: @log2_f64_intrinsic( -; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_log24(<4 x double> [[TMP4:%.*]]) +; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log24(<4 x double> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -585,7 +585,7 @@ define void @log2_f32_intrinsic(float* nocapture %varray) { ; CHECK-LABEL: @log2_f32_intrinsic( -; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_log2f4(<4 x float> [[TMP4:%.*]]) +; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_log2f4(<4 x float> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -608,7 +608,7 @@ define void @log10_f64(double* nocapture %varray) { ; CHECK-LABEL: @log10_f64( -; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_log104(<4 x double> [[TMP4:%.*]]) +; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log104(<4 x double> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -631,7 +631,7 @@ define void @log10_f32(float* nocapture %varray) { ; CHECK-LABEL: @log10_f32( -; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_log10f4(<4 x float> [[TMP4:%.*]]) +; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_log10f4(<4 x float> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -654,7 +654,7 @@ define void @log10_f64_intrinsic(double* nocapture %varray) { ; CHECK-LABEL: @log10_f64_intrinsic( -; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_log104(<4 x double> [[TMP4:%.*]]) +; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log104(<4 x double> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -677,7 +677,7 @@ define void @log10_f32_intrinsic(float* nocapture %varray) { ; CHECK-LABEL: @log10_f32_intrinsic( -; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_log10f4(<4 x float> [[TMP4:%.*]]) +; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_log10f4(<4 x float> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -700,7 +700,7 @@ define void @sqrt_f64(double* nocapture %varray) { ; CHECK-LABEL: @sqrt_f64( -; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_sqrt4(<4 x double> [[TMP4:%.*]]) +; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_sqrt4(<4 x double> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -723,7 +723,7 @@ define void @sqrt_f32(float* nocapture %varray) { ; CHECK-LABEL: @sqrt_f32( -; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_sqrtf4(<4 x float> [[TMP4:%.*]]) +; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_sqrtf4(<4 x float> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -746,7 +746,7 @@ define void @exp2_f64(double* nocapture %varray) { ; CHECK-LABEL: @exp2_f64( -; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_exp24(<4 x double> [[TMP4:%.*]]) +; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp24(<4 x double> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -769,7 +769,7 @@ define void @exp2_f32(float* nocapture %varray) { ; CHECK-LABEL: @exp2_f32( -; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_exp2f4(<4 x float> [[TMP4:%.*]]) +; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_exp2f4(<4 x float> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -792,7 +792,7 @@ define void @exp2_f64_intrinsic(double* nocapture %varray) { ; CHECK-LABEL: @exp2_f64_intrinsic( -; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_exp24(<4 x double> [[TMP4:%.*]]) +; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp24(<4 x double> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -815,7 +815,7 @@ define void @exp2_f32_intrinsic(float* nocapture %varray) { ; CHECK-LABEL: @exp2_f32_intrinsic( -; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_exp2f4(<4 x float> [[TMP4:%.*]]) +; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_exp2f4(<4 x float> [[TMP4:%.*]]) ; CHECK: ret void ; entry: diff --git a/llvm/utils/vim/syntax/llvm.vim b/llvm/utils/vim/syntax/llvm.vim --- a/llvm/utils/vim/syntax/llvm.vim +++ b/llvm/utils/vim/syntax/llvm.vim @@ -103,6 +103,9 @@ \ inlinehint \ inreg \ intel_ocl_bicc + \ intel_svmlcc128 + \ intel_svmlcc256 + \ intel_svmlcc512 \ inteldialect \ internal \ jumptable