Index: include/llvm/Analysis/TargetLibraryInfo.h =================================================================== --- include/llvm/Analysis/TargetLibraryInfo.h +++ include/llvm/Analysis/TargetLibraryInfo.h @@ -38,6 +38,12 @@ NumLibFuncs }; +enum SVMLAccuracy { + SVML_DEFAULT, + SVML_HA, + SVML_EP +}; + /// Implementation of the target library information. /// /// This class constructs tables that hold the target library information and @@ -150,7 +156,8 @@ /// Return true if the function F has a vector equivalent with vectorization /// factor VF. bool isFunctionVectorizable(StringRef F, unsigned VF) const { - return !getVectorizedFunction(F, VF).empty(); + bool Ignored; + return !getVectorizedFunction(F, VF, Ignored, false).empty(); } /// Return true if the function F has a vector equivalent with any @@ -159,7 +166,8 @@ /// Return the name of the equivalent of F, vectorized with factor VF. If no /// such mapping exists, return the empty string. - StringRef getVectorizedFunction(StringRef F, unsigned VF) const; + std::string getVectorizedFunction(StringRef F, unsigned VF, bool &FromSVML, + bool IsFast) const; /// Return true if the function F has a scalar equivalent, and set VF to be /// the vectorization factor. @@ -253,8 +261,9 @@ bool isFunctionVectorizable(StringRef F) const { return Impl->isFunctionVectorizable(F); } - StringRef getVectorizedFunction(StringRef F, unsigned VF) const { - return Impl->getVectorizedFunction(F, VF); + std::string getVectorizedFunction(StringRef F, unsigned VF, bool &FromSVML, + bool IsFast) const { + return Impl->getVectorizedFunction(F, VF, FromSVML, IsFast); } /// Tests if the function is both available and a candidate for optimized code Index: include/llvm/IR/CMakeLists.txt =================================================================== --- include/llvm/IR/CMakeLists.txt +++ include/llvm/IR/CMakeLists.txt @@ -4,3 +4,7 @@ set(LLVM_TARGET_DEFINITIONS Intrinsics.td) tablegen(LLVM Intrinsics.inc -gen-intrinsic) add_public_tablegen_target(intrinsics_gen) + +set(LLVM_TARGET_DEFINITIONS SVML.td) +tablegen(LLVM SVML.inc -gen-svml) +add_public_tablegen_target(svml_gen) Index: include/llvm/IR/CallingConv.h =================================================================== --- include/llvm/IR/CallingConv.h +++ include/llvm/IR/CallingConv.h @@ -220,6 +220,9 @@ /// shader if tessellation is in use, or otherwise the vertex shader. AMDGPU_ES = 96, + /// Intel_SVML - Calling conventions for Intel Short Math Vector Library + Intel_SVML = 97, + /// The highest possible calling convention ID. Must be some 2^k - 1. MaxID = 1023 }; Index: include/llvm/IR/SVML.td =================================================================== --- /dev/null +++ include/llvm/IR/SVML.td @@ -0,0 +1,62 @@ +//===-- Intel_SVML.td - Defines SVML call variants ---------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is used by TableGen to define the different typs of SVML function +// variants used with -fveclib=SVML. +// +//===----------------------------------------------------------------------===// + +class SvmlVariant; + +def sin : SvmlVariant; +def cos : SvmlVariant; +def pow : SvmlVariant; +def exp : SvmlVariant; +def log : SvmlVariant; +def acos : SvmlVariant; +def acosh : SvmlVariant; +def asin : SvmlVariant; +def asinh : SvmlVariant; +def atan2 : SvmlVariant; +def atan : SvmlVariant; +def atanh : SvmlVariant; +def cbrt : SvmlVariant; +def cdfnorm : SvmlVariant; +def cdfnorminv : SvmlVariant; +def cosd : SvmlVariant; +def cosh : SvmlVariant; +def erf : SvmlVariant; +def erfc : SvmlVariant; +def erfcinv : SvmlVariant; +def erfinv : SvmlVariant; +def exp10 : SvmlVariant; +def exp2 : SvmlVariant; +def expm1 : SvmlVariant; +def hypot : SvmlVariant; +def invsqrt : SvmlVariant; +def log10 : SvmlVariant; +def log1p : SvmlVariant; +def log2 : SvmlVariant; +def sind : SvmlVariant; +def sinh : SvmlVariant; +def sqrt : SvmlVariant; +def tan : SvmlVariant; +def tanh : SvmlVariant; + +// TODO: SVML does not currently provide _ha and _ep variants of these fucnctions. +// We should call the default variant of these functions in all cases instead. + +// def nearbyint : SvmlVariant; +// def logb : SvmlVariant; +// def floor : SvmlVariant; +// def fmod : SvmlVariant; +// def ceil : SvmlVariant; +// def trunc : SvmlVariant; +// def rint : SvmlVariant; +// def round : SvmlVariant; Index: lib/Analysis/CMakeLists.txt =================================================================== --- lib/Analysis/CMakeLists.txt +++ lib/Analysis/CMakeLists.txt @@ -92,4 +92,5 @@ DEPENDS intrinsics_gen + svml_gen ) Index: lib/Analysis/TargetLibraryInfo.cpp =================================================================== --- lib/Analysis/TargetLibraryInfo.cpp +++ lib/Analysis/TargetLibraryInfo.cpp @@ -50,6 +50,11 @@ return true; } +std::string svmlMangle(StringRef FnName, const bool IsFast) { + std::string FullName = FnName; + return IsFast ? FullName : FullName + "_ha"; +} + /// Initialize the set of available library functions based on the specified /// target triple. This should be carefully written so that a missing target /// triple gets a sane set of defaults. @@ -1453,109 +1458,9 @@ } case SVML: { const VecDesc VecFuncs[] = { - {"sin", "__svml_sin2", 2}, - {"sin", "__svml_sin4", 4}, - {"sin", "__svml_sin8", 8}, - - {"sinf", "__svml_sinf4", 4}, - {"sinf", "__svml_sinf8", 8}, - {"sinf", "__svml_sinf16", 16}, - - {"llvm.sin.f64", "__svml_sin2", 2}, - {"llvm.sin.f64", "__svml_sin4", 4}, - {"llvm.sin.f64", "__svml_sin8", 8}, - - {"llvm.sin.f32", "__svml_sinf4", 4}, - {"llvm.sin.f32", "__svml_sinf8", 8}, - {"llvm.sin.f32", "__svml_sinf16", 16}, - - {"cos", "__svml_cos2", 2}, - {"cos", "__svml_cos4", 4}, - {"cos", "__svml_cos8", 8}, - - {"cosf", "__svml_cosf4", 4}, - {"cosf", "__svml_cosf8", 8}, - {"cosf", "__svml_cosf16", 16}, - - {"llvm.cos.f64", "__svml_cos2", 2}, - {"llvm.cos.f64", "__svml_cos4", 4}, - {"llvm.cos.f64", "__svml_cos8", 8}, - - {"llvm.cos.f32", "__svml_cosf4", 4}, - {"llvm.cos.f32", "__svml_cosf8", 8}, - {"llvm.cos.f32", "__svml_cosf16", 16}, - - {"pow", "__svml_pow2", 2}, - {"pow", "__svml_pow4", 4}, - {"pow", "__svml_pow8", 8}, - - {"powf", "__svml_powf4", 4}, - {"powf", "__svml_powf8", 8}, - {"powf", "__svml_powf16", 16}, - - { "__pow_finite", "__svml_pow2", 2 }, - { "__pow_finite", "__svml_pow4", 4 }, - { "__pow_finite", "__svml_pow8", 8 }, - - { "__powf_finite", "__svml_powf4", 4 }, - { "__powf_finite", "__svml_powf8", 8 }, - { "__powf_finite", "__svml_powf16", 16 }, - - {"llvm.pow.f64", "__svml_pow2", 2}, - {"llvm.pow.f64", "__svml_pow4", 4}, - {"llvm.pow.f64", "__svml_pow8", 8}, - - {"llvm.pow.f32", "__svml_powf4", 4}, - {"llvm.pow.f32", "__svml_powf8", 8}, - {"llvm.pow.f32", "__svml_powf16", 16}, - - {"exp", "__svml_exp2", 2}, - {"exp", "__svml_exp4", 4}, - {"exp", "__svml_exp8", 8}, - - {"expf", "__svml_expf4", 4}, - {"expf", "__svml_expf8", 8}, - {"expf", "__svml_expf16", 16}, - - { "__exp_finite", "__svml_exp2", 2 }, - { "__exp_finite", "__svml_exp4", 4 }, - { "__exp_finite", "__svml_exp8", 8 }, - - { "__expf_finite", "__svml_expf4", 4 }, - { "__expf_finite", "__svml_expf8", 8 }, - { "__expf_finite", "__svml_expf16", 16 }, - - {"llvm.exp.f64", "__svml_exp2", 2}, - {"llvm.exp.f64", "__svml_exp4", 4}, - {"llvm.exp.f64", "__svml_exp8", 8}, - - {"llvm.exp.f32", "__svml_expf4", 4}, - {"llvm.exp.f32", "__svml_expf8", 8}, - {"llvm.exp.f32", "__svml_expf16", 16}, - - {"log", "__svml_log2", 2}, - {"log", "__svml_log4", 4}, - {"log", "__svml_log8", 8}, - - {"logf", "__svml_logf4", 4}, - {"logf", "__svml_logf8", 8}, - {"logf", "__svml_logf16", 16}, - - { "__log_finite", "__svml_log2", 2 }, - { "__log_finite", "__svml_log4", 4 }, - { "__log_finite", "__svml_log8", 8 }, - - { "__logf_finite", "__svml_logf4", 4 }, - { "__logf_finite", "__svml_logf8", 8 }, - { "__logf_finite", "__svml_logf16", 16 }, - - {"llvm.log.f64", "__svml_log2", 2}, - {"llvm.log.f64", "__svml_log4", 4}, - {"llvm.log.f64", "__svml_log8", 8}, - - {"llvm.log.f32", "__svml_logf4", 4}, - {"llvm.log.f32", "__svml_logf8", 8}, - {"llvm.log.f32", "__svml_logf16", 16}, +#define GET_SVML_VARIANTS +#include "llvm/IR/SVML.inc" +#undef GET_SVML_VARIANTS }; addVectorizableFunctions(VecFuncs); break; @@ -1576,19 +1481,26 @@ return I != VectorDescs.end() && StringRef(I->ScalarFnName) == funcName; } -StringRef TargetLibraryInfoImpl::getVectorizedFunction(StringRef F, - unsigned VF) const { +std::string TargetLibraryInfoImpl::getVectorizedFunction(StringRef F, + unsigned VF, + bool &FromSVML, + bool IsFast) const { + FromSVML = ClVectorLibrary == SVML; F = sanitizeFunctionName(F); if (F.empty()) return F; std::vector::const_iterator I = std::lower_bound( VectorDescs.begin(), VectorDescs.end(), F, compareWithScalarFnName); while (I != VectorDescs.end() && StringRef(I->ScalarFnName) == F) { - if (I->VectorizationFactor == VF) + if (I->VectorizationFactor == VF) { + if (FromSVML) { + return svmlMangle(I->VectorFnName, IsFast); + } return I->VectorFnName; + } ++I; } - return StringRef(); + return std::string(); } StringRef TargetLibraryInfoImpl::getScalarizedFunction(StringRef F, Index: lib/AsmParser/LLLexer.cpp =================================================================== --- lib/AsmParser/LLLexer.cpp +++ lib/AsmParser/LLLexer.cpp @@ -596,6 +596,7 @@ KEYWORD(spir_kernel); KEYWORD(spir_func); KEYWORD(intel_ocl_bicc); + KEYWORD(intel_svmlcc); KEYWORD(x86_64_sysvcc); KEYWORD(win64cc); KEYWORD(x86_regcallcc); Index: lib/AsmParser/LLParser.cpp =================================================================== --- lib/AsmParser/LLParser.cpp +++ lib/AsmParser/LLParser.cpp @@ -1778,6 +1778,7 @@ /// ::= 'ccc' /// ::= 'fastcc' /// ::= 'intel_ocl_bicc' +/// ::= 'intel_svmlcc' /// ::= 'coldcc' /// ::= 'x86_stdcallcc' /// ::= 'x86_fastcallcc' @@ -1837,6 +1838,7 @@ case lltok::kw_spir_kernel: CC = CallingConv::SPIR_KERNEL; break; case lltok::kw_spir_func: CC = CallingConv::SPIR_FUNC; break; case lltok::kw_intel_ocl_bicc: CC = CallingConv::Intel_OCL_BI; break; + case lltok::kw_intel_svmlcc: CC = CallingConv::Intel_SVML; break; case lltok::kw_x86_64_sysvcc: CC = CallingConv::X86_64_SysV; break; case lltok::kw_win64cc: CC = CallingConv::Win64; break; case lltok::kw_webkit_jscc: CC = CallingConv::WebKit_JS; break; Index: lib/AsmParser/LLToken.h =================================================================== --- lib/AsmParser/LLToken.h +++ lib/AsmParser/LLToken.h @@ -130,6 +130,7 @@ kw_fastcc, kw_coldcc, kw_intel_ocl_bicc, + kw_intel_svmlcc, kw_x86_stdcallcc, kw_x86_fastcallcc, kw_x86_thiscallcc, Index: lib/IR/AsmWriter.cpp =================================================================== --- lib/IR/AsmWriter.cpp +++ lib/IR/AsmWriter.cpp @@ -360,6 +360,7 @@ case CallingConv::X86_RegCall: Out << "x86_regcallcc"; break; case CallingConv::X86_VectorCall:Out << "x86_vectorcallcc"; break; case CallingConv::Intel_OCL_BI: Out << "intel_ocl_bicc"; break; + case CallingConv::Intel_SVML: Out << "intel_svmlcc"; break; case CallingConv::ARM_APCS: Out << "arm_apcscc"; break; case CallingConv::ARM_AAPCS: Out << "arm_aapcscc"; break; case CallingConv::ARM_AAPCS_VFP: Out << "arm_aapcs_vfpcc"; break; Index: lib/IR/Verifier.cpp =================================================================== --- lib/IR/Verifier.cpp +++ lib/IR/Verifier.cpp @@ -2091,6 +2091,7 @@ case CallingConv::Fast: case CallingConv::Cold: case CallingConv::Intel_OCL_BI: + case CallingConv::Intel_SVML: case CallingConv::PTX_Kernel: case CallingConv::PTX_Device: Assert(!F.isVarArg(), "Calling convention does not support varargs or " Index: lib/Target/X86/X86CallingConv.td =================================================================== --- lib/Target/X86/X86CallingConv.td +++ lib/Target/X86/X86CallingConv.td @@ -476,12 +476,29 @@ CCDelegateTo ]>; +// Intel_SVML return-value convention. +def RetCC_Intel_SVML : CallingConv<[ + // Vector types are returned in XMM0,XMM1 + CCIfType<[v4f32, v2f64], + CCAssignToReg<[XMM0,XMM1]>>, + + // 256-bit FP vectors + CCIfType<[v8f32, v4f64], + CCAssignToReg<[YMM0,YMM1]>>, + + // 512-bit FP vectors + CCIfType<[v16f32, v8f64], + CCAssignToReg<[ZMM0,ZMM1]>> +]>; + // This is the return-value convention used for the entire X86 backend. def RetCC_X86 : CallingConv<[ // Check if this is the Intel OpenCL built-ins calling convention CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo>, + CCIfCC<"CallingConv::Intel_SVML", CCDelegateTo>, + CCIfSubtarget<"is64Bit()", CCDelegateTo>, CCDelegateTo ]>; @@ -983,6 +1000,22 @@ CCDelegateTo ]>; +// X86-64 Intel Short Vector Math Library calling convention. +def CC_Intel_SVML : CallingConv<[ + + // The SSE vector arguments are passed in XMM registers. + CCIfType<[v4f32, v2f64], + CCAssignToReg<[XMM0, XMM1, XMM2]>>, + + // The 256-bit vector arguments are passed in YMM registers. + CCIfType<[v8f32, v4f64], + CCAssignToReg<[YMM0, YMM1, YMM2]>>, + + // The 512-bit vector arguments are passed in ZMM registers. + CCIfType<[v16f32, v8f64], + CCAssignToReg<[ZMM0, ZMM1, ZMM2]>> +]>; + def CC_X86_32_Intr : CallingConv<[ CCAssignToStack<4, 4> ]>; @@ -1039,6 +1072,7 @@ // This is the argument convention used for the entire X86 backend. def CC_X86 : CallingConv<[ CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo>, + CCIfCC<"CallingConv::Intel_SVML", CCDelegateTo>, CCIfSubtarget<"is64Bit()", CCDelegateTo>, CCDelegateTo ]>; @@ -1147,4 +1181,27 @@ (sequence "R%u", 12, 15))>; def CSR_SysV64_RegCall : CalleeSavedRegs<(add CSR_SysV64_RegCall_NoSSE, (sequence "XMM%u", 8, 15))>; - + +// SVML calling convention +def CSR_32_Intel_SVML : CalleeSavedRegs<(add CSR_32_RegCall_NoSSE)>; +def CSR_32_Intel_SVML_AVX512 : CalleeSavedRegs<(add CSR_32_Intel_SVML, + K4, K5, K6, K7)>; + +def CSR_64_Intel_SVML_NoSSE : CalleeSavedRegs<(add RBX, RSI, RDI, RBP, RSP, R12, R13, R14, R15)>; + +def CSR_64_Intel_SVML : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE, + (sequence "XMM%u", 8, 15))>; +def CSR_Win64_Intel_SVML : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE, + (sequence "XMM%u", 6, 15))>; + +def CSR_64_Intel_SVML_AVX : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE, + (sequence "YMM%u", 8, 15))>; +def CSR_Win64_Intel_SVML_AVX : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE, + (sequence "YMM%u", 6, 15))>; + +def CSR_64_Intel_SVML_AVX512 : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE, + (sequence "ZMM%u", 16, 31), + K4, K5, K6, K7)>; +def CSR_Win64_Intel_SVML_AVX512 : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE, + (sequence "ZMM%u", 6, 21), + K4, K5, K6, K7)>; Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -3258,7 +3258,8 @@ // FIXME: Only some x86_32 calling conventions support AVX512. if (Subtarget.hasAVX512() && (Is64Bit || (CallConv == CallingConv::X86_VectorCall || - CallConv == CallingConv::Intel_OCL_BI))) + CallConv == CallingConv::Intel_OCL_BI || + CallConv == CallingConv::Intel_SVML))) VecVT = MVT::v16f32; else if (Subtarget.hasAVX()) VecVT = MVT::v8f32; Index: lib/Target/X86/X86RegisterInfo.cpp =================================================================== --- lib/Target/X86/X86RegisterInfo.cpp +++ lib/Target/X86/X86RegisterInfo.cpp @@ -311,6 +311,23 @@ return CSR_64_Intel_OCL_BI_SaveList; break; } + case CallingConv::Intel_SVML: { + if (Is64Bit) { + if (HasAVX512) + return IsWin64 ? CSR_Win64_Intel_SVML_AVX512_SaveList : + CSR_64_Intel_SVML_AVX512_SaveList; + if (HasAVX) + return IsWin64 ? CSR_Win64_Intel_SVML_AVX_SaveList : + CSR_64_Intel_SVML_AVX_SaveList; + + return IsWin64 ? CSR_Win64_Intel_SVML_SaveList : + CSR_64_Intel_SVML_SaveList; + } else { // Is32Bit + if (HasAVX512) + return CSR_32_Intel_SVML_AVX512_SaveList; + return CSR_32_Intel_SVML_SaveList; + } + } case CallingConv::HHVM: return CSR_64_HHVM_SaveList; case CallingConv::X86_RegCall: @@ -425,6 +442,23 @@ return CSR_64_Intel_OCL_BI_RegMask; break; } + case CallingConv::Intel_SVML: { + if (Is64Bit) { + if (HasAVX512) + return IsWin64 ? CSR_Win64_Intel_SVML_AVX512_RegMask : + CSR_64_Intel_SVML_AVX512_RegMask; + if (HasAVX) + return IsWin64 ? CSR_Win64_Intel_SVML_AVX_RegMask : + CSR_64_Intel_SVML_AVX_RegMask; + + return IsWin64 ? CSR_Win64_Intel_SVML_RegMask : + CSR_64_Intel_SVML_RegMask; + } else { // Is32Bit + if (HasAVX512) + return CSR_32_Intel_SVML_AVX512_RegMask; + return CSR_32_Intel_SVML_RegMask; + } + } case CallingConv::HHVM: return CSR_64_HHVM_RegMask; case CallingConv::X86_RegCall: Index: lib/Target/X86/X86Subtarget.h =================================================================== --- lib/Target/X86/X86Subtarget.h +++ lib/Target/X86/X86Subtarget.h @@ -768,6 +768,7 @@ case CallingConv::X86_ThisCall: case CallingConv::X86_VectorCall: case CallingConv::Intel_OCL_BI: + case CallingConv::Intel_SVML: return isTargetWin64(); // This convention allows using the Win64 convention on other targets. case CallingConv::Win64: Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4101,6 +4101,7 @@ } Function *VectorF; + bool FromSVML = false; if (UseVectorIntrinsic) { // Use vector version of the intrinsic. Type *TysForDecl[] = {CI->getType()}; @@ -4109,7 +4110,8 @@ VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); } else { // Use vector version of the library call. - StringRef VFnName = TLI->getVectorizedFunction(FnName, VF); + bool IsFast = CI->getFastMathFlags().isFast(); + std::string VFnName = TLI->getVectorizedFunction(FnName, VF, FromSVML, IsFast); assert(!VFnName.empty() && "Vector function name is empty."); VectorF = M->getFunction(VFnName); if (!VectorF) { @@ -4128,7 +4130,7 @@ if (isa(V)) V->copyFastMathFlags(CI); - + if (FromSVML) V->setCallingConv(CallingConv::Intel_SVML); VectorLoopValueMap.setVectorValue(&I, Part, V); addMetadata(V, &I); } Index: test/Transforms/LoopVectorize/X86/svml-calls.ll =================================================================== --- test/Transforms/LoopVectorize/X86/svml-calls.ll +++ test/Transforms/LoopVectorize/X86/svml-calls.ll @@ -31,7 +31,7 @@ define void @sin_f64(double* nocapture %varray) { ; CHECK-LABEL: @sin_f64( -; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_sin4(<4 x double> [[TMP4:%.*]]) +; CHECK: [[TMP5:%.*]] = call intel_svmlcc <4 x double> @__svml_sin4_ha(<4 x double> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -54,7 +54,7 @@ define void @sin_f32(float* nocapture %varray) { ; CHECK-LABEL: @sin_f32( -; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_sinf4(<4 x float> [[TMP4:%.*]]) +; CHECK: [[TMP5:%.*]] = call intel_svmlcc <4 x float> @__svml_sinf4_ha(<4 x float> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -77,7 +77,7 @@ define void @sin_f64_intrinsic(double* nocapture %varray) { ; CHECK-LABEL: @sin_f64_intrinsic( -; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_sin4(<4 x double> [[TMP4:%.*]]) +; CHECK: [[TMP5:%.*]] = call intel_svmlcc <4 x double> @__svml_sin4_ha(<4 x double> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -100,7 +100,7 @@ define void @sin_f32_intrinsic(float* nocapture %varray) { ; CHECK-LABEL: @sin_f32_intrinsic( -; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_sinf4(<4 x float> [[TMP4:%.*]]) +; CHECK: [[TMP5:%.*]] = call intel_svmlcc <4 x float> @__svml_sinf4_ha(<4 x float> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -123,7 +123,7 @@ define void @cos_f64(double* nocapture %varray) { ; CHECK-LABEL: @cos_f64( -; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_cos4(<4 x double> [[TMP4:%.*]]) +; CHECK: [[TMP5:%.*]] = call intel_svmlcc <4 x double> @__svml_cos4_ha(<4 x double> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -146,7 +146,7 @@ define void @cos_f32(float* nocapture %varray) { ; CHECK-LABEL: @cos_f32( -; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_cosf4(<4 x float> [[TMP4:%.*]]) +; CHECK: [[TMP5:%.*]] = call intel_svmlcc <4 x float> @__svml_cosf4_ha(<4 x float> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -169,7 +169,7 @@ define void @cos_f64_intrinsic(double* nocapture %varray) { ; CHECK-LABEL: @cos_f64_intrinsic( -; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_cos4(<4 x double> [[TMP4:%.*]]) +; CHECK: [[TMP5:%.*]] = call intel_svmlcc <4 x double> @__svml_cos4_ha(<4 x double> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -192,7 +192,7 @@ define void @cos_f32_intrinsic(float* nocapture %varray) { ; CHECK-LABEL: @cos_f32_intrinsic( -; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_cosf4(<4 x float> [[TMP4:%.*]]) +; CHECK: [[TMP5:%.*]] = call intel_svmlcc <4 x float> @__svml_cosf4_ha(<4 x float> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -215,7 +215,7 @@ define void @pow_f64(double* nocapture %varray, double* nocapture readonly %exp) { ; CHECK-LABEL: @pow_f64( -; CHECK: [[TMP8:%.*]] = call <4 x double> @__svml_pow4(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]]) +; CHECK: [[TMP8:%.*]] = call intel_svmlcc <4 x double> @__svml_pow4_ha(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]]) ; CHECK: ret void ; entry: @@ -240,7 +240,7 @@ define void @pow_f64_intrinsic(double* nocapture %varray, double* nocapture readonly %exp) { ; CHECK-LABEL: @pow_f64_intrinsic( -; CHECK: [[TMP8:%.*]] = call <4 x double> @__svml_pow4(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]]) +; CHECK: [[TMP8:%.*]] = call intel_svmlcc <4 x double> @__svml_pow4_ha(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]]) ; CHECK: ret void ; entry: @@ -265,7 +265,7 @@ define void @pow_f32(float* nocapture %varray, float* nocapture readonly %exp) { ; CHECK-LABEL: @pow_f32( -; CHECK: [[TMP8:%.*]] = call <4 x float> @__svml_powf4(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]]) +; CHECK: [[TMP8:%.*]] = call intel_svmlcc <4 x float> @__svml_powf4_ha(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]]) ; CHECK: ret void ; entry: @@ -290,7 +290,7 @@ define void @pow_f32_intrinsic(float* nocapture %varray, float* nocapture readonly %exp) { ; CHECK-LABEL: @pow_f32_intrinsic( -; CHECK: [[TMP8:%.*]] = call <4 x float> @__svml_powf4(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]]) +; CHECK: [[TMP8:%.*]] = call intel_svmlcc <4 x float> @__svml_powf4_ha(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]]) ; CHECK: ret void ; entry: @@ -315,7 +315,7 @@ define void @exp_f64(double* nocapture %varray) { ; CHECK-LABEL: @exp_f64( -; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_exp4(<4 x double> [[TMP4:%.*]]) +; CHECK: [[TMP5:%.*]] = call intel_svmlcc <4 x double> @__svml_exp4_ha(<4 x double> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -338,7 +338,7 @@ define void @exp_f32(float* nocapture %varray) { ; CHECK-LABEL: @exp_f32( -; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_expf4(<4 x float> [[TMP4:%.*]]) +; CHECK: [[TMP5:%.*]] = call intel_svmlcc <4 x float> @__svml_expf4_ha(<4 x float> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -361,7 +361,7 @@ define void @exp_f64_intrinsic(double* nocapture %varray) { ; CHECK-LABEL: @exp_f64_intrinsic( -; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_exp4(<4 x double> [[TMP4:%.*]]) +; CHECK: [[TMP5:%.*]] = call intel_svmlcc <4 x double> @__svml_exp4_ha(<4 x double> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -384,7 +384,7 @@ define void @exp_f32_intrinsic(float* nocapture %varray) { ; CHECK-LABEL: @exp_f32_intrinsic( -; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_expf4(<4 x float> [[TMP4:%.*]]) +; CHECK: [[TMP5:%.*]] = call intel_svmlcc <4 x float> @__svml_expf4_ha(<4 x float> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -407,7 +407,7 @@ define void @log_f64(double* nocapture %varray) { ; CHECK-LABEL: @log_f64( -; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_log4(<4 x double> [[TMP4:%.*]]) +; CHECK: [[TMP5:%.*]] = call intel_svmlcc <4 x double> @__svml_log4_ha(<4 x double> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -430,7 +430,7 @@ define void @log_f32(float* nocapture %varray) { ; CHECK-LABEL: @log_f32( -; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_logf4(<4 x float> [[TMP4:%.*]]) +; CHECK: [[TMP5:%.*]] = call intel_svmlcc <4 x float> @__svml_logf4_ha(<4 x float> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -453,7 +453,7 @@ define void @log_f64_intrinsic(double* nocapture %varray) { ; CHECK-LABEL: @log_f64_intrinsic( -; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_log4(<4 x double> [[TMP4:%.*]]) +; CHECK: [[TMP5:%.*]] = call intel_svmlcc <4 x double> @__svml_log4_ha(<4 x double> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -476,7 +476,7 @@ define void @log_f32_intrinsic(float* nocapture %varray) { ; CHECK-LABEL: @log_f32_intrinsic( -; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_logf4(<4 x float> [[TMP4:%.*]]) +; CHECK: [[TMP5:%.*]] = call intel_svmlcc <4 x float> @__svml_logf4_ha(<4 x float> [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -497,5 +497,43 @@ ret void } -attributes #0 = { nounwind readnone } +; CHECK-LABEL: @atan2_finite +; CHECK: intel_svmlcc <8 x double> @__svml_atan28 +; CHECK: ret + +declare double @__atan2_finite(double, double) local_unnamed_addr #0 +define void @atan2_finite([100 x double]* nocapture %varray) local_unnamed_addr #0 { +entry: + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.inc7, %entry + %indvars.iv19 = phi i64 [ 0, %entry ], [ %indvars.iv.next20, %for.inc7 ] + %0 = trunc i64 %indvars.iv19 to i32 + %conv = sitofp i32 %0 to double + br label %for.body3 + +for.body3: ; preds = %for.body3, %for.cond1.preheader + %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %1 = trunc i64 %indvars.iv.next to i32 + %conv4 = sitofp i32 %1 to double + %call = tail call fast double @__atan2_finite(double %conv, double %conv4) + %arrayidx6 = getelementptr inbounds [100 x double], [100 x double]* %varray, i64 %indvars.iv19, i64 %indvars.iv + store double %call, double* %arrayidx6, align 8 + %exitcond = icmp eq i64 %indvars.iv.next, 100 + br i1 %exitcond, label %for.inc7, label %for.body3, !llvm.loop !5 + +for.inc7: ; preds = %for.body3 + %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1 + %exitcond21 = icmp eq i64 %indvars.iv.next20, 100 + br i1 %exitcond21, label %for.end9, label %for.cond1.preheader + +for.end9: ; preds = %for.inc7 + ret void +} + +attributes #0 = { nounwind readnone } +!5 = distinct !{!5, !6, !7} +!6 = !{!"llvm.loop.vectorize.width", i32 8} +!7 = !{!"llvm.loop.vectorize.enable", i1 true} Index: utils/TableGen/CMakeLists.txt =================================================================== --- utils/TableGen/CMakeLists.txt +++ utils/TableGen/CMakeLists.txt @@ -38,6 +38,7 @@ SearchableTableEmitter.cpp SubtargetEmitter.cpp SubtargetFeatureInfo.cpp + SVMLEmitter.cpp TableGen.cpp Types.cpp X86DisassemblerTables.cpp Index: utils/TableGen/SVMLEmitter.cpp =================================================================== --- /dev/null +++ utils/TableGen/SVMLEmitter.cpp @@ -0,0 +1,110 @@ +//===------ SVMLEmitter.cpp - Generate SVML function variants -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This tablegen backend emits the scalar to svml function map for TLI. +// +//===----------------------------------------------------------------------===// + +#include "CodeGenTarget.h" +#include "llvm/Support/Format.h" +#include "llvm/TableGen/Error.h" +#include "llvm/TableGen/Record.h" +#include "llvm/TableGen/TableGenBackend.h" +#include +#include + +using namespace llvm; + +#define DEBUG_TYPE "SVMLVariants" +#include "llvm/Support/Debug.h" + +namespace { + +class SVMLVariantsEmitter { + + RecordKeeper &Records; + +private: + void emitSVMLVariants(raw_ostream &OS); + +public: + SVMLVariantsEmitter(RecordKeeper &R) : Records(R) {} + + void run(raw_ostream &OS); +}; +} // End anonymous namespace + +/// \brief Emit the set of SVML variant function names. +// The default is to emit the high accuracy SVML variants until a mechanism is +// introduced to allow a selection of different variants through precision +// requirements specified by the user. This code generates mappings to svml +// that are in the scalar form of llvm intrinsics, math library calls, or the +// finite variants of math library calls. +void SVMLVariantsEmitter::emitSVMLVariants(raw_ostream &OS) { + + const unsigned MinSinglePrecVL = 4; + const unsigned MaxSinglePrecVL = 16; + const unsigned MinDoublePrecVL = 2; + const unsigned MaxDoublePrecVL = 8; + + OS << "#ifdef GET_SVML_VARIANTS\n"; + + for (const auto &D : Records.getAllDerivedDefinitions("SvmlVariant")) { + StringRef SvmlVariantNameStr = D->getName(); + // Single Precision SVML + for (unsigned VL = MinSinglePrecVL; VL <= MaxSinglePrecVL; VL *= 2) { + // Emit the scalar math library function to svml function entry. + OS << "{\"" << SvmlVariantNameStr << "f" << "\", "; + OS << "\"" << "__svml_" << SvmlVariantNameStr << "f" << VL << "\", " + << VL << "},\n"; + + // Emit the scalar intrinsic to svml function entry. + OS << "{\"" << "llvm." << SvmlVariantNameStr << ".f32" << "\", "; + OS << "\"" << "__svml_" << SvmlVariantNameStr << "f" << VL << "\", " + << VL << "},\n"; + + // Emit the finite math library function to svml function entry. + OS << "{\"__" << SvmlVariantNameStr << "f_finite" << "\", "; + OS << "\"" << "__svml_" << SvmlVariantNameStr << "f" << VL << "\", " + << VL << "},\n"; + } + + // Double Precision SVML + for (unsigned VL = MinDoublePrecVL; VL <= MaxDoublePrecVL; VL *= 2) { + // Emit the scalar math library function to svml function entry. + OS << "{\"" << SvmlVariantNameStr << "\", "; + OS << "\"" << "__svml_" << SvmlVariantNameStr << VL << "\", " << VL + << "},\n"; + + // Emit the scalar intrinsic to svml function entry. + OS << "{\"" << "llvm." << SvmlVariantNameStr << ".f64" << "\", "; + OS << "\"" << "__svml_" << SvmlVariantNameStr << VL << "\", " << VL + << "},\n"; + + // Emit the finite math library function to svml function entry. + OS << "{\"__" << SvmlVariantNameStr << "_finite" << "\", "; + OS << "\"" << "__svml_" << SvmlVariantNameStr << VL << "\", " + << VL << "},\n"; + } + } + + OS << "#endif // GET_SVML_VARIANTS\n\n"; +} + +void SVMLVariantsEmitter::run(raw_ostream &OS) { + emitSVMLVariants(OS); +} + +namespace llvm { + +void EmitSVMLVariants(RecordKeeper &RK, raw_ostream &OS) { + SVMLVariantsEmitter(RK).run(OS); +} + +} // End llvm namespace Index: utils/TableGen/TableGen.cpp =================================================================== --- utils/TableGen/TableGen.cpp +++ utils/TableGen/TableGen.cpp @@ -50,6 +50,7 @@ GenX86EVEX2VEXTables, GenX86FoldTables, GenRegisterBank, + GenSVMLVariants, }; namespace { @@ -108,7 +109,9 @@ clEnumValN(GenX86FoldTables, "gen-x86-fold-tables", "Generate X86 fold tables"), clEnumValN(GenRegisterBank, "gen-register-bank", - "Generate registers bank descriptions"))); + "Generate registers bank descriptions"), + clEnumValN(GenSVMLVariants, "gen-svml", + "Generate SVML variant function names"))); cl::OptionCategory PrintEnumsCat("Options for -print-enums"); cl::opt @@ -213,6 +216,9 @@ case GenX86FoldTables: EmitX86FoldTables(Records, OS); break; + case GenSVMLVariants: + EmitSVMLVariants(Records, OS); + break; } return false; Index: utils/TableGen/TableGenBackends.h =================================================================== --- utils/TableGen/TableGenBackends.h +++ utils/TableGen/TableGenBackends.h @@ -86,6 +86,7 @@ void EmitX86EVEX2VEXTables(RecordKeeper &RK, raw_ostream &OS); void EmitX86FoldTables(RecordKeeper &RK, raw_ostream &OS); void EmitRegisterBank(RecordKeeper &RK, raw_ostream &OS); +void EmitSVMLVariants(RecordKeeper &RK, raw_ostream &OS); } // End llvm namespace Index: utils/vim/syntax/llvm.vim =================================================================== --- utils/vim/syntax/llvm.vim +++ utils/vim/syntax/llvm.vim @@ -94,6 +94,7 @@ \ inreg \ inteldialect \ intel_ocl_bicc + \ intel_svmlcc \ internal \ linkonce \ linkonce_odr