diff --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.h b/llvm/include/llvm/Analysis/TargetLibraryInfo.h --- a/llvm/include/llvm/Analysis/TargetLibraryInfo.h +++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.h @@ -31,6 +31,7 @@ StringRef ScalarFnName; StringRef VectorFnName; ElementCount VectorizationFactor; + bool Masked; }; enum LibFunc : unsigned { @@ -161,7 +162,8 @@ /// Return true if the function F has a vector equivalent with vectorization /// factor VF. bool isFunctionVectorizable(StringRef F, const ElementCount &VF) const { - return !getVectorizedFunction(F, VF).empty(); + return !(getVectorizedFunction(F, VF, false).empty() && + getVectorizedFunction(F, VF, true).empty()); } /// Return true if the function F has a vector equivalent with any @@ -170,7 +172,8 @@ /// Return the name of the equivalent of F, vectorized with factor VF. If no /// such mapping exists, return the empty string. - StringRef getVectorizedFunction(StringRef F, const ElementCount &VF) const; + StringRef getVectorizedFunction(StringRef F, const ElementCount &VF, + bool Masked) const; /// Set to true iff i32 parameters to library functions should have signext /// or zeroext attributes if they correspond to C-level int or unsigned int, @@ -346,8 +349,9 @@ bool isFunctionVectorizable(StringRef F) const { return Impl->isFunctionVectorizable(F); } - StringRef getVectorizedFunction(StringRef F, const ElementCount &VF) const { - return Impl->getVectorizedFunction(F, VF); + StringRef getVectorizedFunction(StringRef F, const ElementCount &VF, + bool Masked) const { + return Impl->getVectorizedFunction(F, VF, Masked); } /// Tests if the function is both available and a candidate for optimized code diff --git a/llvm/include/llvm/Analysis/VecFuncs.def b/llvm/include/llvm/Analysis/VecFuncs.def --- a/llvm/include/llvm/Analysis/VecFuncs.def +++ b/llvm/include/llvm/Analysis/VecFuncs.def @@ -17,8 +17,8 @@ #define TLI_DEFINE_VECFUNC(SCAL, VEC, VF) VEC, #endif -#define FIXED(NL) ElementCount::getFixed(NL) -#define SCALABLE(NL) ElementCount::getScalable(NL) +#define FIXED(NL) ElementCount::getFixed(NL), false +#define SCALABLE(NL) ElementCount::getScalable(NL), true #if !(defined(TLI_DEFINE_VECFUNC)) #define TLI_DEFINE_VECFUNC(SCAL, VEC, VF) {SCAL, VEC, VF}, @@ -604,6 +604,82 @@ TLI_DEFINE_VECFUNC( "tgammaf", "_ZGVnN4v_tgammaf", FIXED(4)) TLI_DEFINE_VECFUNC( "llvm.tgamma.f32", "_ZGVnN4v_tgammaf", FIXED(4)) +#elif defined(TLI_DEFINE_SLEEFGNUABI_SCALABLE_VECFUNCS) + +TLI_DEFINE_VECFUNC("acos", "_ZGVsMxv_acos", SCALABLE(2)) +TLI_DEFINE_VECFUNC("acosf", "_ZGVsMxv_acosf", SCALABLE(4)) + +TLI_DEFINE_VECFUNC("asin", "_ZGVsMxv_asin", SCALABLE(2)) +TLI_DEFINE_VECFUNC("asinf", "_ZGVsMxv_asinf", SCALABLE(4)) + +TLI_DEFINE_VECFUNC("atan", "_ZGVsMxv_atan", SCALABLE(2)) +TLI_DEFINE_VECFUNC("atanf", "_ZGVsMxv_atanf", SCALABLE(4)) + +TLI_DEFINE_VECFUNC("atan2", "_ZGVsMxvv_atan2", SCALABLE(2)) +TLI_DEFINE_VECFUNC("atan2f", "_ZGVsMxvv_atan2f", SCALABLE(4)) + +TLI_DEFINE_VECFUNC("atanh", "_ZGVsMxv_atanh", SCALABLE(2)) +TLI_DEFINE_VECFUNC("atanhf", "_ZGVsMxv_atanhf", SCALABLE(4)) + +TLI_DEFINE_VECFUNC("cos", "_ZGVsMxv_cos", SCALABLE(2)) +TLI_DEFINE_VECFUNC("cosf", "_ZGVsMxv_cosf", SCALABLE(4)) +TLI_DEFINE_VECFUNC("llvm.cos.f64", "_ZGVsMxv_cos", SCALABLE(2)) +TLI_DEFINE_VECFUNC("llvm.cos.f32", "_ZGVsMxv_cosf", SCALABLE(4)) + +TLI_DEFINE_VECFUNC("cosh", "_ZGVsMxv_cosh", SCALABLE(2)) +TLI_DEFINE_VECFUNC("coshf", "_ZGVsMxv_coshf", SCALABLE(4)) + +TLI_DEFINE_VECFUNC("exp", "_ZGVsMxv_exp", SCALABLE(2)) +TLI_DEFINE_VECFUNC("expf", "_ZGVsMxv_expf", SCALABLE(4)) +TLI_DEFINE_VECFUNC("llvm.exp.f64", "_ZGVsMxv_exp", SCALABLE(2)) +TLI_DEFINE_VECFUNC("llvm.exp.f32", "_ZGVsMxv_expf", SCALABLE(4)) + +TLI_DEFINE_VECFUNC("exp2", "_ZGVsMxv_exp2", SCALABLE(2)) +TLI_DEFINE_VECFUNC("exp2f", "_ZGVsMxv_exp2f", SCALABLE(4)) +TLI_DEFINE_VECFUNC("llvm.exp2.f64", "_ZGVsMxv_exp2", SCALABLE(2)) +TLI_DEFINE_VECFUNC("llvm.exp2.f32", "_ZGVsMxv_exp2f", SCALABLE(4)) + +TLI_DEFINE_VECFUNC("exp10", "_ZGVsMxv_exp10", SCALABLE(2)) +TLI_DEFINE_VECFUNC("exp10f", "_ZGVsMxv_exp10f", SCALABLE(4)) + +TLI_DEFINE_VECFUNC("lgamma", "_ZGVsMxv_lgamma", SCALABLE(2)) +TLI_DEFINE_VECFUNC("lgammaf", "_ZGVsMxv_lgammaf", SCALABLE(4)) + +TLI_DEFINE_VECFUNC("log", "_ZGVsMxv_log", SCALABLE(2)) +TLI_DEFINE_VECFUNC("logf", "_ZGVsMxv_logf", SCALABLE(4)) +TLI_DEFINE_VECFUNC("llvm.log.f64", "_ZGVsMxv_log", SCALABLE(2)) +TLI_DEFINE_VECFUNC("llvm.log.f32", "_ZGVsMxv_logf", SCALABLE(4)) + +TLI_DEFINE_VECFUNC("log10", "_ZGVsMxv_log10", SCALABLE(2)) +TLI_DEFINE_VECFUNC("log10f", "_ZGVsMxv_log10f", SCALABLE(4)) +TLI_DEFINE_VECFUNC("llvm.log10.f64", "_ZGVsMxv_log10", SCALABLE(2)) +TLI_DEFINE_VECFUNC("llvm.log10.f32", "_ZGVsMxv_log10f", SCALABLE(4)) + +TLI_DEFINE_VECFUNC("pow", "_ZGVsMxvv_pow", SCALABLE(2)) +TLI_DEFINE_VECFUNC("powf", "_ZGVsMxvv_powf", SCALABLE(4)) +TLI_DEFINE_VECFUNC("llvm.pow.f64", "_ZGVsMxvv_pow", SCALABLE(2)) +TLI_DEFINE_VECFUNC("llvm.pow.f32", "_ZGVsMxvv_powf", SCALABLE(4)) + +TLI_DEFINE_VECFUNC("sin", "_ZGVsMxv_sin", SCALABLE(2)) +TLI_DEFINE_VECFUNC("sinf", "_ZGVsMxv_sinf", SCALABLE(4)) +TLI_DEFINE_VECFUNC("llvm.sin.f64", "_ZGVsMxv_sin", SCALABLE(2)) +TLI_DEFINE_VECFUNC("llvm.sin.f32", "_ZGVsMxv_sinf", SCALABLE(4)) + +TLI_DEFINE_VECFUNC("sinh", "_ZGVsMxv_sinh", SCALABLE(2)) +TLI_DEFINE_VECFUNC("sinhf", "_ZGVsMxv_sinhf", SCALABLE(4)) + +TLI_DEFINE_VECFUNC("sqrt", "_ZGVsMxv_sqrt", SCALABLE(2)) +TLI_DEFINE_VECFUNC("sqrtf", "_ZGVsMxv_sqrtf", SCALABLE(4)) + +TLI_DEFINE_VECFUNC("tan", "_ZGVsMxv_tan", SCALABLE(2)) +TLI_DEFINE_VECFUNC("tanf", "_ZGVsMxv_tanf", SCALABLE(4)) + +TLI_DEFINE_VECFUNC("tanh", "_ZGVsMxv_tanh", SCALABLE(2)) +TLI_DEFINE_VECFUNC("tanhf", "_ZGVsMxv_tanhf", SCALABLE(4)) + +TLI_DEFINE_VECFUNC("tgamma", "_ZGVsMxv_tgamma", SCALABLE(2)) +TLI_DEFINE_VECFUNC("tgammaf", "_ZGVsMxv_tgammaf", SCALABLE(4)) + #else #error "Must choose which vector library functions are to be defined." #endif @@ -616,4 +692,5 @@ #undef TLI_DEFINE_SVML_VECFUNCS #undef TLI_DEFINE_SLEEFGNUABI_VF2_VECFUNCS #undef TLI_DEFINE_SLEEFGNUABI_VF4_VECFUNCS +#undef TLI_DEFINE_SLEEFGNUABI_SCALABLE_VECFUNCS #undef TLI_DEFINE_MASSV_VECFUNCS_NAMES diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h --- a/llvm/include/llvm/Analysis/VectorUtils.h +++ b/llvm/include/llvm/Analysis/VectorUtils.h @@ -192,7 +192,7 @@ /// where: /// /// = "_LLVM_" -/// = "N". Note: TLI does not support masked interfaces. +/// = "M" if masked, "N" if no mask. /// = Number of concurrent lanes, stored in the `VectorizationFactor` /// field of the `VecDesc` struct. If the number of lanes is scalable /// then 'x' is printed instead. @@ -200,7 +200,8 @@ /// = the name of the scalar function. /// = the name of the vector function. std::string mangleTLIVectorName(StringRef VectorName, StringRef ScalarName, - unsigned numArgs, ElementCount VF); + unsigned numArgs, ElementCount VF, + bool Masked = false); /// Retrieve the `VFParamKind` from a string token. VFParamKind getVFParamKindFromString(const StringRef Token); diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp --- a/llvm/lib/Analysis/TargetLibraryInfo.cpp +++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp @@ -1185,6 +1185,10 @@ }; const VecDesc VecFuncs_VF4[] = { #define TLI_DEFINE_SLEEFGNUABI_VF4_VECFUNCS +#include "llvm/Analysis/VecFuncs.def" + }; + const VecDesc VecFuncs_VFScalable[] = { +#define TLI_DEFINE_SLEEFGNUABI_SCALABLE_VECFUNCS #include "llvm/Analysis/VecFuncs.def" }; @@ -1195,6 +1199,7 @@ case llvm::Triple::aarch64_be: addVectorizableFunctions(VecFuncs_VF2); addVectorizableFunctions(VecFuncs_VF4); + addVectorizableFunctions(VecFuncs_VFScalable); break; } break; @@ -1214,16 +1219,16 @@ return I != VectorDescs.end() && StringRef(I->ScalarFnName) == funcName; } -StringRef -TargetLibraryInfoImpl::getVectorizedFunction(StringRef F, - const ElementCount &VF) const { +StringRef TargetLibraryInfoImpl::getVectorizedFunction(StringRef F, + const ElementCount &VF, + bool Masked) const { F = sanitizeFunctionName(F); if (F.empty()) return F; std::vector::const_iterator I = llvm::lower_bound(VectorDescs, F, compareWithScalarFnName); while (I != VectorDescs.end() && StringRef(I->ScalarFnName) == F) { - if (I->VectorizationFactor == VF) + if ((I->VectorizationFactor == VF) && (I->Masked == Masked)) return I->VectorFnName; ++I; } diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -1529,10 +1529,10 @@ std::string VFABI::mangleTLIVectorName(StringRef VectorName, StringRef ScalarName, unsigned numArgs, - ElementCount VF) { + ElementCount VF, bool Masked) { SmallString<256> Buffer; llvm::raw_svector_ostream Out(Buffer); - Out << "_ZGV" << VFABI::_LLVM_ << "N"; + Out << "_ZGV" << VFABI::_LLVM_ << (Masked ? "M" : "N"); if (VF.isScalable()) Out << 'x'; else diff --git a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp --- a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp +++ b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp @@ -156,7 +156,7 @@ // and the exact vector width of the call operands in the // TargetLibraryInfo. const std::string TLIName = - std::string(TLI.getVectorizedFunction(ScalarName, VF)); + std::string(TLI.getVectorizedFunction(ScalarName, VF, false)); LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Looking up TLI mapping for `" << ScalarName << "` and vector width " << VF << ".\n"); diff --git a/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp b/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp --- a/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp +++ b/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp @@ -40,6 +40,7 @@ /// CI (other than void) need to be widened to a VectorType of VF /// lanes. static void addVariantDeclaration(CallInst &CI, const ElementCount &VF, + bool Predicate, const StringRef VFName) { Module *M = CI.getModule(); @@ -50,6 +51,8 @@ Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); assert(!CI.getFunctionType()->isVarArg() && "VarArg functions are not supported."); + if (Predicate) + Tys.push_back(ToVectorTy(Type::getInt1Ty(RetTy->getContext()), VF)); FunctionType *FTy = FunctionType::get(RetTy, Tys, /*isVarArg=*/false); Function *VectorF = Function::Create(FTy, Function::ExternalLinkage, VFName, M); @@ -89,19 +92,19 @@ const SetVector OriginalSetOfMappings(Mappings.begin(), Mappings.end()); - auto AddVariantDecl = [&](const ElementCount &VF) { + auto AddVariantDecl = [&](const ElementCount &VF, bool Predicate) { const std::string TLIName = - std::string(TLI.getVectorizedFunction(ScalarName, VF)); + std::string(TLI.getVectorizedFunction(ScalarName, VF, Predicate)); if (!TLIName.empty()) { - std::string MangledName = - VFABI::mangleTLIVectorName(TLIName, ScalarName, CI.arg_size(), VF); + std::string MangledName = VFABI::mangleTLIVectorName( + TLIName, ScalarName, CI.arg_size(), VF, Predicate); if (!OriginalSetOfMappings.count(MangledName)) { Mappings.push_back(MangledName); ++NumCallInjected; } Function *VariantF = M->getFunction(TLIName); if (!VariantF) - addVariantDeclaration(CI, VF, TLIName); + addVariantDeclaration(CI, VF, Predicate, TLIName); } }; @@ -109,13 +112,15 @@ ElementCount WidestFixedVF, WidestScalableVF; TLI.getWidestVF(ScalarName, WidestFixedVF, WidestScalableVF); - for (ElementCount VF = ElementCount::getFixed(2); - ElementCount::isKnownLE(VF, WidestFixedVF); VF *= 2) - AddVariantDecl(VF); + for (bool Predicated : { false, true }) { + for (ElementCount VF = ElementCount::getFixed(2); + ElementCount::isKnownLE(VF, WidestFixedVF); VF *= 2) + AddVariantDecl(VF, Predicated); - // TODO: Add scalable variants once we're able to test them. - assert(WidestScalableVF.isZero() && - "Scalable vector mappings not yet supported"); + for (ElementCount VF = ElementCount::getScalable(2); + ElementCount::isKnownLE(VF, WidestScalableVF); VF *= 2) + AddVariantDecl(VF, Predicated); + } VFABI::setVectorVariantNames(&CI, Mappings); } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sleef-calls-aarch64.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sleef-calls-aarch64.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/sleef-calls-aarch64.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sleef-calls-aarch64.ll @@ -1,5 +1,6 @@ ; Do NOT use -O3. It will lower exp2 to ldexp, and the test will fail. -; RUN: opt -vector-library=sleefgnuabi -replace-with-veclib < %s | opt -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-unroll,loop-vectorize -S | FileCheck %s +; RUN: opt -vector-library=sleefgnuabi -replace-with-veclib < %s | opt -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-unroll,loop-vectorize -S | FileCheck %s --check-prefixes=CHECK,NEON +; RUN: opt -mattr=+sve -vector-library=sleefgnuabi -replace-with-veclib < %s | opt -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-unroll,loop-vectorize -S | FileCheck %s --check-prefixes=CHECK,SVE target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-unknown-linux-gnu" @@ -11,7 +12,8 @@ define void @acos_f64(double* nocapture %varray) { ; CHECK-LABEL: @acos_f64( - ; CHECK: [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_acos(<2 x double> [[TMP4:%.*]]) + ; NEON: [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_acos(<2 x double> [[TMP4:%.*]]) + ; SVE: [[TMP5:%.*]] = call @_ZGVsMxv_acos( [[TMP4:%.*]], {{.*}}) ; CHECK: ret void ; entry: @@ -34,7 +36,8 @@ define void @acos_f32(float* nocapture %varray) { ; CHECK-LABEL: @acos_f32( - ; CHECK: [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_acosf(<4 x float> [[TMP4:%.*]]) + ; NEON: [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_acosf(<4 x float> [[TMP4:%.*]]) + ; SVE: [[TMP5:%.*]] = call @_ZGVsMxv_acosf( [[TMP4:%.*]], {{.*}}) ; CHECK: ret void ; entry: @@ -62,7 +65,8 @@ define void @asin_f64(double* nocapture %varray) { ; CHECK-LABEL: @asin_f64( - ; CHECK: [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_asin(<2 x double> [[TMP4:%.*]]) + ; NEON: [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_asin(<2 x double> [[TMP4:%.*]]) + ; SVE: [[TMP5:%.*]] = call @_ZGVsMxv_asin( [[TMP4:%.*]], {{.*}}) ; CHECK: ret void ; entry: @@ -85,7 +89,8 @@ define void @asin_f32(float* nocapture %varray) { ; CHECK-LABEL: @asin_f32( - ; CHECK: [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_asinf(<4 x float> [[TMP4:%.*]]) + ; NEON: [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_asinf(<4 x float> [[TMP4:%.*]]) + ; SVE: [[TMP5:%.*]] = call @_ZGVsMxv_asinf( [[TMP4:%.*]], {{.*}}) ; CHECK: ret void ; entry: @@ -113,7 +118,8 @@ define void @atan_f64(double* nocapture %varray) { ; CHECK-LABEL: @atan_f64( - ; CHECK: [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_atan(<2 x double> [[TMP4:%.*]]) + ; NEON: [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_atan(<2 x double> [[TMP4:%.*]]) + ; SVE: [[TMP5:%.*]] = call @_ZGVsMxv_atan( [[TMP4:%.*]], {{.*}}) ; CHECK: ret void ; entry: @@ -136,7 +142,8 @@ define void @atan_f32(float* nocapture %varray) { ; CHECK-LABEL: @atan_f32( - ; CHECK: [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_atanf(<4 x float> [[TMP4:%.*]]) + ; NEON: [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_atanf(<4 x float> [[TMP4:%.*]]) + ; SVE: [[TMP5:%.*]] = call @_ZGVsMxv_atanf( [[TMP4:%.*]], {{.*}}) ; CHECK: ret void ; entry: @@ -164,7 +171,8 @@ define void @atan2_f64(double* nocapture %varray) { ; CHECK-LABEL: @atan2_f64( - ; CHECK: [[TMP5:%.*]] = call <2 x double> @_ZGVnN2vv_atan2(<2 x double> [[TMP4:%.*]], <2 x double> [[TMP4:%.*]]) + ; NEON: [[TMP5:%.*]] = call <2 x double> @_ZGVnN2vv_atan2(<2 x double> [[TMP4:%.*]], <2 x double> [[TMP4:%.*]]) + ; SVE: [[TMP5:%.*]] = call @_ZGVsMxvv_atan2( [[TMP4:%.*]], [[TMP4:%.*]], {{.*}}) ; CHECK: ret void ; entry: @@ -187,7 +195,8 @@ define void @atan2_f32(float* nocapture %varray) { ; CHECK-LABEL: @atan2_f32( - ; CHECK: [[TMP5:%.*]] = call <4 x float> @_ZGVnN4vv_atan2f(<4 x float> [[TMP4:%.*]], <4 x float> [[TMP4:%.*]]) + ; NEON: [[TMP5:%.*]] = call <4 x float> @_ZGVnN4vv_atan2f(<4 x float> [[TMP4:%.*]], <4 x float> [[TMP4:%.*]]) + ; SVE: [[TMP5:%.*]] = call @_ZGVsMxvv_atan2f( [[TMP4:%.*]], [[TMP4:%.*]], {{.*}}) ; CHECK: ret void ; entry: @@ -215,7 +224,8 @@ define void @atanh_f64(double* nocapture %varray) { ; CHECK-LABEL: @atanh_f64( - ; CHECK: [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_atanh(<2 x double> [[TMP4:%.*]]) + ; NEON: [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_atanh(<2 x double> [[TMP4:%.*]]) + ; SVE: [[TMP5:%.*]] = call @_ZGVsMxv_atanh( [[TMP4:%.*]], {{.*}}) ; CHECK: ret void ; entry: @@ -238,7 +248,8 @@ define void @atanh_f32(float* nocapture %varray) { ; CHECK-LABEL: @atanh_f32( - ; CHECK: [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_atanhf(<4 x float> [[TMP4:%.*]]) + ; NEON: [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_atanhf(<4 x float> [[TMP4:%.*]]) + ; SVE: [[TMP5:%.*]] = call @_ZGVsMxv_atanhf( [[TMP4:%.*]], {{.*}}) ; CHECK: ret void ; entry: @@ -266,7 +277,8 @@ define void @cos_f64(double* nocapture %varray) { ; CHECK-LABEL: @cos_f64( - ; CHECK: [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_cos(<2 x double> [[TMP4:%.*]]) + ; NEON: [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_cos(<2 x double> [[TMP4:%.*]]) + ; SVE: [[TMP5:%.*]] = call @_ZGVsMxv_cos( [[TMP4:%.*]], {{.*}}) ; CHECK: ret void ; entry: @@ -289,7 +301,8 @@ define void @cos_f32(float* nocapture %varray) { ; CHECK-LABEL: @cos_f32( - ; CHECK: [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_cosf(<4 x float> [[TMP4:%.*]]) + ; NEON: [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_cosf(<4 x float> [[TMP4:%.*]]) + ; SVE: [[TMP5:%.*]] = call @_ZGVsMxv_cosf( [[TMP4:%.*]], {{.*}}) ; CHECK: ret void ; entry: @@ -317,7 +330,8 @@ define void @cosh_f64(double* nocapture %varray) { ; CHECK-LABEL: @cosh_f64( - ; CHECK: [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_cosh(<2 x double> [[TMP4:%.*]]) + ; NEON: [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_cosh(<2 x double> [[TMP4:%.*]]) + ; SVE: [[TMP5:%.*]] = call @_ZGVsMxv_cosh( [[TMP4:%.*]], {{.*}}) ; CHECK: ret void ; entry: @@ -340,7 +354,8 @@ define void @cosh_f32(float* nocapture %varray) { ; CHECK-LABEL: @cosh_f32( - ; CHECK: [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_coshf(<4 x float> [[TMP4:%.*]]) + ; NEON: [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_coshf(<4 x float> [[TMP4:%.*]]) + ; SVE: [[TMP5:%.*]] = call @_ZGVsMxv_coshf( [[TMP4:%.*]], {{.*}}) ; CHECK: ret void ; entry: @@ -368,7 +383,8 @@ define void @exp_f64(double* nocapture %varray) { ; CHECK-LABEL: @exp_f64( - ; CHECK: [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_exp(<2 x double> [[TMP4:%.*]]) + ; NEON: [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_exp(<2 x double> [[TMP4:%.*]]) + ; SVE: [[TMP5:%.*]] = call @_ZGVsMxv_exp( [[TMP4:%.*]], {{.*}}) ; CHECK: ret void ; entry: @@ -391,7 +407,8 @@ define void @exp_f32(float* nocapture %varray) { ; CHECK-LABEL: @exp_f32( - ; CHECK: [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_expf(<4 x float> [[TMP4:%.*]]) + ; NEON: [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_expf(<4 x float> [[TMP4:%.*]]) + ; SVE: [[TMP5:%.*]] = call @_ZGVsMxv_expf( [[TMP4:%.*]], {{.*}}) ; CHECK: ret void ; entry: @@ -419,7 +436,8 @@ define void @exp2_f64(double* nocapture %varray) { ; CHECK-LABEL: @exp2_f64( - ; CHECK: [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_exp2(<2 x double> [[TMP4:%.*]]) + ; NEON: [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_exp2(<2 x double> [[TMP4:%.*]]) + ; SVE: [[TMP5:%.*]] = call @_ZGVsMxv_exp2( [[TMP4:%.*]], {{.*}}) ; CHECK: ret void ; entry: @@ -442,7 +460,8 @@ define void @exp2_f32(float* nocapture %varray) { ; CHECK-LABEL: @exp2_f32( - ; CHECK: [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_exp2f(<4 x float> [[TMP4:%.*]]) + ; NEON: [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_exp2f(<4 x float> [[TMP4:%.*]]) + ; SVE: [[TMP5:%.*]] = call @_ZGVsMxv_exp2f( [[TMP4:%.*]], {{.*}}) ; CHECK: ret void ; entry: @@ -470,7 +489,8 @@ define void @exp10_f64(double* nocapture %varray) { ; CHECK-LABEL: @exp10_f64( - ; CHECK: [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_exp10(<2 x double> [[TMP4:%.*]]) + ; NEON: [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_exp10(<2 x double> [[TMP4:%.*]]) + ; SVE: [[TMP5:%.*]] = call @_ZGVsMxv_exp10( [[TMP4:%.*]], {{.*}}) ; CHECK: ret void ; entry: @@ -493,7 +513,8 @@ define void @exp10_f32(float* nocapture %varray) { ; CHECK-LABEL: @exp10_f32( - ; CHECK: [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_exp10f(<4 x float> [[TMP4:%.*]]) + ; NEON: [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_exp10f(<4 x float> [[TMP4:%.*]]) + ; SVE: [[TMP5:%.*]] = call @_ZGVsMxv_exp10f( [[TMP4:%.*]], {{.*}}) ; CHECK: ret void ; entry: @@ -521,7 +542,8 @@ define void @lgamma_f64(double* nocapture %varray) { ; CHECK-LABEL: @lgamma_f64( - ; CHECK: [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_lgamma(<2 x double> [[TMP4:%.*]]) + ; NEON: [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_lgamma(<2 x double> [[TMP4:%.*]]) + ; SVE: [[TMP5:%.*]] = call @_ZGVsMxv_lgamma( [[TMP4:%.*]], {{.*}}) ; CHECK: ret void ; entry: @@ -544,7 +566,8 @@ define void @lgamma_f32(float* nocapture %varray) { ; CHECK-LABEL: @lgamma_f32( - ; CHECK: [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_lgammaf(<4 x float> [[TMP4:%.*]]) + ; NEON: [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_lgammaf(<4 x float> [[TMP4:%.*]]) + ; SVE: [[TMP5:%.*]] = call @_ZGVsMxv_lgammaf( [[TMP4:%.*]], {{.*}}) ; CHECK: ret void ; entry: @@ -572,7 +595,8 @@ define void @log10_f64(double* nocapture %varray) { ; CHECK-LABEL: @log10_f64( - ; CHECK: [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_log10(<2 x double> [[TMP4:%.*]]) + ; NEON: [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_log10(<2 x double> [[TMP4:%.*]]) + ; SVE: [[TMP5:%.*]] = call @_ZGVsMxv_log10( [[TMP4:%.*]], {{.*}}) ; CHECK: ret void ; entry: @@ -595,7 +619,8 @@ define void @log10_f32(float* nocapture %varray) { ; CHECK-LABEL: @log10_f32( - ; CHECK: [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_log10f(<4 x float> [[TMP4:%.*]]) + ; NEON: [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_log10f(<4 x float> [[TMP4:%.*]]) + ; SVE: [[TMP5:%.*]] = call @_ZGVsMxv_log10f( [[TMP4:%.*]], {{.*}}) ; CHECK: ret void ; entry: @@ -674,7 +699,8 @@ define void @log_f64(double* nocapture %varray) { ; CHECK-LABEL: @log_f64( - ; CHECK: [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_log(<2 x double> [[TMP4:%.*]]) + ; NEON: [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_log(<2 x double> [[TMP4:%.*]]) + ; SVE: [[TMP5:%.*]] = call @_ZGVsMxv_log( [[TMP4:%.*]], {{.*}}) ; CHECK: ret void ; entry: @@ -697,7 +723,8 @@ define void @log_f32(float* nocapture %varray) { ; CHECK-LABEL: @log_f32( - ; CHECK: [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_logf(<4 x float> [[TMP4:%.*]]) + ; NEON: [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_logf(<4 x float> [[TMP4:%.*]]) + ; SVE: [[TMP5:%.*]] = call @_ZGVsMxv_logf( [[TMP4:%.*]], {{.*}}) ; CHECK: ret void ; entry: @@ -725,7 +752,8 @@ define void @pow_f64(double* nocapture %varray) { ; CHECK-LABEL: @pow_f64( - ; CHECK: [[TMP5:%.*]] = call <2 x double> @_ZGVnN2vv_pow(<2 x double> [[TMP4:%.*]], <2 x double> [[TMP4:%.*]]) + ; NEON: [[TMP5:%.*]] = call <2 x double> @_ZGVnN2vv_pow(<2 x double> [[TMP4:%.*]], <2 x double> [[TMP4:%.*]]) + ; SVE: [[TMP5:%.*]] = call @_ZGVsMxvv_pow( [[TMP4:%.*]], [[TMP4:%.*]], {{.*}}) ; CHECK: ret void ; entry: @@ -748,7 +776,8 @@ define void @pow_f32(float* nocapture %varray) { ; CHECK-LABEL: @pow_f32( - ; CHECK: [[TMP5:%.*]] = call <4 x float> @_ZGVnN4vv_powf(<4 x float> [[TMP4:%.*]], <4 x float> [[TMP4:%.*]]) + ; NEON: [[TMP5:%.*]] = call <4 x float> @_ZGVnN4vv_powf(<4 x float> [[TMP4:%.*]], <4 x float> [[TMP4:%.*]]) + ; SVE: [[TMP5:%.*]] = call @_ZGVsMxvv_powf( [[TMP4:%.*]], [[TMP4:%.*]], {{.*}}) ; CHECK: ret void ; entry: @@ -776,7 +805,8 @@ define void @sin_f64(double* nocapture %varray) { ; CHECK-LABEL: @sin_f64( - ; CHECK: [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_sin(<2 x double> [[TMP4:%.*]]) + ; NEON: [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_sin(<2 x double> [[TMP4:%.*]]) + ; SVE: [[TMP5:%.*]] = call @_ZGVsMxv_sin( [[TMP4:%.*]], {{.*}}) ; CHECK: ret void ; entry: @@ -799,7 +829,8 @@ define void @sin_f32(float* nocapture %varray) { ; CHECK-LABEL: @sin_f32( - ; CHECK: [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_sinf(<4 x float> [[TMP4:%.*]]) + ; NEON: [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_sinf(<4 x float> [[TMP4:%.*]]) + ; SVE: [[TMP5:%.*]] = call @_ZGVsMxv_sinf( [[TMP4:%.*]], {{.*}}) ; CHECK: ret void ; entry: @@ -827,7 +858,8 @@ define void @sinh_f64(double* nocapture %varray) { ; CHECK-LABEL: @sinh_f64( - ; CHECK: [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_sinh(<2 x double> [[TMP4:%.*]]) + ; NEON: [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_sinh(<2 x double> [[TMP4:%.*]]) + ; SVE: [[TMP5:%.*]] = call @_ZGVsMxv_sinh( [[TMP4:%.*]], {{.*}}) ; CHECK: ret void ; entry: @@ -850,7 +882,8 @@ define void @sinh_f32(float* nocapture %varray) { ; CHECK-LABEL: @sinh_f32( - ; CHECK: [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_sinhf(<4 x float> [[TMP4:%.*]]) + ; NEON: [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_sinhf(<4 x float> [[TMP4:%.*]]) + ; SVE: [[TMP5:%.*]] = call @_ZGVsMxv_sinhf( [[TMP4:%.*]], {{.*}}) ; CHECK: ret void ; entry: @@ -878,7 +911,8 @@ define void @sqrt_f64(double* nocapture %varray) { ; CHECK-LABEL: @sqrt_f64( - ; CHECK: [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_sqrt(<2 x double> [[TMP4:%.*]]) + ; NEON: [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_sqrt(<2 x double> [[TMP4:%.*]]) + ; SVE: [[TMP5:%.*]] = call @_ZGVsMxv_sqrt( [[TMP4:%.*]], {{.*}}) ; CHECK: ret void ; entry: @@ -901,7 +935,8 @@ define void @sqrt_f32(float* nocapture %varray) { ; CHECK-LABEL: @sqrt_f32( - ; CHECK: [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_sqrtf(<4 x float> [[TMP4:%.*]]) + ; NEON: [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_sqrtf(<4 x float> [[TMP4:%.*]]) + ; SVE: [[TMP5:%.*]] = call @_ZGVsMxv_sqrtf( [[TMP4:%.*]], {{.*}}) ; CHECK: ret void ; entry: @@ -922,10 +957,10 @@ ret void } - define void @llvm_sqrt_f64(double* nocapture %varray) { ; CHECK-LABEL: @llvm_sqrt_f64( - ; CHECK: [[TMP5:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP4:%.*]]) + ; NEON: [[TMP5:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP4:%.*]]) + ; SVE: [[TMP5:%.*]] = call fast @llvm.sqrt.nxv2f64( [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -948,7 +983,8 @@ define void @llvm_sqrt_f32(float* nocapture %varray) { ; CHECK-LABEL: @llvm_sqrt_f32( - ; CHECK: [[TMP5:%.*]] = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP4:%.*]]) + ; NEON: [[TMP5:%.*]] = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP4:%.*]]) + ; SVE: [[TMP5:%.*]] = call fast @llvm.sqrt.nxv4f32( [[TMP4:%.*]]) ; CHECK: ret void ; entry: @@ -976,7 +1012,8 @@ define void @tan_f64(double* nocapture %varray) { ; CHECK-LABEL: @tan_f64( - ; CHECK: [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_tan(<2 x double> [[TMP4:%.*]]) + ; NEON: [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_tan(<2 x double> [[TMP4:%.*]]) + ; SVE: [[TMP5:%.*]] = call @_ZGVsMxv_tan( [[TMP4:%.*]], {{.*}}) ; CHECK: ret void ; entry: @@ -999,7 +1036,8 @@ define void @tan_f32(float* nocapture %varray) { ; CHECK-LABEL: @tan_f32( - ; CHECK: [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_tanf(<4 x float> [[TMP4:%.*]]) + ; NEON: [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_tanf(<4 x float> [[TMP4:%.*]]) + ; SVE: [[TMP5:%.*]] = call @_ZGVsMxv_tanf( [[TMP4:%.*]], {{.*}}) ; CHECK: ret void ; entry: @@ -1027,7 +1065,8 @@ define void @tanh_f64(double* nocapture %varray) { ; CHECK-LABEL: @tanh_f64( - ; CHECK: [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_tanh(<2 x double> [[TMP4:%.*]]) + ; NEON: [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_tanh(<2 x double> [[TMP4:%.*]]) + ; SVE: [[TMP5:%.*]] = call @_ZGVsMxv_tanh( [[TMP4:%.*]], {{.*}}) ; CHECK: ret void ; entry: @@ -1050,7 +1089,8 @@ define void @tanh_f32(float* nocapture %varray) { ; CHECK-LABEL: @tanh_f32( - ; CHECK: [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_tanhf(<4 x float> [[TMP4:%.*]]) + ; NEON: [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_tanhf(<4 x float> [[TMP4:%.*]]) + ; SVE: [[TMP5:%.*]] = call @_ZGVsMxv_tanhf( [[TMP4:%.*]], {{.*}}) ; CHECK: ret void ; entry: @@ -1078,7 +1118,8 @@ define void @tgamma_f64(double* nocapture %varray) { ; CHECK-LABEL: @tgamma_f64( - ; CHECK: [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_tgamma(<2 x double> [[TMP4:%.*]]) + ; NEON: [[TMP5:%.*]] = call <2 x double> @_ZGVnN2v_tgamma(<2 x double> [[TMP4:%.*]]) + ; SVE: [[TMP5:%.*]] = call @_ZGVsMxv_tgamma( [[TMP4:%.*]], {{.*}}) ; CHECK: ret void ; entry: @@ -1101,7 +1142,8 @@ define void @tgamma_f32(float* nocapture %varray) { ; CHECK-LABEL: @tgamma_f32( - ; CHECK: [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_tgammaf(<4 x float> [[TMP4:%.*]]) + ; NEON: [[TMP5:%.*]] = call <4 x float> @_ZGVnN4v_tgammaf(<4 x float> [[TMP4:%.*]]) + ; SVE: [[TMP5:%.*]] = call @_ZGVsMxv_tgammaf( [[TMP4:%.*]], {{.*}}) ; CHECK: ret void ; entry: