diff --git a/llvm/include/llvm/Analysis/VecFuncs.def b/llvm/include/llvm/Analysis/VecFuncs.def --- a/llvm/include/llvm/Analysis/VecFuncs.def +++ b/llvm/include/llvm/Analysis/VecFuncs.def @@ -644,6 +644,9 @@ TLI_DEFINE_VECFUNC("exp10", "_ZGVsMxv_exp10", SCALABLE(2), MASKED) TLI_DEFINE_VECFUNC("exp10f", "_ZGVsMxv_exp10f", SCALABLE(4), MASKED) +TLI_DEFINE_VECFUNC("fmod", "_ZGVsMxvv_fmod", SCALABLE(2), MASKED) +TLI_DEFINE_VECFUNC("fmodf", "_ZGVsMxvv_fmodf", SCALABLE(4), MASKED) + TLI_DEFINE_VECFUNC("lgamma", "_ZGVsMxv_lgamma", SCALABLE(2), MASKED) TLI_DEFINE_VECFUNC("lgammaf", "_ZGVsMxv_lgammaf", SCALABLE(4), MASKED) diff --git a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp --- a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp +++ b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp @@ -89,6 +89,62 @@ return true; } +static bool replaceInstructionWithTLIFunction(Instruction &I, + const StringRef TLIName, + bool Masked, + ElementCount NumElements, + Type *ElementType) { + Module *M = I.getModule(); + IRBuilder<> IRBuilder(&I); + + // Check if the vector library function is already declared in this module, + // otherwise insert it. + Function *TLIFunc = M->getFunction(TLIName); + if (!TLIFunc) { + FunctionType *FTy = nullptr; + Type *RetTy = I.getType(); + if (Masked) { + Type *Tys[3] = {RetTy, RetTy, + ToVectorTy(IRBuilder.getInt1Ty(), NumElements)}; + FTy = FunctionType::get(RetTy, Tys, false); + } else { + Type *Tys[2] = {RetTy, RetTy}; + FTy = FunctionType::get(RetTy, Tys, false); + } + TLIFunc = Function::Create(FTy, Function::ExternalLinkage, TLIName, *M); + + LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Added vector library function `" + << TLIName << "` of type `" << *(TLIFunc->getType()) + << "` to module.\n"); + + ++NumTLIFuncDeclAdded; + + // Add the freshly created function to llvm.compiler.used, + // similar to as it is done in InjectTLIMappings + appendToCompilerUsed(*M, {TLIFunc}); + LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Adding `" << TLIName + << "` to `@llvm.compiler.used`.\n"); + ++NumFuncUsedAdded; + } + SmallVector Args(I.operand_values()); + if (Masked) { + Value *AllActiveMask = ConstantInt::getTrue(VectorType::get( + IntegerType::getInt1Ty(TLIFunc->getType()->getContext()), NumElements)); + Args.push_back(AllActiveMask); + } + CallInst *Replacement = IRBuilder.CreateCall(TLIFunc, Args); + I.replaceAllUsesWith(Replacement); + if (isa(Replacement)) { + // Preserve fast math flags for FP math. + Replacement->copyFastMathFlags(&I); + } + Replacement->copyMetadata(I); + LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Replaced call to `" << I.getOpcodeName() + << "` with call to `" << TLIName << "`.\n"); + ++NumCallsReplaced; + return true; +} + static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI, CallInst &CI) { if (!CI.getCalledFunction()) { @@ -173,21 +229,72 @@ return false; } +static bool replaceInstructionWithCallToVeclib(const TargetLibraryInfo &TLI, + Instruction &I) { + // We only have TLI mappings for SVE. + if (!I.getType()->isScalableTy()) { + return false; + } + auto *VectorArgTy = dyn_cast(I.getType()); + if (!VectorArgTy) { + return false; + } + ElementCount NumElements = VectorArgTy->getElementCount(); + Type *ElementType = VectorArgTy->getElementType(); + StringRef ScalarName = + (ElementType->isFloatTy()) + ? TLI.getName(LibFunc_fmodf) + : ((ElementType->isDoubleTy()) ? TLI.getName(LibFunc_fmod) : ""); + if (!ScalarName.empty()) { + if (!TLI.isFunctionVectorizable(ScalarName)) { + // The TargetLibraryInfo does not contain a vectorized version of + // the scalar function. + return false; + } + const std::string TLINameUnmasked = + std::string(TLI.getVectorizedFunction(ScalarName, NumElements)); + const std::string TLINameMasked = + std::string(TLI.getVectorizedFunction(ScalarName, NumElements, true)); + LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Looking up TLI mapping for `" + << ScalarName << "` and vector width " << NumElements + << ".\n"); + if (!TLINameUnmasked.empty()) { + LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Found unmasked TLI function `" + << TLINameUnmasked << "`.\n"); + return replaceInstructionWithTLIFunction(I, TLINameUnmasked, false, + NumElements, ElementType); + } else if (!TLINameMasked.empty()) { + LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Found masked TLI function `" + << TLINameMasked << "`.\n"); + return replaceInstructionWithTLIFunction(I, TLINameMasked, true, + NumElements, ElementType); + } + } + return false; +} + static bool runImpl(const TargetLibraryInfo &TLI, Function &F) { bool Changed = false; - SmallVector ReplacedCalls; + SmallVector ReplacedCalls; for (auto &I : instructions(F)) { if (auto *CI = dyn_cast(&I)) { if (replaceWithCallToVeclib(TLI, *CI)) { - ReplacedCalls.push_back(CI); + ReplacedCalls.push_back(&I); + Changed = true; + } + } else if (I.getOpcode() == Instruction::FRem) { + // If there is a suitable TLI mapping for FRem instruction, + // replace the instruction. + if (replaceInstructionWithCallToVeclib(TLI, I)) { + ReplacedCalls.push_back(&I); Changed = true; } } } - // Erase the calls to the intrinsics that have been replaced - // with calls to the vector library. - for (auto *CI : ReplacedCalls) { - CI->eraseFromParent(); + // Erase the calls to the intrinsics and the instructions that have been + // replaced with calls to the vector library. + for (auto *I : ReplacedCalls) { + I->eraseFromParent(); } return Changed; } diff --git a/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-armpl.ll b/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-armpl.ll --- a/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-armpl.ll +++ b/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-armpl.ll @@ -377,4 +377,26 @@ ret %1 } +; NOTE: TLI mappings for FREM instruction. + +define @frem_vscale_f64( %in) #0 { +; CHECK-LABEL: define @frem_vscale_f64 +; CHECK-SAME: ( [[IN:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = call @armpl_svfmod_f64_x( [[IN]], shufflevector ( insertelement ( poison, double 7.000000e+00, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: ret [[TMP1]] +; + %out = frem %in, shufflevector ( insertelement ( poison, double 7.000000e+00, i64 0), poison, zeroinitializer) + ret %out +} + +define @frem_vscale_f32( %in) #0 { +; CHECK-LABEL: define @frem_vscale_f32 +; CHECK-SAME: ( [[IN:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = call @armpl_svfmod_f32_x( [[IN]], shufflevector ( insertelement ( poison, float 7.000000e+00, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: ret [[TMP1]] +; + %out = frem %in, shufflevector ( insertelement ( poison, float 7.000000e+00, i64 0), poison, zeroinitializer) + ret %out +} + attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-sleef-scalable.ll b/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-sleef-scalable.ll --- a/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-sleef-scalable.ll +++ b/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-sleef-scalable.ll @@ -365,6 +365,26 @@ ret %1 } +; NOTE: TLI mapping for FREM instruction. + +define @frem_vscale_f64( %in) { +; CHECK-LABEL: @frem_vscale_f64( +; CHECK-NEXT: [[TMP1:%.*]] = call @_ZGVsMxvv_fmod( [[IN:%.*]], shufflevector ( insertelement ( poison, double 7.000000e+00, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: ret [[TMP1]] +; + %out = frem %in, shufflevector ( insertelement ( poison, double 7.000000e+00, i64 0), poison, zeroinitializer) + ret %out +} + +define @frem_vscale_f32( %in) { +; CHECK-LABEL: @frem_vscale_f32( +; CHECK-NEXT: [[TMP1:%.*]] = call @_ZGVsMxvv_fmodf( [[IN:%.*]], shufflevector ( insertelement ( poison, float 7.000000e+00, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: ret [[TMP1]] +; + %out = frem %in, shufflevector ( insertelement ( poison, float 7.000000e+00, i64 0), poison, zeroinitializer) + ret %out +} + declare @llvm.ceil.nxv2f64() declare @llvm.ceil.nxv4f32() declare @llvm.copysign.nxv2f64(, ) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sleef-calls-aarch64.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sleef-calls-aarch64.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/sleef-calls-aarch64.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sleef-calls-aarch64.ll @@ -1,6 +1,6 @@ ; Do NOT use -O3. It will lower exp2 to ldexp, and the test will fail. -; RUN: opt -vector-library=sleefgnuabi -replace-with-veclib < %s | opt -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-unroll,loop-vectorize -S | FileCheck %s --check-prefixes=CHECK,NEON -; RUN: opt -mattr=+sve -vector-library=sleefgnuabi -replace-with-veclib < %s | opt -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-unroll,loop-vectorize -S | FileCheck %s --check-prefixes=CHECK,SVE +; RUN: opt -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-unroll,loop-vectorize -S < %s | FileCheck %s --check-prefixes=CHECK,NEON +; RUN: opt -mattr=+sve -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-unroll,loop-vectorize -S < %s | FileCheck %s --check-prefixes=CHECK,SVE target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-unknown-linux-gnu" @@ -535,6 +535,55 @@ ret void } +declare double @fmod(double, double) #0 +declare float @fmodf(float, float) #0 + +define void @fmod_f64(double* nocapture %varray) { + ; CHECK-LABEL: @fmod_f64( + ; SVE: [[TMP5:%.*]] = call @_ZGVsMxvv_fmod( [[TMP4:%.*]], [[TMP4:%.*]], {{.*}}) + ; CHECK: ret void + ; + entry: + br label %for.body + + for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to double + %call = tail call double @fmod(double %conv, double %conv) + %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv + store double %call, double* %arrayidx, align 8 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + + for.end: + ret void +} + +define void @fmod_f32(float* nocapture %varray) { + ; CHECK-LABEL: @fmod_f32( + ; SVE: [[TMP5:%.*]] = call @_ZGVsMxvv_fmodf( [[TMP4:%.*]], [[TMP4:%.*]], {{.*}}) + ; CHECK: ret void + ; + entry: + br label %for.body + + for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to float + %call = tail call float @fmodf(float %conv, float %conv) + %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv + store float %call, float* %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + + for.end: + ret void +} + declare double @lgamma(double) #0 declare float @lgammaf(float) #0 declare double @llvm.lgamma.f64(double) #0