Index: llvm/include/llvm/InitializePasses.h =================================================================== --- llvm/include/llvm/InitializePasses.h +++ llvm/include/llvm/InitializePasses.h @@ -380,6 +380,7 @@ void initializeRegionViewerPass(PassRegistry&); void initializeRegisterCoalescerPass(PassRegistry&); void initializeRenameIndependentSubregsPass(PassRegistry&); +void initializeReplaceWithVeclibLegacyPass(PassRegistry &); void initializeResetMachineFunctionPass(PassRegistry&); void initializeReversePostOrderFunctionAttrsLegacyPassPass(PassRegistry&); void initializeRewriteStatepointsForGCLegacyPassPass(PassRegistry &); Index: llvm/include/llvm/LinkAllPasses.h =================================================================== --- llvm/include/llvm/LinkAllPasses.h +++ llvm/include/llvm/LinkAllPasses.h @@ -228,6 +228,7 @@ (void) llvm::createWarnMissedTransformationsPass(); (void) llvm::createHardwareLoopsPass(); (void) llvm::createInjectTLIMappingsLegacyPass(); + (void)llvm::createReplaceWithVeclibLegacyPass(); (void) llvm::createUnifyLoopExitsPass(); (void) llvm::createFixIrreduciblePass(); Index: llvm/include/llvm/Transforms/Utils.h =================================================================== --- llvm/include/llvm/Transforms/Utils.h +++ llvm/include/llvm/Transforms/Utils.h @@ -133,6 +133,13 @@ // FunctionPass *createInjectTLIMappingsLegacyPass(); +//===----------------------------------------------------------------------===// +// +// ReplaceWithVeclibLegacy - replaces calls to builtins and intrinsics +// operating on vectors with calls to functions from the TargetLibraryInfo. +// +FunctionPass *createReplaceWithVeclibLegacyPass(); + //===----------------------------------------------------------------------===// // // UnifyLoopExits - For each loop, creates a new block N such that all exiting Index: llvm/include/llvm/Transforms/Utils/ReplaceWithVeclib.h =================================================================== --- /dev/null +++ llvm/include/llvm/Transforms/Utils/ReplaceWithVeclib.h @@ -0,0 +1,38 @@ +//===- ReplaceWithVeclib.h - Replace vector instrinsics with veclib calls -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Replaces calls to LLVM vector instrinsics (i.e., calls to LLVM intrinsics +// with vector operands) with matching calls to functions from a vector +// library (e.g., libmvec, SVML) according to TargetLibraryInfo. +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_TRANSFORMS_UTILS_REPLACEWITHVECLIB_H +#define LLVM_TRANSFORMS_UTILS_REPLACEWITHVECLIB_H + +#include "llvm/IR/PassManager.h" +#include "llvm/InitializePasses.h" + +namespace llvm { +class ReplaceWithVeclib : public PassInfoMixin { +public: + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; + +// Legacy pass +class ReplaceWithVeclibLegacy : public FunctionPass { +public: + static char ID; + ReplaceWithVeclibLegacy() : FunctionPass(ID) { + initializeReplaceWithVeclibLegacyPass(*PassRegistry::getPassRegistry()); + } + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool runOnFunction(Function &F) override; +}; + +} // End namespace llvm +#endif // LLVM_TRANSFORMS_UTILS_REPLACEWITHVECLIB_H \ No newline at end of file Index: llvm/lib/Passes/PassBuilder.cpp =================================================================== --- llvm/lib/Passes/PassBuilder.cpp +++ llvm/lib/Passes/PassBuilder.cpp @@ -226,6 +226,7 @@ #include "llvm/Transforms/Utils/Mem2Reg.h" #include "llvm/Transforms/Utils/MetaRenamer.h" #include "llvm/Transforms/Utils/NameAnonGlobals.h" +#include "llvm/Transforms/Utils/ReplaceWithVeclib.h" #include "llvm/Transforms/Utils/StripGCRelocates.h" #include "llvm/Transforms/Utils/StripNonLineTableDebugInfo.h" #include "llvm/Transforms/Utils/SymbolRewriter.h" Index: llvm/lib/Passes/PassRegistry.def =================================================================== --- llvm/lib/Passes/PassRegistry.def +++ llvm/lib/Passes/PassRegistry.def @@ -298,6 +298,7 @@ FUNCTION_PASS("reassociate", ReassociatePass()) FUNCTION_PASS("redundant-dbg-inst-elim", RedundantDbgInstEliminationPass()) FUNCTION_PASS("reg2mem", RegToMemPass()) +FUNCTION_PASS("replace-with-veclib", ReplaceWithVeclib()) FUNCTION_PASS("scalarize-masked-mem-intrin", ScalarizeMaskedMemIntrinPass()) FUNCTION_PASS("scalarizer", ScalarizerPass()) FUNCTION_PASS("separate-const-offset-from-gep", SeparateConstOffsetFromGEPPass()) Index: llvm/lib/Transforms/Utils/CMakeLists.txt =================================================================== --- llvm/lib/Transforms/Utils/CMakeLists.txt +++ llvm/lib/Transforms/Utils/CMakeLists.txt @@ -53,6 +53,7 @@ NameAnonGlobals.cpp PredicateInfo.cpp PromoteMemoryToRegister.cpp + ReplaceWithVeclib.cpp ScalarEvolutionExpander.cpp StripGCRelocates.cpp SSAUpdater.cpp Index: llvm/lib/Transforms/Utils/ReplaceWithVeclib.cpp =================================================================== --- /dev/null +++ llvm/lib/Transforms/Utils/ReplaceWithVeclib.cpp @@ -0,0 +1,243 @@ +//=== ReplaceWithVeclib.cpp - Replace vector instrinsics with veclib calls ===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Replaces calls to LLVM vector instrinsics (i.e., calls to LLVM intrinsics +// with vector operands) with matching calls to functions from a vector +// library (e.g., libmvec, SVML) according to TargetLibraryInfo. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/ReplaceWithVeclib.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/DemandedBits.h" +#include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/VectorUtils.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/Transforms/Utils.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "replace-with-veclib" + +STATISTIC(NumCallsReplaced, + "Number of calls to intrinsics that have been replaced."); + +STATISTIC(NumTLIFuncDeclAdded, + "Number of vector library function declarations added."); + +STATISTIC(NumFuncUsedAdded, + "Number of functions added to `llvm.compiler.used`"); + +static bool replaceWithTLIFunction(CallInst &CI, const StringRef TLIName) { + Module *M = CI.getModule(); + + Function *OldFunc = CI.getCalledFunction(); + + // Check if the vector library function is already declared in this module, + // otherwise insert it. + Function *TLIFunc = M->getFunction(TLIName); + if (!TLIFunc) { + TLIFunc = Function::Create(OldFunc->getFunctionType(), + Function::ExternalLinkage, TLIName, *M); + TLIFunc->copyAttributesFrom(OldFunc); + + LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Added vector library function `" + << TLIName << "` of type `" << *(TLIFunc->getType()) + << "` to module.\n"); + + ++NumTLIFuncDeclAdded; + + // Add the freshly created function to llvm.compiler.used, + // similar to as it is done in InjectTLIMappings + appendToCompilerUsed(*M, {TLIFunc}); + + LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Adding `" << TLIName + << "` to `@llvm.compiler.used`.\n"); + ++NumFuncUsedAdded; + } + + // Replace the call to the vector intrinsic with a call + // to the corresponding function from the vector library. + IRBuilder<> builder{&CI}; + SmallVector args(CI.arg_operands()); + // Preserve the operand bundles. + SmallVector OpBundles; + CI.getOperandBundlesAsDefs(OpBundles); + CallInst *Replacement = builder.CreateCall(TLIFunc, args, OpBundles); + CI.replaceAllUsesWith(Replacement); + if (isa(Replacement)) { + // Preserve fast math flags for FP math. + Replacement->copyFastMathFlags(&CI); + } + + LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Replaced call to `" + << OldFunc->getName() << "` with call to `" << TLIName + << "`.\n"); + ++NumCallsReplaced; + return true; +} + +static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI, + CallInst &CI) { + if (!CI.getCalledFunction()) { + return false; + } + + auto IntrinsicID = CI.getCalledFunction()->getIntrinsicID(); + if (IntrinsicID == Intrinsic::not_intrinsic) { + // Replacement is only performed for intrinsic functions + return false; + } + + // Convert vector arguments to scalar type and check that + // all vector operands have identical vector width. + unsigned VF = 0; + SmallVector ScalarTypes; + for (auto arg : enumerate(CI.arg_operands())) { + auto ArgType = arg.value()->getType(); + // Vector calls to intrinsics can still have + // scalar operands for specific arguments. + if (hasVectorInstrinsicScalarOpd(IntrinsicID, arg.index())) { + ScalarTypes.push_back(ArgType); + } else { + // The argument in this place should be a vector if + // this is a call to a vector intrinsic. + auto VectorArgTy = dyn_cast(ArgType); + if (!VectorArgTy) { + // The argument is not a vector, do not perform + // the replacement. + return false; + } + auto NumElements = VectorArgTy->getElementCount(); + if (NumElements.isScalable()) { + // The current implementation does not support + // scalable vectors. + return false; + } + if (VF && VF != NumElements.getFixedValue()) { + // The different arguments differ in vector size. + return false; + } else { + VF = NumElements.getFixedValue(); + } + ScalarTypes.push_back(VectorArgTy->getElementType()); + } + } + + // Try to reconstruct the name for the scalar version of this + // intrinsic using the intrinsic ID and the argument types + // converted to scalar above. + std::string ScalarName; + if (Intrinsic::isOverloaded(IntrinsicID)) { + ScalarName = Intrinsic::getName(IntrinsicID, ScalarTypes); + } else { + ScalarName = Intrinsic::getName(IntrinsicID).str(); + } + + if (!TLI.isFunctionVectorizable(ScalarName)) { + // The TargetLibraryInfo does not contain a vectorized version of + // the scalar function. + return false; + } + + // Try to find the mapping for the scalar version of this intrinsic + // and the exact vector width of the call operands in the + // TargetLibraryInfo. + const std::string TLIName = + std::string(TLI.getVectorizedFunction(ScalarName, VF)); + + LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Looking up TLI mapping for `" + << ScalarName << "` and vector width " << VF << ".\n"); + + if (!TLIName.empty()) { + // Found the correct mapping in the TargetLibraryInfo, + // replace the call to the instrinsic with a call to + // the vector library function. + LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Found TLI function `" << TLIName + << "`.\n"); + return replaceWithTLIFunction(CI, TLIName); + } + + return false; +} + +static bool runImpl(const TargetLibraryInfo &TLI, Function &F) { + bool changed = false; + for (auto &I : instructions(F)) { + if (auto CI = dyn_cast(&I)) { + changed |= replaceWithCallToVeclib(TLI, *CI); + } + } + return changed; +} + +//////////////////////////////////////////////////////////////////////////////// +// New pass manager implementation. +//////////////////////////////////////////////////////////////////////////////// +PreservedAnalyses ReplaceWithVeclib::run(Function &F, + FunctionAnalysisManager &AM) { + const TargetLibraryInfo &TLI = AM.getResult(F); + auto changed = runImpl(TLI, F); + if (changed) { + PreservedAnalyses PA; + PA.preserveSet(); + PA.preserve(); + PA.preserve(); + PA.preserve(); + PA.preserve(); + PA.preserve(); + PA.preserve(); + PA.preserve(); + return PA; + } else { + // The pass did not replace any calls, hence it preserves all analyses. + return PreservedAnalyses::all(); + } +} + +//////////////////////////////////////////////////////////////////////////////// +// Legacy PM Implementation. +//////////////////////////////////////////////////////////////////////////////// +bool ReplaceWithVeclibLegacy::runOnFunction(Function &F) { + const TargetLibraryInfo &TLI = + getAnalysis().getTLI(F); + return runImpl(TLI, F); +} + +void ReplaceWithVeclibLegacy::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + AU.addRequired(); + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); +} + +//////////////////////////////////////////////////////////////////////////////// +// Legacy Pass manager initialization +//////////////////////////////////////////////////////////////////////////////// +char ReplaceWithVeclibLegacy::ID = 0; + +INITIALIZE_PASS_BEGIN(ReplaceWithVeclibLegacy, DEBUG_TYPE, + "Replace with calls to vector library", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_END(ReplaceWithVeclibLegacy, DEBUG_TYPE, + "Replace with calls to vector library", false, false) + +FunctionPass *llvm::createReplaceWithVeclibLegacyPass() { + return new ReplaceWithVeclibLegacy(); +} \ No newline at end of file Index: llvm/lib/Transforms/Utils/Utils.cpp =================================================================== --- llvm/lib/Transforms/Utils/Utils.cpp +++ llvm/lib/Transforms/Utils/Utils.cpp @@ -43,6 +43,7 @@ initializeStripGCRelocatesLegacyPass(Registry); initializePredicateInfoPrinterLegacyPassPass(Registry); initializeInjectTLIMappingsLegacyPass(Registry); + initializeReplaceWithVeclibLegacyPass(Registry); initializeFixIrreduciblePass(Registry); initializeUnifyLoopExitsLegacyPassPass(Registry); initializeUniqueInternalLinkageNamesLegacyPassPass(Registry); Index: llvm/test/Transforms/Util/replace-intrinsics-with-veclib.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/Util/replace-intrinsics-with-veclib.ll @@ -0,0 +1,96 @@ +; RUN: opt -vector-library=SVML -replace-with-veclib -dce -S < %s | FileCheck %s --check-prefixes=COMMON,SVML,F64 +; RUN: opt -vector-library=SVML -passes=replace-with-veclib,dce -S < %s | FileCheck %s --check-prefixes=COMMON,SVML,F64 +; RUN: opt -vector-library=LIBMVEC-X86 -replace-with-veclib -dce -S < %s | FileCheck %s --check-prefixes=COMMON,LIBMVEC-X86,F64 +; RUN: opt -vector-library=LIBMVEC-X86 -passes=replace-with-veclib,dce -S < %s | FileCheck %s --check-prefixes=COMMON,LIBMVEC-X86,F64 +; RUN: opt -vector-library=MASSV -replace-with-veclib -dce -S < %s | FileCheck %s --check-prefixes=COMMON,MASSV,F32 +; RUN: opt -vector-library=MASSV -passes=replace-with-veclib,dce -S < %s | FileCheck %s --check-prefixes=COMMON,MASSV,F32 +; RUN: opt -vector-library=Accelerate -replace-with-veclib -dce -S < %s | FileCheck %s --check-prefixes=COMMON,ACCELERATE,F32 +; RUN: opt -vector-library=Accelerate -passes=replace-with-veclib,dce -S < %s | FileCheck %s --check-prefixes=COMMON,ACCELERATE,F32 + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; COMMON-LABEL: @llvm.compiler.used = appending global +; F64-SAME: [2 x i8*] [ +; SVML-SAME: i8* bitcast (<4 x double> (<4 x double>)* @__svml_exp4 to i8*), +; SVML-SAME: i8* bitcast (<4 x float> (<4 x float>)* @__svml_expf4 to i8* +; LIBMVEC-X86-SAME: i8* bitcast (<4 x double> (<4 x double>)* @_ZGVdN4v_exp to i8*), +; LIBMVEC-X86-sAME: i8* bitcast (<4 x float> (<4 x float>)* @_ZGVbN4v_expf to i8*) +; F32-SAME: [1 x i8*] [ +; MASSV-SAME: i8* bitcast (<4 x float> (<4 x float>)* @__expf4_massv to i8*) +; ACCELERATE-SAME: i8* bitcast (<4 x float> (<4 x float>)* @vexpf to i8*) +; COMMON-SAME: ], section "llvm.metadata" + +define <4 x double> @exp_v4(<4 x double> %in) { +; COMMON-LABEL: @exp_v4( +; COMMON-SAME: <4 x double> %[[IN:[a-zA-Z0-9_]+]] +; LIBMVEC-X86: %[[CALL:[a-zA-Z0-9_]+]] = call <4 x double> @_ZGVdN4v_exp(<4 x double> %[[IN]]) +; SVML: %[[CALL:[a-zA-Z0-9_]+]] = call <4 x double> @__svml_exp4(<4 x double> %[[IN]]) +; F32: %[[CALL:[a-zA-Z0-9_]+]] = call <4 x double> @llvm.exp.v4f64(<4 x double> %[[IN]]) +; F64-NOT: call @llvm.exp.v4f64 +; COMMON: ret <4 x double> %[[CALL]] + %call = call <4 x double> @llvm.exp.v4f64(<4 x double> %in) + ret <4 x double> %call +} + +declare <4 x double> @llvm.exp.v4f64(<4 x double>) #0 + +define <4 x float> @exp_f32(<4 x float> %in) { +; COMMON-LABEL: @exp_f32( +; COMMON-SAME: <4 x float> %[[IN1:[a-zA-Z0-9_]+]] +; LIBMVEC-X86: %[[#CALL1:]] = call <4 x float> @_ZGVbN4v_expf(<4 x float> %[[IN1]]) +; SVML: %[[#CALL1:]] = call <4 x float> @__svml_expf4(<4 x float> %[[IN1]]) +; MASSV: %[[#CALL1:]] = call <4 x float> @__expf4_massv(<4 x float> %[[IN1]]) +; ACCELERATE: %[[#CALL1:]] = call <4 x float> @vexpf(<4 x float> %[[IN1]]) +; COMMON-NOT: call @llvm.exp.v4f32 +; COMMON: ret <4 x float> %[[#CALL1]] + %call = call <4 x float> @llvm.exp.v4f32(<4 x float> %in) + ret <4 x float> %call +} + +declare <4 x float> @llvm.exp.v4f32(<4 x float>) #0 + +define double @exp_f64(double %in) { +; No replacement should take place for non-vector intrinsic +; COMMON-LABEL: @exp_f64( +; COMMON: %[[CALL2:[a-zA-Z0-9_]+]] = call double @llvm.exp.f64 +; COMMON: ret double %[[CALL2]] + %call = call double @llvm.exp.f64(double %in) + ret double %call +} + +declare double @llvm.exp.f64(double) #0 + +define <4 x double> @powi_v4(<4 x double> %in){ +; Check that the pass works with scalar operands on +; vector intrinsics. No vector library has a substitute for powi +; COMMON-LABEL: @powi_v4( +; COMMON: %[[CALL3:[a-zA-Z0-9_]+]] = call <4 x double> @llvm.powi.v4f64 +; COMMON: ret <4 x double> %[[CALL3]] + %call = call <4 x double> @llvm.powi.v4f64(<4 x double> %in, i32 3) + ret <4 x double> %call +} + +declare <4 x double> @llvm.powi.v4f64(<4 x double>, i32) #0 + +define <3 x double> @exp_v3(<3 x double> %in) { +; Replacement should not take place if the vector length +; does not match exactly. +; COMMON-LABEL: @exp_v3( +; COMMON: %[[CALL4:[a-zA-Z0-9_]+]] = call <3 x double> @llvm.exp.v3f64 +; COMMON: ret <3 x double> %[[CALL4]] + %call = call <3 x double> @llvm.exp.v3f64(<3 x double> %in) + ret <3 x double> %call +} + +declare <3 x double> @llvm.exp.v3f64(<3 x double>) #0 + +; LIBMVEC-X86: declare <4 x double> @_ZGVdN4v_exp(<4 x double>) #0 +; LIBMVEC-X86: declare <4 x float> @_ZGVbN4v_expf(<4 x float>) #0 +; SVML: declare <4 x double> @__svml_exp4(<4 x double>) #0 +; SVML: declare <4 x float> @__svml_expf4(<4 x float>) #0 +; MASSV: declare <4 x float> @__expf4_massv(<4 x float>) #0 +; ACCELERATE: declare <4 x float> @vexpf(<4 x float>) #0 + +attributes #0 = {nounwind readnone} +