diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -85,6 +85,7 @@ #include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/InjectTLIMappings.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Vectorize.h" #include @@ -3227,6 +3228,39 @@ }); } +std::pair getVectorCallCosts(CallInst *CI, + VectorType *VecTy, + TargetTransformInfo *TTI, + TargetLibraryInfo *TLI) { + Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); + + // Calculate the cost of the scalar and vector calls. + FastMathFlags FMF; + if (auto *FPMO = dyn_cast(CI)) + FMF = FPMO->getFastMathFlags(); + + SmallVector Args(CI->arg_operands()); + int IntrinsicCost = TTI->getIntrinsicInstrCost(ID, CI->getType(), Args, FMF, + VecTy->getNumElements()); + + auto Shape = + VFShape::get(*CI, {static_cast(VecTy->getNumElements()), false}, + false /*HasGlobalPred*/); + Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); + int LibCost = IntrinsicCost; + if (!CI->isNoBuiltin() && VecFunc) { + // Calculate the cost of the vector library call. + SmallVector VecTys; + for (Use &Arg : CI->args()) + VecTys.push_back( + VectorType::get(Arg->getType(), VecTy->getNumElements())); + + // If the corresponding vector call is cheaper, return its cost. + LibCost = TTI->getCallInstrCost(nullptr, VecTy, VecTys); + } + return {IntrinsicCost, LibCost}; +} + int BoUpSLP::getEntryCost(TreeEntry *E) { ArrayRef VL = E->Scalars; @@ -3539,9 +3573,8 @@ } int ScalarCallCost = VecTy->getNumElements() * ScalarEltCost; - SmallVector Args(CI->arg_operands()); - int VecCallCost = TTI->getIntrinsicInstrCost(ID, CI->getType(), Args, FMF, - VecTy->getNumElements()); + auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI); + int VecCallCost = std::min(VecCallCosts.first, VecCallCosts.second); LLVM_DEBUG(dbgs() << "SLP: Call cost " << VecCallCost - ScalarCallCost << " (" << VecCallCost << "-" << ScalarCallCost << ")" @@ -4446,13 +4479,18 @@ if (Function *FI = CI->getCalledFunction()) IID = FI->getIntrinsicID(); + Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); + + auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI); + bool UseIntrinsic = VecCallCosts.first <= VecCallCosts.second; + Value *ScalarArg = nullptr; std::vector OpVecs; for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) { ValueList OpVL; // Some intrinsics have scalar arguments. This argument should not be // vectorized. - if (hasVectorInstrinsicScalarOpd(IID, j)) { + if (UseIntrinsic && hasVectorInstrinsicScalarOpd(IID, j)) { CallInst *CEI = cast(VL0); ScalarArg = CEI->getArgOperand(j); OpVecs.push_back(CEI->getArgOperand(j)); @@ -4465,9 +4503,16 @@ } Module *M = F->getParent(); - Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) }; Function *CF = Intrinsic::getDeclaration(M, ID, Tys); + + if (!UseIntrinsic) { + VFShape Shape = VFShape::get( + *CI, {static_cast(VecTy->getNumElements()), false}, + false /*HasGlobalPred*/); + CF = VFDatabase(*CI).getVectorizedFunction(Shape); + } + SmallVector OpBundles; CI->getOperandBundlesAsDefs(OpBundles); Value *V = Builder.CreateCall(CF, OpVecs, OpBundles); @@ -5561,6 +5606,7 @@ AU.addRequired(); AU.addRequired(); AU.addRequired(); + AU.addRequired(); AU.addPreserved(); AU.addPreserved(); AU.addPreserved(); @@ -7476,6 +7522,7 @@ INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) +INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false) Pass *llvm::createSLPVectorizerPass() { return new SLPVectorizer(); } diff --git a/llvm/test/Other/opt-O2-pipeline.ll b/llvm/test/Other/opt-O2-pipeline.ll --- a/llvm/test/Other/opt-O2-pipeline.ll +++ b/llvm/test/Other/opt-O2-pipeline.ll @@ -250,6 +250,7 @@ ; CHECK-NEXT: Lazy Branch Probability Analysis ; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Optimization Remark Emitter +; CHECK-NEXT: Inject TLI Mappings ; CHECK-NEXT: SLP Vectorizer ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Combine redundant instructions diff --git a/llvm/test/Other/opt-O3-pipeline.ll b/llvm/test/Other/opt-O3-pipeline.ll --- a/llvm/test/Other/opt-O3-pipeline.ll +++ b/llvm/test/Other/opt-O3-pipeline.ll @@ -255,6 +255,7 @@ ; CHECK-NEXT: Lazy Branch Probability Analysis ; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Optimization Remark Emitter +; CHECK-NEXT: Inject TLI Mappings ; CHECK-NEXT: SLP Vectorizer ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Combine redundant instructions diff --git a/llvm/test/Other/opt-Os-pipeline.ll b/llvm/test/Other/opt-Os-pipeline.ll --- a/llvm/test/Other/opt-Os-pipeline.ll +++ b/llvm/test/Other/opt-Os-pipeline.ll @@ -237,6 +237,7 @@ ; CHECK-NEXT: Lazy Branch Probability Analysis ; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Optimization Remark Emitter +; CHECK-NEXT: Inject TLI Mappings ; CHECK-NEXT: SLP Vectorizer ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Combine redundant instructions diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll @@ -8,23 +8,20 @@ declare float @llvm.sin.f32(float) #1 -; FIXME: Accelerate provides sin() for <4 x float> +; Accelerate provides sin() for <4 x float> define <4 x float> @sin_4x(<4 x float>* %a) { ; CHECK-LABEL: @sin_4x( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 -; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]]) -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 -; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]]) -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]]) -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vsinf(<4 x float> [[TMP0]]) +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 +; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 +; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 +; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 +; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 ; CHECK-NEXT: ret <4 x float> [[VECINS_3]] ; ; NOACCELERATE-LABEL: @sin_4x( @@ -67,10 +64,10 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, <2 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]]) +; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]]) #1 ; CHECK-NEXT: [[VECINS:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0 ; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) +; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) #1 ; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1 ; CHECK-NEXT: ret <2 x float> [[VECINS_1]] ; @@ -99,23 +96,20 @@ declare float @llvm.cos.f32(float) #1 -; FIXME: Accelerate provides cos() for <4 x float> +; Accelerate provides cos() for <4 x float> define <4 x float> @cos_4x(<4 x float>* %a) { ; CHECK-LABEL: @cos_4x( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 -; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]]) -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 -; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]]) -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_2]]) -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 -; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]]) -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vcosf(<4 x float> [[TMP0]]) +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 +; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 +; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 +; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 +; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 ; CHECK-NEXT: ret <4 x float> [[VECINS_3]] ; ; NOACCELERATE-LABEL: @cos_4x( @@ -158,10 +152,10 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, <2 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]]) +; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]]) #2 ; CHECK-NEXT: [[VECINS:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0 ; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]]) +; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]]) #2 ; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1 ; CHECK-NEXT: ret <2 x float> [[VECINS_1]] ;