Index: include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h =================================================================== --- include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -322,6 +322,10 @@ // Returns true if the NoNaN attribute is set on the function. bool hasFunNoNaNAttr() const { return HasFunNoNaNAttr; } + // Returns true if the loop contains a call that may be vectorized + // with a vector version of the library call. + bool mayHaveVectorLibCall() const { return MayHaveVectorLibCall; } + private: /// Return true if the pre-header, exiting and latch blocks of \p Lp and all /// its nested loops are considered legal for vectorization. These legal @@ -475,6 +479,10 @@ /// While vectorizing these instructions we have to generate a /// call to the appropriate masked intrinsic SmallPtrSet MaskedOp; + + // Does the loop contain a call that may be vectorized with a vector version + // of the library call. + bool MayHaveVectorLibCall = false; }; } // namespace llvm Index: lib/Transforms/Vectorize/LoopVectorizationLegality.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -658,29 +658,36 @@ // * Have a mapping to an IR intrinsic. // * Have a vector version available. auto *CI = dyn_cast(&I); - if (CI && !getVectorIntrinsicIDForCall(CI, TLI) && - !isa(CI) && - !(CI->getCalledFunction() && TLI && - TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) { - ORE->emit(createMissedAnalysis("CantVectorizeCall", CI) - << "call instruction cannot be vectorized"); - LLVM_DEBUG( - dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n"); - return false; - } - - // Intrinsics such as powi,cttz and ctlz are legal to vectorize if the - // second argument is the same (i.e. loop invariant) - if (CI && hasVectorInstrinsicScalarOpd( - getVectorIntrinsicIDForCall(CI, TLI), 1)) { - auto *SE = PSE.getSE(); - if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(1)), TheLoop)) { - ORE->emit(createMissedAnalysis("CantVectorizeIntrinsic", CI) - << "intrinsic instruction cannot be vectorized"); - LLVM_DEBUG(dbgs() - << "LV: Found unvectorizable intrinsic " << *CI << "\n"); + if (CI) { + Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); + bool VectorAvail = CI->getCalledFunction() && TLI && + TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()); + if (!ID && !VectorAvail && !isa(CI)) { + ORE->emit(createMissedAnalysis("CantVectorizeCall", CI) + << "call instruction cannot be vectorized"); + LLVM_DEBUG( + dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n"); return false; } + + // Intrinsics such as powi,cttz and ctlz are legal to vectorize if the + // second argument is the same (i.e. loop invariant) + if (hasVectorInstrinsicScalarOpd(ID, 1)) { + auto *SE = PSE.getSE(); + if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(1)), TheLoop)) { + ORE->emit(createMissedAnalysis("CantVectorizeIntrinsic", CI) + << "intrinsic instruction cannot be vectorized"); + LLVM_DEBUG(dbgs() + << "LV: Found unvectorizable intrinsic " << *CI << "\n"); + return false; + } + } + + // If a vector library call is available, we can only say the vectorized + // loop "may" contain a call to it, as the decision depends on the + // selected vectorization factor. + if (VectorAvail) + MayHaveVectorLibCall = true; } // Check that the instruction return type is vectorizable. Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1494,6 +1494,10 @@ /// should be used. bool useEmulatedMaskMemRefHack(Instruction *I); + // Returns true if the loop vectorized with a factor of \p VF would contain a + // call to a vector library function. + bool containsVectorLibCall(unsigned VF); + /// Create an analysis remark that explains why vectorization failed /// /// \p RemarkName is the identifier for the remark. \return the remark object @@ -5151,6 +5155,12 @@ if (TC > 1 && TC < TinyTripCountInterleaveThreshold) return 1; + // Do not interleave if the vectorized loop will contain a call to a vector + // library function, as the function call will cause the registers for + // the parallel instances to be spilled. + if (Legal->mayHaveVectorLibCall() && containsVectorLibCall(VF)) + return 1; + unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1); LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters << " registers\n"); @@ -5595,6 +5605,24 @@ return Discount; } +bool LoopVectorizationCostModel::containsVectorLibCall(unsigned VF) { + // Given a vectorization factor VF, this function looks for calls that would + // be vectorized with a vector version of the library call. + for (BasicBlock *BB : TheLoop->blocks()) + for (Instruction &I : BB->instructionsWithoutDebug()) + if (auto *CI = dyn_cast(&I)) { + bool NeedToScalarize; + unsigned CallCost = + getVectorCallCost(CI, VF, TTI, TLI, NeedToScalarize); + Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); + if (!NeedToScalarize && + (!ID || getVectorIntrinsicCost(CI, VF, TTI, TLI) > CallCost)) + return true; + } + + return false; +} + LoopVectorizationCostModel::VectorizationCostTy LoopVectorizationCostModel::expectedCost(unsigned VF) { VectorizationCostTy Cost; Index: test/Transforms/LoopVectorize/X86/interleaving-veclib-call.ll =================================================================== --- test/Transforms/LoopVectorize/X86/interleaving-veclib-call.ll +++ test/Transforms/LoopVectorize/X86/interleaving-veclib-call.ll @@ -0,0 +1,101 @@ +; RUN: opt -S -mtriple=x86_64-unknown-linux -mcpu=btver2 -vector-library=SVML -loop-vectorize < %s | FileCheck %s + +; This test checks that when a call is vectorized with a vector library call +; the interleave count used is 1 (i.e. the call appears only once). As loops +; with reductions are treated specially by the cost model we also test this +; case. Finally, a test is included that checks that no restriction of the +; interleave count is done when the call is not vectorized to a vector library +; call. + +; CHECK-LABEL: test +; CHECK: call <8 x float> @__svml_sinf8 +; CHECK-NOT: call <8 x float> @__svml_sinf8 + +define void @sinf-test(float* nocapture readonly %a, float* nocapture %b, i32 %n) { +entry: + %cmp7 = icmp sgt i32 %n, 0 + br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body, %for.body.preheader + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv + %0 = load float, float* %arrayidx, align 4 + %call = tail call float @sinf(float %0) + %arrayidx2 = getelementptr inbounds float, float* %b, i64 %indvars.iv + store float %call, float* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +declare dso_local float @sinf(float) local_unnamed_addr + +; CHECK-LABEL: sinf-reduc-test +; CHECK: call fast <8 x float> @__svml_sinf8 +; CHECK-NOT: call fast <8 x float> @__svml_sinf8 + +define float @sinf-reduc-test(float* nocapture readonly %a, i32 %n) { +entry: + %cmp6 = icmp sgt i32 %n, 0 + br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + %s.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ] + ret float %s.0.lcssa + +for.body: ; preds = %for.body, %for.body.preheader + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %s.07 = phi float [ 0.000000e+00, %for.body.preheader ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv + %0 = load float, float* %arrayidx, align 4 + %1 = tail call fast float @llvm.sin.f32(float %0) + %add = fadd fast float %1, %s.07 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +declare float @llvm.sin.f32(float) + +; CHECK-LABEL: ceilf-test +; CHECK: call <8 x float> @llvm.ceil.v8f32 +; CHECK: call <8 x float> @llvm.ceil.v8f32 +; CHECK: call <8 x float> @llvm.ceil.v8f32 +; CHECK: call <8 x float> @llvm.ceil.v8f32 + +define void @ceilf-test(float* nocapture readonly %a, float* nocapture %b, i32 %n) { +entry: + %cmp7 = icmp sgt i32 %n, 0 + br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body, %for.body.preheader + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv + %0 = load float, float* %arrayidx, align 4 + %1 = tail call float @llvm.ceil.f32(float %0) + %arrayidx2 = getelementptr inbounds float, float* %b, i64 %indvars.iv + store float %1, float* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +declare float @llvm.ceil.f32(float)