Index: llvm/trunk/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -714,10 +714,30 @@ !isa(CI) && !(CI->getCalledFunction() && TLI && TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) { - ORE->emit(createMissedAnalysis("CantVectorizeCall", CI) - << "call instruction cannot be vectorized"); + // If the call is a recognized math libary call, it is likely that + // we can vectorize it given loosened floating-point constraints. + LibFunc Func; + bool IsMathLibCall = + TLI && CI->getCalledFunction() && + CI->getType()->isFloatingPointTy() && + TLI->getLibFunc(CI->getCalledFunction()->getName(), Func) && + TLI->hasOptimizedCodeGen(Func); + + if (IsMathLibCall) { + // TODO: Ideally, we should not use clang-specific language here, + // but it's hard to provide meaningful yet generic advice. + // Also, should this be guarded by allowExtraAnalysis() and/or be part + // of the returned info from isFunctionVectorizable()? + ORE->emit(createMissedAnalysis("CantVectorizeLibcall", CI) + << "library call cannot be vectorized. " + "Try compiling with -fno-math-errno, -ffast-math, " + "or similar flags"); + } else { + ORE->emit(createMissedAnalysis("CantVectorizeCall", CI) + << "call instruction cannot be vectorized"); + } LLVM_DEBUG( - dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n"); + dbgs() << "LV: Found a non-intrinsic callsite.\n"); return false; } Index: llvm/trunk/test/Transforms/LoopVectorize/libcall-remark.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/libcall-remark.ll +++ llvm/trunk/test/Transforms/LoopVectorize/libcall-remark.ll @@ -0,0 +1,52 @@ +; RUN: opt -S -loop-vectorize < %s 2>&1 -pass-remarks-analysis=.* | FileCheck %s + +; Test the optimization remark emitter for recognition +; of a mathlib function vs. an arbitrary function. + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.14.0" +@data = external local_unnamed_addr global [32768 x float], align 16 + +; CHECK: loop not vectorized: library call cannot be vectorized + +define void @libcall_blocks_vectorization() { +entry: + br label %for.body + +for.cond.cleanup: + ret void + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [32768 x float], [32768 x float]* @data, i64 0, i64 %indvars.iv + %t0 = load float, float* %arrayidx, align 4 + %sqrtf = tail call float @sqrtf(float %t0) + store float %sqrtf, float* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 32768 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; CHECK: loop not vectorized: call instruction cannot be vectorized + +define void @arbitrary_call_blocks_vectorization() { +entry: + br label %for.body + +for.cond.cleanup: + ret void + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [32768 x float], [32768 x float]* @data, i64 0, i64 %indvars.iv + %t0 = load float, float* %arrayidx, align 4 + %sqrtf = tail call float @arbitrary(float %t0) + store float %sqrtf, float* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 32768 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +declare float @sqrtf(float) +declare float @arbitrary(float) +