Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -611,7 +611,7 @@ /// platform, scalar floating-point math does. /// This applies to floating-point math operations and calls, not memory /// operations, shuffles, or casts. - bool isFPVectorizationPotentiallyUnsafe() const; + bool isFPVectorizationPotentiallyUnsafe(bool IsFTZEnabled) const; /// Determine if the target supports unaligned memory accesses. bool allowsMisalignedMemoryAccesses(LLVMContext &Context, @@ -1106,7 +1106,7 @@ bool IsZeroCmp) const = 0; virtual bool enableInterleavedAccessVectorization() = 0; virtual bool enableMaskedInterleavedAccessVectorization() = 0; - virtual bool isFPVectorizationPotentiallyUnsafe() = 0; + virtual bool isFPVectorizationPotentiallyUnsafe(bool IsFTZEnabled) = 0; virtual bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, unsigned AddressSpace, @@ -1392,8 +1392,8 @@ bool enableMaskedInterleavedAccessVectorization() override { return Impl.enableMaskedInterleavedAccessVectorization(); } - bool isFPVectorizationPotentiallyUnsafe() override { - return Impl.isFPVectorizationPotentiallyUnsafe(); + bool isFPVectorizationPotentiallyUnsafe(bool IsFTZEnabled) override { + return Impl.isFPVectorizationPotentiallyUnsafe(IsFTZEnabled); } bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, unsigned AddressSpace, Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -317,7 +317,7 @@ bool enableMaskedInterleavedAccessVectorization() { return false; } - bool isFPVectorizationPotentiallyUnsafe() { return false; } + bool isFPVectorizationPotentiallyUnsafe(bool IsFTZEnabled) { return false; } bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -278,8 +278,9 @@ return TTIImpl->enableMaskedInterleavedAccessVectorization(); } -bool TargetTransformInfo::isFPVectorizationPotentiallyUnsafe() const { - return TTIImpl->isFPVectorizationPotentiallyUnsafe(); +bool TargetTransformInfo::isFPVectorizationPotentiallyUnsafe( + bool IsFTZEnabled) const { + return TTIImpl->isFPVectorizationPotentiallyUnsafe(IsFTZEnabled); } bool TargetTransformInfo::allowsMisalignedMemoryAccesses(LLVMContext &Context, Index: lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- lib/Target/ARM/ARMTargetTransformInfo.h +++ lib/Target/ARM/ARMTargetTransformInfo.h @@ -102,8 +102,8 @@ /// Floating-point computation using ARMv8 AArch32 Advanced /// SIMD instructions remains unchanged from ARMv7. Only AArch64 SIMD /// is IEEE-754 compliant, but it's not covered in this target. - bool isFPVectorizationPotentiallyUnsafe() { - return !ST->isTargetDarwin(); + bool isFPVectorizationPotentiallyUnsafe(bool IsFTZEnabled) { + return !(IsFTZEnabled || ST->isTargetDarwin()); } /// \name Scalar TTI Implementations Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7284,12 +7284,14 @@ return false; } + bool IsFTZEnabled = F->hasFnAttribute("ftz"); + // Check if the target supports potentially unsafe FP vectorization. // FIXME: Add a check for the type of safety issue (denormal, signaling) // for the target we're vectorizing for, to make sure none of the // additional fp-math flags can help. if (Hints.isPotentiallyUnsafe() && - TTI->isFPVectorizationPotentiallyUnsafe()) { + TTI->isFPVectorizationPotentiallyUnsafe(IsFTZEnabled)) { LLVM_DEBUG( dbgs() << "LV: Potentially unsafe FP op prevents vectorization.\n"); ORE->emit( Index: test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll =================================================================== --- test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll +++ test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll @@ -5,8 +5,8 @@ ; Testing the ability of the loop vectorizer to tell when SIMD is safe or not ; regarding IEEE 754 standard. -; On Linux, we only want the vectorizer to work when -ffast-math flag is set, -; because NEON is not IEEE compliant. +; On Linux, we only want the vectorizer to work when -ffast-math flag is set or +; when the function has an FTZ attribute, because NEON is not IEEE compliant. ; Darwin, on the other hand, doesn't support subnormals, and all optimizations ; are allowed, even without -ffast-math. @@ -326,5 +326,40 @@ declare float @fabsf(float) +; Floating-point loops need fast-math to be vectorizeable +; LINUX: Checking a loop in "sumf_with_ftz" +; LINUX-NOT: Potentially unsafe FP op prevents vectorization +; DARWIN: Checking a loop in "sumf_with_ftz" +; DARWIN-NOT: Potentially unsafe FP op prevents vectorization +define void @sumf_with_ftz(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) #3 { +entry: + %cmp5 = icmp eq i32 %N, 0 + br i1 %cmp5, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds float, float* %A, i32 %i.06 + %0 = load float, float* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.06 + %1 = load float, float* %arrayidx1, align 4 + %mul = fmul float %0, %1 + %arrayidx2 = getelementptr inbounds float, float* %C, i32 %i.06 + store float %mul, float* %arrayidx2, align 4 + %inc = add nuw nsw i32 %i.06, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +} + attributes #1 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #2 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #3 = { "ftz" } +