Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -368,6 +368,14 @@ /// \brief Return true if the hardware has a fast square-root instruction. bool haveFastSqrt(Type *Ty) const; + /// \brief Return true if SIMD is IEEE 754 compliant. This enables induction + /// and SLP vectorization without -ffast-math option. + bool isSIMDIEEE754() const; + + /// \brief Return true if the target implements floating point sub-normal handling, + /// ie. if it cares about it on a standard implementation level. + bool supportsSubnormal() const; + /// \brief Return the expected cost of supporting the floating point operation /// of the specified type. int getFPOpCost(Type *Ty) const; @@ -608,6 +616,8 @@ virtual bool enableInterleavedAccessVectorization() = 0; virtual PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) = 0; virtual bool haveFastSqrt(Type *Ty) = 0; + virtual bool isSIMDIEEE754() = 0; + virtual bool supportsSubnormal() = 0; virtual int getFPOpCost(Type *Ty) = 0; virtual int getIntImmCost(const APInt &Imm, Type *Ty) = 0; virtual int getIntImmCost(unsigned Opc, unsigned Idx, const APInt &Imm, @@ -765,6 +775,10 @@ } bool haveFastSqrt(Type *Ty) override { return Impl.haveFastSqrt(Ty); } + bool isSIMDIEEE754() override { return Impl.isSIMDIEEE754(); } + + bool supportsSubnormal() override { return Impl.supportsSubnormal(); } + int getFPOpCost(Type *Ty) override { return Impl.getFPOpCost(Ty); } int getIntImmCost(const APInt &Imm, Type *Ty) override { Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -246,6 +246,10 @@ bool haveFastSqrt(Type *Ty) { return false; } + bool isSIMDIEEE754() { return true; } + + bool supportsSubnormal() { return true; } + unsigned getFPOpCost(Type *Ty) { return TargetTransformInfo::TCC_Basic; } unsigned getIntImmCost(const APInt &Imm, Type *Ty) { return TTI::TCC_Basic; } Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -181,6 +181,14 @@ return TTIImpl->haveFastSqrt(Ty); } +bool TargetTransformInfo::isSIMDIEEE754() const { + return TTIImpl->isSIMDIEEE754(); +} + +bool TargetTransformInfo::supportsSubnormal() const { + return TTIImpl->supportsSubnormal(); +} + int TargetTransformInfo::getFPOpCost(Type *Ty) const { int Cost = TTIImpl->getFPOpCost(Ty); assert(Cost >= 0 && "TTI should not produce negative costs!"); Index: lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- lib/Target/ARM/ARMTargetTransformInfo.h +++ lib/Target/ARM/ARMTargetTransformInfo.h @@ -54,6 +54,11 @@ bool enableInterleavedAccessVectorization() { return true; } + bool isSIMDIEEE754() { return false; } + + // Darwin doesn't care about subnormals, so fast math is allowed everywhere. + bool supportsSubnormal() { return !ST->isTargetDarwin(); } + /// \name Scalar TTI Implementations /// @{ Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -920,6 +920,9 @@ /// Return the loop metadata prefix. static StringRef Prefix() { return "llvm.loop."; } + /// True if there is any unsafe math in the loop. + bool PotentiallyUnsafe; + public: enum ForceKind { FK_Undefined = -1, ///< Not selected. @@ -932,7 +935,7 @@ HK_WIDTH), Interleave("interleave.count", DisableInterleaving, HK_UNROLL), Force("vectorize.enable", FK_Undefined, HK_FORCE), - TheLoop(L) { + PotentiallyUnsafe(false), TheLoop(L) { // Populate values with existing loop metadata. getHintsFromMetadata(); @@ -1030,6 +1033,19 @@ return getForce() == LoopVectorizeHints::FK_Enabled || getWidth() > 1; } + bool isPotentiallyUnsafe() const { + // Avoid FP vectorization if the target is unsure about proper support. + // This may be related to the SIMD unit in the target not handling + // IEEE 754 FP ops properly, or bad single-to-double promotions. + // Otherwise, a sequence of vectorized loops, even without reduction, + // could lead to different end results on the destination vectors. + return getForce() != LoopVectorizeHints::FK_Enabled && PotentiallyUnsafe; + } + + void setPotentiallyUnsafe() { + PotentiallyUnsafe = true; + } + private: /// Find hints specified in the loop metadata and update local values. void getHintsFromMetadata() { @@ -1191,7 +1207,7 @@ const TargetTransformInfo *TTI, LoopAccessAnalysis *LAA, LoopVectorizationRequirements *R, - const LoopVectorizeHints *H) + LoopVectorizeHints *H) : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TheFunction(F), TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), InterleaveInfo(PSE, L, DT), Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false), @@ -1417,7 +1433,7 @@ LoopVectorizationRequirements *Requirements; /// Used to emit an analysis of any legality issues. - const LoopVectorizeHints *Hints; + LoopVectorizeHints *Hints; ValueToValueMap Strides; SmallPtrSet StrideSet; @@ -1830,6 +1846,21 @@ return false; } + // Check the target to see if SIMD is IEEE-754 compliant. + if (Hints.isPotentiallyUnsafe() && + TTI->supportsSubnormal() && + !TTI->isSIMDIEEE754()) { + DEBUG(dbgs() << "LV: Can't vectorize FP loops when target's SIMD is not " + "IEEE-754 compliant.\n"); + emitAnalysisDiag( + F, L, Hints, + VectorizationReport() + << "non-IEEE-754 compliant SIMD for this target. Use " + "-ffast-math or add #pragma clang loop vectorize(enable)."); + emitMissedWarning(F, L, Hints); + return false; + } + // Select the optimal vectorization factor. const LoopVectorizationCostModel::VectorizationFactor VF = CM.selectVectorizationFactor(OptForSize); @@ -4620,12 +4651,20 @@ } if (EnableMemAccessVersioning) collectStridedAccess(ST); - } - if (EnableMemAccessVersioning) - if (LoadInst *LI = dyn_cast(it)) + } else if (LoadInst *LI = dyn_cast(it)) { + if (EnableMemAccessVersioning) collectStridedAccess(LI); + // FP instructions can allow unsafe algebra, thus vectorizable by + // non-IEEE-754 compliant SIMD units. + } else if (it->getType()->isFloatingPointTy() && + (it->isBinaryOp() || it->isCast()) && + !it->hasUnsafeAlgebra()) { + DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n"); + Hints->setPotentiallyUnsafe(); + } + // Reduction instructions are allowed to have exit users. // All other instructions must not have external users. if (hasOutsideLoopUser(TheLoop, &*it, AllowedExit)) { Index: test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll @@ -0,0 +1,274 @@ +; RUN: opt -O2 -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=LINUX +; RUN: opt -mtriple armv7-unknwon-darwin -O2 -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=DARWIN + +; Testing the ability of the loop vectorizer to tell when SIMD is safe or not +; regarding IEEE 754 standard. +; On Linux, we only want the vectorizer to work when -ffast-math flag is set, +; because NEON is not IEEE compliant. +; Darwin, on the other hand, doesn't support subnormals, and all optimizations +; are allowed, even without -ffast-math. + +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "armv7--linux-gnueabihf" + +; Integer loops are always vectorizeable +; CHECK: Checking a loop in "sumi" +; CHECK: Found a vectorizable loop (4) in +define void @sumi(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %N) { +entry: + %cmp5 = icmp eq i32 %N, 0 + br i1 %cmp5, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.06 + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.06 + %1 = load i32, i32* %arrayidx1, align 4 + %mul = mul nsw i32 %1, %0 + %arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.06 + store i32 %mul, i32* %arrayidx2, align 4 + %inc = add nuw nsw i32 %i.06, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +} + +; Floating-point loops need fast-math to be vectorizeable +; LINUX: Checking a loop in "sumf" +; LINUX: Found FP op with unsafe algebra. +; LINUX: Can't vectorize FP loops when target's SIMD is not IEEE-754 compliant. +; DARWIN: Checking a loop in "sumf" +; DARWIN: Found a vectorizable loop (4) in +define void @sumf(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) { +entry: + %cmp5 = icmp eq i32 %N, 0 + br i1 %cmp5, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds float, float* %A, i32 %i.06 + %0 = load float, float* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.06 + %1 = load float, float* %arrayidx1, align 4 + %mul = fmul float %0, %1 + %arrayidx2 = getelementptr inbounds float, float* %C, i32 %i.06 + store float %mul, float* %arrayidx2, align 4 + %inc = add nuw nsw i32 %i.06, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +} + +; Integer loops are always vectorizeable +; CHECK: Checking a loop in "redi" +; CHECK: Found a vectorizable loop (4) in +define i32 @redi(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) { +entry: + %cmp5 = icmp eq i32 %N, 0 + br i1 %cmp5, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.07 + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.07 + %1 = load i32, i32* %arrayidx1, align 4 + %mul = mul nsw i32 %1, %0 + %add = add nsw i32 %mul, %Red.06 + %inc = add nuw nsw i32 %i.07, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + %add.lcssa = phi i32 [ %add, %for.body ] + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ] + ret i32 %Red.0.lcssa +} + +; Floating-point loops need fast-math to be vectorizeable +; LINUX: Checking a loop in "redf" +; LINUX: Found FP op with unsafe algebra. +; LINUX: Can't vectorize FP loops when target's SIMD is not IEEE-754 compliant. +; DARWIN: Checking a loop in "redf" +; DARWIN: Found a vectorizable loop (4) in +define float @redf(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i32 %N) { +entry: + %cmp5 = icmp eq i32 %N, 0 + br i1 %cmp5, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ] + %arrayidx = getelementptr inbounds float, float* %a, i32 %i.07 + %0 = load float, float* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.07 + %1 = load float, float* %arrayidx1, align 4 + %mul = fmul float %0, %1 + %add = fadd float %Red.06, %mul + %inc = add nuw nsw i32 %i.07, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + %add.lcssa = phi float [ %add, %for.body ] + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ] + ret float %Red.0.lcssa +} + +; Integer loops are always vectorizeable +; CHECK: Checking a loop in "sumi_fast" +; CHECK: Found a vectorizable loop (4) in +define void @sumi_fast(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %N) { +entry: + %cmp5 = icmp eq i32 %N, 0 + br i1 %cmp5, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.06 + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.06 + %1 = load i32, i32* %arrayidx1, align 4 + %mul = mul nsw i32 %1, %0 + %arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.06 + store i32 %mul, i32* %arrayidx2, align 4 + %inc = add nuw nsw i32 %i.06, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +} + +; Floating-point loops can be vectorizeable with fast-math +; CHECK: Checking a loop in "sumf_fast" +; CHECK: Found a vectorizable loop (4) in +define void @sumf_fast(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) { +entry: + %cmp5 = icmp eq i32 %N, 0 + br i1 %cmp5, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds float, float* %A, i32 %i.06 + %0 = load float, float* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.06 + %1 = load float, float* %arrayidx1, align 4 + %mul = fmul fast float %1, %0 + %arrayidx2 = getelementptr inbounds float, float* %C, i32 %i.06 + store float %mul, float* %arrayidx2, align 4 + %inc = add nuw nsw i32 %i.06, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +} + +; Integer loops are always vectorizeable +; CHECK: Checking a loop in "redi_fast" +; CHECK: Found a vectorizable loop (4) in +define i32 @redi_fast(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) { +entry: + %cmp5 = icmp eq i32 %N, 0 + br i1 %cmp5, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.07 + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.07 + %1 = load i32, i32* %arrayidx1, align 4 + %mul = mul nsw i32 %1, %0 + %add = add nsw i32 %mul, %Red.06 + %inc = add nuw nsw i32 %i.07, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + %add.lcssa = phi i32 [ %add, %for.body ] + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ] + ret i32 %Red.0.lcssa +} + +; Floating-point loops can be vectorizeable with fast-math +; CHECK: Checking a loop in "redf_fast" +; CHECK: Found a vectorizable loop (4) in +define float @redf_fast(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i32 %N) { +entry: + %cmp5 = icmp eq i32 %N, 0 + br i1 %cmp5, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ] + %arrayidx = getelementptr inbounds float, float* %a, i32 %i.07 + %0 = load float, float* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.07 + %1 = load float, float* %arrayidx1, align 4 + %mul = fmul fast float %1, %0 + %add = fadd fast float %mul, %Red.06 + %inc = add nuw nsw i32 %i.07, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + %add.lcssa = phi float [ %add, %for.body ] + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ] + ret float %Red.0.lcssa +}