Index: include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfo.h
+++ include/llvm/Analysis/TargetTransformInfo.h
@@ -368,6 +368,14 @@
   /// \brief Return true if the hardware has a fast square-root instruction.
   bool haveFastSqrt(Type *Ty) const;
 
+  /// \brief Return true if SIMD is IEEE 754 compliant. This enables induction
+  /// and SLP vectorization without -ffast-math option.
+  bool isSIMDIEEE754() const;
+
+  /// \brief Return true if the target implements floating point sub-normal handling,
+  /// ie. if it cares about it on a standard implementation level.
+  bool supportsSubnormal() const;
+
   /// \brief Return the expected cost of supporting the floating point operation
   /// of the specified type.
   int getFPOpCost(Type *Ty) const;
@@ -608,6 +616,8 @@
   virtual bool enableInterleavedAccessVectorization() = 0;
   virtual PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) = 0;
   virtual bool haveFastSqrt(Type *Ty) = 0;
+  virtual bool isSIMDIEEE754() = 0;
+  virtual bool supportsSubnormal() = 0;
   virtual int getFPOpCost(Type *Ty) = 0;
   virtual int getIntImmCost(const APInt &Imm, Type *Ty) = 0;
   virtual int getIntImmCost(unsigned Opc, unsigned Idx, const APInt &Imm,
@@ -765,6 +775,10 @@
   }
   bool haveFastSqrt(Type *Ty) override { return Impl.haveFastSqrt(Ty); }
 
+  bool isSIMDIEEE754() override { return Impl.isSIMDIEEE754(); }
+
+  bool supportsSubnormal() override { return Impl.supportsSubnormal(); }
+
   int getFPOpCost(Type *Ty) override { return Impl.getFPOpCost(Ty); }
 
   int getIntImmCost(const APInt &Imm, Type *Ty) override {
Index: include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfoImpl.h
+++ include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -246,6 +246,10 @@
 
   bool haveFastSqrt(Type *Ty) { return false; }
 
+  bool isSIMDIEEE754() { return true; }
+
+  bool supportsSubnormal() { return true; }
+
   unsigned getFPOpCost(Type *Ty) { return TargetTransformInfo::TCC_Basic; }
 
   unsigned getIntImmCost(const APInt &Imm, Type *Ty) { return TTI::TCC_Basic; }
Index: lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- lib/Analysis/TargetTransformInfo.cpp
+++ lib/Analysis/TargetTransformInfo.cpp
@@ -181,6 +181,14 @@
   return TTIImpl->haveFastSqrt(Ty);
 }
 
+bool TargetTransformInfo::isSIMDIEEE754() const {
+  return TTIImpl->isSIMDIEEE754();
+}
+
+bool TargetTransformInfo::supportsSubnormal() const {
+  return TTIImpl->supportsSubnormal();
+}
+
 int TargetTransformInfo::getFPOpCost(Type *Ty) const {
   int Cost = TTIImpl->getFPOpCost(Ty);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
Index: lib/Target/ARM/ARMTargetTransformInfo.h
===================================================================
--- lib/Target/ARM/ARMTargetTransformInfo.h
+++ lib/Target/ARM/ARMTargetTransformInfo.h
@@ -54,6 +54,11 @@
 
   bool enableInterleavedAccessVectorization() { return true; }
 
+  bool isSIMDIEEE754() { return false; }
+
+  // Darwin doesn't care about subnormals, so fast math is allowed everywhere.
+  bool supportsSubnormal() { return !ST->isTargetDarwin(); }
+
   /// \name Scalar TTI Implementations
   /// @{
 
Index: lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorize.cpp
+++ lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -920,6 +920,9 @@
   /// Return the loop metadata prefix.
   static StringRef Prefix() { return "llvm.loop."; }
 
+  /// True if there is any unsafe math in the loop.
+  bool PotentiallyUnsafe;
+
 public:
   enum ForceKind {
     FK_Undefined = -1, ///< Not selected.
@@ -932,7 +935,7 @@
               HK_WIDTH),
         Interleave("interleave.count", DisableInterleaving, HK_UNROLL),
         Force("vectorize.enable", FK_Undefined, HK_FORCE),
-        TheLoop(L) {
+        PotentiallyUnsafe(false), TheLoop(L) {
     // Populate values with existing loop metadata.
     getHintsFromMetadata();
 
@@ -1030,6 +1033,19 @@
     return getForce() == LoopVectorizeHints::FK_Enabled || getWidth() > 1;
   }
 
+  bool isPotentiallyUnsafe() const {
+    // Avoid FP vectorization if the target is unsure about proper support.
+    // This may be related to the SIMD unit in the target not handling
+    // IEEE 754 FP ops properly, or bad single-to-double promotions.
+    // Otherwise, a sequence of vectorized loops, even without reduction,
+    // could lead to different end results on the destination vectors.
+    return getForce() != LoopVectorizeHints::FK_Enabled && PotentiallyUnsafe;
+  }
+
+  void setPotentiallyUnsafe() {
+    PotentiallyUnsafe = true;
+  }
+
 private:
   /// Find hints specified in the loop metadata and update local values.
   void getHintsFromMetadata() {
@@ -1191,7 +1207,7 @@
                             const TargetTransformInfo *TTI,
                             LoopAccessAnalysis *LAA,
                             LoopVectorizationRequirements *R,
-                            const LoopVectorizeHints *H)
+                            LoopVectorizeHints *H)
       : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TheFunction(F),
         TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), InterleaveInfo(PSE, L, DT),
         Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false),
@@ -1417,7 +1433,7 @@
   LoopVectorizationRequirements *Requirements;
 
   /// Used to emit an analysis of any legality issues.
-  const LoopVectorizeHints *Hints;
+  LoopVectorizeHints *Hints;
 
   ValueToValueMap Strides;
   SmallPtrSet<Value *, 8> StrideSet;
@@ -1830,6 +1846,21 @@
       return false;
     }
 
+    // Check the target to see if SIMD is IEEE-754 compliant.
+    if (Hints.isPotentiallyUnsafe() &&
+        TTI->supportsSubnormal() &&
+        !TTI->isSIMDIEEE754()) {
+      DEBUG(dbgs() << "LV: Can't vectorize FP loops when target's SIMD is not "
+            "IEEE-754 compliant.\n");
+      emitAnalysisDiag(
+          F, L, Hints,
+          VectorizationReport()
+             << "non-IEEE-754 compliant SIMD for this target. Use "
+                "-ffast-math or add #pragma clang loop vectorize(enable).");
+      emitMissedWarning(F, L, Hints);
+      return false;
+    }
+
     // Select the optimal vectorization factor.
     const LoopVectorizationCostModel::VectorizationFactor VF =
         CM.selectVectorizationFactor(OptForSize);
@@ -4620,12 +4651,20 @@
         }
         if (EnableMemAccessVersioning)
           collectStridedAccess(ST);
-      }
 
-      if (EnableMemAccessVersioning)
-        if (LoadInst *LI = dyn_cast<LoadInst>(it))
+      } else if (LoadInst *LI = dyn_cast<LoadInst>(it)) {
+        if (EnableMemAccessVersioning)
           collectStridedAccess(LI);
 
+      // FP instructions can allow unsafe algebra, thus vectorizable by
+      // non-IEEE-754 compliant SIMD units.
+      } else if (it->getType()->isFloatingPointTy() &&
+                (it->isBinaryOp() || it->isCast()) &&
+                !it->hasUnsafeAlgebra()) {
+        DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n");
+        Hints->setPotentiallyUnsafe();
+      }
+
       // Reduction instructions are allowed to have exit users.
       // All other instructions must not have external users.
       if (hasOutsideLoopUser(TheLoop, &*it, AllowedExit)) {
Index: test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll
===================================================================
--- /dev/null
+++ test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll
@@ -0,0 +1,274 @@
+; RUN: opt -O2 -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=LINUX
+; RUN: opt -mtriple armv7-unknwon-darwin -O2 -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=DARWIN
+
+; Testing the ability of the loop vectorizer to tell when SIMD is safe or not
+; regarding IEEE 754 standard.
+; On Linux, we only want the vectorizer to work when -ffast-math flag is set,
+; because NEON is not IEEE compliant.
+; Darwin, on the other hand, doesn't support subnormals, and all optimizations
+; are allowed, even without -ffast-math.
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv7--linux-gnueabihf"
+
+; Integer loops are always vectorizeable
+; CHECK: Checking a loop in "sumi"
+; CHECK: Found a vectorizable loop (4) in
+define void @sumi(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %N) {
+entry:
+  %cmp5 = icmp eq i32 %N, 0
+  br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.06
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.06
+  %1 = load i32, i32* %arrayidx1, align 4
+  %mul = mul nsw i32 %1, %0
+  %arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.06
+  store i32 %mul, i32* %arrayidx2, align 4
+  %inc = add nuw nsw i32 %i.06, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+; Floating-point loops need fast-math to be vectorizeable
+; LINUX: Checking a loop in "sumf"
+; LINUX: Found FP op with unsafe algebra.
+; LINUX: Can't vectorize FP loops when target's SIMD is not IEEE-754 compliant.
+; DARWIN: Checking a loop in "sumf"
+; DARWIN: Found a vectorizable loop (4) in
+define void @sumf(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
+entry:
+  %cmp5 = icmp eq i32 %N, 0
+  br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds float, float* %A, i32 %i.06
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.06
+  %1 = load float, float* %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %arrayidx2 = getelementptr inbounds float, float* %C, i32 %i.06
+  store float %mul, float* %arrayidx2, align 4
+  %inc = add nuw nsw i32 %i.06, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+; Integer loops are always vectorizeable
+; CHECK: Checking a loop in "redi"
+; CHECK: Found a vectorizable loop (4) in
+define i32 @redi(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
+entry:
+  %cmp5 = icmp eq i32 %N, 0
+  br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.07
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.07
+  %1 = load i32, i32* %arrayidx1, align 4
+  %mul = mul nsw i32 %1, %0
+  %add = add nsw i32 %mul, %Red.06
+  %inc = add nuw nsw i32 %i.07, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
+  ret i32 %Red.0.lcssa
+}
+
+; Floating-point loops need fast-math to be vectorizeable
+; LINUX: Checking a loop in "redf"
+; LINUX: Found FP op with unsafe algebra.
+; LINUX: Can't vectorize FP loops when target's SIMD is not IEEE-754 compliant.
+; DARWIN: Checking a loop in "redf"
+; DARWIN: Found a vectorizable loop (4) in
+define float @redf(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i32 %N) {
+entry:
+  %cmp5 = icmp eq i32 %N, 0
+  br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds float, float* %a, i32 %i.07
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.07
+  %1 = load float, float* %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %add = fadd float %Red.06, %mul
+  %inc = add nuw nsw i32 %i.07, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  %add.lcssa = phi float [ %add, %for.body ]
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
+  ret float %Red.0.lcssa
+}
+
+; Integer loops are always vectorizeable
+; CHECK: Checking a loop in "sumi_fast"
+; CHECK: Found a vectorizable loop (4) in
+define void @sumi_fast(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %N) {
+entry:
+  %cmp5 = icmp eq i32 %N, 0
+  br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.06
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.06
+  %1 = load i32, i32* %arrayidx1, align 4
+  %mul = mul nsw i32 %1, %0
+  %arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.06
+  store i32 %mul, i32* %arrayidx2, align 4
+  %inc = add nuw nsw i32 %i.06, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+; Floating-point loops can be vectorizeable with fast-math
+; CHECK: Checking a loop in "sumf_fast"
+; CHECK: Found a vectorizable loop (4) in
+define void @sumf_fast(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
+entry:
+  %cmp5 = icmp eq i32 %N, 0
+  br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds float, float* %A, i32 %i.06
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.06
+  %1 = load float, float* %arrayidx1, align 4
+  %mul = fmul fast float %1, %0
+  %arrayidx2 = getelementptr inbounds float, float* %C, i32 %i.06
+  store float %mul, float* %arrayidx2, align 4
+  %inc = add nuw nsw i32 %i.06, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+; Integer loops are always vectorizeable
+; CHECK: Checking a loop in "redi_fast"
+; CHECK: Found a vectorizable loop (4) in
+define i32 @redi_fast(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
+entry:
+  %cmp5 = icmp eq i32 %N, 0
+  br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.07
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.07
+  %1 = load i32, i32* %arrayidx1, align 4
+  %mul = mul nsw i32 %1, %0
+  %add = add nsw i32 %mul, %Red.06
+  %inc = add nuw nsw i32 %i.07, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
+  ret i32 %Red.0.lcssa
+}
+
+; Floating-point loops can be vectorizeable with fast-math
+; CHECK: Checking a loop in "redf_fast"
+; CHECK: Found a vectorizable loop (4) in
+define float @redf_fast(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i32 %N) {
+entry:
+  %cmp5 = icmp eq i32 %N, 0
+  br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds float, float* %a, i32 %i.07
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.07
+  %1 = load float, float* %arrayidx1, align 4
+  %mul = fmul fast float %1, %0
+  %add = fadd fast float %mul, %Red.06
+  %inc = add nuw nsw i32 %i.07, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  %add.lcssa = phi float [ %add, %for.body ]
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
+  ret float %Red.0.lcssa
+}