Index: include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfo.h
+++ include/llvm/Analysis/TargetTransformInfo.h
@@ -325,6 +325,10 @@
   /// \brief Return true if the hardware has a fast square-root instruction.
   bool haveFastSqrt(Type *Ty) const;
 
+  /// \brief Return the expected cost of supporting the floating point operation
+  /// of the specified type.
+  unsigned getFPOpCost(Type *Ty) const;
+
   /// \brief Return the expected cost of materializing for the given integer
   /// immediate of the specified type.
   unsigned getIntImmCost(const APInt &Imm, Type *Ty) const;
@@ -516,6 +520,7 @@
   virtual bool shouldBuildLookupTables() = 0;
   virtual PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) = 0;
   virtual bool haveFastSqrt(Type *Ty) = 0;
+  virtual unsigned getFPOpCost(Type *Ty) = 0;
   virtual unsigned getIntImmCost(const APInt &Imm, Type *Ty) = 0;
   virtual unsigned getIntImmCost(unsigned Opc, unsigned Idx, const APInt &Imm,
                                  Type *Ty) = 0;
@@ -631,6 +636,11 @@
     return Impl.getPopcntSupport(IntTyWidthInBit);
   }
   bool haveFastSqrt(Type *Ty) override { return Impl.haveFastSqrt(Ty); }
+
+  unsigned getFPOpCost(Type *Ty) override {
+    return Impl.getFPOpCost(Ty);
+  }
+
   unsigned getIntImmCost(const APInt &Imm, Type *Ty) override {
     return Impl.getIntImmCost(Imm, Ty);
   }
Index: include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfoImpl.h
+++ include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -239,6 +239,8 @@
 
   bool haveFastSqrt(Type *Ty) { return false; }
 
+  unsigned getFPOpCost(Type *Ty) { return TargetTransformInfo::TCC_Basic; }
+
   unsigned getIntImmCost(const APInt &Imm, Type *Ty) { return TTI::TCC_Basic; }
 
   unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
Index: include/llvm/CodeGen/BasicTTIImpl.h
===================================================================
--- include/llvm/CodeGen/BasicTTIImpl.h
+++ include/llvm/CodeGen/BasicTTIImpl.h
@@ -167,6 +167,12 @@
            TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
   }
 
+  unsigned getFPOpCost(Type *Ty) {
+    // By default, FP instructions are no more expensive since they are
+    // implemented in HW.  Target specific TTI can override this.
+    return TargetTransformInfo::TCC_Basic;
+  }
+
   void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP) {
     // This unrolling functionality is target independent, but to provide some
     // motivation for its intended use, for x86:
Index: lib/Analysis/IPA/InlineCost.cpp
===================================================================
--- lib/Analysis/IPA/InlineCost.cpp
+++ lib/Analysis/IPA/InlineCost.cpp
@@ -907,6 +907,25 @@
     if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy())
       ++NumVectorInstructions;
 
+    // If the instruction is floating point, and the target says this operation is
+    // expensive or the function has the "use-soft-float" attribute, this may
+    // eventually become a library call.  Treat the cost as such.
+    if (I->getType()->isFloatingPointTy()) {
+      bool hasSoftFloatAttr = false;
+
+      // If the function has the "use-soft-float" attribute, mark it as expensive.
+      if (F.hasFnAttribute("use-soft-float")) {
+        Attribute Attr = F.getFnAttribute("use-soft-float");
+        StringRef Val = Attr.getValueAsString();
+        if (Val == "true")
+          hasSoftFloatAttr = true;
+      }
+
+      if (TTI.getFPOpCost(I->getType()) == TargetTransformInfo::TCC_Expensive ||
+          hasSoftFloatAttr)
+        Cost += InlineConstants::CallPenalty;
+    }
+
     // If the instruction simplified to a constant, there is no cost to this
     // instruction. Visit the instructions using our InstVisitor to account for
     // all of the per-instruction logic. The visit tree returns true if we
Index: lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- lib/Analysis/TargetTransformInfo.cpp
+++ lib/Analysis/TargetTransformInfo.cpp
@@ -148,6 +148,10 @@
   return TTIImpl->haveFastSqrt(Ty);
 }
 
+unsigned TargetTransformInfo::getFPOpCost(Type *Ty) const {
+  return TTIImpl->getFPOpCost(Ty);
+}
+
 unsigned TargetTransformInfo::getIntImmCost(const APInt &Imm, Type *Ty) const {
   return TTIImpl->getIntImmCost(Imm, Ty);
 }
Index: lib/Target/ARM/ARMSubtarget.h
===================================================================
--- lib/Target/ARM/ARMSubtarget.h
+++ lib/Target/ARM/ARMSubtarget.h
@@ -310,7 +310,8 @@
   bool hasCRC() const { return HasCRC; }
   bool hasVirtualization() const { return HasVirtualization; }
   bool useNEONForSinglePrecisionFP() const {
-    return hasNEON() && UseNEONForSinglePrecisionFP; }
+    return hasNEON() && UseNEONForSinglePrecisionFP;
+  }
 
   bool hasDivide() const { return HasHardwareDivide; }
   bool hasDivideInARMMode() const { return HasHardwareDivideInARM; }
Index: lib/Target/ARM/ARMTargetTransformInfo.h
===================================================================
--- lib/Target/ARM/ARMTargetTransformInfo.h
+++ lib/Target/ARM/ARMTargetTransformInfo.h
@@ -114,6 +114,8 @@
 
   unsigned getAddressComputationCost(Type *Val, bool IsComplex);
 
+  unsigned getFPOpCost(Type *Ty);
+
   unsigned getArithmeticInstrCost(
       unsigned Opcode, Type *Ty,
       TTI::OperandValueKind Op1Info = TTI::OK_AnyValue,
Index: lib/Target/ARM/ARMTargetTransformInfo.cpp
===================================================================
--- lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -314,6 +314,25 @@
   return 1;
 }
 
+unsigned ARMTTIImpl::getFPOpCost(Type *Ty) {
+  // Use similar logic that's in ARMISelLowering:
+  // Any ARM CPU with VFP2 has floating point, but Thumb1 didn't have access
+  // to VFP.
+
+  if (ST->hasVFP2() && !ST->isThumb1Only()) {
+    if (Ty->isFloatTy()) {
+      return TargetTransformInfo::TCC_Basic;
+    }
+
+    if (Ty->isDoubleTy()) {
+      return ST->isFPOnlySP() ? TargetTransformInfo::TCC_Expensive :
+        TargetTransformInfo::TCC_Basic;
+    }
+  }
+
+  return TargetTransformInfo::TCC_Expensive;
+}
+
 unsigned ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
                                     Type *SubTp) {
   // We only handle costs of reverse and alternate shuffles for now.
Index: test/Transforms/Inline/inline-fp.ll
===================================================================
--- /dev/null
+++ test/Transforms/Inline/inline-fp.ll
@@ -0,0 +1,140 @@
+; RUN: opt -S -inline < %s | FileCheck %s
+; Make sure that soft float implementations are calculated as being more expensive
+; to the inliner.
+
+define i32 @test_nofp() #0 {
+entry:
+  %responseX = alloca i32, align 4
+  %responseY = alloca i32, align 4
+  %responseZ = alloca i32, align 4
+  %valueX = alloca i8, align 1
+  %valueY = alloca i8, align 1
+  %valueZ = alloca i8, align 1
+
+  call void @getX(i32* %responseX, i8* %valueX)
+  call void @getY(i32* %responseY, i8* %valueY)
+  call void @getZ(i32* %responseZ, i8* %valueZ)
+
+  %0 = load i32* %responseX
+  %1 = load i8* %valueX
+  %call = call float @f_nofp(i32 %0, i8 zeroext %1)
+  %2 = load i32* %responseZ
+  %3 = load i8* %valueZ
+  %call2 = call float @f_nofp(i32 %2, i8 zeroext %3)
+  %call3 = call float @fabsf(float %call)
+  %cmp = fcmp ogt float %call3, 0x3FC1EB8520000000
+  br i1 %cmp, label %if.end12, label %if.else
+
+if.else:                                          ; preds = %entry
+  %4 = load i32* %responseY
+  %5 = load i8* %valueY
+  %call1 = call float @f_nofp(i32 %4, i8 zeroext %5)
+  %call4 = call float @fabsf(float %call1)
+  %cmp5 = fcmp ogt float %call4, 0x3FC1EB8520000000
+  br i1 %cmp5, label %if.end12, label %if.else7
+
+if.else7:                                         ; preds = %if.else
+  %call8 = call float @fabsf(float %call2)
+  %cmp9 = fcmp ogt float %call8, 0x3FC1EB8520000000
+  br i1 %cmp9, label %if.then10, label %if.end12
+
+if.then10:                                        ; preds = %if.else7
+  br label %if.end12
+
+if.end12:                                         ; preds = %if.else, %entry, %if.then10, %if.else7
+  %success.0 = phi i32 [ 0, %if.then10 ], [ 1, %if.else7 ], [ 0, %entry ], [ 0, %if.else ]
+  ret i32 %success.0
+}
+
+define i32 @test_hasfp() #0 {
+entry:
+  %responseX = alloca i32, align 4
+  %responseY = alloca i32, align 4
+  %responseZ = alloca i32, align 4
+  %valueX = alloca i8, align 1
+  %valueY = alloca i8, align 1
+  %valueZ = alloca i8, align 1
+
+  call void @getX(i32* %responseX, i8* %valueX)
+  call void @getY(i32* %responseY, i8* %valueY)
+  call void @getZ(i32* %responseZ, i8* %valueZ)
+
+  %0 = load i32* %responseX
+  %1 = load i8* %valueX
+  %call = call float @f_hasfp(i32 %0, i8 zeroext %1)
+  %2 = load i32* %responseZ
+  %3 = load i8* %valueZ
+  %call2 = call float @f_hasfp(i32 %2, i8 zeroext %3)
+  %call3 = call float @fabsf(float %call)
+  %cmp = fcmp ogt float %call3, 0x3FC1EB8520000000
+  br i1 %cmp, label %if.end12, label %if.else
+
+if.else:                                          ; preds = %entry
+  %4 = load i32* %responseY
+  %5 = load i8* %valueY
+  %call1 = call float @f_hasfp(i32 %4, i8 zeroext %5)
+  %call4 = call float @fabsf(float %call1)
+  %cmp5 = fcmp ogt float %call4, 0x3FC1EB8520000000
+  br i1 %cmp5, label %if.end12, label %if.else7
+
+if.else7:                                         ; preds = %if.else
+  %call8 = call float @fabsf(float %call2)
+  %cmp9 = fcmp ogt float %call8, 0x3FC1EB8520000000
+  br i1 %cmp9, label %if.then10, label %if.end12
+
+if.then10:                                        ; preds = %if.else7
+  br label %if.end12
+
+if.end12:                                         ; preds = %if.else, %entry, %if.then10, %if.else7
+  %success.0 = phi i32 [ 0, %if.then10 ], [ 1, %if.else7 ], [ 0, %entry ], [ 0, %if.else ]
+  ret i32 %success.0
+}
+
+declare void @getX(i32*, i8*) #0
+
+declare void @getY(i32*, i8*) #0
+
+declare void @getZ(i32*, i8*) #0
+
+; f_nofp() is marked with the "use-soft-float", so it should never get inlined.
+; f_hasfp() should get inlined.
+; CHECK: @test_nofp
+; CHECK: call float @f_nofp 
+; CHECK: ret i32
+
+; CHECK: @test_hasfp
+; CHECK-NOT: call float @f_hasfp 
+; CHECK: ret i32
+
+define internal float @f_hasfp(i32 %response, i8 zeroext %value1) #0 {
+entry:
+  %conv = zext i8 %value1 to i32
+  %sub = add nsw i32 %conv, -1
+  %conv1 = sitofp i32 %sub to float
+  %0 = tail call float @llvm.pow.f32(float 0x3FF028F5C0000000, float %conv1)
+  %mul = fmul float %0, 2.620000e+03
+  %conv2 = sitofp i32 %response to float
+  %sub3 = fsub float %conv2, %mul
+  %div = fdiv float %sub3, %mul
+  ret float %div
+}
+
+define internal float @f_nofp(i32 %response, i8 zeroext %value1) #1 {
+entry:
+  %conv = zext i8 %value1 to i32
+  %sub = add nsw i32 %conv, -1
+  %conv1 = sitofp i32 %sub to float
+  %0 = tail call float @llvm.pow.f32(float 0x3FF028F5C0000000, float %conv1)
+  %mul = fmul float %0, 2.620000e+03
+  %conv2 = sitofp i32 %response to float
+  %sub3 = fsub float %conv2, %mul
+  %div = fdiv float %sub3, %mul
+  ret float %div
+}
+
+declare float @fabsf(float) optsize minsize
+
+declare float @llvm.pow.f32(float, float) optsize minsize
+
+attributes #0 = { minsize optsize }
+attributes #1 = { minsize optsize "use-soft-float"="true" }