Index: include/llvm/IR/PatternMatch.h
===================================================================
--- include/llvm/IR/PatternMatch.h
+++ include/llvm/IR/PatternMatch.h
@@ -826,6 +826,18 @@
   return CastClass_match<OpTy, Instruction::SIToFP>(Op);
 }
 
+/// \brief Matches FPTrunc
+template <typename OpTy>
+inline CastClass_match<OpTy, Instruction::FPTrunc> m_FPTrunc(const OpTy &Op) {
+  return CastClass_match<OpTy, Instruction::FPTrunc>(Op);
+}
+
+/// \brief Matches FPExt
+template <typename OpTy>
+inline CastClass_match<OpTy, Instruction::FPExt> m_FPExt(const OpTy &Op) {
+  return CastClass_match<OpTy, Instruction::FPExt>(Op);
+}
+
 //===----------------------------------------------------------------------===//
 // Matchers for unary operators
 //
Index: lib/Transforms/InstCombine/InstCombineCalls.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1631,6 +1631,18 @@
       return SelectInst::Create(Cond, Call0, Call1);
     }
 
+    Value *ExtSrc;
+    if (match(II->getArgOperand(0), m_FPExt(m_Value(ExtSrc))) &&
+        II->getArgOperand(0)->hasOneUse()) {
+      // fabs (fpext x) -> fpext (fabs x)
+      Value *F = Intrinsic::getDeclaration(II->getModule(), Intrinsic::fabs,
+                                           { ExtSrc->getType() });
+      CallInst *NewFabs = Builder->CreateCall(F, ExtSrc);
+      NewFabs->copyFastMathFlags(II);
+      NewFabs->takeName(II);
+      return new FPExtInst(NewFabs, II->getType());
+    }
+
     break;
   }
   case Intrinsic::cos:
Index: lib/Transforms/InstCombine/InstCombineCasts.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1392,21 +1392,24 @@
   IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI.getOperand(0));
   if (II) {
     switch (II->getIntrinsicID()) {
-      default: break;
-      case Intrinsic::fabs: {
-        // (fptrunc (fabs x)) -> (fabs (fptrunc x))
-        Value *InnerTrunc = Builder->CreateFPTrunc(II->getArgOperand(0),
-                                                   CI.getType());
-        Type *IntrinsicType[] = { CI.getType() };
-        Function *Overload = Intrinsic::getDeclaration(
-            CI.getModule(), II->getIntrinsicID(), IntrinsicType);
-
-        SmallVector<OperandBundleDef, 1> OpBundles;
-        II->getOperandBundlesAsDefs(OpBundles);
-
-        Value *Args[] = { InnerTrunc };
-        return CallInst::Create(Overload, Args, OpBundles, II->getName());
-      }
+    default: break;
+    case Intrinsic::fabs: {
+      // (fptrunc (fabs x)) -> (fabs (fptrunc x))
+      Value *InnerTrunc = Builder->CreateFPTrunc(II->getArgOperand(0),
+                                                 CI.getType());
+      Type *IntrinsicType[] = { CI.getType() };
+      Function *Overload = Intrinsic::getDeclaration(
+        CI.getModule(), II->getIntrinsicID(), IntrinsicType);
+
+      SmallVector<OperandBundleDef, 1> OpBundles;
+      II->getOperandBundlesAsDefs(OpBundles);
+
+      Value *Args[] = { InnerTrunc };
+      CallInst *NewCI =  CallInst::Create(Overload, Args,
+                                          OpBundles, II->getName());
+      NewCI->copyFastMathFlags(II);
+      return NewCI;
+    }
     }
   }
 
Index: lib/Transforms/Utils/SimplifyLibCalls.cpp
===================================================================
--- lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -1210,11 +1210,15 @@
 
 Value *LibCallSimplifier::optimizeFabs(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
-  StringRef Name = Callee->getName();
-  if (Name == "fabs" && hasFloatVersion(Name))
-    return optimizeUnaryDoubleFP(CI, B, false);
+  IRBuilder<>::FastMathFlagGuard Guard(B);
+  B.setFastMathFlags(CI->getFastMathFlags());
 
-  return nullptr;
+  // fabs/fabsf -> llvm.fabs.*
+  Value *F = Intrinsic::getDeclaration(Callee->getParent(), Intrinsic::fabs,
+                                       CI->getType());
+  Value *NewCall = B.CreateCall(F, { CI->getArgOperand(0) });
+  NewCall->takeName(CI);
+  return NewCall;
 }
 
 Value *LibCallSimplifier::optimizeFMinFMax(CallInst *CI, IRBuilder<> &B) {
@@ -2029,8 +2033,6 @@
       return optimizePow(CI, Builder);
     case Intrinsic::exp2:
       return optimizeExp2(CI, Builder);
-    case Intrinsic::fabs:
-      return optimizeFabs(CI, Builder);
     case Intrinsic::log:
       return optimizeLog(CI, Builder);
     case Intrinsic::sqrt:
Index: test/Transforms/InstCombine/double-float-shrink-2.ll
===================================================================
--- test/Transforms/InstCombine/double-float-shrink-2.ll
+++ test/Transforms/InstCombine/double-float-shrink-2.ll
@@ -10,7 +10,8 @@
 ; DO-SIMPLIFY: call float @roundf(
 ; DO-SIMPLIFY: call float @nearbyintf(
 ; DO-SIMPLIFY: call float @truncf(
-; DO-SIMPLIFY: call float @fabsf(
+; DO-SIMPLIFY: call float @llvm.fabs.f32(
+; DO-SIMPLIFY: call float fast @llvm.fabs.f32(
 
 ; C89-SIMPLIFY: call float @floorf(
 ; C89-SIMPLIFY: call float @ceilf(
@@ -22,7 +23,10 @@
 ; DONT-SIMPLIFY: call double @round(
 ; DONT-SIMPLIFY: call double @nearbyint(
 ; DONT-SIMPLIFY: call double @trunc(
-; DONT-SIMPLIFY: call double @fabs(
+
+; This is replaced with the intrinsic, which does the right thing on
+; all platforms.
+; DONT-SIMPLIFY: call float @llvm.fabs.f32(
 
 declare double @floor(double)
 declare double @ceil(double)
@@ -30,6 +34,7 @@
 declare double @nearbyint(double)
 declare double @trunc(double)
 declare double @fabs(double)
+declare double @llvm.fabs.f64(double)
 
 define float @test_floor(float %C) {
   %D = fpext float %C to double
@@ -78,3 +83,12 @@
   %F = fptrunc double %E to float
   ret float %F
 }
+
+; Make sure fast math flags are preserved
+define float @test_fabs_fast(float %C) {
+  %D = fpext float %C to double
+  ; --> fabsf
+  %E = call fast double @fabs(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
Index: test/Transforms/InstCombine/fabs-libcall.ll
===================================================================
--- /dev/null
+++ test/Transforms/InstCombine/fabs-libcall.ll
@@ -0,0 +1,21 @@
+; RUN: opt -S -mtriple=i686-apple-macosx -instcombine %s | FileCheck %s
+
+declare x86_fp80 @fabsl(x86_fp80)
+
+; CHECK-LABEL: @replace_fabs_call_f80(
+; CHECK-NEXT: %fabsl = call x86_fp80 @llvm.fabs.f80(x86_fp80 %x)
+; CHECK-NEXT: ret x86_fp80 %fabsl
+define x86_fp80 @replace_fabs_call_f80(x86_fp80 %x) {
+  %fabsl = tail call x86_fp80 @fabsl(x86_fp80 %x)
+  ret x86_fp80 %fabsl
+
+}
+
+; CHECK-LABEL: @fmf_replace_fabs_call_f80(
+; CHECK-NEXT: %fabsl = call nnan x86_fp80 @llvm.fabs.f80(x86_fp80 %x)
+; CHECK-NEXT: ret x86_fp80 %fabsl
+define x86_fp80 @fmf_replace_fabs_call_f80(x86_fp80 %x) {
+  %fabsl = tail call nnan x86_fp80 @fabsl(x86_fp80 %x)
+  ret x86_fp80 %fabsl
+}
+
Index: test/Transforms/InstCombine/fabs.ll
===================================================================
--- test/Transforms/InstCombine/fabs.ll
+++ test/Transforms/InstCombine/fabs.ll
@@ -1,6 +1,10 @@
-; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu < %s -instcombine -S | FileCheck %s
 
-; Make sure all library calls are eliminated when the input is known positive.
+; Make sure libcalls are replaced with intrinsic calls.
+
+declare float @llvm.fabs.f32(float)
+declare double @llvm.fabs.f64(double)
+declare fp128 @llvm.fabs.f128(fp128)
 
 declare float @fabsf(float)
 declare double @fabs(double)
@@ -8,46 +12,46 @@
 declare float @llvm.fma.f32(float, float, float)
 declare float @llvm.fmuladd.f32(float, float, float)
 
-define float @square_fabs_call_f32(float %x) {
-  %mul = fmul float %x, %x
-  %fabsf = tail call float @fabsf(float %mul)
+define float @replace_fabs_call_f32(float %x) {
+  %fabsf = tail call float @fabsf(float %x)
   ret float %fabsf
 
-; CHECK-LABEL: square_fabs_call_f32(
-; CHECK-NEXT: %mul = fmul float %x, %x
-; CHECK-NEXT: %fabsf = tail call float @fabsf(float %mul)
+; CHECK-LABEL: @replace_fabs_call_f32(
+; CHECK-NEXT: %fabsf = call float @llvm.fabs.f32(float %x)
 ; CHECK-NEXT: ret float %fabsf
 }
 
-define double @square_fabs_call_f64(double %x) {
-  %mul = fmul double %x, %x
-  %fabs = tail call double @fabs(double %mul)
+define double @replace_fabs_call_f64(double %x) {
+  %fabs = tail call double @fabs(double %x)
   ret double %fabs
 
-; CHECK-LABEL: square_fabs_call_f64(
-; CHECK-NEXT: %mul = fmul double %x, %x
-; CHECK-NEXT: %fabs = tail call double @fabs(double %mul)
+; CHECK-LABEL: @replace_fabs_call_f64(
+; CHECK-NEXT: %fabs = call double @llvm.fabs.f64(double %x)
 ; CHECK-NEXT: ret double %fabs
 }
 
-define fp128 @square_fabs_call_f128(fp128 %x) {
-  %mul = fmul fp128 %x, %x
-  %fabsl = tail call fp128 @fabsl(fp128 %mul)
+define fp128 @replace_fabs_call_f128(fp128 %x) {
+  %fabsl = tail call fp128 @fabsl(fp128 %x)
   ret fp128 %fabsl
 
-; CHECK-LABEL: square_fabs_call_f128(
-; CHECK-NEXT: %mul = fmul fp128 %x, %x
-; CHECK-NEXT: %fabsl = tail call fp128 @fabsl(fp128 %mul)
+; CHECK-LABEL: replace_fabs_call_f128(
+; CHECK-NEXT: %fabsl = call fp128 @llvm.fabs.f128(fp128 %x)
 ; CHECK-NEXT: ret fp128 %fabsl
 }
 
+; Make sure fast math flags are preserved when replacing the libcall.
+define float @fmf_replace_fabs_call_f32(float %x) {
+  %fabsf = tail call nnan float @fabsf(float %x)
+  ret float %fabsf
+
+; CHECK-LABEL: @fmf_replace_fabs_call_f32(
+; CHECK-NEXT: %fabsf = call nnan float @llvm.fabs.f32(float %x)
+; CHECK-NEXT: ret float %fabsf
+}
+
 ; Make sure all intrinsic calls are eliminated when the input is known
 ; positive.
 
-declare float @llvm.fabs.f32(float)
-declare double @llvm.fabs.f64(double)
-declare fp128 @llvm.fabs.f128(fp128)
-
 ; The fabs cannot be eliminated because %x may be a NaN
 define float @square_fabs_intrinsic_f32(float %x) {
   %mul = fmul float %x, %x
@@ -102,10 +106,8 @@
   ret float %trunc
 
 ; CHECK-LABEL: square_fabs_shrink_call1(
-; CHECK-NEXT: %ext = fpext float %x to double
-; CHECK-NEXT: %sq = fmul double %ext, %ext
-; CHECK-NEXT: call double @fabs(double %sq)
-; CHECK-NEXT: %trunc = fptrunc double %fabs to float
+; CHECK-NEXT: fmul float %x, %x
+; CHECK-NEXT: %trunc = call float @llvm.fabs.f32(float
 ; CHECK-NEXT: ret float %trunc
 }
 
@@ -118,8 +120,8 @@
 
 ; CHECK-LABEL: square_fabs_shrink_call2(
 ; CHECK-NEXT: %sq = fmul float %x, %x
-; CHECK-NEXT: %fabsf = call float @fabsf(float %sq)
-; CHECK-NEXT: ret float %fabsf
+; CHECK-NEXT: %trunc = call float @llvm.fabs.f32(float %sq)
+; CHECK-NEXT: ret float %trunc
 }
 
 ; CHECK-LABEL: @fabs_select_constant_negative_positive(
@@ -214,3 +216,16 @@
 ; CHECK-NEXT: %fmuladd = call nnan float @llvm.fmuladd.f32(float %x, float %x, float 1.000000e+00)
 ; CHECK-NEXT: ret float %fmuladd
 }
+
+; Don't introduce a second fpext
+; CHECK-LABEL: @multi_use_fabs_fpext(
+; CHECK: %fpext = fpext float %x to double
+; CHECK-NEXT: %fabs = call double @llvm.fabs.f64(double %fpext)
+; CHECK-NEXT: store volatile double %fpext, double* undef, align 8
+; CHECK-NEXT: ret double %fabs
+define double @multi_use_fabs_fpext(float %x) {
+  %fpext = fpext float %x to double
+  %fabs = call double @llvm.fabs.f64(double %fpext)
+  store volatile double %fpext, double* undef
+  ret double %fabs
+}
Index: test/Transforms/InstCombine/float-shrink-compare.ll
===================================================================
--- test/Transforms/InstCombine/float-shrink-compare.ll
+++ test/Transforms/InstCombine/float-shrink-compare.ll
@@ -22,8 +22,20 @@
   %5 = zext i1 %4 to i32
   ret i32 %5
 ; CHECK-LABEL: @test2(
-; CHECK-NEXT: %fabsf = call float @fabsf(float %x)
-; CHECK-NEXT: fcmp oeq float %fabsf, %y
+; CHECK-NEXT: [[FABS:%[0-9]+]] = call float @llvm.fabs.f32(float %x)
+; CHECK-NEXT: fcmp oeq float [[FABS]], %y
+}
+
+define i32 @fmf_test2(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %x to double
+  %2 = call nnan double @fabs(double %1) nounwind readnone
+  %3 = fpext float %y to double
+  %4 = fcmp oeq double %2, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK-LABEL: @fmf_test2(
+; CHECK-NEXT: [[FABS:%[0-9]+]] = call nnan float @llvm.fabs.f32(float %x)
+; CHECK-NEXT: fcmp oeq float [[FABS]], %y
 }
 
 define i32 @test3(float %x, float %y) nounwind uwtable {
@@ -99,15 +111,15 @@
 }
 
 define i32 @test9(float %x, float %y) nounwind uwtable {
-  %1 = fpext float %y to double
-  %2 = fpext float %x to double
-  %3 = call double @fabs(double %2) nounwind readnone
-  %4 = fcmp oeq double %1, %3
-  %5 = zext i1 %4 to i32
-  ret i32 %5
+  %x.ext = fpext float %x to double
+  %y.ext = fpext float %y to double
+  %fabs = call double @fabs(double %x.ext) nounwind readnone
+  %cmp = fcmp oeq double %y.ext, %fabs
+  %cmp.ext = zext i1 %cmp to i32
+  ret i32 %cmp.ext
 ; CHECK-LABEL: @test9(
-; CHECK-NEXT: %fabsf = call float @fabsf(float %x)
-; CHECK-NEXT: fcmp oeq float %fabsf, %y
+; CHECK-NEXT: %fabs = call float @llvm.fabs.f32(float %x)
+; CHECK-NEXT: fcmp oeq float %fabs, %y
 }
 
 define i32 @test10(float %x, float %y) nounwind uwtable {
Index: test/Transforms/InstCombine/pow-1.ll
===================================================================
--- test/Transforms/InstCombine/pow-1.ll
+++ test/Transforms/InstCombine/pow-1.ll
@@ -72,7 +72,7 @@
 ; CHECK-LABEL: @test_simplify7(
   %retval = call float @powf(float %x, float 0.5)
 ; CHECK-NEXT: [[SQRTF:%[a-z0-9]+]] = call float @sqrtf(float %x) [[NUW_RO:#[0-9]+]]
-; CHECK-NEXT: [[FABSF:%[a-z0-9]+]] = call float @fabsf(float [[SQRTF]]) [[NUW_RO]]
+; CHECK-NEXT: [[FABSF:%[a-z0-9]+]] = call float @llvm.fabs.f32(float [[SQRTF]])
 ; CHECK-NEXT: [[FCMP:%[a-z0-9]+]] = fcmp oeq float %x, 0xFFF0000000000000
 ; CHECK-NEXT: [[SELECT:%[a-z0-9]+]] = select i1 [[FCMP]], float 0x7FF0000000000000, float [[FABSF]]
   ret float %retval
@@ -83,7 +83,7 @@
 ; CHECK-LABEL: @test_simplify8(
   %retval = call double @pow(double %x, double 0.5)
 ; CHECK-NEXT: [[SQRT:%[a-z0-9]+]] = call double @sqrt(double %x) [[NUW_RO]]
-; CHECK-NEXT: [[FABS:%[a-z0-9]+]] = call double @fabs(double [[SQRT]]) [[NUW_RO]]
+; CHECK-NEXT: [[FABS:%[a-z0-9]+]] = call double @llvm.fabs.f64(double [[SQRT]])
 ; CHECK-NEXT: [[FCMP:%[a-z0-9]+]] = fcmp oeq double %x, 0xFFF0000000000000
 ; CHECK-NEXT: [[SELECT:%[a-z0-9]+]] = select i1 [[FCMP]], double 0x7FF0000000000000, double [[FABS]]
   ret double %retval
@@ -163,7 +163,7 @@
 ; CHECK-LABEL: @test_simplify17(
   %retval = call double @llvm.pow.f64(double %x, double 0.5)
 ; CHECK-NEXT: [[SQRT:%[a-z0-9]+]] = call double @sqrt(double %x)
-; CHECK-NEXT: [[FABS:%[a-z0-9]+]] = call double @fabs(double [[SQRT]])
+; CHECK-NEXT: [[FABS:%[a-z0-9]+]] = call double @llvm.fabs.f64(double [[SQRT]])
 ; CHECK-NEXT: [[FCMP:%[a-z0-9]+]] = fcmp oeq double %x, 0xFFF0000000000000
 ; CHECK-NEXT: [[SELECT:%[a-z0-9]+]] = select i1 [[FCMP]], double 0x7FF0000000000000, double [[FABS]]
   ret double %retval
Index: test/Transforms/InstCombine/win-math.ll
===================================================================
--- test/Transforms/InstCombine/win-math.ll
+++ test/Transforms/InstCombine/win-math.ll
@@ -284,11 +284,11 @@
 ; WIN64: float @powf
 ; MINGW32-LABEL: @float_powsqrt(
 ; MINGW32: float @sqrtf
-; MINGW32: float @fabsf
+; MINGW32: float @llvm.fabs.f32
 ; MINGW32-NOT: float @powf
 ; MINGW64-LABEL: @float_powsqrt(
 ; MINGW64: float @sqrtf
-; MINGW64: float @fabsf
+; MINGW64: float @llvm.fabs.f32(
 ; MINGW64-NOT: float @powf
     %1 = call float @powf(float %x, float 0.5)
     ret float %1
Index: test/Transforms/InstCombine/zero-point-zero-add.ll
===================================================================
--- test/Transforms/InstCombine/zero-point-zero-add.ll
+++ test/Transforms/InstCombine/zero-point-zero-add.ll
@@ -15,7 +15,7 @@
 
 define double @test1(double %X) {
 ; CHECK-LABEL: @test1(
-; CHECK-NEXT:    [[Y:%.*]] = call double @fabs(double %X)
+; CHECK-NEXT:    [[Y:%.*]] = call double @llvm.fabs.f64(double %X)
 ; CHECK-NEXT:    ret double [[Y]]
 ;
   %Y = call double @fabs(double %X)