Index: llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -75,9 +75,6 @@
   // rootn
   bool fold_rootn(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo);
 
-  // fma/mad
-  bool fold_fma_mad(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
-
   // -fuse-native for sincos
   bool sincosUseNative(CallInst *aCI, const FuncInfo &FInfo);
 
@@ -649,11 +646,6 @@
     case AMDGPULibFunc::EI_COS:
     case AMDGPULibFunc::EI_SIN:
       return fold_sincos(FPOp, B, FInfo, AA);
-    case AMDGPULibFunc::EI_FMA:
-    case AMDGPULibFunc::EI_MAD:
-    case AMDGPULibFunc::EI_NFMA:
-      // skip vector function
-      return (getVecSize(FInfo) != 1) ? false : fold_fma_mad(CI, B, FInfo);
     default:
       break;
     }
@@ -1087,50 +1079,6 @@
   return false;
 }
 
-bool AMDGPULibCalls::fold_fma_mad(CallInst *CI, IRBuilder<> &B,
-                                  const FuncInfo &FInfo) {
-  Value *opr0 = CI->getArgOperand(0);
-  Value *opr1 = CI->getArgOperand(1);
-  Value *opr2 = CI->getArgOperand(2);
-
-  ConstantFP *CF0 = dyn_cast<ConstantFP>(opr0);
-  ConstantFP *CF1 = dyn_cast<ConstantFP>(opr1);
-  if ((CF0 && CF0->isZero()) || (CF1 && CF1->isZero())) {
-    // fma/mad(a, b, c) = c if a=0 || b=0
-    LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr2 << "\n");
-    replaceCall(opr2);
-    return true;
-  }
-  if (CF0 && CF0->isExactlyValue(1.0f)) {
-    // fma/mad(a, b, c) = b+c if a=1
-    LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr1 << " + " << *opr2
-                      << "\n");
-    Value *nval = B.CreateFAdd(opr1, opr2, "fmaadd");
-    replaceCall(nval);
-    return true;
-  }
-  if (CF1 && CF1->isExactlyValue(1.0f)) {
-    // fma/mad(a, b, c) = a+c if b=1
-    LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr0 << " + " << *opr2
-                      << "\n");
-    Value *nval = B.CreateFAdd(opr0, opr2, "fmaadd");
-    replaceCall(nval);
-    return true;
-  }
-  if (ConstantFP *CF = dyn_cast<ConstantFP>(opr2)) {
-    if (CF->isZero()) {
-      // fma/mad(a, b, c) = a*b if c=0
-      LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr0 << " * "
-                        << *opr1 << "\n");
-      Value *nval = B.CreateFMul(opr0, opr1, "fmamul");
-      replaceCall(nval);
-      return true;
-    }
-  }
-
-  return false;
-}
-
 // Get a scalar native builtin single argument FP function
 FunctionCallee AMDGPULibCalls::getNativeFunction(Module *M,
                                                  const FuncInfo &FInfo) {
Index: llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
+++ llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
@@ -471,7 +471,7 @@
 }
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_0x
-; GCN: store float %y, ptr addrspace(1) %a
+; GCN: %call = tail call fast float @_Z3fmafff(float 0.000000e+00, float %tmp, float %y)
 define amdgpu_kernel void @test_fma_0x(ptr addrspace(1) nocapture %a, float %y) {
 entry:
   %tmp = load float, ptr addrspace(1) %a, align 4
@@ -483,7 +483,7 @@
 declare float @_Z3fmafff(float, float, float)
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_x0
-; GCN: store float %y, ptr addrspace(1) %a
+; GCN: %call = tail call fast float @_Z3fmafff(float %tmp, float 0.000000e+00, float %y)
 define amdgpu_kernel void @test_fma_x0(ptr addrspace(1) nocapture %a, float %y) {
 entry:
   %tmp = load float, ptr addrspace(1) %a, align 4
@@ -493,7 +493,7 @@
 }
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_mad_0x
-; GCN: store float %y, ptr addrspace(1) %a
+; GCN: %call = tail call fast float @_Z3madfff(float 0.000000e+00, float %tmp, float %y)
 define amdgpu_kernel void @test_mad_0x(ptr addrspace(1) nocapture %a, float %y) {
 entry:
   %tmp = load float, ptr addrspace(1) %a, align 4
@@ -505,7 +505,7 @@
 declare float @_Z3madfff(float, float, float)
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_mad_x0
-; GCN: store float %y, ptr addrspace(1) %a
+; GCN: %call = tail call fast float @_Z3madfff(float %tmp, float 0.000000e+00, float %y)
 define amdgpu_kernel void @test_mad_x0(ptr addrspace(1) nocapture %a, float %y) {
 entry:
   %tmp = load float, ptr addrspace(1) %a, align 4
@@ -515,7 +515,7 @@
 }
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_x1y
-; GCN: %fmaadd = fadd fast float %tmp, %y
+; GCN: %call = tail call fast float @_Z3fmafff(float %tmp, float 1.000000e+00, float %y)
 define amdgpu_kernel void @test_fma_x1y(ptr addrspace(1) nocapture %a, float %y) {
 entry:
   %tmp = load float, ptr addrspace(1) %a, align 4
@@ -525,7 +525,7 @@
 }
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_1xy
-; GCN: %fmaadd = fadd fast float %tmp, %y
+; GCN: %call = tail call fast float @_Z3fmafff(float 1.000000e+00, float %tmp, float %y)
 define amdgpu_kernel void @test_fma_1xy(ptr addrspace(1) nocapture %a, float %y) {
 entry:
   %tmp = load float, ptr addrspace(1) %a, align 4
@@ -535,7 +535,7 @@
 }
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_xy0
-; GCN: %fmamul = fmul fast float %tmp1, %tmp
+; GCN: %call = tail call fast float @_Z3fmafff(float %tmp, float %tmp1, float 0.000000e+00)
 define amdgpu_kernel void @test_fma_xy0(ptr addrspace(1) nocapture %a) {
 entry:
   %arrayidx = getelementptr inbounds float, ptr addrspace(1) %a, i64 1