Index: lib/Target/AMDGPU/SIISelLowering.h
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.h
+++ lib/Target/AMDGPU/SIISelLowering.h
@@ -85,7 +85,8 @@
 
   SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
-  unsigned getFusedOpcode(const SelectionDAG &DAG, EVT VT) const;
+  unsigned getFusedOpcode(const SelectionDAG &DAG,
+                          const SDNode *N0, const SDNode *N1) const;
   SDValue performFAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performFSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const;
Index: lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.cpp
+++ lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3906,7 +3906,11 @@
   return SDValue();
 }
 
-unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, EVT VT) const {
+unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
+                                          const SDNode *N0,
+                                          const SDNode *N1) const {
+  EVT VT = N0->getValueType(0);
+
   // Only do this if we are not trying to support denormals. v_mad_f32 does not
   // support denormals ever.
   if ((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) ||
@@ -3914,7 +3918,10 @@
     return ISD::FMAD;
 
   const TargetOptions &Options = DAG.getTarget().Options;
-  if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath) &&
+  if ((Options.AllowFPOpFusion == FPOpFusion::Fast ||
+       Options.UnsafeFPMath ||
+       (cast<BinaryWithFlagsSDNode>(N0)->Flags.hasUnsafeAlgebra() &&
+        cast<BinaryWithFlagsSDNode>(N1)->Flags.hasUnsafeAlgebra())) &&
       isFMAFasterThanFMulAndFAdd(VT)) {
     return ISD::FMA;
   }
@@ -3942,7 +3949,7 @@
   if (LHS.getOpcode() == ISD::FADD) {
     SDValue A = LHS.getOperand(0);
     if (A == LHS.getOperand(1)) {
-      unsigned FusedOp = getFusedOpcode(DAG, VT);
+      unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
       if (FusedOp != 0) {
         const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
         return DAG.getNode(FusedOp, SL, VT, Two, A, RHS);
@@ -3954,7 +3961,7 @@
   if (RHS.getOpcode() == ISD::FADD) {
     SDValue A = RHS.getOperand(0);
     if (A == RHS.getOperand(1)) {
-      unsigned FusedOp = getFusedOpcode(DAG, VT);
+      unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
       if (FusedOp != 0) {
         const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
         return DAG.getNode(FusedOp, SL, VT, Two, A, LHS);
@@ -3986,7 +3993,7 @@
     // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
     SDValue A = LHS.getOperand(0);
     if (A == LHS.getOperand(1)) {
-      unsigned FusedOp = getFusedOpcode(DAG, VT);
+      unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
       if (FusedOp != 0){
         const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
         SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
@@ -4001,7 +4008,7 @@
 
     SDValue A = RHS.getOperand(0);
     if (A == RHS.getOperand(1)) {
-      unsigned FusedOp = getFusedOpcode(DAG, VT);
+      unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
       if (FusedOp != 0){
         const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
         return DAG.getNode(FusedOp, SL, VT, NegTwo, A, LHS);
Index: test/CodeGen/AMDGPU/fmuladd.f64.ll
===================================================================
--- test/CodeGen/AMDGPU/fmuladd.f64.ll
+++ test/CodeGen/AMDGPU/fmuladd.f64.ll
@@ -112,6 +112,69 @@
   ret void
 }
 
+; GCN-LABEL: {{^}}fadd_a_a_b_f64_fast_add0:
+; GCN-STRICT: v_add_f64
+; GCN-STRICT: v_add_f64
+
+; GCN-CONTRACT: v_fma_f64
+define void @fadd_a_a_b_f64_fast_add0(double addrspace(1)* %out,
+                                      double addrspace(1)* %in1,
+                                      double addrspace(1)* %in2) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+  %gep.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
+
+  %r0 = load volatile double, double addrspace(1)* %gep.0
+  %r1 = load volatile double, double addrspace(1)* %gep.1
+
+  %add.0 = fadd fast double %r0, %r0
+  %add.1 = fadd double %add.0, %r1
+  store double %add.1, double addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}fadd_a_a_b_f64_fast_add1:
+; GCN-STRICT: v_add_f64
+; GCN-STRICT: v_add_f64
+
+; GCN-CONTRACT: v_fma_f64
+define void @fadd_a_a_b_f64_fast_add1(double addrspace(1)* %out,
+                                      double addrspace(1)* %in1,
+                                      double addrspace(1)* %in2) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+  %gep.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
+
+  %r0 = load volatile double, double addrspace(1)* %gep.0
+  %r1 = load volatile double, double addrspace(1)* %gep.1
+
+  %add.0 = fadd double %r0, %r0
+  %add.1 = fadd fast double %add.0, %r1
+  store double %add.1, double addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}fadd_a_a_b_f64_fast:
+; GCN: v_fma_f64
+define void @fadd_a_a_b_f64_fast(double addrspace(1)* %out,
+                                 double addrspace(1)* %in1,
+                                double addrspace(1)* %in2) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+  %gep.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
+
+  %r0 = load volatile double, double addrspace(1)* %gep.0
+  %r1 = load volatile double, double addrspace(1)* %gep.1
+
+  %add.0 = fadd fast double %r0, %r0
+  %add.1 = fadd fast double %add.0, %r1
+  store double %add.1, double addrspace(1)* %gep.out
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 declare double @llvm.fmuladd.f64(double, double, double) #1