Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -9678,29 +9678,67 @@
                          &DAG.getTarget().Options))
     return GetNegatedExpression(N0, DAG, LegalOperations);
 
-  // Transform fneg(bitconvert(x)) -> bitconvert(x ^ sign) to avoid loading
-  // constant pool values.
-  if (!TLI.isFNegFree(VT) &&
-      N0.getOpcode() == ISD::BITCAST &&
-      N0.getNode()->hasOneUse()) {
-    SDValue Int = N0.getOperand(0);
-    EVT IntVT = Int.getValueType();
-    if (IntVT.isInteger() && !IntVT.isVector()) {
-      APInt SignMask;
-      if (N0.getValueType().isVector()) {
-        // For a vector, get a mask such as 0x80... per scalar element
-        // and splat it.
-        SignMask = APInt::getSignBit(N0.getScalarValueSizeInBits());
-        SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask);
-      } else {
-        // For a scalar, just generate 0x80...
-        SignMask = APInt::getSignBit(IntVT.getSizeInBits());
+  if (TLI.isFNegFree(VT)) {
+    SDLoc SL(N);
+    unsigned Opc = N0.getOpcode();
+
+    // If the input has multiple uses and we can either fold the negate down,
+    // or the other uses cannot, give up. This both prevents unprofitable
+    // transformations and infinite loops: we won't repeatedly try to
+    // fold around a negate that has no 'good' form.
+    // TODO: Check if users are foldable.
+    if ((Opc == ISD::FADD || Opc == ISD::FMUL || Opc == ISD::FMA) &&
+        !N0.hasOneUse())
+      return SDValue();
+
+    switch (Opc) {
+    case ISD::FADD: {
+      // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
+      SDValue LHS = N0.getOperand(0);
+      SDValue RHS = N0.getOperand(1);
+
+      if (LHS.getOpcode() != ISD::FNEG)
+        LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
+      else
+        LHS = LHS.getOperand(0);
+
+      if (RHS.getOpcode() != ISD::FNEG)
+        RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
+      else
+        RHS = RHS.getOperand(0);
+
+      SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS);
+      if (!N0.hasOneUse())
+        DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
+      return Res;
+    }
+    default:
+      break;
+    }
+  } else {
+    // Transform fneg(bitconvert(x)) -> bitconvert(x ^ sign) to avoid loading
+    // constant pool values.
+    if (N0.getOpcode() == ISD::BITCAST &&
+        N0.getNode()->hasOneUse()) {
+      SDValue Int = N0.getOperand(0);
+      EVT IntVT = Int.getValueType();
+      if (IntVT.isInteger() && !IntVT.isVector()) {
+        APInt SignMask;
+        if (N0.getValueType().isVector()) {
+          // For a vector, get a mask such as 0x80... per scalar element
+          // and splat it.
+          SignMask = APInt::getSignBit(N0.getScalarValueSizeInBits());
+          SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask);
+        } else {
+          // For a scalar, just generate 0x80...
+          SignMask = APInt::getSignBit(IntVT.getSizeInBits());
+        }
+        SDLoc DL0(N0);
+        Int = DAG.getNode(ISD::XOR, DL0, IntVT, Int,
+                          DAG.getConstant(SignMask, DL0, IntVT));
+        AddToWorklist(Int.getNode());
+        return DAG.getBitcast(VT, Int);
       }
-      SDLoc DL0(N0);
-      Int = DAG.getNode(ISD::XOR, DL0, IntVT, Int,
-                        DAG.getConstant(SignMask, DL0, IntVT));
-      AddToWorklist(Int.getNode());
-      return DAG.getBitcast(VT, Int);
     }
   }
 
Index: test/CodeGen/AMDGPU/fneg-combines.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/fneg-combines.ll
@@ -0,0 +1,179 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+
+; GCN-LABEL: {{^}}v_fneg_add_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[A]], [[B]]
+; GCN-NEXT: buffer_store_dword [[RESULT]]
+define void @v_fneg_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %add = fadd float %a, %b
+  %fneg = fsub float -0.000000e+00, %add
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_add_store_use_add_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-DAG: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]]
+; GCN-NEXT: buffer_store_dword [[NEG_ADD]]
+; GCN-NEXT: buffer_store_dword [[ADD]]
+define void @v_fneg_add_store_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %add = fadd float %a, %b
+  %fneg = fsub float -0.000000e+00, %add
+  store volatile float %fneg, float addrspace(1)* %out
+  store volatile float %add, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_add_multi_use_add_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-DAG: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]]
+; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]]
+; GCN-NEXT: buffer_store_dword [[NEG_ADD]]
+; GCN-NEXT: buffer_store_dword [[MUL]]
+define void @v_fneg_add_multi_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %add = fadd float %a, %b
+  %fneg = fsub float -0.000000e+00, %add
+  %use1 = fmul float %add, 4.0
+  store volatile float %fneg, float addrspace(1)* %out
+  store volatile float %use1, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_add_fneg_x_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-NEXT: buffer_store_dword [[ADD]]
+define void @v_fneg_add_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %add = fadd float %fneg.a, %b
+  %fneg = fsub float -0.000000e+00, %add
+  store volatile float %fneg, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_add_x_fneg_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
+; GCN-NEXT: buffer_store_dword [[ADD]]
+define void @v_fneg_add_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %fneg.b = fsub float -0.000000e+00, %b
+  %add = fadd float %a, %fneg.b
+  %fneg = fsub float -0.000000e+00, %add
+  store volatile float %fneg, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_add_fneg_fneg_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-NEXT: buffer_store_dword [[ADD]]
+define void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %fneg.b = fsub float -0.000000e+00, %b
+  %add = fadd float %fneg.a, %fneg.b
+  %fneg = fsub float -0.000000e+00, %add
+  store volatile float %fneg, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_add_store_use_fneg_x_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
+; GCN-DAG: v_subrev_f32_e32 [[NEG_ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-NEXT: buffer_store_dword [[NEG_ADD]]
+; GCN-NEXT: buffer_store_dword [[NEG_A]]
+define void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %add = fadd float %fneg.a, %b
+  %fneg = fsub float -0.000000e+00, %add
+  store volatile float %fneg, float addrspace(1)* %out
+  store volatile float %fneg.a, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_add_multi_use_fneg_x_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN-DAG: v_subrev_f32_e32 [[NEG_ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
+; GCN-NEXT: buffer_store_dword [[NEG_ADD]]
+; GCN-NEXT: buffer_store_dword [[MUL]]
+define void @v_fneg_add_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %add = fadd float %fneg.a, %b
+  %fneg = fsub float -0.000000e+00, %add
+  %use1 = fmul float %fneg.a, %c
+  store volatile float %fneg, float addrspace(1)* %out
+  store volatile float %use1, float addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }