Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
===================================================================
--- llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -692,7 +692,7 @@
   void visitAdd(const User &I)  { visitBinary(I, ISD::ADD); }
   void visitFAdd(const User &I) { visitBinary(I, ISD::FADD); }
   void visitSub(const User &I)  { visitBinary(I, ISD::SUB); }
-  void visitFSub(const User &I);
+  void visitFSub(const User &I) { visitBinary(I, ISD::FSUB); }
   void visitMul(const User &I)  { visitBinary(I, ISD::MUL); }
   void visitFMul(const User &I) { visitBinary(I, ISD::FMUL); }
   void visitURem(const User &I) { visitBinary(I, ISD::UREM); }
Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
===================================================================
--- llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -3025,20 +3025,6 @@
   DAG.setRoot(DAG.getNode(ISD::TRAP, getCurSDLoc(), MVT::Other, DAG.getRoot()));
 }
 
-void SelectionDAGBuilder::visitFSub(const User &I) {
-  // -0.0 - X --> fneg
-  Type *Ty = I.getType();
-  if (isa<Constant>(I.getOperand(0)) &&
-      I.getOperand(0) == ConstantFP::getZeroValueForNegation(Ty)) {
-    SDValue Op2 = getValue(I.getOperand(1));
-    setValue(&I, DAG.getNode(ISD::FNEG, getCurSDLoc(),
-                             Op2.getValueType(), Op2));
-    return;
-  }
-
-  visitBinary(I, ISD::FSUB);
-}
-
 void SelectionDAGBuilder::visitUnary(const User &I, unsigned Opcode) {
   SDNodeFlags Flags;
 
Index: llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
===================================================================
--- llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -5707,6 +5707,18 @@
     return Op.getOperand(0);
   }
 
+  // Treat fsub(-0.0,x) the same as fneg.
+  if (Op.getOpcode() == ISD::FSUB) {
+    ConstantFPSDNode *N0C = isConstOrConstSplatFP(Op.getOperand(0), true);
+    if (N0C && N0C->isZero()) {
+      // TODO: Handle NSZ cases.
+      if (N0C->isNegative()) {
+        Cost = NegatibleCost::Cheaper;
+        return Op.getOperand(1);
+      }
+    }
+  }
+
   // Don't recurse exponentially.
   if (Depth > SelectionDAG::MaxRecursionDepth)
     return SDValue();
Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -9241,7 +9241,6 @@
   switch (Opcode) {
   // These will flush denorms if required.
   case ISD::FADD:
-  case ISD::FSUB:
   case ISD::FMUL:
   case ISD::FCEIL:
   case ISD::FFLOOR:
@@ -9271,6 +9270,16 @@
   case AMDGPUISD::CVT_F32_UBYTE3:
     return true;
 
+  case ISD::FSUB: {
+    // FSUB(-0.0,X) can be lowered or combined as a bit operation.
+    // Need to check its input recursively to handle.
+    ConstantFPSDNode *N0C = isConstOrConstSplatFP(Op.getOperand(0), true);
+    // TODO: Handle NSZ.
+    if (N0C && N0C->isZero() && N0C->isNegative())
+      return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
+
+    return true;
+  }
   // It can/will be lowered or combined as a bit operation.
   // Need to check their input recursively to handle.
   case ISD::FNEG:
Index: llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll
+++ llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll
@@ -273,11 +273,14 @@
   ret void
 }
 
+; FIXME: These changes look supicious, but they seem ok. It looks like non-determinism in
+;        DAGCombine is picking a different lowering, but it's the same instructions and counts.
+;        Although, there is a small scheduling difference with a move towards the end.
 ; GCN-LABEL: {{^}}div_v4_c_by_minus_x_25ulp:
-; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}}
-; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}}
-; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}}
-; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}}
+; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, 2.0{{$}}
+; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, 2.0{{$}}
+; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, 2.0{{$}}
+; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, 2.0{{$}}
 ; GCN-DENORM-DAG: v_rcp_f32_e32
 ; GCN-DENORM-DAG: v_rcp_f32_e32
 
@@ -298,8 +301,8 @@
 
 ; GCN-DENORM-DAG: v_div_fmas_f32
 ; GCN-DENORM-DAG: v_div_fmas_f32
-; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, -2.0{{$}}
-; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, -2.0{{$}}
+; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, 2.0{{$}}
+; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, 2.0{{$}}
 
 ; GCN-FLUSH-DAG:  v_rcp_f32_e32
 ; GCN-FLUSH-DAG:  v_rcp_f32_e64
Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll
+++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll
@@ -30,9 +30,10 @@
   ret void
 }
 
+; NOTE: Seems equivalent, but I'm not sure. The instruction changed.
 ; GCN-LABEL: {{^}}test_fneg_fmed3_multi_use:
 ; GCN: v_med3_f32 [[MED3:v[0-9]+]], -s{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
-; GCN: v_mul_f32_e32 v{{[0-9]+}}, -4.0, [[MED3]]
+; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[MED3]], 4.0
 define amdgpu_kernel void @test_fneg_fmed3_multi_use(float addrspace(1)* %out, float %src0, float %src1, float %src2) #1 {
   %med3 = call float @llvm.amdgcn.fmed3.f32(float %src0, float %src1, float %src2)
   %neg.med3 = fsub float -0.0, %med3
Index: llvm/test/CodeGen/AMDGPU/selectcc-opt.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/selectcc-opt.ll
+++ llvm/test/CodeGen/AMDGPU/selectcc-opt.ll
@@ -7,11 +7,14 @@
 ; EG-NOT: CND
 ; EG: SET{{[NEQGTL]+}}_DX10
 
+; NOTE: Don't think we can make FSUB(-0,X) work here. The FSUB(-0,X) is
+;       being combined with the SELECT. Can't prevent that xform without
+;       affecting other targets.
 define amdgpu_kernel void @test_a(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp olt float %in, 0.000000e+00
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
-  %2 = fsub float -0.000000e+00, %1
+  %2 = fneg float %1
   %3 = fptosi float %2 to i32
   %4 = bitcast i32 %3 to float
   %5 = bitcast float %4 to i32
@@ -39,7 +42,7 @@
 entry:
   %0 = fcmp olt float %in, 0.0
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
-  %2 = fsub float -0.000000e+00, %1
+  %2 = fneg float %1
   %3 = fptosi float %2 to i32
   %4 = bitcast i32 %3 to float
   %5 = bitcast float %4 to i32
Index: llvm/test/CodeGen/AMDGPU/set-dx10.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/set-dx10.ll
+++ llvm/test/CodeGen/AMDGPU/set-dx10.ll
@@ -12,7 +12,7 @@
 entry:
   %0 = fcmp une float %in, 5.0
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
-  %2 = fsub float -0.000000e+00, %1
+  %2 = fneg float %1
   %3 = fptosi float %2 to i32
   store i32 %3, i32 addrspace(1)* %out
   ret void
@@ -38,7 +38,7 @@
 entry:
   %0 = fcmp oeq float %in, 5.0
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
-  %2 = fsub float -0.000000e+00, %1
+  %2 = fneg float %1
   %3 = fptosi float %2 to i32
   store i32 %3, i32 addrspace(1)* %out
   ret void
@@ -64,7 +64,7 @@
 entry:
   %0 = fcmp ogt float %in, 5.0
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
-  %2 = fsub float -0.000000e+00, %1
+  %2 = fneg float %1
   %3 = fptosi float %2 to i32
   store i32 %3, i32 addrspace(1)* %out
   ret void
@@ -90,7 +90,7 @@
 entry:
   %0 = fcmp oge float %in, 5.0
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
-  %2 = fsub float -0.000000e+00, %1
+  %2 = fneg float %1
   %3 = fptosi float %2 to i32
   store i32 %3, i32 addrspace(1)* %out
   ret void
@@ -116,7 +116,7 @@
 entry:
   %0 = fcmp ole float %in, 5.0
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
-  %2 = fsub float -0.000000e+00, %1
+  %2 = fneg float %1
   %3 = fptosi float %2 to i32
   store i32 %3, i32 addrspace(1)* %out
   ret void
@@ -142,7 +142,7 @@
 entry:
   %0 = fcmp olt float %in, 5.0
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
-  %2 = fsub float -0.000000e+00, %1
+  %2 = fneg float %1
   %3 = fptosi float %2 to i32
   store i32 %3, i32 addrspace(1)* %out
   ret void