Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
@@ -13678,12 +13678,24 @@
                                  In, DAG.getUNDEF(SVT)));
 }
 
-// The only differences between FABS and FNEG are the mask and the logic op.
+/// The only differences between FABS and FNEG are the mask and the logic op.
+/// FNEG also has a folding opportunity for FNEG(FABS(x)).
 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
   assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
          "Wrong opcode for lowering FABS or FNEG.");
 
   bool IsFABS = (Op.getOpcode() == ISD::FABS);
+
+  // If this is a FABS and it has an FNEG user, bail out to fold the combination
+  // into an FNABS. We'll lower the FABS after that if it is still in use.
+  if (IsFABS)
+    for (SDNode *User : Op->uses())
+      if (User->getOpcode() == ISD::FNEG)
+        return Op;
+
+  SDValue Op0 = Op.getOperand(0);
+  bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
+
   SDLoc dl(Op);
   MVT VT = Op.getSimpleValueType();
   // Assume scalar op for initialization; update for vector if needed.
@@ -13719,15 +13731,19 @@
     // For a vector, cast operands to a vector type, perform the logic op,
     // and cast the result back to the original value type.
     MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
-    SDValue Op0Casted = DAG.getNode(ISD::BITCAST, dl, VecVT, Op.getOperand(0));
     SDValue MaskCasted = DAG.getNode(ISD::BITCAST, dl, VecVT, Mask);
-    unsigned LogicOp = IsFABS ? ISD::AND : ISD::XOR;
+    SDValue Operand = IsFNABS ?
+      DAG.getNode(ISD::BITCAST, dl, VecVT, Op0.getOperand(0)) :
+      DAG.getNode(ISD::BITCAST, dl, VecVT, Op0);
+    unsigned BitOp = IsFABS ? ISD::AND : IsFNABS ? ISD::OR : ISD::XOR;
     return DAG.getNode(ISD::BITCAST, dl, VT,
-                       DAG.getNode(LogicOp, dl, VecVT, Op0Casted, MaskCasted));
+                       DAG.getNode(BitOp, dl, VecVT, Operand, MaskCasted));
   }
+  
   // If not vector, then scalar.
-  unsigned LogicOp = IsFABS ? X86ISD::FAND : X86ISD::FXOR;
-  return DAG.getNode(LogicOp, dl, VT, Op.getOperand(0), Mask);
+  unsigned BitOp = IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
+  SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
+  return DAG.getNode(BitOp, dl, VT, Operand, Mask);
 }
 
 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
Index: llvm/trunk/test/CodeGen/X86/fnabs.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/fnabs.ll
+++ llvm/trunk/test/CodeGen/X86/fnabs.ll
@@ -0,0 +1,77 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx| FileCheck %s
+
+; Verify that we generate a single OR instruction for a scalar, vec128, and vec256
+; FNABS(x) operation -> FNEG (FABS(x)).
+; If the FABS() result isn't used, the AND instruction should be eliminated.
+; PR20578: http://llvm.org/bugs/show_bug.cgi?id=20578
+
+define float @scalar_no_abs(float %a) {
+; CHECK-LABEL: scalar_no_abs:
+; CHECK: vorps
+; CHECK-NEXT: retq
+  %fabs = tail call float @fabsf(float %a) #1
+  %fsub = fsub float -0.0, %fabs
+  ret float %fsub
+}
+
+define float @scalar_uses_abs(float %a) {
+; CHECK-LABEL: scalar_uses_abs:
+; CHECK-DAG: vandps
+; CHECK-DAG: vorps
+; CHECK: vmulss
+; CHECK-NEXT: retq
+  %fabs = tail call float @fabsf(float %a) #1
+  %fsub = fsub float -0.0, %fabs
+  %fmul = fmul float %fsub, %fabs
+  ret float %fmul
+}
+
+define <4 x float> @vector128_no_abs(<4 x float> %a) {
+; CHECK-LABEL: vector128_no_abs:
+; CHECK: vorps
+; CHECK-NEXT: retq
+  %fabs = tail call <4 x float> @llvm.fabs.v4f32(< 4 x float> %a) #1
+  %fsub = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %fabs
+  ret <4 x float> %fsub
+}
+
+define <4 x float> @vector128_uses_abs(<4 x float> %a) {
+; CHECK-LABEL: vector128_uses_abs:
+; CHECK-DAG: vandps
+; CHECK-DAG: vorps
+; CHECK: vmulps
+; CHECK-NEXT: retq
+  %fabs = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> %a) #1
+  %fsub = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %fabs
+  %fmul = fmul <4 x float> %fsub, %fabs
+  ret <4 x float> %fmul
+}
+
+define <8 x float> @vector256_no_abs(<8 x float> %a) {
+; CHECK-LABEL: vector256_no_abs:
+; CHECK: vorps
+; CHECK-NEXT: retq
+  %fabs = tail call <8 x float> @llvm.fabs.v8f32(< 8 x float> %a) #1
+  %fsub = fsub <8 x float> <float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0>, %fabs
+  ret <8 x float> %fsub
+}
+
+define <8 x float> @vector256_uses_abs(<8 x float> %a) {
+; CHECK-LABEL: vector256_uses_abs:
+; CHECK-DAG: vandps
+; CHECK-DAG: vorps
+; CHECK: vmulps
+; CHECK-NEXT: retq
+  %fabs = tail call <8 x float> @llvm.fabs.v8f32(<8 x float> %a) #1
+  %fsub = fsub <8 x float> <float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0>, %fabs
+  %fmul = fmul <8 x float> %fsub, %fabs
+  ret <8 x float> %fmul
+}
+
+declare <4 x float> @llvm.fabs.v4f32(<4 x float> %p)
+declare <8 x float> @llvm.fabs.v8f32(<8 x float> %p)
+
+declare float @fabsf(float)
+
+attributes #1 = { readnone }
+