Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -13397,6 +13397,26 @@ SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); + // If we're negating a FMA node, then we can adjust the + // instruction to include the extra negation. + if (!IsFABS && Op.getOperand(0).hasOneUse()) { + SDValue Op0 = Op.getOperand(0); + switch (Op0.getOpcode()) { + case X86ISD::FMADD: + return DAG.getNode(X86ISD::FNMSUB, dl, VT, Op0.getOperand(0), + Op0.getOperand(1), Op0.getOperand(2)); + case X86ISD::FMSUB: + return DAG.getNode(X86ISD::FNMADD, dl, VT, Op0.getOperand(0), + Op0.getOperand(1), Op0.getOperand(2)); + case X86ISD::FNMADD: + return DAG.getNode(X86ISD::FMSUB, dl, VT, Op0.getOperand(0), + Op0.getOperand(1), Op0.getOperand(2)); + case X86ISD::FNMSUB: + return DAG.getNode(X86ISD::FMADD, dl, VT, Op0.getOperand(0), + Op0.getOperand(1), Op0.getOperand(2)); + } + } + // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to // decide if we should generate a 16-byte constant mask when we only need 4 or // 8 bytes for the scalar case. Index: test/CodeGen/X86/fma_patterns.ll =================================================================== --- test/CodeGen/X86/fma_patterns.ll +++ test/CodeGen/X86/fma_patterns.ll @@ -568,6 +568,74 @@ ret <4 x double> %r } +; (fneg (fma x, y, z)) -> (fma x, -y, -z) + +define <4 x float> @test_v4f32_fneg_fmadd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { +; CHECK_FMA-LABEL: test_v4f32_fneg_fmadd: +; CHECK_FMA: # BB#0: +; CHECK_FMA-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0 +; CHECK_FMA-NEXT: retq +; +; CHECK_FMA4-LABEL: test_v4f32_fneg_fmadd: +; CHECK_FMA4: # BB#0: +; CHECK_FMA4-NEXT: vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK_FMA4-NEXT: retq + %mul = fmul <4 x float> %a0, %a1 + %add = fadd <4 x float> %mul, %a2 + %neg = fsub <4 x float> , %add + ret <4 x float> %neg +} + +define <4 x double> @test_v4f64_fneg_fmsub(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 { +; CHECK_FMA-LABEL: test_v4f64_fneg_fmsub: +; CHECK_FMA: # BB#0: +; CHECK_FMA-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0 +; CHECK_FMA-NEXT: retq +; +; CHECK_FMA4-LABEL: test_v4f64_fneg_fmsub: +; CHECK_FMA4: # BB#0: +; CHECK_FMA4-NEXT: vfnmaddpd %ymm2, %ymm1, %ymm0, %ymm0 +; CHECK_FMA4-NEXT: retq + %mul = fmul <4 x double> %a0, %a1 + %sub = fsub <4 x double> %mul, %a2 + %neg = fsub <4 x double> , %sub + ret <4 x double> %neg +} + +define <4 x float> @test_v4f32_fneg_fnmadd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { +; CHECK_FMA-LABEL: test_v4f32_fneg_fnmadd: +; CHECK_FMA: # BB#0: +; CHECK_FMA-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0 +; CHECK_FMA-NEXT: retq +; +; CHECK_FMA4-LABEL: test_v4f32_fneg_fnmadd: +; CHECK_FMA4: # BB#0: +; CHECK_FMA4-NEXT: vfmsubps %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK_FMA4-NEXT: retq + %mul = fmul <4 x float> %a0, %a1 + %neg0 = fsub <4 x float> , %mul + %add = fadd <4 x float> %neg0, %a2 + %neg1 = fsub <4 x float> , %add + ret <4 x float> %neg1 +} + +define <4 x double> @test_v4f64_fneg_fnmsub(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 { +; CHECK_FMA-LABEL: test_v4f64_fneg_fnmsub: +; CHECK_FMA: # BB#0: +; CHECK_FMA-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 +; CHECK_FMA-NEXT: retq +; +; CHECK_FMA4-LABEL: test_v4f64_fneg_fnmsub: +; CHECK_FMA4: # BB#0: +; CHECK_FMA4-NEXT: vfmaddpd %ymm2, %ymm1, %ymm0, %ymm0 +; CHECK_FMA4-NEXT: retq + %mul = fmul <4 x double> %a0, %a1 + %neg0 = fsub <4 x double> , %mul + %sub = fsub <4 x double> %neg0, %a2 + %neg1 = fsub <4 x double> , %sub + ret <4 x double> %neg1 +} + ; (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2) define <4 x float> @test_v4f32_fma_x_c1_fmul_x_c2(<4 x float> %x) #0 {