diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1110,6 +1110,9 @@ /// Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST. MVT hasFastEqualityCompare(unsigned NumBits) const override; + /// Force aggressive FMA fusion. + bool enableAggressiveFMAFusion(EVT VT) const override; + /// Return the value type to use for ISD::SETCC. EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -5855,6 +5855,10 @@ return MVT::INVALID_SIMPLE_VALUE_TYPE; } +bool X86TargetLowering::enableAggressiveFMAFusion(EVT VT) const { + return Subtarget.hasAnyFMA(); +} + /// Val is the undef sentinel value or equal to the specified value. static bool isUndefOrEqual(int Val, int CmpVal) { return ((Val == SM_SentinelUndef) || (Val == CmpVal)); diff --git a/llvm/test/CodeGen/X86/dag-fmf-cse.ll b/llvm/test/CodeGen/X86/dag-fmf-cse.ll --- a/llvm/test/CodeGen/X86/dag-fmf-cse.ll +++ b/llvm/test/CodeGen/X86/dag-fmf-cse.ll @@ -9,8 +9,8 @@ define float @fmf_should_not_break_cse(float %a, float %b) { ; CHECK-LABEL: fmf_should_not_break_cse: ; CHECK: # %bb.0: -; CHECK-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vmulss %xmm1, %xmm0, %xmm2 +; CHECK-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 ; CHECK-NEXT: retq %mul1 = fmul fast float %a, %b %nega = fsub fast float 0.0, %a @@ -22,8 +22,8 @@ define <4 x float> @fmf_should_not_break_cse_vector(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: fmf_should_not_break_cse_vector: ; CHECK: # %bb.0: -; CHECK-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vaddps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vmulps %xmm1, %xmm0, %xmm2 +; CHECK-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 ; CHECK-NEXT: retq %mul1 = fmul fast <4 x float> %a, %b %nega = fsub fast <4 x float> , %a diff --git a/llvm/test/CodeGen/X86/fmsubadd-combine.ll b/llvm/test/CodeGen/X86/fmsubadd-combine.ll --- a/llvm/test/CodeGen/X86/fmsubadd-combine.ll +++ b/llvm/test/CodeGen/X86/fmsubadd-combine.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s -check-prefixes=CHECK,NOFMA -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma | FileCheck %s -check-prefixes=CHECK,FMA3,FMA3_256 -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma,+avx512f | FileCheck %s -check-prefixes=CHECK,FMA3,FMA3_512 -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma4 | FileCheck %s -check-prefixes=CHECK,FMA4 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s -check-prefixes=NOFMA +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma | FileCheck %s -check-prefixes=FMA3,FMA3_256 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma,+avx512f | FileCheck %s -check-prefixes=FMA3,FMA3_512 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma4 | FileCheck %s -check-prefixes=FMA4 ; This test checks the fusing of MUL + SUB/ADD to FMSUBADD. @@ -186,13 +186,28 @@ ; This should not be matched to fmsubadd because the mul is on the wrong side of the fsub. define <2 x double> @mul_subadd_bad_commute(<2 x double> %A, <2 x double> %B, <2 x double> %C) #0 { -; CHECK-LABEL: mul_subadd_bad_commute: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vmulpd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vsubpd %xmm0, %xmm2, %xmm1 -; CHECK-NEXT: vaddpd %xmm2, %xmm0, %xmm0 -; CHECK-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] -; CHECK-NEXT: retq +; NOFMA-LABEL: mul_subadd_bad_commute: +; NOFMA: # %bb.0: # %entry +; NOFMA-NEXT: vmulpd %xmm1, %xmm0, %xmm0 +; NOFMA-NEXT: vsubpd %xmm0, %xmm2, %xmm1 +; NOFMA-NEXT: vaddpd %xmm2, %xmm0, %xmm0 +; NOFMA-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; NOFMA-NEXT: retq +; +; FMA3-LABEL: mul_subadd_bad_commute: +; FMA3: # %bb.0: # %entry +; FMA3-NEXT: vmovapd %xmm1, %xmm3 +; FMA3-NEXT: vfnmadd213pd {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 +; FMA3-NEXT: vfmadd213pd {{.*#+}} xmm1 = (xmm0 * xmm1) + xmm2 +; FMA3-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm3[1] +; FMA3-NEXT: retq +; +; FMA4-LABEL: mul_subadd_bad_commute: +; FMA4: # %bb.0: # %entry +; FMA4-NEXT: vfnmaddpd {{.*#+}} xmm3 = -(xmm0 * xmm1) + xmm2 +; FMA4-NEXT: vfmaddpd {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2 +; FMA4-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm3[1] +; FMA4-NEXT: retq entry: %AB = fmul <2 x double> %A, %B %Sub = fsub <2 x double> %C, %AB