diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -46999,6 +46999,8 @@ EVT VT = Op.getValueType(); EVT SVT = VT.getScalarType(); unsigned Opc = Op.getOpcode(); + SDNodeFlags Flags = Op.getNode()->getFlags(); + const TargetOptions &Options = getTargetMachine().Options; switch (Opc) { case ISD::FMA: case X86ISD::FMSUB: @@ -47013,6 +47015,11 @@ !isOperationLegal(ISD::FMA, VT)) break; + // Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z) + // if it may have signed zeros. + if (!Flags.hasNoSignedZeros() && !Options.NoSignedZerosFPMath) + break; + // This is always negatible for free but we might be able to remove some // extra operand negations as well. SmallVector NewOps(Op.getNumOperands(), SDValue()); diff --git a/llvm/test/CodeGen/X86/avx2-fma-fneg-combine.ll b/llvm/test/CodeGen/X86/avx2-fma-fneg-combine.ll --- a/llvm/test/CodeGen/X86/avx2-fma-fneg-combine.ll +++ b/llvm/test/CodeGen/X86/avx2-fma-fneg-combine.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2,+fma | FileCheck %s --check-prefix=X32 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fma | FileCheck %s --check-prefix=X64 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2,+fma --enable-no-signed-zeros-fp-math | FileCheck %s --check-prefix=X32 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fma --enable-no-signed-zeros-fp-math | FileCheck %s --check-prefix=X64 ; This test checks combinations of FNEG and FMA intrinsics diff --git a/llvm/test/CodeGen/X86/fma-fneg-combine.ll b/llvm/test/CodeGen/X86/fma-fneg-combine.ll --- a/llvm/test/CodeGen/X86/fma-fneg-combine.ll +++ b/llvm/test/CodeGen/X86/fma-fneg-combine.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512bw -mattr=+avx512vl -mattr=+avx512dq | FileCheck %s --check-prefix=CHECK --check-prefix=SKX -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f -mattr=+fma | FileCheck %s --check-prefix=CHECK --check-prefix=KNL +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512bw -mattr=+avx512vl -mattr=+avx512dq --enable-no-signed-zeros-fp-math | FileCheck %s --check-prefix=CHECK --check-prefix=SKX +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f -mattr=+fma --enable-no-signed-zeros-fp-math | FileCheck %s --check-prefix=CHECK --check-prefix=KNL ; This test checks combinations of FNEG and FMA intrinsics on AVX-512 target ; PR28892 diff --git a/llvm/test/CodeGen/X86/fma-signed-zero.ll b/llvm/test/CodeGen/X86/fma-signed-zero.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/fma-signed-zero.ll @@ -0,0 +1,82 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma \ +; RUN: | FileCheck %s --check-prefixes=NO-NSZ-OPTION +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma \ +; RUN: --enable-no-signed-zeros-fp-math | FileCheck %s --check-prefixes=NSZ-OPTION + +; This test checks that (fneg (fma (fneg x), y, (fneg z))) can't be folded to (fma x, y, z) +; without no signed zeros flag (nsz) or no NoSignedZerosFPMath option. + +declare float @llvm.fma.f32(float, float, float) + +define float @fneg_fma32(float %x, float %y, float %z) { +; NO-NSZ-OPTION-LABEL: fneg_fma32: +; NO-NSZ-OPTION: # %bb.0: +; NO-NSZ-OPTION-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 +; NO-NSZ-OPTION-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 +; NO-NSZ-OPTION-NEXT: retq +; +; NSZ-OPTION-LABEL: fneg_fma32: +; NSZ-OPTION: # %bb.0: +; NSZ-OPTION-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 +; NSZ-OPTION-NEXT: retq + %negx = fneg float %x + %negz = fneg float %z + %fma = call float @llvm.fma.f32(float %negx, float %y, float %negz) + %n = fneg float %fma + ret float %n +} + +define float @fneg_fma32_nsz(float %x, float %y, float %z) { +; NO-NSZ-OPTION-LABEL: fneg_fma32_nsz: +; NO-NSZ-OPTION: # %bb.0: +; NO-NSZ-OPTION-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 +; NO-NSZ-OPTION-NEXT: retq +; +; NSZ-OPTION-LABEL: fneg_fma32_nsz: +; NSZ-OPTION: # %bb.0: +; NSZ-OPTION-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 +; NSZ-OPTION-NEXT: retq + %negx = fneg float %x + %negz = fneg float %z + %fma = call nsz float @llvm.fma.f32(float %negx, float %y, float %negz) + %n = fneg float %fma + ret float %n +} + +declare double @llvm.fma.f64(double, double, double) + +define double @fneg_fma64(double %x, double %y, double %z) { +; NO-NSZ-OPTION-LABEL: fneg_fma64: +; NO-NSZ-OPTION: # %bb.0: +; NO-NSZ-OPTION-NEXT: vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 +; NO-NSZ-OPTION-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0 +; NO-NSZ-OPTION-NEXT: retq +; +; NSZ-OPTION-LABEL: fneg_fma64: +; NSZ-OPTION: # %bb.0: +; NSZ-OPTION-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 +; NSZ-OPTION-NEXT: retq + %negx = fneg double %x + %negz = fneg double %z + %fma = call double @llvm.fma.f64(double %negx, double %y, double %negz) + %n = fneg double %fma + ret double %n +} + +define double @fneg_fma64_nsz(double %x, double %y, double %z) { +; NO-NSZ-OPTION-LABEL: fneg_fma64_nsz: +; NO-NSZ-OPTION: # %bb.0: +; NO-NSZ-OPTION-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 +; NO-NSZ-OPTION-NEXT: retq +; +; NSZ-OPTION-LABEL: fneg_fma64_nsz: +; NSZ-OPTION: # %bb.0: +; NSZ-OPTION-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 +; NSZ-OPTION-NEXT: retq + %negx = fneg double %x + %negz = fneg double %z + %fma = call nsz double @llvm.fma.f64(double %negx, double %y, double %negz) + %n = fneg double %fma + ret double %n +} diff --git a/llvm/test/CodeGen/X86/fma_patterns.ll b/llvm/test/CodeGen/X86/fma_patterns.ll --- a/llvm/test/CodeGen/X86/fma_patterns.ll +++ b/llvm/test/CodeGen/X86/fma_patterns.ll @@ -1308,10 +1308,10 @@ ; AVX512-NOINFS-NEXT: vfmsub213ss {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 ; AVX512-NOINFS-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1 ; AVX512-NOINFS-NEXT: retq - %t1 = fsub float 1.0, %t - %tx = fmul float %x, %t - %ty = fmul float %y, %t1 - %r = fadd float %tx, %ty + %t1 = fsub nsz float 1.0, %t + %tx = fmul nsz float %x, %t + %ty = fmul nsz float %y, %t1 + %r = fadd nsz float %tx, %ty ret float %r } @@ -1357,10 +1357,10 @@ ; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 ; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1 ; AVX512-NOINFS-NEXT: retq - %t1 = fsub <4 x float> , %t - %tx = fmul <4 x float> %x, %t - %ty = fmul <4 x float> %y, %t1 - %r = fadd <4 x float> %tx, %ty + %t1 = fsub nsz <4 x float> , %t + %tx = fmul nsz <4 x float> %x, %t + %ty = fmul nsz <4 x float> %y, %t1 + %r = fadd nsz <4 x float> %tx, %ty ret <4 x float> %r } @@ -1406,10 +1406,10 @@ ; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm1 ; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm1 ; AVX512-NOINFS-NEXT: retq - %t1 = fsub <8 x float> , %t - %tx = fmul <8 x float> %x, %t - %ty = fmul <8 x float> %y, %t1 - %r = fadd <8 x float> %tx, %ty + %t1 = fsub nsz <8 x float> , %t + %tx = fmul nsz <8 x float> %x, %t + %ty = fmul nsz <8 x float> %y, %t1 + %r = fadd nsz <8 x float> %tx, %ty ret <8 x float> %r } @@ -1455,10 +1455,10 @@ ; AVX512-NOINFS-NEXT: vfmsub213sd {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 ; AVX512-NOINFS-NEXT: vfmsub213sd {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1 ; AVX512-NOINFS-NEXT: retq - %t1 = fsub double 1.0, %t - %tx = fmul double %x, %t - %ty = fmul double %y, %t1 - %r = fadd double %tx, %ty + %t1 = fsub nsz double 1.0, %t + %tx = fmul nsz double %x, %t + %ty = fmul nsz double %y, %t1 + %r = fadd nsz double %tx, %ty ret double %r } @@ -1504,10 +1504,10 @@ ; AVX512-NOINFS-NEXT: vfmsub213pd {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 ; AVX512-NOINFS-NEXT: vfmsub213pd {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1 ; AVX512-NOINFS-NEXT: retq - %t1 = fsub <2 x double> , %t - %tx = fmul <2 x double> %x, %t - %ty = fmul <2 x double> %y, %t1 - %r = fadd <2 x double> %tx, %ty + %t1 = fsub nsz <2 x double> , %t + %tx = fmul nsz <2 x double> %x, %t + %ty = fmul nsz <2 x double> %y, %t1 + %r = fadd nsz <2 x double> %tx, %ty ret <2 x double> %r } @@ -1553,10 +1553,10 @@ ; AVX512-NOINFS-NEXT: vfmsub213pd {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm1 ; AVX512-NOINFS-NEXT: vfmsub213pd {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm1 ; AVX512-NOINFS-NEXT: retq - %t1 = fsub <4 x double> , %t - %tx = fmul <4 x double> %x, %t - %ty = fmul <4 x double> %y, %t1 - %r = fadd <4 x double> %tx, %ty + %t1 = fsub nsz <4 x double> , %t + %tx = fmul nsz <4 x double> %x, %t + %ty = fmul nsz <4 x double> %y, %t1 + %r = fadd nsz <4 x double> %tx, %ty ret <4 x double> %r } @@ -1579,9 +1579,9 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 ; AVX512-NEXT: retq - %mul = fmul <4 x float> %a0, %a1 - %add = fadd <4 x float> %mul, %a2 - %neg = fsub <4 x float> , %add + %mul = fmul nsz <4 x float> %a0, %a1 + %add = fadd nsz <4 x float> %mul, %a2 + %neg = fsub nsz <4 x float> , %add ret <4 x float> %neg } @@ -1600,9 +1600,9 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vfnmadd213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 ; AVX512-NEXT: retq - %mul = fmul <4 x double> %a0, %a1 - %sub = fsub <4 x double> %mul, %a2 - %neg = fsub <4 x double> , %sub + %mul = fmul nsz <4 x double> %a0, %a1 + %sub = fsub nsz <4 x double> %mul, %a2 + %neg = fsub nsz <4 x double> , %sub ret <4 x double> %neg } @@ -1621,10 +1621,10 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 ; AVX512-NEXT: retq - %mul = fmul <4 x float> %a0, %a1 - %neg0 = fsub <4 x float> , %mul - %add = fadd <4 x float> %neg0, %a2 - %neg1 = fsub <4 x float> , %add + %mul = fmul nsz <4 x float> %a0, %a1 + %neg0 = fsub nsz <4 x float> , %mul + %add = fadd nsz <4 x float> %neg0, %a2 + %neg1 = fsub nsz <4 x float> , %add ret <4 x float> %neg1 } @@ -1643,10 +1643,10 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2 ; AVX512-NEXT: retq - %mul = fmul <4 x double> %a0, %a1 - %neg0 = fsub <4 x double> , %mul - %sub = fsub <4 x double> %neg0, %a2 - %neg1 = fsub <4 x double> , %sub + %mul = fmul nsz <4 x double> %a0, %a1 + %neg0 = fsub nsz <4 x double> , %mul + %sub = fsub nsz <4 x double> %neg0, %a2 + %neg1 = fsub nsz <4 x double> , %sub ret <4 x double> %neg1 } diff --git a/llvm/test/CodeGen/X86/fma_patterns_wide.ll b/llvm/test/CodeGen/X86/fma_patterns_wide.ll --- a/llvm/test/CodeGen/X86/fma_patterns_wide.ll +++ b/llvm/test/CodeGen/X86/fma_patterns_wide.ll @@ -868,10 +868,10 @@ ; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} zmm1 = (zmm2 * zmm1) - zmm1 ; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} zmm0 = (zmm2 * zmm0) - zmm1 ; AVX512-NOINFS-NEXT: retq - %t1 = fsub <16 x float> , %t - %tx = fmul <16 x float> %x, %t - %ty = fmul <16 x float> %y, %t1 - %r = fadd <16 x float> %tx, %ty + %t1 = fsub nsz <16 x float> , %t + %tx = fmul nsz <16 x float> %x, %t + %ty = fmul nsz <16 x float> %y, %t1 + %r = fadd nsz <16 x float> %tx, %ty ret <16 x float> %r } @@ -927,10 +927,10 @@ ; AVX512-NOINFS-NEXT: vfmsub213pd {{.*#+}} zmm1 = (zmm2 * zmm1) - zmm1 ; AVX512-NOINFS-NEXT: vfmsub213pd {{.*#+}} zmm0 = (zmm2 * zmm0) - zmm1 ; AVX512-NOINFS-NEXT: retq - %t1 = fsub <8 x double> , %t - %tx = fmul <8 x double> %x, %t - %ty = fmul <8 x double> %y, %t1 - %r = fadd <8 x double> %tx, %ty + %t1 = fsub nsz <8 x double> , %t + %tx = fmul nsz <8 x double> %x, %t + %ty = fmul nsz <8 x double> %y, %t1 + %r = fadd nsz <8 x double> %tx, %ty ret <8 x double> %r } @@ -955,9 +955,9 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vfnmsub213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2 ; AVX512-NEXT: retq - %mul = fmul <16 x float> %a0, %a1 - %add = fadd <16 x float> %mul, %a2 - %neg = fsub <16 x float> , %add + %mul = fmul nsz <16 x float> %a0, %a1 + %add = fadd nsz <16 x float> %mul, %a2 + %neg = fsub nsz <16 x float> , %add ret <16 x float> %neg } @@ -978,9 +978,9 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vfnmadd213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm2 ; AVX512-NEXT: retq - %mul = fmul <8 x double> %a0, %a1 - %sub = fsub <8 x double> %mul, %a2 - %neg = fsub <8 x double> , %sub + %mul = fmul nsz <8 x double> %a0, %a1 + %sub = fsub nsz <8 x double> %mul, %a2 + %neg = fsub nsz <8 x double> , %sub ret <8 x double> %neg } @@ -1001,10 +1001,10 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2 ; AVX512-NEXT: retq - %mul = fmul <16 x float> %a0, %a1 - %neg0 = fsub <16 x float> , %mul - %add = fadd <16 x float> %neg0, %a2 - %neg1 = fsub <16 x float> , %add + %mul = fmul nsz <16 x float> %a0, %a1 + %neg0 = fsub nsz <16 x float> , %mul + %add = fadd nsz <16 x float> %neg0, %a2 + %neg1 = fsub nsz <16 x float> , %add ret <16 x float> %neg1 } @@ -1025,10 +1025,10 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 ; AVX512-NEXT: retq - %mul = fmul <8 x double> %a0, %a1 - %neg0 = fsub <8 x double> , %mul - %sub = fsub <8 x double> %neg0, %a2 - %neg1 = fsub <8 x double> , %sub + %mul = fmul nsz <8 x double> %a0, %a1 + %neg0 = fsub nsz <8 x double> , %mul + %sub = fsub nsz <8 x double> %neg0, %a2 + %neg1 = fsub nsz <8 x double> , %sub ret <8 x double> %neg1 }