diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1240,8 +1240,10 @@ if (Subtarget.hasAnyFMA()) { for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32, - MVT::v2f64, MVT::v4f64 }) + MVT::v2f64, MVT::v4f64 }) { setOperationAction(ISD::FMA, VT, Legal); + setOperationAction(ISD::STRICT_FMA, VT, Legal); + } } for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { @@ -1434,6 +1436,7 @@ setOperationAction(ISD::FNEG, VT, Custom); setOperationAction(ISD::FABS, VT, Custom); setOperationAction(ISD::FMA, VT, Legal); + setOperationAction(ISD::STRICT_FMA, VT, Legal); setOperationAction(ISD::FCOPYSIGN, VT, Custom); } diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -6475,7 +6475,7 @@ VEX_W; } -defm VFMADD213 : avx512_fma3p_213_f<0xA8, "vfmadd213", X86Fmadd, X86FmaddRnd>; +defm VFMADD213 : avx512_fma3p_213_f<0xA8, "vfmadd213", X86any_Fmadd, X86FmaddRnd>; defm VFMSUB213 : avx512_fma3p_213_f<0xAA, "vfmsub213", X86Fmsub, X86FmsubRnd>; defm VFMADDSUB213 : avx512_fma3p_213_f<0xA6, "vfmaddsub213", X86Fmaddsub, X86FmaddsubRnd>; defm VFMSUBADD213 : avx512_fma3p_213_f<0xA7, "vfmsubadd213", X86Fmsubadd, X86FmsubaddRnd>; @@ -6553,7 +6553,7 @@ VEX_W; } -defm VFMADD231 : avx512_fma3p_231_f<0xB8, "vfmadd231", X86Fmadd, X86FmaddRnd>; +defm VFMADD231 : avx512_fma3p_231_f<0xB8, "vfmadd231", X86any_Fmadd, X86FmaddRnd>; defm VFMSUB231 : avx512_fma3p_231_f<0xBA, "vfmsub231", X86Fmsub, X86FmsubRnd>; defm VFMADDSUB231 : avx512_fma3p_231_f<0xB6, "vfmaddsub231", X86Fmaddsub, X86FmaddsubRnd>; defm VFMSUBADD231 : avx512_fma3p_231_f<0xB7, "vfmsubadd231", X86Fmsubadd, X86FmsubaddRnd>; @@ -6633,7 +6633,7 @@ VEX_W; } -defm VFMADD132 : avx512_fma3p_132_f<0x98, "vfmadd132", X86Fmadd, X86FmaddRnd>; +defm VFMADD132 : avx512_fma3p_132_f<0x98, "vfmadd132", X86any_Fmadd, X86FmaddRnd>; defm VFMSUB132 : avx512_fma3p_132_f<0x9A, "vfmsub132", X86Fmsub, X86FmsubRnd>; defm VFMADDSUB132 : avx512_fma3p_132_f<0x96, "vfmaddsub132", X86Fmaddsub, X86FmaddsubRnd>; defm VFMSUBADD132 : avx512_fma3p_132_f<0x97, "vfmsubadd132", X86Fmsubadd, X86FmsubaddRnd>; @@ -6730,7 +6730,7 @@ } } -defm VFMADD : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86Fmadd, X86FmaddRnd>; +defm VFMADD : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86any_Fmadd, X86FmaddRnd>; defm VFMSUB : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86Fmsub, X86FmsubRnd>; defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd, X86FnmaddRnd>; defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub, X86FnmsubRnd>; @@ -6937,7 +6937,7 @@ } } -defm : avx512_scalar_fma_patterns; defm : avx512_scalar_fma_patterns; @@ -6946,7 +6946,7 @@ defm : avx512_scalar_fma_patterns; -defm : avx512_scalar_fma_patterns; defm : avx512_scalar_fma_patterns; diff --git a/llvm/lib/Target/X86/X86InstrFMA.td b/llvm/lib/Target/X86/X86InstrFMA.td --- a/llvm/lib/Target/X86/X86InstrFMA.td +++ b/llvm/lib/Target/X86/X86InstrFMA.td @@ -123,7 +123,7 @@ // Fused Multiply-Add let ExeDomain = SSEPackedSingle in { defm VFMADD : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps", "PS", - loadv4f32, loadv8f32, X86Fmadd, v4f32, v8f32, + loadv4f32, loadv8f32, X86any_Fmadd, v4f32, v8f32, SchedWriteFMA>; defm VFMSUB : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", "PS", loadv4f32, loadv8f32, X86Fmsub, v4f32, v8f32, @@ -138,7 +138,7 @@ let ExeDomain = SSEPackedDouble in { defm VFMADD : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd", "PD", - loadv2f64, loadv4f64, X86Fmadd, v2f64, + loadv2f64, loadv4f64, X86any_Fmadd, v2f64, v4f64, SchedWriteFMA>, VEX_W; defm VFMSUB : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd", "PD", loadv2f64, loadv4f64, X86Fmsub, v2f64, @@ -319,7 +319,7 @@ VR128, sdmem, sched>, VEX_W; } -defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", X86Fmadd, +defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", X86any_Fmadd, SchedWriteFMA.Scl>, VEX_LIG; defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", X86Fmsub, SchedWriteFMA.Scl>, VEX_LIG; @@ -372,12 +372,12 @@ } } -defm : scalar_fma_patterns; +defm : scalar_fma_patterns; defm : scalar_fma_patterns; defm : scalar_fma_patterns; defm : scalar_fma_patterns; -defm : scalar_fma_patterns; +defm : scalar_fma_patterns; defm : scalar_fma_patterns; defm : scalar_fma_patterns; defm : scalar_fma_patterns; @@ -538,7 +538,7 @@ let ExeDomain = SSEPackedSingle in { // Scalar Instructions - defm VFMADDSS4 : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86Fmadd, loadf32, + defm VFMADDSS4 : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86any_Fmadd, loadf32, SchedWriteFMA.Scl>, fma4s_int<0x6A, "vfmaddss", ssmem, v4f32, SchedWriteFMA.Scl>; @@ -555,7 +555,7 @@ fma4s_int<0x7E, "vfnmsubss", ssmem, v4f32, SchedWriteFMA.Scl>; // Packed Instructions - defm VFMADDPS4 : fma4p<0x68, "vfmaddps", X86Fmadd, v4f32, v8f32, + defm VFMADDPS4 : fma4p<0x68, "vfmaddps", X86any_Fmadd, v4f32, v8f32, loadv4f32, loadv8f32, SchedWriteFMA>; defm VFMSUBPS4 : fma4p<0x6C, "vfmsubps", X86Fmsub, v4f32, v8f32, loadv4f32, loadv8f32, SchedWriteFMA>; @@ -571,7 +571,7 @@ let ExeDomain = SSEPackedDouble in { // Scalar Instructions - defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86Fmadd, loadf64, + defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86any_Fmadd, loadf64, SchedWriteFMA.Scl>, fma4s_int<0x6B, "vfmaddsd", sdmem, v2f64, SchedWriteFMA.Scl>; @@ -588,7 +588,7 @@ fma4s_int<0x7F, "vfnmsubsd", sdmem, v2f64, SchedWriteFMA.Scl>; // Packed Instructions - defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", X86Fmadd, v2f64, v4f64, + defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", X86any_Fmadd, v2f64, v4f64, loadv2f64, loadv4f64, SchedWriteFMA>; defm VFMSUBPD4 : fma4p<0x6D, "vfmsubpd", X86Fmsub, v2f64, v4f64, loadv2f64, loadv4f64, SchedWriteFMA>; @@ -629,12 +629,12 @@ } } -defm : scalar_fma4_patterns; +defm : scalar_fma4_patterns; defm : scalar_fma4_patterns; defm : scalar_fma4_patterns; defm : scalar_fma4_patterns; -defm : scalar_fma4_patterns; +defm : scalar_fma4_patterns; defm : scalar_fma4_patterns; defm : scalar_fma4_patterns; defm : scalar_fma4_patterns; diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -493,7 +493,11 @@ def X86fgetexps : SDNode<"X86ISD::FGETEXPS", SDTFPBinOp>; def X86fgetexpSAEs : SDNode<"X86ISD::FGETEXPS_SAE", SDTFPBinOp>; -def X86Fmadd : SDNode<"ISD::FMA", SDTFPTernaryOp, [SDNPCommutative]>; +def X86Fmadd : SDNode<"ISD::FMA", SDTFPTernaryOp, [SDNPCommutative]>; +def X86strict_Fmadd : SDNode<"ISD::STRICT_FMA", SDTFPTernaryOp, [SDNPCommutative, SDNPHasChain]>; +def X86any_Fmadd : PatFrags<(ops node:$src1, node:$src2, node:$src3), + [(X86strict_Fmadd node:$src1, node:$src2, node:$src3), + (X86Fmadd node:$src1, node:$src2, node:$src3)]>; def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFPTernaryOp, [SDNPCommutative]>; def X86Fmsub : SDNode<"X86ISD::FMSUB", SDTFPTernaryOp, [SDNPCommutative]>; def X86Fnmsub : SDNode<"X86ISD::FNMSUB", SDTFPTernaryOp, [SDNPCommutative]>; diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar.ll b/llvm/test/CodeGen/X86/fp-strict-scalar.ll --- a/llvm/test/CodeGen/X86/fp-strict-scalar.ll +++ b/llvm/test/CodeGen/X86/fp-strict-scalar.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=CHECK,SSE,SSE-X86 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=CHECK,SSE,SSE-X64 -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=CHECK,AVX,AVX-X86 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=CHECK,AVX,AVX-X64 -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f -mattr=+avx512vl -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=CHECK,AVX,AVX-X86 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -mattr=+avx512vl -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=CHECK,AVX,AVX-X64 -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=-sse -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=CHECK,X87 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=SSE-X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=SSE-X64 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+fma -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=AVX-X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+fma -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=AVX-X64 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f -mattr=+avx512vl -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=AVX-X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -mattr=+avx512vl -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=AVX-X64 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=-sse -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=X87 declare double @llvm.experimental.constrained.fadd.f64(double, double, metadata, metadata) declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, metadata) @@ -19,6 +19,8 @@ declare float @llvm.experimental.constrained.fptrunc.f32.f64(double, metadata, metadata) declare float @llvm.experimental.constrained.sqrt.f32(float, metadata, metadata) declare double @llvm.experimental.constrained.sqrt.f64(double, metadata, metadata) +declare float @llvm.experimental.constrained.fma.f32(float, float, float, metadata, metadata) +declare double @llvm.experimental.constrained.fma.f64(double, double, double, metadata, metadata) define double @fadd_f64(double %a, double %b) nounwind strictfp { ; SSE-X86-LABEL: fadd_f64: @@ -579,4 +581,118 @@ ret void } +define double @fma_f64(double %a, double %b, double %c) nounwind strictfp { +; SSE-X86-LABEL: fma_f64: +; SSE-X86: # %bb.0: +; SSE-X86-NEXT: subl $24, %esp +; SSE-X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE-X86-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; SSE-X86-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; SSE-X86-NEXT: movsd %xmm2, {{[0-9]+}}(%esp) +; SSE-X86-NEXT: movsd %xmm1, {{[0-9]+}}(%esp) +; SSE-X86-NEXT: movsd %xmm0, (%esp) +; SSE-X86-NEXT: calll fma +; SSE-X86-NEXT: addl $24, %esp +; SSE-X86-NEXT: retl +; +; SSE-X64-LABEL: fma_f64: +; SSE-X64: # %bb.0: +; SSE-X64-NEXT: pushq %rax +; SSE-X64-NEXT: callq fma +; SSE-X64-NEXT: popq %rax +; SSE-X64-NEXT: retq +; +; AVX-X86-LABEL: fma_f64: +; AVX-X86: # %bb.0: +; AVX-X86-NEXT: pushl %ebp +; AVX-X86-NEXT: movl %esp, %ebp +; AVX-X86-NEXT: andl $-8, %esp +; AVX-X86-NEXT: subl $8, %esp +; AVX-X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX-X86-NEXT: vfmadd213sd {{.*#+}} xmm1 = (xmm0 * xmm1) + mem +; AVX-X86-NEXT: vmovsd %xmm1, (%esp) +; AVX-X86-NEXT: fldl (%esp) +; AVX-X86-NEXT: movl %ebp, %esp +; AVX-X86-NEXT: popl %ebp +; AVX-X86-NEXT: retl +; +; AVX-X64-LABEL: fma_f64: +; AVX-X64: # %bb.0: +; AVX-X64-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 +; AVX-X64-NEXT: retq +; +; X87-LABEL: fma_f64: +; X87: # %bb.0: +; X87-NEXT: subl $24, %esp +; X87-NEXT: fldl {{[0-9]+}}(%esp) +; X87-NEXT: fldl {{[0-9]+}}(%esp) +; X87-NEXT: fldl {{[0-9]+}}(%esp) +; X87-NEXT: fstpl {{[0-9]+}}(%esp) +; X87-NEXT: fstpl {{[0-9]+}}(%esp) +; X87-NEXT: fstpl (%esp) +; X87-NEXT: calll fma +; X87-NEXT: addl $24, %esp +; X87-NEXT: retl + %res = call double @llvm.experimental.constrained.fma.f64(double %a, double %b, double %c, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret double %res +} + +define float @fma_f32(float %a, float %b, float %c) nounwind strictfp { +; SSE-X86-LABEL: fma_f32: +; SSE-X86: # %bb.0: +; SSE-X86-NEXT: subl $12, %esp +; SSE-X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-X86-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-X86-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE-X86-NEXT: movss %xmm2, {{[0-9]+}}(%esp) +; SSE-X86-NEXT: movss %xmm1, {{[0-9]+}}(%esp) +; SSE-X86-NEXT: movss %xmm0, (%esp) +; SSE-X86-NEXT: calll fmaf +; SSE-X86-NEXT: addl $12, %esp +; SSE-X86-NEXT: retl +; +; SSE-X64-LABEL: fma_f32: +; SSE-X64: # %bb.0: +; SSE-X64-NEXT: pushq %rax +; SSE-X64-NEXT: callq fmaf +; SSE-X64-NEXT: popq %rax +; SSE-X64-NEXT: retq +; +; AVX-X86-LABEL: fma_f32: +; AVX-X86: # %bb.0: +; AVX-X86-NEXT: pushl %eax +; AVX-X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-X86-NEXT: vfmadd213ss {{.*#+}} xmm1 = (xmm0 * xmm1) + mem +; AVX-X86-NEXT: vmovss %xmm1, (%esp) +; AVX-X86-NEXT: flds (%esp) +; AVX-X86-NEXT: popl %eax +; AVX-X86-NEXT: retl +; +; AVX-X64-LABEL: fma_f32: +; AVX-X64: # %bb.0: +; AVX-X64-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 +; AVX-X64-NEXT: retq +; +; X87-LABEL: fma_f32: +; X87: # %bb.0: +; X87-NEXT: subl $12, %esp +; X87-NEXT: flds {{[0-9]+}}(%esp) +; X87-NEXT: flds {{[0-9]+}}(%esp) +; X87-NEXT: flds {{[0-9]+}}(%esp) +; X87-NEXT: fstps {{[0-9]+}}(%esp) +; X87-NEXT: fstps {{[0-9]+}}(%esp) +; X87-NEXT: fstps (%esp) +; X87-NEXT: calll fmaf +; X87-NEXT: addl $12, %esp +; X87-NEXT: retl + %res = call float @llvm.experimental.constrained.fma.f32(float %a, float %b, float %c, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret float %res +} + attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/X86/vec-strict-128.ll b/llvm/test/CodeGen/X86/vec-strict-128.ll --- a/llvm/test/CodeGen/X86/vec-strict-128.ll +++ b/llvm/test/CodeGen/X86/vec-strict-128.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=CHECK,SSE -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=CHECK,SSE -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f -mattr=+avx512vl -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -mattr=+avx512vl -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=SSE,SSE-X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=SSE,SSE-X64 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+fma -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+fma -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=AVX +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f -mattr=+avx512vl -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -mattr=+avx512vl -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=AVX declare <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double>, <2 x double>, metadata, metadata) declare <4 x float> @llvm.experimental.constrained.fadd.v4f32(<4 x float>, <4 x float>, metadata, metadata) @@ -18,6 +18,8 @@ declare <4 x float> @llvm.experimental.constrained.sqrt.v4f32(<4 x float>, metadata, metadata) declare float @llvm.experimental.constrained.fptrunc.f32.f64(double, metadata, metadata) declare double @llvm.experimental.constrained.fpext.f64.f32(float, metadata) +declare <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double>, <2 x double>, <2 x double>, metadata, metadata) +declare <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float>, <4 x float>, <4 x float>, metadata, metadata) define <2 x double> @f1(<2 x double> %a, <2 x double> %b) #0 { ; SSE-LABEL: f1: @@ -217,4 +219,184 @@ ret <2 x double> %res } +define <4 x float> @f13(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 { +; SSE-X86-LABEL: f13: +; SSE-X86: # %bb.0: +; SSE-X86-NEXT: subl $108, %esp +; SSE-X86-NEXT: .cfi_def_cfa_offset 112 +; SSE-X86-NEXT: movups %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; SSE-X86-NEXT: movups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; SSE-X86-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; SSE-X86-NEXT: movss %xmm2, {{[0-9]+}}(%esp) +; SSE-X86-NEXT: movss %xmm1, {{[0-9]+}}(%esp) +; SSE-X86-NEXT: movss %xmm0, (%esp) +; SSE-X86-NEXT: calll fmaf +; SSE-X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill +; SSE-X86-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-X86-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; SSE-X86-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-X86-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; SSE-X86-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-X86-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-X86-NEXT: movss %xmm0, (%esp) +; SSE-X86-NEXT: calll fmaf +; SSE-X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill +; SSE-X86-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE-X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; SSE-X86-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE-X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; SSE-X86-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE-X86-NEXT: movss %xmm0, (%esp) +; SSE-X86-NEXT: calll fmaf +; SSE-X86-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE-X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; SSE-X86-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE-X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; SSE-X86-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE-X86-NEXT: movss %xmm0, (%esp) +; SSE-X86-NEXT: fstps {{[0-9]+}}(%esp) +; SSE-X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload +; SSE-X86-NEXT: fstps {{[0-9]+}}(%esp) +; SSE-X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload +; SSE-X86-NEXT: fstps {{[0-9]+}}(%esp) +; SSE-X86-NEXT: calll fmaf +; SSE-X86-NEXT: fstps {{[0-9]+}}(%esp) +; SSE-X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-X86-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-X86-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-X86-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE-X86-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-X86-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-X86-NEXT: addl $108, %esp +; SSE-X86-NEXT: .cfi_def_cfa_offset 4 +; SSE-X86-NEXT: retl +; +; SSE-X64-LABEL: f13: +; SSE-X64: # %bb.0: +; SSE-X64-NEXT: subq $88, %rsp +; SSE-X64-NEXT: .cfi_def_cfa_offset 96 +; SSE-X64-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-X64-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE-X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] +; SSE-X64-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1,2,3] +; SSE-X64-NEXT: callq fmaf +; SSE-X64-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-X64-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-X64-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-X64-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] +; SSE-X64-NEXT: callq fmaf +; SSE-X64-NEXT: unpcklps (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-X64-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-X64-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-X64-NEXT: callq fmaf +; SSE-X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] +; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-X64-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,2,3] +; SSE-X64-NEXT: callq fmaf +; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-X64-NEXT: unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload +; SSE-X64-NEXT: # xmm1 = xmm1[0],mem[0] +; SSE-X64-NEXT: movaps %xmm1, %xmm0 +; SSE-X64-NEXT: addq $88, %rsp +; SSE-X64-NEXT: .cfi_def_cfa_offset 8 +; SSE-X64-NEXT: retq +; +; AVX-LABEL: f13: +; AVX: # %bb.0: +; AVX-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 +; AVX-NEXT: ret{{[l|q]}} + %res = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <4 x float> %res +} + +define <2 x double> @f14(<2 x double> %a, <2 x double> %b, <2 x double> %c) #0 { +; SSE-X86-LABEL: f14: +; SSE-X86: # %bb.0: +; SSE-X86-NEXT: pushl %ebp +; SSE-X86-NEXT: .cfi_def_cfa_offset 8 +; SSE-X86-NEXT: .cfi_offset %ebp, -8 +; SSE-X86-NEXT: movl %esp, %ebp +; SSE-X86-NEXT: .cfi_def_cfa_register %ebp +; SSE-X86-NEXT: andl $-16, %esp +; SSE-X86-NEXT: subl $112, %esp +; SSE-X86-NEXT: movaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; SSE-X86-NEXT: movaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; SSE-X86-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; SSE-X86-NEXT: movlps %xmm2, {{[0-9]+}}(%esp) +; SSE-X86-NEXT: movlps %xmm1, {{[0-9]+}}(%esp) +; SSE-X86-NEXT: movlps %xmm0, (%esp) +; SSE-X86-NEXT: calll fma +; SSE-X86-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-X86-NEXT: movhps %xmm0, {{[0-9]+}}(%esp) +; SSE-X86-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-X86-NEXT: movhps %xmm0, {{[0-9]+}}(%esp) +; SSE-X86-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-X86-NEXT: movhps %xmm0, (%esp) +; SSE-X86-NEXT: fstpl {{[0-9]+}}(%esp) +; SSE-X86-NEXT: calll fma +; SSE-X86-NEXT: fstpl {{[0-9]+}}(%esp) +; SSE-X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE-X86-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; SSE-X86-NEXT: movl %ebp, %esp +; SSE-X86-NEXT: popl %ebp +; SSE-X86-NEXT: .cfi_def_cfa %esp, 4 +; SSE-X86-NEXT: retl +; +; SSE-X64-LABEL: f14: +; SSE-X64: # %bb.0: +; SSE-X64-NEXT: subq $72, %rsp +; SSE-X64-NEXT: .cfi_def_cfa_offset 80 +; SSE-X64-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-X64-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-X64-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-X64-NEXT: callq fma +; SSE-X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-X64-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-X64-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-X64-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-X64-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] +; SSE-X64-NEXT: callq fma +; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-X64-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-X64-NEXT: movaps %xmm1, %xmm0 +; SSE-X64-NEXT: addq $72, %rsp +; SSE-X64-NEXT: .cfi_def_cfa_offset 8 +; SSE-X64-NEXT: retq +; +; AVX-LABEL: f14: +; AVX: # %bb.0: +; AVX-NEXT: vfmadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 +; AVX-NEXT: ret{{[l|q]}} + %res = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <2 x double> %res +} + attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/X86/vec-strict-256.ll b/llvm/test/CodeGen/X86/vec-strict-256.ll --- a/llvm/test/CodeGen/X86/vec-strict-256.ll +++ b/llvm/test/CodeGen/X86/vec-strict-256.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx -O3 -disable-strictnode-mutation | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -O3 -disable-strictnode-mutation | FileCheck %s +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+fma -O3 -disable-strictnode-mutation | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+fma -O3 -disable-strictnode-mutation | FileCheck %s ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f -mattr=+avx512vl -O3 -disable-strictnode-mutation | FileCheck %s ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -mattr=+avx512vl -O3 -disable-strictnode-mutation | FileCheck %s @@ -16,6 +16,8 @@ declare <8 x float> @llvm.experimental.constrained.sqrt.v8f32(<8 x float>, metadata, metadata) declare <4 x double> @llvm.experimental.constrained.fpext.v4f64.v4f32(<4 x float>, metadata) declare <4 x float> @llvm.experimental.constrained.fptrunc.v4f32.v4f64(<4 x double>, metadata, metadata) +declare <4 x double> @llvm.experimental.constrained.fma.v4f64(<4 x double>, <4 x double>, <4 x double>, metadata, metadata) +declare <8 x float> @llvm.experimental.constrained.fma.v8f32(<8 x float>, <8 x float>, <8 x float>, metadata, metadata) define <4 x double> @f1(<4 x double> %a, <4 x double> %b) #0 { ; CHECK-LABEL: f1: @@ -154,4 +156,26 @@ ret <4 x float> %ret } +define <8 x float> @f13(<8 x float> %a, <8 x float> %b, <8 x float> %c) #0 { +; CHECK-LABEL: f13: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2 +; CHECK-NEXT: ret{{[l|q]}} + %res = call <8 x float> @llvm.experimental.constrained.fma.v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <8 x float> %res +} + +define <4 x double> @f14(<4 x double> %a, <4 x double> %b, <4 x double> %c) #0 { +; CHECK-LABEL: f14: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2 +; CHECK-NEXT: ret{{[l|q]}} + %res = call <4 x double> @llvm.experimental.constrained.fma.v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <4 x double> %res +} + attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/X86/vec-strict-512.ll b/llvm/test/CodeGen/X86/vec-strict-512.ll --- a/llvm/test/CodeGen/X86/vec-strict-512.ll +++ b/llvm/test/CodeGen/X86/vec-strict-512.ll @@ -14,6 +14,8 @@ declare <16 x float> @llvm.experimental.constrained.sqrt.v16f32(<16 x float>, metadata, metadata) declare <8 x double> @llvm.experimental.constrained.fpext.v8f64.v8f32(<8 x float>, metadata) declare <8 x float> @llvm.experimental.constrained.fptrunc.v8f32.v8f64(<8 x double>, metadata, metadata) +declare <8 x double> @llvm.experimental.constrained.fma.v8f64(<8 x double>, <8 x double>, <8 x double>, metadata, metadata) +declare <16 x float> @llvm.experimental.constrained.fma.v16f32(<16 x float>, <16 x float>, <16 x float>, metadata, metadata) define <8 x double> @f1(<8 x double> %a, <8 x double> %b) #0 { ; CHECK-LABEL: f1: @@ -151,4 +153,26 @@ ret <8 x float> %ret } +define <16 x float> @f13(<16 x float> %a, <16 x float> %b, <16 x float> %c) #0 { +; CHECK-LABEL: f13: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 +; CHECK-NEXT: ret{{[l|q]}} + %res = call <16 x float> @llvm.experimental.constrained.fma.v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %c, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <16 x float> %res +} + +define <8 x double> @f14(<8 x double> %a, <8 x double> %b, <8 x double> %c) #0 { +; CHECK-LABEL: f14: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 +; CHECK-NEXT: ret{{[l|q]}} + %res = call <8 x double> @llvm.experimental.constrained.fma.v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %c, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <8 x double> %res +} + attributes #0 = { strictfp }