diff --git a/llvm/lib/Target/X86/X86InstrFMA.td b/llvm/lib/Target/X86/X86InstrFMA.td --- a/llvm/lib/Target/X86/X86InstrFMA.td +++ b/llvm/lib/Target/X86/X86InstrFMA.td @@ -95,7 +95,8 @@ Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>; } -let Constraints = "$src1 = $dst", hasSideEffects = 0, isCommutable = 1 in +let Constraints = "$src1 = $dst", hasSideEffects = 0, isCommutable = 1, + Uses = [MXCSR], mayRaiseFPException = 1 in multiclass fma3p_forms opc132, bits<8> opc213, bits<8> opc231, string OpcodeStr, string PackTy, string Suff, PatFrag MemFrag128, PatFrag MemFrag256, @@ -237,7 +238,7 @@ } let Constraints = "$src1 = $dst", isCommutable = 1, isCodeGenOnly = 1, - hasSideEffects = 0 in + hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1 in multiclass fma3s_forms opc132, bits<8> opc213, bits<8> opc231, string OpStr, string PackTy, string Suff, SDNode OpNode, RegisterClass RC, @@ -263,7 +264,8 @@ // the lowest element of the FMA*_Int instruction. Even though such analysis // may be not implemented yet we allow the routines doing the actual commute // transformation to decide if one or another instruction is commutable or not. -let Constraints = "$src1 = $dst", isCommutable = 1, hasSideEffects = 0 in +let Constraints = "$src1 = $dst", isCommutable = 1, hasSideEffects = 0, + Uses = [MXCSR], mayRaiseFPException = 1 in multiclass fma3s_rm_int opc, string OpcodeStr, Operand memopr, RegisterClass RC, X86FoldableSchedWrite sched> { @@ -384,6 +386,7 @@ // FMA4 - AMD 4 operand Fused Multiply-Add instructions //===----------------------------------------------------------------------===// +let Uses = [MXCSR], mayRaiseFPException = 1 in multiclass fma4s opc, string OpcodeStr, RegisterClass RC, X86MemOperand x86memop, ValueType OpVT, SDNode OpNode, PatFrag mem_frag, X86FoldableSchedWrite sched> { @@ -425,7 +428,8 @@ multiclass fma4s_int opc, string OpcodeStr, Operand memop, ValueType VT, X86FoldableSchedWrite sched> { -let isCodeGenOnly = 1, hasSideEffects = 0 in { +let isCodeGenOnly = 1, hasSideEffects = 0, + Uses = [MXCSR], mayRaiseFPException = 1 in { def rr_Int : FMA4S_Int opc, string OpcodeStr, SDNode OpNode, ValueType OpVT128, ValueType OpVT256, PatFrag ld_frag128, PatFrag ld_frag256, diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -5542,7 +5542,7 @@ // FP round - roundss, roundps, roundsd, roundpd let Predicates = [HasAVX, NoVLX] in { - let ExeDomain = SSEPackedSingle in { + let ExeDomain = SSEPackedSingle, Uses = [MXCSR], mayRaiseFPException = 1 in { // Intrinsic form defm VROUNDPS : sse41_fp_unop_p<0x08, "vroundps", f128mem, VR128, v4f32, loadv4f32, X86VRndScale, SchedWriteFRnd.XMM>, @@ -5552,7 +5552,7 @@ VEX, VEX_L, VEX_WIG; } - let ExeDomain = SSEPackedDouble in { + let ExeDomain = SSEPackedDouble, Uses = [MXCSR], mayRaiseFPException = 1 in { defm VROUNDPD : sse41_fp_unop_p<0x09, "vroundpd", f128mem, VR128, v2f64, loadv2f64, X86VRndScale, SchedWriteFRnd.XMM>, VEX, VEX_WIG; @@ -5564,9 +5564,9 @@ let Predicates = [UseAVX] in { defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl, v4f32, v2f64, X86RndScales, 0>, - VEX_4V, VEX_LIG, VEX_WIG; + VEX_4V, VEX_LIG, VEX_WIG, SIMD_EXC; defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl>, - VEX_4V, VEX_LIG, VEX_WIG; + VEX_4V, VEX_LIG, VEX_WIG, SIMD_EXC; } let Predicates = [UseAVX] in { @@ -7326,12 +7326,12 @@ } let Predicates = [HasF16C, NoVLX] in { - defm VCVTPH2PS : f16c_ph2ps; - defm VCVTPH2PSY : f16c_ph2ps, VEX_L; + defm VCVTPH2PS : f16c_ph2ps, SIMD_EXC; + defm VCVTPH2PSY : f16c_ph2ps, VEX_L, SIMD_EXC; defm VCVTPS2PH : f16c_ps2ph; + WriteCvtPS2PHSt>, SIMD_EXC; defm VCVTPS2PHY : f16c_ps2ph, VEX_L; + WriteCvtPS2PHYSt>, VEX_L, SIMD_EXC; // Pattern match vcvtph2ps of a scalar i64 load. def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))), diff --git a/llvm/test/CodeGen/X86/mxcsr-reg-usage.ll b/llvm/test/CodeGen/X86/mxcsr-reg-usage.ll --- a/llvm/test/CodeGen/X86/mxcsr-reg-usage.ll +++ b/llvm/test/CodeGen/X86/mxcsr-reg-usage.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=x86-64 -mattr=+mmx -stop-after finalize-isel -o - %s | FileCheck %s +; RUN: llc -march=x86-64 -mattr=+mmx,+fma,+f16c -stop-after finalize-isel -o - %s | FileCheck %s ; This test ensures that the MXCSR is implicitly used by MMX FP instructions. define x86_mmx @mxcsr_mmx(<4 x float> %a0) { @@ -15,8 +15,31 @@ ret x86_mmx %5 } +define half @mxcsr_f16c(float %a) { +; CHECK: VCVTPS2PH{{.*}}mxcsr +; CHECK: VCVTPH2PS{{.*}}mxcsr + %res = fptrunc float %a to half + ret half %res +} + +define <4 x float> @mxcsr_fma_ss(<4 x float> %a, <4 x float> %b) { +; CHECK: VFMADD{{.*}}mxcsr + %res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %b, <4 x float> %a, <4 x float> +%a) + ret <4 x float> %res +} + +define <4 x float> @mxcsr_fma_ps(<4 x float> %a, <4 x float> %b) { +; CHECK: VFMADD{{.*}}mxcsr + %res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %b, <4 x float> %a, <4 x float> +%a) + ret <4 x float> %res +} + declare x86_mmx @llvm.x86.sse.cvtps2pi(<4 x float>) declare<4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, x86_mmx) declare x86_mmx @llvm.x86.sse.cvttps2pi(<4 x float>) declare <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx) declare x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double>) +declare <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>) +declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>) diff --git a/llvm/test/tools/llvm-exegesis/X86/uops-VFMADDSS4rm.s b/llvm/test/tools/llvm-exegesis/X86/uops-VFMADDSS4rm.s --- a/llvm/test/tools/llvm-exegesis/X86/uops-VFMADDSS4rm.s +++ b/llvm/test/tools/llvm-exegesis/X86/uops-VFMADDSS4rm.s @@ -6,5 +6,4 @@ CHECK-NEXT: instructions: CHECK-NEXT: VFMADDSS4rm CHECK: register_initial_values: -# FIXME: This will be changed to CHECK by the following patch that modeling MXCSR to VFMADDSS. -CHECK-NOT: MXCSR +CHECK: MXCSR