Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -5790,13 +5790,13 @@ (ins _.RC:$src2, _.RC:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", (_.VT (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)), 1, 1>, - AVX512FMA3Base; + AVX512FMA3Base, Sched<[WriteFMA]>; defm m: AVX512_maskable_3src, - AVX512FMA3Base; + AVX512FMA3Base, Sched<[WriteFMA, ReadAfterLd]>; defm mb: AVX512_maskable_3src, - AVX512FMA3Base, EVEX_B; + AVX512FMA3Base, EVEX_B, Sched<[WriteFMA, ReadAfterLd]>; } } @@ -5815,7 +5815,7 @@ (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc), OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", (_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 imm:$rc))), 1, 1>, - AVX512FMA3Base, EVEX_B, EVEX_RC; + AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[WriteFMA]>; } multiclass avx512_fma3p_213_common opc, string OpcodeStr, SDNode OpNode, @@ -5857,13 +5857,13 @@ (ins _.RC:$src2, _.RC:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), 1, 1, vselect, 1>, - AVX512FMA3Base; + AVX512FMA3Base, Sched<[WriteFMA]>; defm m: AVX512_maskable_3src, - AVX512FMA3Base; + AVX512FMA3Base, Sched<[WriteFMA, ReadAfterLd]>; defm mb: AVX512_maskable_3src, AVX512FMA3Base, EVEX_B; + _.RC:$src1)), 1, 0>, AVX512FMA3Base, EVEX_B, + Sched<[WriteFMA, ReadAfterLd]>; } } @@ -5883,7 +5884,7 @@ OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", (_.VT ( OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 imm:$rc))), 1, 1, vselect, 1>, - AVX512FMA3Base, EVEX_B, EVEX_RC; + AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[WriteFMA]>; } multiclass avx512_fma3p_231_common opc, string OpcodeStr, SDNode OpNode, @@ -5924,7 +5925,7 @@ (ins _.RC:$src2, _.RC:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", (_.VT (OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2)), 1, 1, vselect, 1>, - AVX512FMA3Base; + AVX512FMA3Base, Sched<[WriteFMA]>; // Pattern is 312 order so that the load is in a different place from the // 213 and 231 patterns this helps tablegen's duplicate pattern detection. @@ -5932,7 +5933,7 @@ (ins _.RC:$src2, _.MemOp:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", (_.VT (OpNode (_.LdFrag addr:$src3), _.RC:$src1, _.RC:$src2)), 1, 0>, - AVX512FMA3Base; + AVX512FMA3Base, Sched<[WriteFMA, ReadAfterLd]>; // Pattern is 312 order so that the load is in a different place from the // 213 and 231 patterns this helps tablegen's duplicate pattern detection. @@ -5941,7 +5942,8 @@ OpcodeStr, "${src3}"##_.BroadcastStr##", $src2", "$src2, ${src3}"##_.BroadcastStr, (_.VT (OpNode (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))), - _.RC:$src1, _.RC:$src2)), 1, 0>, AVX512FMA3Base, EVEX_B; + _.RC:$src1, _.RC:$src2)), 1, 0>, AVX512FMA3Base, EVEX_B, + Sched<[WriteFMA, ReadAfterLd]>; } } @@ -5953,7 +5955,7 @@ OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", (_.VT ( OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 imm:$rc))), 1, 1, vselect, 1>, - AVX512FMA3Base, EVEX_B, EVEX_RC; + AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[WriteFMA]>; } multiclass avx512_fma3p_132_common opc, string OpcodeStr, SDNode OpNode, @@ -5994,28 +5996,30 @@ let Constraints = "$src1 = $dst", hasSideEffects = 0 in { defm r_Int: AVX512_maskable_3src_scalar, AVX512FMA3Base; + "$src3, $src2", "$src2, $src3", RHS_VEC_r, 1, 1>, AVX512FMA3Base, + Sched<[WriteFMA]>; defm m_Int: AVX512_maskable_3src_scalar, AVX512FMA3Base; + "$src3, $src2", "$src2, $src3", RHS_VEC_m, 1, 1>, AVX512FMA3Base, + Sched<[WriteFMA, ReadAfterLd]>; defm rb_Int: AVX512_maskable_3src_scalar, - AVX512FMA3Base, EVEX_B, EVEX_RC; + AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[WriteFMA, ReadAfterLd]>; let isCodeGenOnly = 1, isCommutable = 1 in { def r : AVX512FMA3S; + !if(MaskOnlyReg, [], [RHS_r])>, Sched<[WriteFMA]>; def m : AVX512FMA3S; + [RHS_m]>, Sched<[WriteFMA, ReadAfterLd]>; }// isCodeGenOnly = 1 }// Constraints = "$src1 = $dst" } Index: lib/Target/X86/X86InstrFMA.td =================================================================== --- lib/Target/X86/X86InstrFMA.td +++ lib/Target/X86/X86InstrFMA.td @@ -41,7 +41,8 @@ (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), - [(set RC:$dst, (VT (Op RC:$src2, RC:$src1, RC:$src3)))]>; + [(set RC:$dst, (VT (Op RC:$src2, RC:$src1, RC:$src3)))]>, + Sched<[WriteFMA]>; let mayLoad = 1 in def m : FMA3; + (MemFrag addr:$src3))))]>, + Sched<[WriteFMA, ReadAfterLd]>; } multiclass fma3p_rm_231 opc, string OpcodeStr, RegisterClass RC, @@ -60,7 +62,7 @@ (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), - []>; + []>, Sched<[WriteFMA]>; let mayLoad = 1 in def m : FMA3; + RC:$src1)))]>, Sched<[WriteFMA, ReadAfterLd]>; } multiclass fma3p_rm_132 opc, string OpcodeStr, RegisterClass RC, @@ -79,7 +81,7 @@ (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), - []>; + []>, Sched<[WriteFMA]>; // Pattern is 312 order so that the load is in a different place from the // 213 and 231 patterns this helps tablegen's duplicate pattern detection. @@ -89,7 +91,7 @@ !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), [(set RC:$dst, (VT (Op (MemFrag addr:$src3), RC:$src1, - RC:$src2)))]>; + RC:$src2)))]>, Sched<[WriteFMA, ReadAfterLd]>; } let Constraints = "$src1 = $dst", hasSideEffects = 0, isCommutable = 1 in @@ -172,7 +174,8 @@ (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), - [(set RC:$dst, (OpNode RC:$src2, RC:$src1, RC:$src3))]>; + [(set RC:$dst, (OpNode RC:$src2, RC:$src1, RC:$src3))]>, + Sched<[WriteFMA]>; let mayLoad = 1 in def m : FMA3S; + (OpNode RC:$src2, RC:$src1, (load addr:$src3)))]>, + Sched<[WriteFMA, ReadAfterLd]>; } multiclass fma3s_rm_231 opc, string OpcodeStr, @@ -191,7 +195,7 @@ (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), - []>; + []>, Sched<[WriteFMA]>; let mayLoad = 1 in def m : FMA3S; + (OpNode RC:$src2, (load addr:$src3), RC:$src1))]>, + Sched<[WriteFMA, ReadAfterLd]>; } multiclass fma3s_rm_132 opc, string OpcodeStr, @@ -210,7 +215,7 @@ (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), - []>; + []>, Sched<[WriteFMA]>; // Pattern is 312 order so that the load is in a different place from the // 213 and 231 patterns this helps tablegen's duplicate pattern detection. @@ -220,7 +225,8 @@ !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), [(set RC:$dst, - (OpNode (load addr:$src3), RC:$src1, RC:$src2))]>; + (OpNode (load addr:$src3), RC:$src1, RC:$src2))]>, + Sched<[WriteFMA, ReadAfterLd]>; } let Constraints = "$src1 = $dst", isCommutable = 1, hasSideEffects = 0 in @@ -257,14 +263,14 @@ (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), - []>; + []>, Sched<[WriteFMA]>; let mayLoad = 1 in def m_Int : FMA3S; + []>, Sched<[WriteFMA, ReadAfterLd]>; } // The FMA 213 form is created for lowering of scalar FMA intrinscis @@ -360,26 +366,29 @@ !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, - (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>, VEX_W, VEX_LIG; + (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>, VEX_W, VEX_LIG, + Sched<[WriteFMA]>; def rm : FMA4S, VEX_W, VEX_LIG; + (mem_frag addr:$src3)))]>, VEX_W, VEX_LIG, + Sched<[WriteFMA, ReadAfterLd]>; def mr : FMA4S, VEX_LIG; + (OpNode RC:$src1, (mem_frag addr:$src2), RC:$src3))]>, VEX_LIG, + Sched<[WriteFMA, ReadAfterLd]>; // For disassembler let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in def rr_REV : FMA4S, - VEX_LIG, FoldGenData; + VEX_LIG, FoldGenData, Sched<[WriteFMA]>; } multiclass fma4s_int opc, string OpcodeStr, Operand memop, @@ -391,26 +400,27 @@ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, (VT (OpNode VR128:$src1, VR128:$src2, VR128:$src3)))]>, VEX_W, - VEX_LIG; + VEX_LIG, Sched<[WriteFMA]>; def rm_Int : FMA4S, VEX_W, VEX_LIG; + mem_cpat:$src3)))]>, VEX_W, VEX_LIG, + Sched<[WriteFMA, ReadAfterLd]>; def mr_Int : FMA4S, - VEX_LIG; + VEX_LIG, Sched<[WriteFMA, ReadAfterLd]>; let hasSideEffects = 0 in def rr_Int_REV : FMA4S, VEX_LIG, FoldGenData; + []>, VEX_LIG, FoldGenData, Sched<[WriteFMA]>; } // isCodeGenOnly = 1 } @@ -424,19 +434,21 @@ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, (OpVT128 (OpNode VR128:$src1, VR128:$src2, VR128:$src3)))]>, - VEX_W; + VEX_W, Sched<[WriteFMA]>; def rm : FMA4, VEX_W; + (ld_frag128 addr:$src3)))]>, VEX_W, + Sched<[WriteFMA, ReadAfterLd]>; def mr : FMA4; + (OpNode VR128:$src1, (ld_frag128 addr:$src2), VR128:$src3))]>, + Sched<[WriteFMA, ReadAfterLd]>; let isCommutable = 1 in def Yrr : FMA4, - VEX_W, VEX_L; + VEX_W, VEX_L, Sched<[WriteFMA]>; def Yrm : FMA4, VEX_W, VEX_L; + (ld_frag256 addr:$src3)))]>, VEX_W, VEX_L, + Sched<[WriteFMA, ReadAfterLd]>; def Ymr : FMA4, VEX_L; + (ld_frag256 addr:$src2), VR256:$src3))]>, VEX_L, + Sched<[WriteFMA, ReadAfterLd]>; // For disassembler let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { def rr_REV : FMA4, - FoldGenData; + Sched<[WriteFMA]>, FoldGenData; def Yrr_REV : FMA4, - VEX_L, FoldGenData; + VEX_L, Sched<[WriteFMA]>, FoldGenData; } // isCodeGenOnly = 1 } Index: lib/Target/X86/X86SchedBroadwell.td =================================================================== --- lib/Target/X86/X86SchedBroadwell.td +++ lib/Target/X86/X86SchedBroadwell.td @@ -131,7 +131,7 @@ defm : BWWriteResPair; // Floating point square root. defm : BWWriteResPair; // Floating point reciprocal estimate. defm : BWWriteResPair; // Floating point reciprocal square root estimate. -// defm WriteFMA : X86SchedWritePair; // Fused Multiply Add. +defm : BWWriteResPair; // Fused Multiply Add. defm : BWWriteResPair; // Floating point vector shuffles. defm : BWWriteResPair; // Floating point vector blends. def : WriteRes { // Fp vector variable blends. Index: lib/Target/X86/X86SchedHaswell.td =================================================================== --- lib/Target/X86/X86SchedHaswell.td +++ lib/Target/X86/X86SchedHaswell.td @@ -134,6 +134,7 @@ defm : HWWriteResPair; defm : HWWriteResPair; defm : HWWriteResPair; +defm : HWWriteResPair; defm : HWWriteResPair; defm : HWWriteResPair; defm : HWWriteResPair; Index: lib/Target/X86/X86SchedSandyBridge.td =================================================================== --- lib/Target/X86/X86SchedSandyBridge.td +++ lib/Target/X86/X86SchedSandyBridge.td @@ -276,11 +276,12 @@ def : WriteRes; def : WriteRes; -// AVX2 is not supported on that architecture, but we should define the basic +// AVX2/FMA is not supported on that architecture, but we should define the basic // scheduling resources anyway. defm : SBWriteResPair; defm : SBWriteResPair; defm : SBWriteResPair; +defm : SBWriteResPair; // Remaining SNB instrs. Index: lib/Target/X86/X86SchedSkylakeClient.td =================================================================== --- lib/Target/X86/X86SchedSkylakeClient.td +++ lib/Target/X86/X86SchedSkylakeClient.td @@ -132,7 +132,7 @@ defm : SKLWriteResPair; // Floating point square root. defm : SKLWriteResPair; // Floating point reciprocal estimate. defm : SKLWriteResPair; // Floating point reciprocal square root estimate. -// defm WriteFMA : X86SchedWritePair; // Fused Multiply Add. +defm : SKLWriteResPair; // Fused Multiply Add. defm : SKLWriteResPair; // Floating point vector shuffles. defm : SKLWriteResPair; // Floating point vector blends. def : WriteRes { // Fp vector variable blends. Index: lib/Target/X86/X86SchedSkylakeServer.td =================================================================== --- lib/Target/X86/X86SchedSkylakeServer.td +++ lib/Target/X86/X86SchedSkylakeServer.td @@ -132,7 +132,7 @@ defm : SKXWriteResPair; // Floating point square root. defm : SKXWriteResPair; // Floating point reciprocal estimate. defm : SKXWriteResPair; // Floating point reciprocal square root estimate. -// defm WriteFMA : X86SchedWritePair; // Fused Multiply Add. +defm : SKXWriteResPair; // Fused Multiply Add. defm : SKXWriteResPair; // Floating point vector shuffles. defm : SKXWriteResPair; // Floating point vector blends. def : WriteRes { // Fp vector variable blends. Index: lib/Target/X86/X86ScheduleBtVer2.td =================================================================== --- lib/Target/X86/X86ScheduleBtVer2.td +++ lib/Target/X86/X86ScheduleBtVer2.td @@ -168,6 +168,7 @@ defm : JWriteResFpuPair; defm : JWriteResFpuPair; +defm : JWriteResFpuPair; // NOTE: Doesn't exist on Jaguar. defm : JWriteResFpuPair; defm : JWriteResFpuPair; defm : JWriteResFpuPair; Index: lib/Target/X86/X86ScheduleSLM.td =================================================================== --- lib/Target/X86/X86ScheduleSLM.td +++ lib/Target/X86/X86ScheduleSLM.td @@ -249,7 +249,7 @@ def : WriteRes; def : WriteRes; -// AVX is not supported on that architecture, but we should define the basic +// AVX/FMA is not supported on that architecture, but we should define the basic // scheduling resources anyway. def : WriteRes; defm : SMWriteResPair; @@ -257,4 +257,5 @@ defm : SMWriteResPair; defm : SMWriteResPair; defm : SMWriteResPair; +defm : SMWriteResPair; } // SchedModel Index: lib/Target/X86/X86ScheduleZnver1.td =================================================================== --- lib/Target/X86/X86ScheduleZnver1.td +++ lib/Target/X86/X86ScheduleZnver1.td @@ -175,6 +175,7 @@ defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair;