Index: llvm/trunk/lib/Target/X86/X86SchedBroadwell.td =================================================================== --- llvm/trunk/lib/Target/X86/X86SchedBroadwell.td +++ llvm/trunk/lib/Target/X86/X86SchedBroadwell.td @@ -66,6 +66,9 @@ let BufferSize=60; } +// Integer division issued on port 0. +def BWDivider : ProcResource<1>; // Integer division issued on port 0. + // Loads are 5 cycles, so ReadAfterLd registers needn't be available until 5 // cycles after the memory operand. def : ReadAdvance<ReadAfterLd, 5>; @@ -76,15 +79,21 @@ // This multiclass defines the resource usage for variants with and without // folded loads. multiclass BWWriteResPair<X86FoldableSchedWrite SchedRW, - ProcResourceKind ExePort, - int Lat> { + list<ProcResourceKind> ExePorts, + int Lat, list<int> Res = [1], int UOps = 1> { // Register variant is using a single cycle on ExePort. - def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; } + def : WriteRes<SchedRW, ExePorts> { + let Latency = Lat; + let ResourceCycles = Res; + let NumMicroOps = UOps; + } // Memory variant also uses a cycle on port 2/3 and adds 5 cycles to the // latency. - def : WriteRes<SchedRW.Folded, [BWPort23, ExePort]> { - let Latency = !add(Lat, 5); + def : WriteRes<SchedRW.Folded, !listconcat([BWPort23], ExePorts)> { + let Latency = !add(Lat, 5); + let ResourceCycles = !listconcat([1], Res); + let NumMicroOps = UOps; } } @@ -93,23 +102,15 @@ def : WriteRes<WriteRMW, [BWPort4]>; // Arithmetic. -defm : BWWriteResPair<WriteALU, BWPort0156, 1>; // Simple integer ALU op. -defm : BWWriteResPair<WriteIMul, BWPort1, 3>; // Integer multiplication. +defm : BWWriteResPair<WriteALU, [BWPort0156], 1>; // Simple integer ALU op. +defm : BWWriteResPair<WriteIMul, [BWPort1], 3>; // Integer multiplication. +defm : BWWriteResPair<WriteIDiv, [BWPort0, BWDivider], 25, [1, 10]>; def : WriteRes<WriteIMulH, []> { let Latency = 3; } // Integer multiplication, high part. -def BWDivider : ProcResource<1>; // Integer division issued on port 0. -def : WriteRes<WriteIDiv, [BWPort0, BWDivider]> { // Integer division. - let Latency = 25; - let ResourceCycles = [1, 10]; -} -def : WriteRes<WriteIDivLd, [BWPort23, BWPort0, BWDivider]> { - let Latency = 29; - let ResourceCycles = [1, 1, 10]; -} def : WriteRes<WriteLEA, [BWPort15]>; // LEA instructions can't fold loads. // Integer shifts and rotates. -defm : BWWriteResPair<WriteShift, BWPort06, 1>; +defm : BWWriteResPair<WriteShift, [BWPort06], 1>; // Loads, stores, and moves, not folded with other operations. def : WriteRes<WriteLoad, [BWPort23]> { let Latency = 5; } @@ -125,30 +126,23 @@ // Branches don't produce values, so they have no latency, but they still // consume resources. Indirect branches can fold loads. -defm : BWWriteResPair<WriteJump, BWPort06, 1>; +defm : BWWriteResPair<WriteJump, [BWPort06], 1>; // Floating point. This covers both scalar and vector operations. def : WriteRes<WriteFLoad, [BWPort23]> { let Latency = 5; } def : WriteRes<WriteFStore, [BWPort237, BWPort4]>; def : WriteRes<WriteFMove, [BWPort5]>; -defm : BWWriteResPair<WriteFAdd, BWPort1, 3>; // Floating point add/sub/compare. -defm : BWWriteResPair<WriteFMul, BWPort0, 5>; // Floating point multiplication. -defm : BWWriteResPair<WriteFDiv, BWPort0, 12>; // 10-14 cycles. // Floating point division. -defm : BWWriteResPair<WriteFSqrt, BWPort0, 15>; // Floating point square root. -defm : BWWriteResPair<WriteFRcp, BWPort0, 5>; // Floating point reciprocal estimate. -defm : BWWriteResPair<WriteFRsqrt, BWPort0, 5>; // Floating point reciprocal square root estimate. -defm : BWWriteResPair<WriteFMA, BWPort01, 5>; // Fused Multiply Add. -defm : BWWriteResPair<WriteFShuffle, BWPort5, 1>; // Floating point vector shuffles. -defm : BWWriteResPair<WriteFBlend, BWPort015, 1>; // Floating point vector blends. -def : WriteRes<WriteFVarBlend, [BWPort5]> { // Fp vector variable blends. - let Latency = 2; - let ResourceCycles = [2]; -} -def : WriteRes<WriteFVarBlendLd, [BWPort5, BWPort23]> { - let Latency = 6; - let ResourceCycles = [2, 1]; -} +defm : BWWriteResPair<WriteFAdd, [BWPort1], 3>; // Floating point add/sub/compare. +defm : BWWriteResPair<WriteFMul, [BWPort0], 5>; // Floating point multiplication. +defm : BWWriteResPair<WriteFDiv, [BWPort0], 12>; // 10-14 cycles. // Floating point division. +defm : BWWriteResPair<WriteFSqrt, [BWPort0], 15>; // Floating point square root. +defm : BWWriteResPair<WriteFRcp, [BWPort0], 5>; // Floating point reciprocal estimate. +defm : BWWriteResPair<WriteFRsqrt, [BWPort0], 5>; // Floating point reciprocal square root estimate. +defm : BWWriteResPair<WriteFMA, [BWPort01], 5>; // Fused Multiply Add. +defm : BWWriteResPair<WriteFShuffle, [BWPort5], 1>; // Floating point vector shuffles. +defm : BWWriteResPair<WriteFBlend, [BWPort015], 1>; // Floating point vector blends. +defm : BWWriteResPair<WriteFVarBlend, [HWPort5], 2, [2]>; // Fp vector variable blends. // FMA Scheduling helper class. // class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; } @@ -158,38 +152,22 @@ def : WriteRes<WriteVecStore, [BWPort237, BWPort4]>; def : WriteRes<WriteVecMove, [BWPort015]>; -defm : BWWriteResPair<WriteVecALU, BWPort15, 1>; // Vector integer ALU op, no logicals. -defm : BWWriteResPair<WriteVecShift, BWPort0, 1>; // Vector integer shifts. -defm : BWWriteResPair<WriteVecIMul, BWPort0, 5>; // Vector integer multiply. -defm : BWWriteResPair<WriteShuffle, BWPort5, 1>; // Vector shuffles. -defm : BWWriteResPair<WriteBlend, BWPort15, 1>; // Vector blends. - -def : WriteRes<WriteVarBlend, [BWPort5]> { // Vector variable blends. - let Latency = 2; - let ResourceCycles = [2]; -} -def : WriteRes<WriteVarBlendLd, [BWPort5, BWPort23]> { - let Latency = 6; - let ResourceCycles = [2, 1]; -} - -def : WriteRes<WriteMPSAD, [BWPort0, BWPort5]> { // Vector MPSAD. - let Latency = 6; - let ResourceCycles = [1, 2]; -} -def : WriteRes<WriteMPSADLd, [BWPort23, BWPort0, BWPort5]> { - let Latency = 6; - let ResourceCycles = [1, 1, 2]; -} +defm : BWWriteResPair<WriteVecALU, [BWPort15], 1>; // Vector integer ALU op, no logicals. +defm : BWWriteResPair<WriteVecShift, [BWPort0], 1>; // Vector integer shifts. +defm : BWWriteResPair<WriteVecIMul, [BWPort0], 5>; // Vector integer multiply. +defm : BWWriteResPair<WriteShuffle, [BWPort5], 1>; // Vector shuffles. +defm : BWWriteResPair<WriteBlend, [BWPort15], 1>; // Vector blends. +defm : BWWriteResPair<WriteVarBlend, [BWPort5], 2, [2]>; // Vector variable blends. +defm : BWWriteResPair<WriteMPSAD, [BWPort0, BWPort5], 6, [1, 2]>; // Vector MPSAD. // Vector bitwise operations. // These are often used on both floating point and integer vectors. -defm : BWWriteResPair<WriteVecLogic, BWPort015, 1>; // Vector and/or/xor. +defm : BWWriteResPair<WriteVecLogic, [BWPort015], 1>; // Vector and/or/xor. // Conversion between integer and float. -defm : BWWriteResPair<WriteCvtF2I, BWPort1, 3>; // Float -> Integer. -defm : BWWriteResPair<WriteCvtI2F, BWPort1, 4>; // Integer -> Float. -defm : BWWriteResPair<WriteCvtF2F, BWPort1, 3>; // Float -> Float size conversion. +defm : BWWriteResPair<WriteCvtF2I, [BWPort1], 3>; // Float -> Integer. +defm : BWWriteResPair<WriteCvtI2F, [BWPort1], 4>; // Integer -> Float. +defm : BWWriteResPair<WriteCvtF2F, [BWPort1], 3>; // Float -> Float size conversion. // Strings instructions. // Packed Compare Implicit Length Strings, Return Mask @@ -257,29 +235,15 @@ } // Carry-less multiplication instructions. -def : WriteRes<WriteCLMul, [BWPort0, BWPort5]> { - let Latency = 7; - let ResourceCycles = [2, 1]; -} -def : WriteRes<WriteCLMulLd, [BWPort0, BWPort5, BWPort23]> { - let Latency = 7; - let ResourceCycles = [2, 1, 1]; -} +defm : BWWriteResPair<WriteCLMul, [BWPort0, BWPort5], 7, [2, 1]>; // Catch-all for expensive system instructions. def : WriteRes<WriteSystem, [BWPort0156]> { let Latency = 100; } // def WriteSystem : SchedWrite; // AVX2. -defm : BWWriteResPair<WriteFShuffle256, BWPort5, 3>; // Fp 256-bit width vector shuffles. -defm : BWWriteResPair<WriteShuffle256, BWPort5, 3>; // 256-bit width vector shuffles. -def : WriteRes<WriteVarVecShift, [BWPort0, BWPort5]> { // Variable vector shifts. - let Latency = 2; - let ResourceCycles = [2, 1]; -} -def : WriteRes<WriteVarVecShiftLd, [BWPort0, BWPort5, BWPort23]> { - let Latency = 6; - let ResourceCycles = [2, 1, 1]; -} +defm : BWWriteResPair<WriteFShuffle256, [BWPort5], 3>; // Fp 256-bit width vector shuffles. +defm : BWWriteResPair<WriteShuffle256, [BWPort5], 3>; // 256-bit width vector shuffles. +defm : BWWriteResPair<WriteVarVecShift, [BWPort0, BWPort5], 2, [2, 1]>; // Variable vector shifts. // Old microcoded instructions that nobody use. def : WriteRes<WriteMicrocoded, [BWPort0156]> { let Latency = 100; } // def WriteMicrocoded : SchedWrite; @@ -293,27 +257,9 @@ //////////////////////////////////////////////////////////////////////////////// // Horizontal add/sub instructions. //////////////////////////////////////////////////////////////////////////////// -// HADD, HSUB PS/PD -// x,x / v,v,v. -def : WriteRes<WriteFHAdd, [BWPort1]> { - let Latency = 3; -} -// x,m / v,v,m. -def : WriteRes<WriteFHAddLd, [BWPort1, BWPort23]> { - let Latency = 7; - let ResourceCycles = [1, 1]; -} - -// PHADD|PHSUB (S) W/D. -// v <- v,v. -def : WriteRes<WritePHAdd, [BWPort15]>; - -// v <- v,m. -def : WriteRes<WritePHAddLd, [BWPort15, BWPort23]> { - let Latency = 5; - let ResourceCycles = [1, 1]; -} +defm : BWWriteResPair<WriteFHAdd, [BWPort1], 3>; +defm : BWWriteResPair<WritePHAdd, [BWPort15], 1>; // Remaining instrs. Index: llvm/trunk/lib/Target/X86/X86SchedHaswell.td =================================================================== --- llvm/trunk/lib/Target/X86/X86SchedHaswell.td +++ llvm/trunk/lib/Target/X86/X86SchedHaswell.td @@ -80,15 +80,21 @@ // This multiclass defines the resource usage for variants with and without // folded loads. multiclass HWWriteResPair<X86FoldableSchedWrite SchedRW, - ProcResourceKind ExePort, - int Lat> { + list<ProcResourceKind> ExePorts, + int Lat, list<int> Res = [1], int UOps = 1> { // Register variant is using a single cycle on ExePort. - def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; } + def : WriteRes<SchedRW, ExePorts> { + let Latency = Lat; + let ResourceCycles = Res; + let NumMicroOps = UOps; + } // Memory variant also uses a cycle on port 2/3 and adds 5 cycles to the // latency. - def : WriteRes<SchedRW.Folded, [HWPort23, ExePort]> { - let Latency = !add(Lat, 5); + def : WriteRes<SchedRW.Folded, !listconcat([HWPort23], ExePorts)> { + let Latency = !add(Lat, 5); + let ResourceCycles = !listconcat([1], Res); + let NumMicroOps = UOps; } } @@ -103,11 +109,11 @@ def : WriteRes<WriteMove, [HWPort0156]>; def : WriteRes<WriteZero, []>; -defm : HWWriteResPair<WriteALU, HWPort0156, 1>; -defm : HWWriteResPair<WriteIMul, HWPort1, 3>; +defm : HWWriteResPair<WriteALU, [HWPort0156], 1>; +defm : HWWriteResPair<WriteIMul, [HWPort1], 3>; def : WriteRes<WriteIMulH, []> { let Latency = 3; } -defm : HWWriteResPair<WriteShift, HWPort06, 1>; -defm : HWWriteResPair<WriteJump, HWPort06, 1>; +defm : HWWriteResPair<WriteShift, [HWPort06], 1>; +defm : HWWriteResPair<WriteJump, [HWPort06], 1>; // This is for simple LEAs with one or two input operands. // The complex ones can only execute on port 1, and they require two cycles on @@ -129,68 +135,36 @@ def : WriteRes<WriteFLoad, [HWPort23]> { let Latency = 5; } def : WriteRes<WriteFMove, [HWPort5]>; -defm : HWWriteResPair<WriteFAdd, HWPort1, 3>; -defm : HWWriteResPair<WriteFMul, HWPort0, 5>; -defm : HWWriteResPair<WriteFDiv, HWPort0, 12>; // 10-14 cycles. -defm : HWWriteResPair<WriteFRcp, HWPort0, 5>; -defm : HWWriteResPair<WriteFRsqrt, HWPort0, 5>; -defm : HWWriteResPair<WriteFSqrt, HWPort0, 15>; -defm : HWWriteResPair<WriteCvtF2I, HWPort1, 3>; -defm : HWWriteResPair<WriteCvtI2F, HWPort1, 4>; -defm : HWWriteResPair<WriteCvtF2F, HWPort1, 3>; -defm : HWWriteResPair<WriteFMA, HWPort01, 5>; -defm : HWWriteResPair<WriteFShuffle, HWPort5, 1>; -defm : HWWriteResPair<WriteFBlend, HWPort015, 1>; -defm : HWWriteResPair<WriteFShuffle256, HWPort5, 3>; - -def : WriteRes<WriteFVarBlend, [HWPort5]> { - let Latency = 2; - let ResourceCycles = [2]; -} -def : WriteRes<WriteFVarBlendLd, [HWPort5, HWPort23]> { - let Latency = 6; - let ResourceCycles = [2, 1]; -} +defm : HWWriteResPair<WriteFAdd, [HWPort1], 3>; +defm : HWWriteResPair<WriteFMul, [HWPort0], 5>; +defm : HWWriteResPair<WriteFDiv, [HWPort0], 12>; // 10-14 cycles. +defm : HWWriteResPair<WriteFRcp, [HWPort0], 5>; +defm : HWWriteResPair<WriteFRsqrt, [HWPort0], 5>; +defm : HWWriteResPair<WriteFSqrt, [HWPort0], 15>; +defm : HWWriteResPair<WriteCvtF2I, [HWPort1], 3>; +defm : HWWriteResPair<WriteCvtI2F, [HWPort1], 4>; +defm : HWWriteResPair<WriteCvtF2F, [HWPort1], 3>; +defm : HWWriteResPair<WriteFMA, [HWPort01], 5>; +defm : HWWriteResPair<WriteFShuffle, [HWPort5], 1>; +defm : HWWriteResPair<WriteFBlend, [HWPort015], 1>; +defm : HWWriteResPair<WriteFShuffle256, [HWPort5], 3>; +defm : HWWriteResPair<WriteFVarBlend, [HWPort5], 2, [2]>; // Vector integer operations. def : WriteRes<WriteVecStore, [HWPort237, HWPort4]>; def : WriteRes<WriteVecLoad, [HWPort23]> { let Latency = 5; } def : WriteRes<WriteVecMove, [HWPort015]>; -defm : HWWriteResPair<WriteVecShift, HWPort0, 1>; -defm : HWWriteResPair<WriteVecLogic, HWPort015, 1>; -defm : HWWriteResPair<WriteVecALU, HWPort15, 1>; -defm : HWWriteResPair<WriteVecIMul, HWPort0, 5>; -defm : HWWriteResPair<WriteShuffle, HWPort5, 1>; -defm : HWWriteResPair<WriteBlend, HWPort15, 1>; -defm : HWWriteResPair<WriteShuffle256, HWPort5, 3>; - -def : WriteRes<WriteVarBlend, [HWPort5]> { - let Latency = 2; - let ResourceCycles = [2]; -} -def : WriteRes<WriteVarBlendLd, [HWPort5, HWPort23]> { - let Latency = 6; - let ResourceCycles = [2, 1]; -} - -def : WriteRes<WriteVarVecShift, [HWPort0, HWPort5]> { - let Latency = 2; - let ResourceCycles = [2, 1]; -} -def : WriteRes<WriteVarVecShiftLd, [HWPort0, HWPort5, HWPort23]> { - let Latency = 6; - let ResourceCycles = [2, 1, 1]; -} - -def : WriteRes<WriteMPSAD, [HWPort0, HWPort5]> { - let Latency = 6; - let ResourceCycles = [1, 2]; -} -def : WriteRes<WriteMPSADLd, [HWPort23, HWPort0, HWPort5]> { - let Latency = 6; - let ResourceCycles = [1, 1, 2]; -} +defm : HWWriteResPair<WriteVecShift, [HWPort0], 1>; +defm : HWWriteResPair<WriteVecLogic, [HWPort015], 1>; +defm : HWWriteResPair<WriteVecALU, [HWPort15], 1>; +defm : HWWriteResPair<WriteVecIMul, [HWPort0], 5>; +defm : HWWriteResPair<WriteShuffle, [HWPort5], 1>; +defm : HWWriteResPair<WriteBlend, [HWPort15], 1>; +defm : HWWriteResPair<WriteShuffle256, [HWPort5], 3>; +defm : HWWriteResPair<WriteVarBlend, [HWPort5], 2, [2]>; +defm : HWWriteResPair<WriteVarVecShift, [HWPort0, HWPort5], 2, [2, 1]>; +defm : HWWriteResPair<WriteMPSAD, [HWPort0, HWPort5], 6, [1, 2]>; // String instructions. // Packed Compare Implicit Length Strings, Return Mask @@ -544,34 +518,8 @@ // Horizontal add/sub instructions. //////////////////////////////////////////////////////////////////////////////// -// HADD, HSUB PS/PD -// x,x / v,v,v. -def : WriteRes<WriteFHAdd, [HWPort1, HWPort5]> { - let Latency = 5; - let NumMicroOps = 3; - let ResourceCycles = [1, 2]; -} - -// x,m / v,v,m. -def : WriteRes<WriteFHAddLd, [HWPort1, HWPort5, HWPort23]> { - let Latency = 9; - let NumMicroOps = 4; - let ResourceCycles = [1, 2, 1]; -} - -// PHADD|PHSUB (S) W/D. -// v <- v,v. -def : WriteRes<WritePHAdd, [HWPort1, HWPort5]> { - let Latency = 3; - let NumMicroOps = 3; - let ResourceCycles = [1, 2]; -} -// v <- v,m. -def : WriteRes<WritePHAddLd, [HWPort1, HWPort5, HWPort23]> { - let Latency = 6; - let NumMicroOps = 3; - let ResourceCycles = [1, 2, 1]; -} +defm : HWWriteResPair<WriteFHAdd, [HWPort1, HWPort5], 5, [1, 2], 3>; +defm : HWWriteResPair<WritePHAdd, [HWPort1, HWPort5], 3, [1, 2], 3>; //=== Floating Point XMM and YMM Instructions ===// Index: llvm/trunk/lib/Target/X86/X86SchedSandyBridge.td =================================================================== --- llvm/trunk/lib/Target/X86/X86SchedSandyBridge.td +++ llvm/trunk/lib/Target/X86/X86SchedSandyBridge.td @@ -71,15 +71,21 @@ // This multiclass defines the resource usage for variants with and without // folded loads. multiclass SBWriteResPair<X86FoldableSchedWrite SchedRW, - ProcResourceKind ExePort, - int Lat> { + list<ProcResourceKind> ExePorts, + int Lat, list<int> Res = [1], int UOps = 1> { // Register variant is using a single cycle on ExePort. - def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; } + def : WriteRes<SchedRW, ExePorts> { + let Latency = Lat; + let ResourceCycles = Res; + let NumMicroOps = UOps; + } // Memory variant also uses a cycle on port 2/3 and adds 4 cycles to the // latency. - def : WriteRes<SchedRW.Folded, [SBPort23, ExePort]> { - let Latency = !add(Lat, 4); + def : WriteRes<SchedRW.Folded, !listconcat([SBPort23], ExePorts)> { + let Latency = !add(Lat, 4); + let ResourceCycles = !listconcat([1], Res); + let NumMicroOps = UOps; } } @@ -92,106 +98,57 @@ def : WriteRes<WriteMove, [SBPort015]>; def : WriteRes<WriteZero, []>; -defm : SBWriteResPair<WriteALU, SBPort015, 1>; -defm : SBWriteResPair<WriteIMul, SBPort1, 3>; +defm : SBWriteResPair<WriteALU, [SBPort015], 1>; +defm : SBWriteResPair<WriteIMul, [SBPort1], 3>; +defm : SBWriteResPair<WriteIDiv, [SBPort0, SBDivider], 25, [1, 10]>; def : WriteRes<WriteIMulH, []> { let Latency = 3; } -defm : SBWriteResPair<WriteShift, SBPort05, 1>; -defm : SBWriteResPair<WriteJump, SBPort5, 1>; + +defm : SBWriteResPair<WriteShift, [SBPort05], 1>; +defm : SBWriteResPair<WriteJump, [SBPort5], 1>; // This is for simple LEAs with one or two input operands. // The complex ones can only execute on port 1, and they require two cycles on // the port to read all inputs. We don't model that. def : WriteRes<WriteLEA, [SBPort15]>; -// This is quite rough, latency depends on the dividend. -def : WriteRes<WriteIDiv, [SBPort0, SBDivider]> { - let Latency = 25; - let ResourceCycles = [1, 10]; -} -def : WriteRes<WriteIDivLd, [SBPort23, SBPort0, SBDivider]> { - let Latency = 29; - let ResourceCycles = [1, 1, 10]; -} - // Scalar and vector floating point. def : WriteRes<WriteFStore, [SBPort23, SBPort4]>; def : WriteRes<WriteFLoad, [SBPort23]> { let Latency = 6; } def : WriteRes<WriteFMove, [SBPort5]>; -defm : SBWriteResPair<WriteFAdd, SBPort1, 3>; -defm : SBWriteResPair<WriteFMul, SBPort0, 5>; -defm : SBWriteResPair<WriteFDiv, SBPort0, 24>; -defm : SBWriteResPair<WriteFRcp, SBPort0, 5>; -defm : SBWriteResPair<WriteFRsqrt, SBPort0, 5>; -defm : SBWriteResPair<WriteFSqrt, SBPort0, 14>; -defm : SBWriteResPair<WriteCvtF2I, SBPort1, 3>; -defm : SBWriteResPair<WriteCvtI2F, SBPort1, 4>; -defm : SBWriteResPair<WriteCvtF2F, SBPort1, 3>; -defm : SBWriteResPair<WriteFShuffle, SBPort5, 1>; -defm : SBWriteResPair<WriteFBlend, SBPort05, 1>; -def : WriteRes<WriteFVarBlend, [SBPort0, SBPort5]> { - let Latency = 2; - let ResourceCycles = [1, 1]; -} -def : WriteRes<WriteFVarBlendLd, [SBPort0, SBPort5, SBPort23]> { - let Latency = 6; - let ResourceCycles = [1, 1, 1]; -} +defm : SBWriteResPair<WriteFAdd, [SBPort1], 3>; +defm : SBWriteResPair<WriteFMul, [SBPort0], 5>; +defm : SBWriteResPair<WriteFDiv, [SBPort0], 24>; +defm : SBWriteResPair<WriteFRcp, [SBPort0], 5>; +defm : SBWriteResPair<WriteFRsqrt, [SBPort0], 5>; +defm : SBWriteResPair<WriteFSqrt, [SBPort0], 14>; +defm : SBWriteResPair<WriteCvtF2I, [SBPort1], 3>; +defm : SBWriteResPair<WriteCvtI2F, [SBPort1], 4>; +defm : SBWriteResPair<WriteCvtF2F, [SBPort1], 3>; +defm : SBWriteResPair<WriteFShuffle, [SBPort5], 1>; +defm : SBWriteResPair<WriteFBlend, [SBPort05], 1>; +defm : SBWriteResPair<WriteFVarBlend, [SBPort0, SBPort5], 2>; // Vector integer operations. def : WriteRes<WriteVecStore, [SBPort23, SBPort4]>; def : WriteRes<WriteVecLoad, [SBPort23]> { let Latency = 6; } def : WriteRes<WriteVecMove, [SBPort05]>; -defm : SBWriteResPair<WriteVecShift, SBPort5, 1>; -defm : SBWriteResPair<WriteVecLogic, SBPort5, 1>; -defm : SBWriteResPair<WriteVecALU, SBPort1, 3>; -defm : SBWriteResPair<WriteVecIMul, SBPort0, 5>; -defm : SBWriteResPair<WriteShuffle, SBPort5, 1>; -defm : SBWriteResPair<WriteBlend, SBPort15, 1>; -def : WriteRes<WriteVarBlend, [SBPort1, SBPort5]> { - let Latency = 2; - let ResourceCycles = [1, 1]; -} -def : WriteRes<WriteVarBlendLd, [SBPort1, SBPort5, SBPort23]> { - let Latency = 6; - let ResourceCycles = [1, 1, 1]; -} -def : WriteRes<WriteMPSAD, [SBPort0,SBPort15]> { - let Latency = 5; - let NumMicroOps = 3; - let ResourceCycles = [1,2]; -} -def : WriteRes<WriteMPSADLd, [SBPort0,SBPort23,SBPort15]> { - let Latency = 11; - let NumMicroOps = 4; - let ResourceCycles = [1,1,2]; -} +defm : SBWriteResPair<WriteVecShift, [SBPort5], 1>; +defm : SBWriteResPair<WriteVecLogic, [SBPort5], 1>; +defm : SBWriteResPair<WriteVecALU, [SBPort1], 3>; +defm : SBWriteResPair<WriteVecIMul, [SBPort0], 5>; +defm : SBWriteResPair<WriteShuffle, [SBPort5], 1>; +defm : SBWriteResPair<WriteBlend, [SBPort15], 1>; +defm : SBWriteResPair<WriteVarBlend, [SBPort1, SBPort5], 2>; +defm : SBWriteResPair<WriteMPSAD, [SBPort0, SBPort15], 5, [1,2], 3>; //////////////////////////////////////////////////////////////////////////////// // Horizontal add/sub instructions. //////////////////////////////////////////////////////////////////////////////// -// HADD, HSUB PS/PD -// x,x / v,v,v. -def : WriteRes<WriteFHAdd, [SBPort1]> { - let Latency = 3; -} - -// x,m / v,v,m. -def : WriteRes<WriteFHAddLd, [SBPort1, SBPort23]> { - let Latency = 7; - let ResourceCycles = [1, 1]; -} -// PHADD|PHSUB (S) W/D. -// v <- v,v. -def : WriteRes<WritePHAdd, [SBPort15]>; - -// v <- v,m. -def : WriteRes<WritePHAddLd, [SBPort15, SBPort23]> { - let Latency = 5; - let ResourceCycles = [1, 1]; -} +defm : SBWriteResPair<WriteFHAdd, [SBPort1], 3>; +defm : SBWriteResPair<WritePHAdd, [SBPort15], 1>; // String instructions. // Packed Compare Implicit Length Strings, Return Mask @@ -286,10 +243,10 @@ // AVX2/FMA is not supported on that architecture, but we should define the basic // scheduling resources anyway. -defm : SBWriteResPair<WriteFShuffle256, SBPort0, 1>; -defm : SBWriteResPair<WriteShuffle256, SBPort0, 1>; -defm : SBWriteResPair<WriteVarVecShift, SBPort0, 1>; -defm : SBWriteResPair<WriteFMA, SBPort01, 5>; +defm : SBWriteResPair<WriteFShuffle256, [SBPort0], 1>; +defm : SBWriteResPair<WriteShuffle256, [SBPort0], 1>; +defm : SBWriteResPair<WriteVarVecShift, [SBPort0], 1>; +defm : SBWriteResPair<WriteFMA, [SBPort01], 5>; // Remaining SNB instrs. Index: llvm/trunk/lib/Target/X86/X86SchedSkylakeClient.td =================================================================== --- llvm/trunk/lib/Target/X86/X86SchedSkylakeClient.td +++ llvm/trunk/lib/Target/X86/X86SchedSkylakeClient.td @@ -77,15 +77,21 @@ // This multiclass defines the resource usage for variants with and without // folded loads. multiclass SKLWriteResPair<X86FoldableSchedWrite SchedRW, - ProcResourceKind ExePort, - int Lat> { + list<ProcResourceKind> ExePorts, + int Lat, list<int> Res = [1], int UOps = 1> { // Register variant is using a single cycle on ExePort. - def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; } + def : WriteRes<SchedRW, ExePorts> { + let Latency = Lat; + let ResourceCycles = Res; + let NumMicroOps = UOps; + } // Memory variant also uses a cycle on port 2/3 and adds 5 cycles to the // latency. - def : WriteRes<SchedRW.Folded, [SKLPort23, ExePort]> { - let Latency = !add(Lat, 5); + def : WriteRes<SchedRW.Folded, !listconcat([SKLPort23], ExePorts)> { + let Latency = !add(Lat, 5); + let ResourceCycles = !listconcat([1], Res); + let NumMicroOps = UOps; } } @@ -94,8 +100,8 @@ def : WriteRes<WriteRMW, [SKLPort4]>; // Arithmetic. -defm : SKLWriteResPair<WriteALU, SKLPort0156, 1>; // Simple integer ALU op. -defm : SKLWriteResPair<WriteIMul, SKLPort1, 3>; // Integer multiplication. +defm : SKLWriteResPair<WriteALU, [SKLPort0156], 1>; // Simple integer ALU op. +defm : SKLWriteResPair<WriteIMul, [SKLPort1], 3>; // Integer multiplication. def : WriteRes<WriteIMulH, []> { let Latency = 3; } // Integer multiplication, high part. def SKLDivider : ProcResource<1>; // Integer division issued on port 0. def : WriteRes<WriteIDiv, [SKLPort0, SKLDivider]> { // Integer division. @@ -110,7 +116,7 @@ def : WriteRes<WriteLEA, [SKLPort15]>; // LEA instructions can't fold loads. // Integer shifts and rotates. -defm : SKLWriteResPair<WriteShift, SKLPort06, 1>; +defm : SKLWriteResPair<WriteShift, [SKLPort06], 1>; // Loads, stores, and moves, not folded with other operations. def : WriteRes<WriteLoad, [SKLPort23]> { let Latency = 5; } @@ -123,30 +129,23 @@ // Branches don't produce values, so they have no latency, but they still // consume resources. Indirect branches can fold loads. -defm : SKLWriteResPair<WriteJump, SKLPort06, 1>; +defm : SKLWriteResPair<WriteJump, [SKLPort06], 1>; // Floating point. This covers both scalar and vector operations. def : WriteRes<WriteFLoad, [SKLPort23]> { let Latency = 6; } def : WriteRes<WriteFStore, [SKLPort237, SKLPort4]>; def : WriteRes<WriteFMove, [SKLPort015]>; -defm : SKLWriteResPair<WriteFAdd, SKLPort1, 3>; // Floating point add/sub/compare. -defm : SKLWriteResPair<WriteFMul, SKLPort0, 5>; // Floating point multiplication. -defm : SKLWriteResPair<WriteFDiv, SKLPort0, 12>; // 10-14 cycles. // Floating point division. -defm : SKLWriteResPair<WriteFSqrt, SKLPort0, 15>; // Floating point square root. -defm : SKLWriteResPair<WriteFRcp, SKLPort0, 5>; // Floating point reciprocal estimate. -defm : SKLWriteResPair<WriteFRsqrt, SKLPort0, 5>; // Floating point reciprocal square root estimate. -defm : SKLWriteResPair<WriteFMA, SKLPort01, 4>; // Fused Multiply Add. -defm : SKLWriteResPair<WriteFShuffle, SKLPort5, 1>; // Floating point vector shuffles. -defm : SKLWriteResPair<WriteFBlend, SKLPort015, 1>; // Floating point vector blends. -def : WriteRes<WriteFVarBlend, [SKLPort5]> { // Fp vector variable blends. - let Latency = 2; - let ResourceCycles = [2]; -} -def : WriteRes<WriteFVarBlendLd, [SKLPort5, SKLPort23]> { - let Latency = 6; - let ResourceCycles = [2, 1]; -} +defm : SKLWriteResPair<WriteFAdd, [SKLPort1], 3>; // Floating point add/sub/compare. +defm : SKLWriteResPair<WriteFMul, [SKLPort0], 5>; // Floating point multiplication. +defm : SKLWriteResPair<WriteFDiv, [SKLPort0], 12>; // 10-14 cycles. // Floating point division. +defm : SKLWriteResPair<WriteFSqrt, [SKLPort0], 15>; // Floating point square root. +defm : SKLWriteResPair<WriteFRcp, [SKLPort0], 5>; // Floating point reciprocal estimate. +defm : SKLWriteResPair<WriteFRsqrt, [SKLPort0], 5>; // Floating point reciprocal square root estimate. +defm : SKLWriteResPair<WriteFMA, [SKLPort01], 4>; // Fused Multiply Add. +defm : SKLWriteResPair<WriteFShuffle, [SKLPort5], 1>; // Floating point vector shuffles. +defm : SKLWriteResPair<WriteFBlend, [SKLPort015], 1>; // Floating point vector blends. +defm : SKLWriteResPair<WriteFVarBlend, [SKLPort5], 2, [2]>; // Fp vector variable blends. // FMA Scheduling helper class. // class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; } @@ -156,38 +155,22 @@ def : WriteRes<WriteVecStore, [SKLPort237, SKLPort4]>; def : WriteRes<WriteVecMove, [SKLPort015]>; -defm : SKLWriteResPair<WriteVecALU, SKLPort15, 1>; // Vector integer ALU op, no logicals. -defm : SKLWriteResPair<WriteVecShift, SKLPort0, 1>; // Vector integer shifts. -defm : SKLWriteResPair<WriteVecIMul, SKLPort0, 5>; // Vector integer multiply. -defm : SKLWriteResPair<WriteShuffle, SKLPort5, 1>; // Vector shuffles. -defm : SKLWriteResPair<WriteBlend, SKLPort15, 1>; // Vector blends. - -def : WriteRes<WriteVarBlend, [SKLPort5]> { // Vector variable blends. - let Latency = 2; - let ResourceCycles = [2]; -} -def : WriteRes<WriteVarBlendLd, [SKLPort5, SKLPort23]> { - let Latency = 6; - let ResourceCycles = [2, 1]; -} - -def : WriteRes<WriteMPSAD, [SKLPort0, SKLPort5]> { // Vector MPSAD. - let Latency = 6; - let ResourceCycles = [1, 2]; -} -def : WriteRes<WriteMPSADLd, [SKLPort23, SKLPort0, SKLPort5]> { - let Latency = 6; - let ResourceCycles = [1, 1, 2]; -} +defm : SKLWriteResPair<WriteVecALU, [SKLPort15], 1>; // Vector integer ALU op, no logicals. +defm : SKLWriteResPair<WriteVecShift, [SKLPort0], 1>; // Vector integer shifts. +defm : SKLWriteResPair<WriteVecIMul, [SKLPort0], 5>; // Vector integer multiply. +defm : SKLWriteResPair<WriteShuffle, [SKLPort5], 1>; // Vector shuffles. +defm : SKLWriteResPair<WriteBlend, [SKLPort15], 1>; // Vector blends. +defm : SKLWriteResPair<WriteVarBlend, [SKLPort5], 2, [2]>; // Vector variable blends. +defm : SKLWriteResPair<WriteMPSAD, [SKLPort0, SKLPort5], 6, [1, 2]>; // Vector MPSAD. // Vector bitwise operations. // These are often used on both floating point and integer vectors. -defm : SKLWriteResPair<WriteVecLogic, SKLPort015, 1>; // Vector and/or/xor. +defm : SKLWriteResPair<WriteVecLogic, [SKLPort015], 1>; // Vector and/or/xor. // Conversion between integer and float. -defm : SKLWriteResPair<WriteCvtF2I, SKLPort1, 3>; // Float -> Integer. -defm : SKLWriteResPair<WriteCvtI2F, SKLPort1, 4>; // Integer -> Float. -defm : SKLWriteResPair<WriteCvtF2F, SKLPort1, 3>; // Float -> Float size conversion. +defm : SKLWriteResPair<WriteCvtF2I, [SKLPort1], 3>; // Float -> Integer. +defm : SKLWriteResPair<WriteCvtI2F, [SKLPort1], 4>; // Integer -> Float. +defm : SKLWriteResPair<WriteCvtF2F, [SKLPort1], 3>; // Float -> Float size conversion. // Strings instructions. // Packed Compare Implicit Length Strings, Return Mask @@ -268,16 +251,9 @@ def : WriteRes<WriteSystem, [SKLPort0156]> { let Latency = 100; } // def WriteSystem : SchedWrite; // AVX2. -defm : SKLWriteResPair<WriteFShuffle256, SKLPort5, 3>; // Fp 256-bit width vector shuffles. -defm : SKLWriteResPair<WriteShuffle256, SKLPort5, 3>; // 256-bit width vector shuffles. -def : WriteRes<WriteVarVecShift, [SKLPort0, SKLPort5]> { // Variable vector shifts. - let Latency = 2; - let ResourceCycles = [2, 1]; -} -def : WriteRes<WriteVarVecShiftLd, [SKLPort0, SKLPort5, SKLPort23]> { - let Latency = 6; - let ResourceCycles = [2, 1, 1]; -} +defm : SKLWriteResPair<WriteFShuffle256, [SKLPort5], 3>; // Fp 256-bit width vector shuffles. +defm : SKLWriteResPair<WriteShuffle256, [SKLPort5], 3>; // 256-bit width vector shuffles. +defm : SKLWriteResPair<WriteVarVecShift, [SKLPort0, SKLPort5], 2, [2, 1]>; // Variable vector shifts. // Old microcoded instructions that nobody use. def : WriteRes<WriteMicrocoded, [SKLPort0156]> { let Latency = 100; } // def WriteMicrocoded : SchedWrite; @@ -291,27 +267,9 @@ //////////////////////////////////////////////////////////////////////////////// // Horizontal add/sub instructions. //////////////////////////////////////////////////////////////////////////////// -// HADD, HSUB PS/PD -// x,x / v,v,v. -def : WriteRes<WriteFHAdd, [SKLPort1]> { - let Latency = 3; -} -// x,m / v,v,m. -def : WriteRes<WriteFHAddLd, [SKLPort1, SKLPort23]> { - let Latency = 7; - let ResourceCycles = [1, 1]; -} - -// PHADD|PHSUB (S) W/D. -// v <- v,v. -def : WriteRes<WritePHAdd, [SKLPort15]>; - -// v <- v,m. -def : WriteRes<WritePHAddLd, [SKLPort15, SKLPort23]> { - let Latency = 5; - let ResourceCycles = [1, 1]; -} +defm : SKLWriteResPair<WriteFHAdd, [SKLPort1], 3>; +defm : SKLWriteResPair<WritePHAdd, [SKLPort15], 1>; // Remaining instrs. Index: llvm/trunk/lib/Target/X86/X86SchedSkylakeServer.td =================================================================== --- llvm/trunk/lib/Target/X86/X86SchedSkylakeServer.td +++ llvm/trunk/lib/Target/X86/X86SchedSkylakeServer.td @@ -77,15 +77,21 @@ // This multiclass defines the resource usage for variants with and without // folded loads. multiclass SKXWriteResPair<X86FoldableSchedWrite SchedRW, - ProcResourceKind ExePort, - int Lat> { + list<ProcResourceKind> ExePorts, + int Lat, list<int> Res = [1], int UOps = 1> { // Register variant is using a single cycle on ExePort. - def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; } + def : WriteRes<SchedRW, ExePorts> { + let Latency = Lat; + let ResourceCycles = Res; + let NumMicroOps = UOps; + } // Memory variant also uses a cycle on port 2/3 and adds 5 cycles to the // latency. - def : WriteRes<SchedRW.Folded, [SKXPort23, ExePort]> { - let Latency = !add(Lat, 5); + def : WriteRes<SchedRW.Folded, !listconcat([SKXPort23], ExePorts)> { + let Latency = !add(Lat, 5); + let ResourceCycles = !listconcat([1], Res); + let NumMicroOps = UOps; } } @@ -94,8 +100,8 @@ def : WriteRes<WriteRMW, [SKXPort4]>; // Arithmetic. -defm : SKXWriteResPair<WriteALU, SKXPort0156, 1>; // Simple integer ALU op. -defm : SKXWriteResPair<WriteIMul, SKXPort1, 3>; // Integer multiplication. +defm : SKXWriteResPair<WriteALU, [SKXPort0156], 1>; // Simple integer ALU op. +defm : SKXWriteResPair<WriteIMul, [SKXPort1], 3>; // Integer multiplication. def : WriteRes<WriteIMulH, []> { let Latency = 3; } // Integer multiplication, high part. def SKXDivider : ProcResource<1>; // Integer division issued on port 0. def : WriteRes<WriteIDiv, [SKXPort0, SKXDivider]> { // Integer division. @@ -110,7 +116,7 @@ def : WriteRes<WriteLEA, [SKXPort15]>; // LEA instructions can't fold loads. // Integer shifts and rotates. -defm : SKXWriteResPair<WriteShift, SKXPort06, 1>; +defm : SKXWriteResPair<WriteShift, [SKXPort06], 1>; // Loads, stores, and moves, not folded with other operations. def : WriteRes<WriteLoad, [SKXPort23]> { let Latency = 5; } @@ -123,30 +129,23 @@ // Branches don't produce values, so they have no latency, but they still // consume resources. Indirect branches can fold loads. -defm : SKXWriteResPair<WriteJump, SKXPort06, 1>; +defm : SKXWriteResPair<WriteJump, [SKXPort06], 1>; // Floating point. This covers both scalar and vector operations. def : WriteRes<WriteFLoad, [SKXPort23]> { let Latency = 5; } def : WriteRes<WriteFStore, [SKXPort237, SKXPort4]>; def : WriteRes<WriteFMove, [SKXPort015]>; -defm : SKXWriteResPair<WriteFAdd, SKXPort1, 3>; // Floating point add/sub/compare. -defm : SKXWriteResPair<WriteFMul, SKXPort0, 5>; // Floating point multiplication. -defm : SKXWriteResPair<WriteFDiv, SKXPort0, 12>; // 10-14 cycles. // Floating point division. -defm : SKXWriteResPair<WriteFSqrt, SKXPort0, 15>; // Floating point square root. -defm : SKXWriteResPair<WriteFRcp, SKXPort0, 5>; // Floating point reciprocal estimate. -defm : SKXWriteResPair<WriteFRsqrt, SKXPort0, 5>; // Floating point reciprocal square root estimate. -defm : SKXWriteResPair<WriteFMA, SKXPort015, 4>; // Fused Multiply Add. -defm : SKXWriteResPair<WriteFShuffle, SKXPort5, 1>; // Floating point vector shuffles. -defm : SKXWriteResPair<WriteFBlend, SKXPort015, 1>; // Floating point vector blends. -def : WriteRes<WriteFVarBlend, [SKXPort5]> { // Fp vector variable blends. - let Latency = 2; - let ResourceCycles = [2]; -} -def : WriteRes<WriteFVarBlendLd, [SKXPort5, SKXPort23]> { - let Latency = 6; - let ResourceCycles = [2, 1]; -} +defm : SKXWriteResPair<WriteFAdd, [SKXPort1], 3>; // Floating point add/sub/compare. +defm : SKXWriteResPair<WriteFMul, [SKXPort0], 5>; // Floating point multiplication. +defm : SKXWriteResPair<WriteFDiv, [SKXPort0], 12>; // 10-14 cycles. // Floating point division. +defm : SKXWriteResPair<WriteFSqrt, [SKXPort0], 15>; // Floating point square root. +defm : SKXWriteResPair<WriteFRcp, [SKXPort0], 5>; // Floating point reciprocal estimate. +defm : SKXWriteResPair<WriteFRsqrt, [SKXPort0], 5>; // Floating point reciprocal square root estimate. +defm : SKXWriteResPair<WriteFMA, [SKXPort015], 4>; // Fused Multiply Add. +defm : SKXWriteResPair<WriteFShuffle, [SKXPort5], 1>; // Floating point vector shuffles. +defm : SKXWriteResPair<WriteFBlend, [SKXPort015], 1>; // Floating point vector blends. +defm : SKXWriteResPair<WriteFVarBlend, [SKXPort5], 2, [2]>; // Fp vector variable blends. // FMA Scheduling helper class. // class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; } @@ -156,38 +155,22 @@ def : WriteRes<WriteVecStore, [SKXPort237, SKXPort4]>; def : WriteRes<WriteVecMove, [SKXPort015]>; -defm : SKXWriteResPair<WriteVecALU, SKXPort15, 1>; // Vector integer ALU op, no logicals. -defm : SKXWriteResPair<WriteVecShift, SKXPort0, 1>; // Vector integer shifts. -defm : SKXWriteResPair<WriteVecIMul, SKXPort0, 5>; // Vector integer multiply. -defm : SKXWriteResPair<WriteShuffle, SKXPort5, 1>; // Vector shuffles. -defm : SKXWriteResPair<WriteBlend, SKXPort15, 1>; // Vector blends. - -def : WriteRes<WriteVarBlend, [SKXPort5]> { // Vector variable blends. - let Latency = 2; - let ResourceCycles = [2]; -} -def : WriteRes<WriteVarBlendLd, [SKXPort5, SKXPort23]> { - let Latency = 6; - let ResourceCycles = [2, 1]; -} - -def : WriteRes<WriteMPSAD, [SKXPort0, SKXPort5]> { // Vector MPSAD. - let Latency = 6; - let ResourceCycles = [1, 2]; -} -def : WriteRes<WriteMPSADLd, [SKXPort23, SKXPort0, SKXPort5]> { - let Latency = 6; - let ResourceCycles = [1, 1, 2]; -} +defm : SKXWriteResPair<WriteVecALU, [SKXPort15], 1>; // Vector integer ALU op, no logicals. +defm : SKXWriteResPair<WriteVecShift, [SKXPort0], 1>; // Vector integer shifts. +defm : SKXWriteResPair<WriteVecIMul, [SKXPort0], 5>; // Vector integer multiply. +defm : SKXWriteResPair<WriteShuffle, [SKXPort5], 1>; // Vector shuffles. +defm : SKXWriteResPair<WriteBlend, [SKXPort15], 1>; // Vector blends. +defm : SKXWriteResPair<WriteVarBlend, [SKXPort5], 2, [2]>; // Vector variable blends. +defm : SKXWriteResPair<WriteMPSAD, [SKXPort0, SKXPort5], 6, [1, 2]>; // Vector MPSAD. // Vector bitwise operations. // These are often used on both floating point and integer vectors. -defm : SKXWriteResPair<WriteVecLogic, SKXPort015, 1>; // Vector and/or/xor. +defm : SKXWriteResPair<WriteVecLogic, [SKXPort015], 1>; // Vector and/or/xor. // Conversion between integer and float. -defm : SKXWriteResPair<WriteCvtF2I, SKXPort1, 3>; // Float -> Integer. -defm : SKXWriteResPair<WriteCvtI2F, SKXPort1, 4>; // Integer -> Float. -defm : SKXWriteResPair<WriteCvtF2F, SKXPort1, 3>; // Float -> Float size conversion. +defm : SKXWriteResPair<WriteCvtF2I, [SKXPort1], 3>; // Float -> Integer. +defm : SKXWriteResPair<WriteCvtI2F, [SKXPort1], 4>; // Integer -> Float. +defm : SKXWriteResPair<WriteCvtF2F, [SKXPort1], 3>; // Float -> Float size conversion. // Strings instructions. // Packed Compare Implicit Length Strings, Return Mask @@ -268,16 +251,9 @@ def : WriteRes<WriteSystem, [SKXPort0156]> { let Latency = 100; } // def WriteSystem : SchedWrite; // AVX2. -defm : SKXWriteResPair<WriteFShuffle256, SKXPort5, 3>; // Fp 256-bit width vector shuffles. -defm : SKXWriteResPair<WriteShuffle256, SKXPort5, 3>; // 256-bit width vector shuffles. -def : WriteRes<WriteVarVecShift, [SKXPort0, SKXPort5]> { // Variable vector shifts. - let Latency = 2; - let ResourceCycles = [2, 1]; -} -def : WriteRes<WriteVarVecShiftLd, [SKXPort0, SKXPort5, SKXPort23]> { - let Latency = 6; - let ResourceCycles = [2, 1, 1]; -} +defm : SKXWriteResPair<WriteFShuffle256, [SKXPort5], 3>; // Fp 256-bit width vector shuffles. +defm : SKXWriteResPair<WriteShuffle256, [SKXPort5], 3>; // 256-bit width vector shuffles. +defm : SKXWriteResPair<WriteVarVecShift, [SKXPort0, SKXPort5], 2, [2, 1]>; // Variable vector shifts. // Old microcoded instructions that nobody use. def : WriteRes<WriteMicrocoded, [SKXPort0156]> { let Latency = 100; } // def WriteMicrocoded : SchedWrite; @@ -291,27 +267,9 @@ //////////////////////////////////////////////////////////////////////////////// // Horizontal add/sub instructions. //////////////////////////////////////////////////////////////////////////////// -// HADD, HSUB PS/PD -// x,x / v,v,v. -def : WriteRes<WriteFHAdd, [SKXPort1]> { - let Latency = 3; -} -// x,m / v,v,m. -def : WriteRes<WriteFHAddLd, [SKXPort1, SKXPort23]> { - let Latency = 7; - let ResourceCycles = [1, 1]; -} - -// PHADD|PHSUB (S) W/D. -// v <- v,v. -def : WriteRes<WritePHAdd, [SKXPort15]>; - -// v <- v,m. -def : WriteRes<WritePHAddLd, [SKXPort15, SKXPort23]> { - let Latency = 5; - let ResourceCycles = [1, 1]; -} +defm : SKXWriteResPair<WriteFHAdd, [SKXPort1], 3>; +defm : SKXWriteResPair<WritePHAdd, [SKXPort15], 1>; // Remaining instrs. Index: llvm/trunk/lib/Target/X86/X86ScheduleSLM.td =================================================================== --- llvm/trunk/lib/Target/X86/X86ScheduleSLM.td +++ llvm/trunk/lib/Target/X86/X86ScheduleSLM.td @@ -56,15 +56,21 @@ // This multiclass defines the resource usage for variants with and without // folded loads. multiclass SMWriteResPair<X86FoldableSchedWrite SchedRW, - ProcResourceKind ExePort, - int Lat> { + list<ProcResourceKind> ExePorts, + int Lat, list<int> Res = [1], int UOps = 1> { // Register variant is using a single cycle on ExePort. - def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; } + def : WriteRes<SchedRW, ExePorts> { + let Latency = Lat; + let ResourceCycles = Res; + let NumMicroOps = UOps; + } // Memory variant also uses a cycle on MEC_RSV and adds 3 cycles to the // latency. - def : WriteRes<SchedRW.Folded, [MEC_RSV, ExePort]> { - let Latency = !add(Lat, 3); + def : WriteRes<SchedRW.Folded, !listconcat([MEC_RSV], ExePorts)> { + let Latency = !add(Lat, 3); + let ResourceCycles = !listconcat([1], Res); + let NumMicroOps = UOps; } } @@ -80,10 +86,10 @@ // Treat misc copies as a move. def : InstRW<[WriteMove], (instrs COPY)>; -defm : SMWriteResPair<WriteALU, IEC_RSV01, 1>; -defm : SMWriteResPair<WriteIMul, IEC_RSV1, 3>; -defm : SMWriteResPair<WriteShift, IEC_RSV0, 1>; -defm : SMWriteResPair<WriteJump, IEC_RSV1, 1>; +defm : SMWriteResPair<WriteALU, [IEC_RSV01], 1>; +defm : SMWriteResPair<WriteIMul, [IEC_RSV1], 3>; +defm : SMWriteResPair<WriteShift, [IEC_RSV0], 1>; +defm : SMWriteResPair<WriteJump, [IEC_RSV1], 1>; // This is for simple LEAs with one or two input operands. // The complex ones can only execute on port 1, and they require two cycles on @@ -105,74 +111,37 @@ def : WriteRes<WriteFLoad, [MEC_RSV]> { let Latency = 3; } def : WriteRes<WriteFMove, [FPC_RSV01]>; -defm : SMWriteResPair<WriteFAdd, FPC_RSV1, 3>; -defm : SMWriteResPair<WriteFRcp, FPC_RSV0, 5>; -defm : SMWriteResPair<WriteFRsqrt, FPC_RSV0, 5>; -defm : SMWriteResPair<WriteFSqrt, FPC_RSV0, 15>; -defm : SMWriteResPair<WriteCvtF2I, FPC_RSV01, 4>; -defm : SMWriteResPair<WriteCvtI2F, FPC_RSV01, 4>; -defm : SMWriteResPair<WriteCvtF2F, FPC_RSV01, 4>; -defm : SMWriteResPair<WriteFShuffle, FPC_RSV0, 1>; -defm : SMWriteResPair<WriteFBlend, FPC_RSV0, 1>; - -// This is quite rough, latency depends on precision -def : WriteRes<WriteFMul, [FPC_RSV0, SMFPMultiplier]> { - let Latency = 5; - let ResourceCycles = [1, 2]; -} -def : WriteRes<WriteFMulLd, [MEC_RSV, FPC_RSV0, SMFPMultiplier]> { - let Latency = 8; - let ResourceCycles = [1, 1, 2]; -} - -def : WriteRes<WriteFDiv, [FPC_RSV0, SMFPDivider]> { - let Latency = 34; - let ResourceCycles = [1, 34]; -} -def : WriteRes<WriteFDivLd, [MEC_RSV, FPC_RSV0, SMFPDivider]> { - let Latency = 37; - let ResourceCycles = [1, 1, 34]; -} +defm : SMWriteResPair<WriteFAdd, [FPC_RSV1], 3>; +defm : SMWriteResPair<WriteFMul, [FPC_RSV0, SMFPMultiplier], 5, [1,2]>; +defm : SMWriteResPair<WriteFDiv, [FPC_RSV0, SMFPDivider], 34, [1,34]>; +defm : SMWriteResPair<WriteFRcp, [FPC_RSV0], 5>; +defm : SMWriteResPair<WriteFRsqrt, [FPC_RSV0], 5>; +defm : SMWriteResPair<WriteFSqrt, [FPC_RSV0], 15>; +defm : SMWriteResPair<WriteCvtF2I, [FPC_RSV01], 4>; +defm : SMWriteResPair<WriteCvtI2F, [FPC_RSV01], 4>; +defm : SMWriteResPair<WriteCvtF2F, [FPC_RSV01], 4>; +defm : SMWriteResPair<WriteFShuffle, [FPC_RSV0], 1>; +defm : SMWriteResPair<WriteFBlend, [FPC_RSV0], 1>; // Vector integer operations. def : WriteRes<WriteVecStore, [FPC_RSV01, MEC_RSV]>; def : WriteRes<WriteVecLoad, [MEC_RSV]> { let Latency = 3; } def : WriteRes<WriteVecMove, [FPC_RSV01]>; -defm : SMWriteResPair<WriteVecShift, FPC_RSV0, 1>; -defm : SMWriteResPair<WriteVecLogic, FPC_RSV01, 1>; -defm : SMWriteResPair<WriteVecALU, FPC_RSV01, 1>; -defm : SMWriteResPair<WriteVecIMul, FPC_RSV0, 4>; -defm : SMWriteResPair<WriteShuffle, FPC_RSV0, 1>; -defm : SMWriteResPair<WriteBlend, FPC_RSV0, 1>; -defm : SMWriteResPair<WriteMPSAD, FPC_RSV0, 7>; +defm : SMWriteResPair<WriteVecShift, [FPC_RSV0], 1>; +defm : SMWriteResPair<WriteVecLogic, [FPC_RSV01], 1>; +defm : SMWriteResPair<WriteVecALU, [FPC_RSV01], 1>; +defm : SMWriteResPair<WriteVecIMul, [FPC_RSV0], 4>; +defm : SMWriteResPair<WriteShuffle, [FPC_RSV0], 1>; +defm : SMWriteResPair<WriteBlend, [FPC_RSV0], 1>; +defm : SMWriteResPair<WriteMPSAD, [FPC_RSV0], 7>; //////////////////////////////////////////////////////////////////////////////// // Horizontal add/sub instructions. //////////////////////////////////////////////////////////////////////////////// -// HADD, HSUB PS/PD - -def : WriteRes<WriteFHAdd, [FPC_RSV01]> { - let Latency = 3; - let ResourceCycles = [2]; -} - -def : WriteRes<WriteFHAddLd, [FPC_RSV01, MEC_RSV]> { - let Latency = 6; - let ResourceCycles = [2, 1]; -} - -// PHADD|PHSUB (S) W/D. -def : WriteRes<WritePHAdd, [FPC_RSV01]> { - let Latency = 1; - let ResourceCycles = [1]; -} - -def : WriteRes<WritePHAddLd, [FPC_RSV01, MEC_RSV]> { - let Latency = 4; - let ResourceCycles = [1, 1]; -} +defm : SMWriteResPair<WriteFHAdd, [FPC_RSV01], 3, [2]>; +defm : SMWriteResPair<WritePHAdd, [FPC_RSV01], 1>; // String instructions. // Packed Compare Implicit Length Strings, Return Mask @@ -262,10 +231,10 @@ // AVX/FMA is not supported on that architecture, but we should define the basic // scheduling resources anyway. def : WriteRes<WriteIMulH, [FPC_RSV0]>; -defm : SMWriteResPair<WriteVarBlend, FPC_RSV0, 1>; -defm : SMWriteResPair<WriteFVarBlend, FPC_RSV0, 1>; -defm : SMWriteResPair<WriteFShuffle256, FPC_RSV0, 1>; -defm : SMWriteResPair<WriteShuffle256, FPC_RSV0, 1>; -defm : SMWriteResPair<WriteVarVecShift, FPC_RSV0, 1>; -defm : SMWriteResPair<WriteFMA, FPC_RSV0, 1>; +defm : SMWriteResPair<WriteVarBlend, [FPC_RSV0], 1>; +defm : SMWriteResPair<WriteFVarBlend, [FPC_RSV0], 1>; +defm : SMWriteResPair<WriteFShuffle256, [FPC_RSV0], 1>; +defm : SMWriteResPair<WriteShuffle256, [FPC_RSV0], 1>; +defm : SMWriteResPair<WriteVarVecShift, [FPC_RSV0], 1>; +defm : SMWriteResPair<WriteFMA, [FPC_RSV0], 1>; } // SchedModel Index: llvm/trunk/lib/Target/X86/X86ScheduleZnver1.td =================================================================== --- llvm/trunk/lib/Target/X86/X86ScheduleZnver1.td +++ llvm/trunk/lib/Target/X86/X86ScheduleZnver1.td @@ -99,30 +99,41 @@ // b. addpd // This multiclass is for folded loads for integer units. multiclass ZnWriteResPair<X86FoldableSchedWrite SchedRW, - ProcResourceKind ExePort, - int Lat> { + list<ProcResourceKind> ExePorts, + int Lat, list<int> Res = [1], int UOps = 1> { // Register variant takes 1-cycle on Execution Port. - def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; } + def : WriteRes<SchedRW, ExePorts> { + let Latency = Lat; + let ResourceCycles = Res; + let NumMicroOps = UOps; + } // Memory variant also uses a cycle on ZnAGU // adds 4 cycles to the latency. - def : WriteRes<SchedRW.Folded, [ZnAGU, ExePort]> { - let NumMicroOps = 2; - let Latency = !add(Lat, 4); + def : WriteRes<SchedRW.Folded, !listconcat([ZnAGU], ExePorts)> { + let Latency = !add(Lat, 4); + let ResourceCycles = !listconcat([1], Res); + let NumMicroOps = !add(UOps, 1); } } // This multiclass is for folded loads for floating point units. multiclass ZnWriteResFpuPair<X86FoldableSchedWrite SchedRW, - ProcResourceKind ExePort, - int Lat> { + list<ProcResourceKind> ExePorts, + int Lat, list<int> Res = [1], int UOps = 1> { // Register variant takes 1-cycle on Execution Port. - def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; } + def : WriteRes<SchedRW, ExePorts> { + let Latency = Lat; + let ResourceCycles = Res; + let NumMicroOps = UOps; + } // Memory variant also uses a cycle on ZnAGU // adds 7 cycles to the latency. - def : WriteRes<SchedRW.Folded, [ZnAGU, ExePort]> { - let Latency = !add(Lat, 7); + def : WriteRes<SchedRW.Folded, !listconcat([ZnAGU], ExePorts)> { + let Latency = !add(Lat, 7); + let ResourceCycles = !listconcat([1], Res); + let NumMicroOps = UOps; } } @@ -136,9 +147,10 @@ def : WriteRes<WriteZero, []>; def : WriteRes<WriteLEA, [ZnALU]>; -defm : ZnWriteResPair<WriteALU, ZnALU, 1>; -defm : ZnWriteResPair<WriteShift, ZnALU, 1>; -defm : ZnWriteResPair<WriteJump, ZnALU, 1>; +defm : ZnWriteResPair<WriteALU, [ZnALU], 1>; +defm : ZnWriteResPair<WriteIMul, [ZnALU1, ZnMultiplier], 4>; +defm : ZnWriteResPair<WriteShift, [ZnALU], 1>; +defm : ZnWriteResPair<WriteJump, [ZnALU], 1>; // Treat misc copies as a move. def : InstRW<[WriteMove], (instrs COPY)>; @@ -154,67 +166,60 @@ let ResourceCycles = [1, 4, 41]; } -// IMUL +// IMULH def : WriteRes<WriteIMulH, [ZnALU1, ZnMultiplier]>{ let Latency = 4; } -def : WriteRes<WriteIMul, [ZnALU1, ZnMultiplier]> { - let Latency = 4; -} - -def : WriteRes<WriteIMulLd,[ZnALU1, ZnMultiplier]> { - let Latency = 8; -} // Floating point operations def : WriteRes<WriteFStore, [ZnAGU]>; def : WriteRes<WriteFMove, [ZnFPU]>; def : WriteRes<WriteFLoad, [ZnAGU]> { let Latency = 8; } -defm : ZnWriteResFpuPair<WriteFHAdd, ZnFPU0, 3>; -defm : ZnWriteResFpuPair<WriteFAdd, ZnFPU0, 3>; -defm : ZnWriteResFpuPair<WriteFBlend, ZnFPU01, 1>; -defm : ZnWriteResFpuPair<WriteFVarBlend, ZnFPU01, 1>; -defm : ZnWriteResFpuPair<WriteVarBlend, ZnFPU0, 1>; -defm : ZnWriteResFpuPair<WriteCvtI2F, ZnFPU3, 5>; -defm : ZnWriteResFpuPair<WriteCvtF2F, ZnFPU3, 5>; -defm : ZnWriteResFpuPair<WriteCvtF2I, ZnFPU3, 5>; -defm : ZnWriteResFpuPair<WriteFDiv, ZnFPU3, 15>; -defm : ZnWriteResFpuPair<WriteFShuffle, ZnFPU12, 1>; -defm : ZnWriteResFpuPair<WriteFMul, ZnFPU0, 5>; -defm : ZnWriteResFpuPair<WriteFMA, ZnFPU03, 5>; -defm : ZnWriteResFpuPair<WriteFRcp, ZnFPU01, 5>; -defm : ZnWriteResFpuPair<WriteFRsqrt, ZnFPU01, 5>; -defm : ZnWriteResFpuPair<WriteFSqrt, ZnFPU3, 20>; +defm : ZnWriteResFpuPair<WriteFHAdd, [ZnFPU0], 3>; +defm : ZnWriteResFpuPair<WriteFAdd, [ZnFPU0], 3>; +defm : ZnWriteResFpuPair<WriteFBlend, [ZnFPU01], 1>; +defm : ZnWriteResFpuPair<WriteFVarBlend, [ZnFPU01], 1>; +defm : ZnWriteResFpuPair<WriteVarBlend, [ZnFPU0], 1>; +defm : ZnWriteResFpuPair<WriteCvtI2F, [ZnFPU3], 5>; +defm : ZnWriteResFpuPair<WriteCvtF2F, [ZnFPU3], 5>; +defm : ZnWriteResFpuPair<WriteCvtF2I, [ZnFPU3], 5>; +defm : ZnWriteResFpuPair<WriteFDiv, [ZnFPU3], 15>; +defm : ZnWriteResFpuPair<WriteFShuffle, [ZnFPU12], 1>; +defm : ZnWriteResFpuPair<WriteFMul, [ZnFPU0], 5>; +defm : ZnWriteResFpuPair<WriteFMA, [ZnFPU03], 5>; +defm : ZnWriteResFpuPair<WriteFRcp, [ZnFPU01], 5>; +defm : ZnWriteResFpuPair<WriteFRsqrt, [ZnFPU01], 5>; +defm : ZnWriteResFpuPair<WriteFSqrt, [ZnFPU3], 20>; // Vector integer operations which uses FPU units def : WriteRes<WriteVecStore, [ZnAGU]>; def : WriteRes<WriteVecMove, [ZnFPU]>; def : WriteRes<WriteVecLoad, [ZnAGU]> { let Latency = 8; } -defm : ZnWriteResFpuPair<WriteVecShift, ZnFPU, 1>; -defm : ZnWriteResFpuPair<WriteVecLogic, ZnFPU, 1>; -defm : ZnWriteResFpuPair<WritePHAdd, ZnFPU, 1>; -defm : ZnWriteResFpuPair<WriteVecALU, ZnFPU, 1>; -defm : ZnWriteResFpuPair<WriteVecIMul, ZnFPU0, 4>; -defm : ZnWriteResFpuPair<WriteShuffle, ZnFPU, 1>; -defm : ZnWriteResFpuPair<WriteBlend, ZnFPU01, 1>; -defm : ZnWriteResFpuPair<WriteShuffle256, ZnFPU, 2>; +defm : ZnWriteResFpuPair<WriteVecShift, [ZnFPU], 1>; +defm : ZnWriteResFpuPair<WriteVecLogic, [ZnFPU], 1>; +defm : ZnWriteResFpuPair<WritePHAdd, [ZnFPU], 1>; +defm : ZnWriteResFpuPair<WriteVecALU, [ZnFPU], 1>; +defm : ZnWriteResFpuPair<WriteVecIMul, [ZnFPU0], 4>; +defm : ZnWriteResFpuPair<WriteShuffle, [ZnFPU], 1>; +defm : ZnWriteResFpuPair<WriteBlend, [ZnFPU01], 1>; +defm : ZnWriteResFpuPair<WriteShuffle256, [ZnFPU], 2>; // Vector Shift Operations -defm : ZnWriteResFpuPair<WriteVarVecShift, ZnFPU12, 1>; +defm : ZnWriteResFpuPair<WriteVarVecShift, [ZnFPU12], 1>; // AES Instructions. -defm : ZnWriteResFpuPair<WriteAESDecEnc, ZnFPU01, 4>; -defm : ZnWriteResFpuPair<WriteAESIMC, ZnFPU01, 4>; -defm : ZnWriteResFpuPair<WriteAESKeyGen, ZnFPU01, 4>; +defm : ZnWriteResFpuPair<WriteAESDecEnc, [ZnFPU01], 4>; +defm : ZnWriteResFpuPair<WriteAESIMC, [ZnFPU01], 4>; +defm : ZnWriteResFpuPair<WriteAESKeyGen, [ZnFPU01], 4>; def : WriteRes<WriteFence, [ZnAGU]>; def : WriteRes<WriteNop, []>; // Following instructions with latency=100 are microcoded. // We set long latency so as to block the entire pipeline. -defm : ZnWriteResFpuPair<WriteFShuffle256, ZnFPU, 100>; +defm : ZnWriteResFpuPair<WriteFShuffle256, [ZnFPU], 100>; //Microcoded Instructions let Latency = 100 in { Index: llvm/trunk/test/CodeGen/X86/avx2-schedule.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx2-schedule.ll +++ llvm/trunk/test/CodeGen/X86/avx2-schedule.ll @@ -609,7 +609,7 @@ ; GENERIC-LABEL: test_mpsadbw: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmpsadbw $7, %ymm1, %ymm0, %ymm0 # sched: [5:1.00] -; GENERIC-NEXT: vmpsadbw $7, (%rdi), %ymm0, %ymm0 # sched: [11:1.00] +; GENERIC-NEXT: vmpsadbw $7, (%rdi), %ymm0, %ymm0 # sched: [9:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_mpsadbw: