Index: llvm/trunk/lib/Target/X86/X86SchedBroadwell.td =================================================================== --- llvm/trunk/lib/Target/X86/X86SchedBroadwell.td +++ llvm/trunk/lib/Target/X86/X86SchedBroadwell.td @@ -66,6 +66,9 @@ let BufferSize=60; } +// Integer division issued on port 0. +def BWDivider : ProcResource<1>; // Integer division issued on port 0. + // Loads are 5 cycles, so ReadAfterLd registers needn't be available until 5 // cycles after the memory operand. def : ReadAdvance; @@ -76,15 +79,21 @@ // This multiclass defines the resource usage for variants with and without // folded loads. multiclass BWWriteResPair { + list ExePorts, + int Lat, list Res = [1], int UOps = 1> { // Register variant is using a single cycle on ExePort. - def : WriteRes { let Latency = Lat; } + def : WriteRes { + let Latency = Lat; + let ResourceCycles = Res; + let NumMicroOps = UOps; + } // Memory variant also uses a cycle on port 2/3 and adds 5 cycles to the // latency. - def : WriteRes { - let Latency = !add(Lat, 5); + def : WriteRes { + let Latency = !add(Lat, 5); + let ResourceCycles = !listconcat([1], Res); + let NumMicroOps = UOps; } } @@ -93,23 +102,15 @@ def : WriteRes; // Arithmetic. -defm : BWWriteResPair; // Simple integer ALU op. -defm : BWWriteResPair; // Integer multiplication. +defm : BWWriteResPair; // Simple integer ALU op. +defm : BWWriteResPair; // Integer multiplication. +defm : BWWriteResPair; def : WriteRes { let Latency = 3; } // Integer multiplication, high part. -def BWDivider : ProcResource<1>; // Integer division issued on port 0. -def : WriteRes { // Integer division. - let Latency = 25; - let ResourceCycles = [1, 10]; -} -def : WriteRes { - let Latency = 29; - let ResourceCycles = [1, 1, 10]; -} def : WriteRes; // LEA instructions can't fold loads. // Integer shifts and rotates. -defm : BWWriteResPair; +defm : BWWriteResPair; // Loads, stores, and moves, not folded with other operations. def : WriteRes { let Latency = 5; } @@ -125,30 +126,23 @@ // Branches don't produce values, so they have no latency, but they still // consume resources. Indirect branches can fold loads. -defm : BWWriteResPair; +defm : BWWriteResPair; // Floating point. This covers both scalar and vector operations. def : WriteRes { let Latency = 5; } def : WriteRes; def : WriteRes; -defm : BWWriteResPair; // Floating point add/sub/compare. -defm : BWWriteResPair; // Floating point multiplication. -defm : BWWriteResPair; // 10-14 cycles. // Floating point division. -defm : BWWriteResPair; // Floating point square root. -defm : BWWriteResPair; // Floating point reciprocal estimate. -defm : BWWriteResPair; // Floating point reciprocal square root estimate. -defm : BWWriteResPair; // Fused Multiply Add. -defm : BWWriteResPair; // Floating point vector shuffles. -defm : BWWriteResPair; // Floating point vector blends. -def : WriteRes { // Fp vector variable blends. - let Latency = 2; - let ResourceCycles = [2]; -} -def : WriteRes { - let Latency = 6; - let ResourceCycles = [2, 1]; -} +defm : BWWriteResPair; // Floating point add/sub/compare. +defm : BWWriteResPair; // Floating point multiplication. +defm : BWWriteResPair; // 10-14 cycles. // Floating point division. +defm : BWWriteResPair; // Floating point square root. +defm : BWWriteResPair; // Floating point reciprocal estimate. +defm : BWWriteResPair; // Floating point reciprocal square root estimate. +defm : BWWriteResPair; // Fused Multiply Add. +defm : BWWriteResPair; // Floating point vector shuffles. +defm : BWWriteResPair; // Floating point vector blends. +defm : BWWriteResPair; // Fp vector variable blends. // FMA Scheduling helper class. // class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; } @@ -158,38 +152,22 @@ def : WriteRes; def : WriteRes; -defm : BWWriteResPair; // Vector integer ALU op, no logicals. -defm : BWWriteResPair; // Vector integer shifts. -defm : BWWriteResPair; // Vector integer multiply. -defm : BWWriteResPair; // Vector shuffles. -defm : BWWriteResPair; // Vector blends. - -def : WriteRes { // Vector variable blends. - let Latency = 2; - let ResourceCycles = [2]; -} -def : WriteRes { - let Latency = 6; - let ResourceCycles = [2, 1]; -} - -def : WriteRes { // Vector MPSAD. - let Latency = 6; - let ResourceCycles = [1, 2]; -} -def : WriteRes { - let Latency = 6; - let ResourceCycles = [1, 1, 2]; -} +defm : BWWriteResPair; // Vector integer ALU op, no logicals. +defm : BWWriteResPair; // Vector integer shifts. +defm : BWWriteResPair; // Vector integer multiply. +defm : BWWriteResPair; // Vector shuffles. +defm : BWWriteResPair; // Vector blends. +defm : BWWriteResPair; // Vector variable blends. +defm : BWWriteResPair; // Vector MPSAD. // Vector bitwise operations. // These are often used on both floating point and integer vectors. -defm : BWWriteResPair; // Vector and/or/xor. +defm : BWWriteResPair; // Vector and/or/xor. // Conversion between integer and float. -defm : BWWriteResPair; // Float -> Integer. -defm : BWWriteResPair; // Integer -> Float. -defm : BWWriteResPair; // Float -> Float size conversion. +defm : BWWriteResPair; // Float -> Integer. +defm : BWWriteResPair; // Integer -> Float. +defm : BWWriteResPair; // Float -> Float size conversion. // Strings instructions. // Packed Compare Implicit Length Strings, Return Mask @@ -257,29 +235,15 @@ } // Carry-less multiplication instructions. -def : WriteRes { - let Latency = 7; - let ResourceCycles = [2, 1]; -} -def : WriteRes { - let Latency = 7; - let ResourceCycles = [2, 1, 1]; -} +defm : BWWriteResPair; // Catch-all for expensive system instructions. def : WriteRes { let Latency = 100; } // def WriteSystem : SchedWrite; // AVX2. -defm : BWWriteResPair; // Fp 256-bit width vector shuffles. -defm : BWWriteResPair; // 256-bit width vector shuffles. -def : WriteRes { // Variable vector shifts. - let Latency = 2; - let ResourceCycles = [2, 1]; -} -def : WriteRes { - let Latency = 6; - let ResourceCycles = [2, 1, 1]; -} +defm : BWWriteResPair; // Fp 256-bit width vector shuffles. +defm : BWWriteResPair; // 256-bit width vector shuffles. +defm : BWWriteResPair; // Variable vector shifts. // Old microcoded instructions that nobody use. def : WriteRes { let Latency = 100; } // def WriteMicrocoded : SchedWrite; @@ -293,27 +257,9 @@ //////////////////////////////////////////////////////////////////////////////// // Horizontal add/sub instructions. //////////////////////////////////////////////////////////////////////////////// -// HADD, HSUB PS/PD -// x,x / v,v,v. -def : WriteRes { - let Latency = 3; -} -// x,m / v,v,m. -def : WriteRes { - let Latency = 7; - let ResourceCycles = [1, 1]; -} - -// PHADD|PHSUB (S) W/D. -// v <- v,v. -def : WriteRes; - -// v <- v,m. -def : WriteRes { - let Latency = 5; - let ResourceCycles = [1, 1]; -} +defm : BWWriteResPair; +defm : BWWriteResPair; // Remaining instrs. Index: llvm/trunk/lib/Target/X86/X86SchedHaswell.td =================================================================== --- llvm/trunk/lib/Target/X86/X86SchedHaswell.td +++ llvm/trunk/lib/Target/X86/X86SchedHaswell.td @@ -80,15 +80,21 @@ // This multiclass defines the resource usage for variants with and without // folded loads. multiclass HWWriteResPair { + list ExePorts, + int Lat, list Res = [1], int UOps = 1> { // Register variant is using a single cycle on ExePort. - def : WriteRes { let Latency = Lat; } + def : WriteRes { + let Latency = Lat; + let ResourceCycles = Res; + let NumMicroOps = UOps; + } // Memory variant also uses a cycle on port 2/3 and adds 5 cycles to the // latency. - def : WriteRes { - let Latency = !add(Lat, 5); + def : WriteRes { + let Latency = !add(Lat, 5); + let ResourceCycles = !listconcat([1], Res); + let NumMicroOps = UOps; } } @@ -103,11 +109,11 @@ def : WriteRes; def : WriteRes; -defm : HWWriteResPair; -defm : HWWriteResPair; +defm : HWWriteResPair; +defm : HWWriteResPair; def : WriteRes { let Latency = 3; } -defm : HWWriteResPair; -defm : HWWriteResPair; +defm : HWWriteResPair; +defm : HWWriteResPair; // This is for simple LEAs with one or two input operands. // The complex ones can only execute on port 1, and they require two cycles on @@ -129,68 +135,36 @@ def : WriteRes { let Latency = 5; } def : WriteRes; -defm : HWWriteResPair; -defm : HWWriteResPair; -defm : HWWriteResPair; // 10-14 cycles. -defm : HWWriteResPair; -defm : HWWriteResPair; -defm : HWWriteResPair; -defm : HWWriteResPair; -defm : HWWriteResPair; -defm : HWWriteResPair; -defm : HWWriteResPair; -defm : HWWriteResPair; -defm : HWWriteResPair; -defm : HWWriteResPair; - -def : WriteRes { - let Latency = 2; - let ResourceCycles = [2]; -} -def : WriteRes { - let Latency = 6; - let ResourceCycles = [2, 1]; -} +defm : HWWriteResPair; +defm : HWWriteResPair; +defm : HWWriteResPair; // 10-14 cycles. +defm : HWWriteResPair; +defm : HWWriteResPair; +defm : HWWriteResPair; +defm : HWWriteResPair; +defm : HWWriteResPair; +defm : HWWriteResPair; +defm : HWWriteResPair; +defm : HWWriteResPair; +defm : HWWriteResPair; +defm : HWWriteResPair; +defm : HWWriteResPair; // Vector integer operations. def : WriteRes; def : WriteRes { let Latency = 5; } def : WriteRes; -defm : HWWriteResPair; -defm : HWWriteResPair; -defm : HWWriteResPair; -defm : HWWriteResPair; -defm : HWWriteResPair; -defm : HWWriteResPair; -defm : HWWriteResPair; - -def : WriteRes { - let Latency = 2; - let ResourceCycles = [2]; -} -def : WriteRes { - let Latency = 6; - let ResourceCycles = [2, 1]; -} - -def : WriteRes { - let Latency = 2; - let ResourceCycles = [2, 1]; -} -def : WriteRes { - let Latency = 6; - let ResourceCycles = [2, 1, 1]; -} - -def : WriteRes { - let Latency = 6; - let ResourceCycles = [1, 2]; -} -def : WriteRes { - let Latency = 6; - let ResourceCycles = [1, 1, 2]; -} +defm : HWWriteResPair; +defm : HWWriteResPair; +defm : HWWriteResPair; +defm : HWWriteResPair; +defm : HWWriteResPair; +defm : HWWriteResPair; +defm : HWWriteResPair; +defm : HWWriteResPair; +defm : HWWriteResPair; +defm : HWWriteResPair; // String instructions. // Packed Compare Implicit Length Strings, Return Mask @@ -544,34 +518,8 @@ // Horizontal add/sub instructions. //////////////////////////////////////////////////////////////////////////////// -// HADD, HSUB PS/PD -// x,x / v,v,v. -def : WriteRes { - let Latency = 5; - let NumMicroOps = 3; - let ResourceCycles = [1, 2]; -} - -// x,m / v,v,m. -def : WriteRes { - let Latency = 9; - let NumMicroOps = 4; - let ResourceCycles = [1, 2, 1]; -} - -// PHADD|PHSUB (S) W/D. -// v <- v,v. -def : WriteRes { - let Latency = 3; - let NumMicroOps = 3; - let ResourceCycles = [1, 2]; -} -// v <- v,m. -def : WriteRes { - let Latency = 6; - let NumMicroOps = 3; - let ResourceCycles = [1, 2, 1]; -} +defm : HWWriteResPair; +defm : HWWriteResPair; //=== Floating Point XMM and YMM Instructions ===// Index: llvm/trunk/lib/Target/X86/X86SchedSandyBridge.td =================================================================== --- llvm/trunk/lib/Target/X86/X86SchedSandyBridge.td +++ llvm/trunk/lib/Target/X86/X86SchedSandyBridge.td @@ -71,15 +71,21 @@ // This multiclass defines the resource usage for variants with and without // folded loads. multiclass SBWriteResPair { + list ExePorts, + int Lat, list Res = [1], int UOps = 1> { // Register variant is using a single cycle on ExePort. - def : WriteRes { let Latency = Lat; } + def : WriteRes { + let Latency = Lat; + let ResourceCycles = Res; + let NumMicroOps = UOps; + } // Memory variant also uses a cycle on port 2/3 and adds 4 cycles to the // latency. - def : WriteRes { - let Latency = !add(Lat, 4); + def : WriteRes { + let Latency = !add(Lat, 4); + let ResourceCycles = !listconcat([1], Res); + let NumMicroOps = UOps; } } @@ -92,106 +98,57 @@ def : WriteRes; def : WriteRes; -defm : SBWriteResPair; -defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; def : WriteRes { let Latency = 3; } -defm : SBWriteResPair; -defm : SBWriteResPair; + +defm : SBWriteResPair; +defm : SBWriteResPair; // This is for simple LEAs with one or two input operands. // The complex ones can only execute on port 1, and they require two cycles on // the port to read all inputs. We don't model that. def : WriteRes; -// This is quite rough, latency depends on the dividend. -def : WriteRes { - let Latency = 25; - let ResourceCycles = [1, 10]; -} -def : WriteRes { - let Latency = 29; - let ResourceCycles = [1, 1, 10]; -} - // Scalar and vector floating point. def : WriteRes; def : WriteRes { let Latency = 6; } def : WriteRes; -defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; -def : WriteRes { - let Latency = 2; - let ResourceCycles = [1, 1]; -} -def : WriteRes { - let Latency = 6; - let ResourceCycles = [1, 1, 1]; -} +defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; // Vector integer operations. def : WriteRes; def : WriteRes { let Latency = 6; } def : WriteRes; -defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; -def : WriteRes { - let Latency = 2; - let ResourceCycles = [1, 1]; -} -def : WriteRes { - let Latency = 6; - let ResourceCycles = [1, 1, 1]; -} -def : WriteRes { - let Latency = 5; - let NumMicroOps = 3; - let ResourceCycles = [1,2]; -} -def : WriteRes { - let Latency = 11; - let NumMicroOps = 4; - let ResourceCycles = [1,1,2]; -} +defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; //////////////////////////////////////////////////////////////////////////////// // Horizontal add/sub instructions. //////////////////////////////////////////////////////////////////////////////// -// HADD, HSUB PS/PD -// x,x / v,v,v. -def : WriteRes { - let Latency = 3; -} - -// x,m / v,v,m. -def : WriteRes { - let Latency = 7; - let ResourceCycles = [1, 1]; -} -// PHADD|PHSUB (S) W/D. -// v <- v,v. -def : WriteRes; - -// v <- v,m. -def : WriteRes { - let Latency = 5; - let ResourceCycles = [1, 1]; -} +defm : SBWriteResPair; +defm : SBWriteResPair; // String instructions. // Packed Compare Implicit Length Strings, Return Mask @@ -286,10 +243,10 @@ // AVX2/FMA is not supported on that architecture, but we should define the basic // scheduling resources anyway. -defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; -defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; +defm : SBWriteResPair; // Remaining SNB instrs. Index: llvm/trunk/lib/Target/X86/X86SchedSkylakeClient.td =================================================================== --- llvm/trunk/lib/Target/X86/X86SchedSkylakeClient.td +++ llvm/trunk/lib/Target/X86/X86SchedSkylakeClient.td @@ -77,15 +77,21 @@ // This multiclass defines the resource usage for variants with and without // folded loads. multiclass SKLWriteResPair { + list ExePorts, + int Lat, list Res = [1], int UOps = 1> { // Register variant is using a single cycle on ExePort. - def : WriteRes { let Latency = Lat; } + def : WriteRes { + let Latency = Lat; + let ResourceCycles = Res; + let NumMicroOps = UOps; + } // Memory variant also uses a cycle on port 2/3 and adds 5 cycles to the // latency. - def : WriteRes { - let Latency = !add(Lat, 5); + def : WriteRes { + let Latency = !add(Lat, 5); + let ResourceCycles = !listconcat([1], Res); + let NumMicroOps = UOps; } } @@ -94,8 +100,8 @@ def : WriteRes; // Arithmetic. -defm : SKLWriteResPair; // Simple integer ALU op. -defm : SKLWriteResPair; // Integer multiplication. +defm : SKLWriteResPair; // Simple integer ALU op. +defm : SKLWriteResPair; // Integer multiplication. def : WriteRes { let Latency = 3; } // Integer multiplication, high part. def SKLDivider : ProcResource<1>; // Integer division issued on port 0. def : WriteRes { // Integer division. @@ -110,7 +116,7 @@ def : WriteRes; // LEA instructions can't fold loads. // Integer shifts and rotates. -defm : SKLWriteResPair; +defm : SKLWriteResPair; // Loads, stores, and moves, not folded with other operations. def : WriteRes { let Latency = 5; } @@ -123,30 +129,23 @@ // Branches don't produce values, so they have no latency, but they still // consume resources. Indirect branches can fold loads. -defm : SKLWriteResPair; +defm : SKLWriteResPair; // Floating point. This covers both scalar and vector operations. def : WriteRes { let Latency = 6; } def : WriteRes; def : WriteRes; -defm : SKLWriteResPair; // Floating point add/sub/compare. -defm : SKLWriteResPair; // Floating point multiplication. -defm : SKLWriteResPair; // 10-14 cycles. // Floating point division. -defm : SKLWriteResPair; // Floating point square root. -defm : SKLWriteResPair; // Floating point reciprocal estimate. -defm : SKLWriteResPair; // Floating point reciprocal square root estimate. -defm : SKLWriteResPair; // Fused Multiply Add. -defm : SKLWriteResPair; // Floating point vector shuffles. -defm : SKLWriteResPair; // Floating point vector blends. -def : WriteRes { // Fp vector variable blends. - let Latency = 2; - let ResourceCycles = [2]; -} -def : WriteRes { - let Latency = 6; - let ResourceCycles = [2, 1]; -} +defm : SKLWriteResPair; // Floating point add/sub/compare. +defm : SKLWriteResPair; // Floating point multiplication. +defm : SKLWriteResPair; // 10-14 cycles. // Floating point division. +defm : SKLWriteResPair; // Floating point square root. +defm : SKLWriteResPair; // Floating point reciprocal estimate. +defm : SKLWriteResPair; // Floating point reciprocal square root estimate. +defm : SKLWriteResPair; // Fused Multiply Add. +defm : SKLWriteResPair; // Floating point vector shuffles. +defm : SKLWriteResPair; // Floating point vector blends. +defm : SKLWriteResPair; // Fp vector variable blends. // FMA Scheduling helper class. // class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; } @@ -156,38 +155,22 @@ def : WriteRes; def : WriteRes; -defm : SKLWriteResPair; // Vector integer ALU op, no logicals. -defm : SKLWriteResPair; // Vector integer shifts. -defm : SKLWriteResPair; // Vector integer multiply. -defm : SKLWriteResPair; // Vector shuffles. -defm : SKLWriteResPair; // Vector blends. - -def : WriteRes { // Vector variable blends. - let Latency = 2; - let ResourceCycles = [2]; -} -def : WriteRes { - let Latency = 6; - let ResourceCycles = [2, 1]; -} - -def : WriteRes { // Vector MPSAD. - let Latency = 6; - let ResourceCycles = [1, 2]; -} -def : WriteRes { - let Latency = 6; - let ResourceCycles = [1, 1, 2]; -} +defm : SKLWriteResPair; // Vector integer ALU op, no logicals. +defm : SKLWriteResPair; // Vector integer shifts. +defm : SKLWriteResPair; // Vector integer multiply. +defm : SKLWriteResPair; // Vector shuffles. +defm : SKLWriteResPair; // Vector blends. +defm : SKLWriteResPair; // Vector variable blends. +defm : SKLWriteResPair; // Vector MPSAD. // Vector bitwise operations. // These are often used on both floating point and integer vectors. -defm : SKLWriteResPair; // Vector and/or/xor. +defm : SKLWriteResPair; // Vector and/or/xor. // Conversion between integer and float. -defm : SKLWriteResPair; // Float -> Integer. -defm : SKLWriteResPair; // Integer -> Float. -defm : SKLWriteResPair; // Float -> Float size conversion. +defm : SKLWriteResPair; // Float -> Integer. +defm : SKLWriteResPair; // Integer -> Float. +defm : SKLWriteResPair; // Float -> Float size conversion. // Strings instructions. // Packed Compare Implicit Length Strings, Return Mask @@ -268,16 +251,9 @@ def : WriteRes { let Latency = 100; } // def WriteSystem : SchedWrite; // AVX2. -defm : SKLWriteResPair; // Fp 256-bit width vector shuffles. -defm : SKLWriteResPair; // 256-bit width vector shuffles. -def : WriteRes { // Variable vector shifts. - let Latency = 2; - let ResourceCycles = [2, 1]; -} -def : WriteRes { - let Latency = 6; - let ResourceCycles = [2, 1, 1]; -} +defm : SKLWriteResPair; // Fp 256-bit width vector shuffles. +defm : SKLWriteResPair; // 256-bit width vector shuffles. +defm : SKLWriteResPair; // Variable vector shifts. // Old microcoded instructions that nobody use. def : WriteRes { let Latency = 100; } // def WriteMicrocoded : SchedWrite; @@ -291,27 +267,9 @@ //////////////////////////////////////////////////////////////////////////////// // Horizontal add/sub instructions. //////////////////////////////////////////////////////////////////////////////// -// HADD, HSUB PS/PD -// x,x / v,v,v. -def : WriteRes { - let Latency = 3; -} -// x,m / v,v,m. -def : WriteRes { - let Latency = 7; - let ResourceCycles = [1, 1]; -} - -// PHADD|PHSUB (S) W/D. -// v <- v,v. -def : WriteRes; - -// v <- v,m. -def : WriteRes { - let Latency = 5; - let ResourceCycles = [1, 1]; -} +defm : SKLWriteResPair; +defm : SKLWriteResPair; // Remaining instrs. Index: llvm/trunk/lib/Target/X86/X86SchedSkylakeServer.td =================================================================== --- llvm/trunk/lib/Target/X86/X86SchedSkylakeServer.td +++ llvm/trunk/lib/Target/X86/X86SchedSkylakeServer.td @@ -77,15 +77,21 @@ // This multiclass defines the resource usage for variants with and without // folded loads. multiclass SKXWriteResPair { + list ExePorts, + int Lat, list Res = [1], int UOps = 1> { // Register variant is using a single cycle on ExePort. - def : WriteRes { let Latency = Lat; } + def : WriteRes { + let Latency = Lat; + let ResourceCycles = Res; + let NumMicroOps = UOps; + } // Memory variant also uses a cycle on port 2/3 and adds 5 cycles to the // latency. - def : WriteRes { - let Latency = !add(Lat, 5); + def : WriteRes { + let Latency = !add(Lat, 5); + let ResourceCycles = !listconcat([1], Res); + let NumMicroOps = UOps; } } @@ -94,8 +100,8 @@ def : WriteRes; // Arithmetic. -defm : SKXWriteResPair; // Simple integer ALU op. -defm : SKXWriteResPair; // Integer multiplication. +defm : SKXWriteResPair; // Simple integer ALU op. +defm : SKXWriteResPair; // Integer multiplication. def : WriteRes { let Latency = 3; } // Integer multiplication, high part. def SKXDivider : ProcResource<1>; // Integer division issued on port 0. def : WriteRes { // Integer division. @@ -110,7 +116,7 @@ def : WriteRes; // LEA instructions can't fold loads. // Integer shifts and rotates. -defm : SKXWriteResPair; +defm : SKXWriteResPair; // Loads, stores, and moves, not folded with other operations. def : WriteRes { let Latency = 5; } @@ -123,30 +129,23 @@ // Branches don't produce values, so they have no latency, but they still // consume resources. Indirect branches can fold loads. -defm : SKXWriteResPair; +defm : SKXWriteResPair; // Floating point. This covers both scalar and vector operations. def : WriteRes { let Latency = 5; } def : WriteRes; def : WriteRes; -defm : SKXWriteResPair; // Floating point add/sub/compare. -defm : SKXWriteResPair; // Floating point multiplication. -defm : SKXWriteResPair; // 10-14 cycles. // Floating point division. -defm : SKXWriteResPair; // Floating point square root. -defm : SKXWriteResPair; // Floating point reciprocal estimate. -defm : SKXWriteResPair; // Floating point reciprocal square root estimate. -defm : SKXWriteResPair; // Fused Multiply Add. -defm : SKXWriteResPair; // Floating point vector shuffles. -defm : SKXWriteResPair; // Floating point vector blends. -def : WriteRes { // Fp vector variable blends. - let Latency = 2; - let ResourceCycles = [2]; -} -def : WriteRes { - let Latency = 6; - let ResourceCycles = [2, 1]; -} +defm : SKXWriteResPair; // Floating point add/sub/compare. +defm : SKXWriteResPair; // Floating point multiplication. +defm : SKXWriteResPair; // 10-14 cycles. // Floating point division. +defm : SKXWriteResPair; // Floating point square root. +defm : SKXWriteResPair; // Floating point reciprocal estimate. +defm : SKXWriteResPair; // Floating point reciprocal square root estimate. +defm : SKXWriteResPair; // Fused Multiply Add. +defm : SKXWriteResPair; // Floating point vector shuffles. +defm : SKXWriteResPair; // Floating point vector blends. +defm : SKXWriteResPair; // Fp vector variable blends. // FMA Scheduling helper class. // class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; } @@ -156,38 +155,22 @@ def : WriteRes; def : WriteRes; -defm : SKXWriteResPair; // Vector integer ALU op, no logicals. -defm : SKXWriteResPair; // Vector integer shifts. -defm : SKXWriteResPair; // Vector integer multiply. -defm : SKXWriteResPair; // Vector shuffles. -defm : SKXWriteResPair; // Vector blends. - -def : WriteRes { // Vector variable blends. - let Latency = 2; - let ResourceCycles = [2]; -} -def : WriteRes { - let Latency = 6; - let ResourceCycles = [2, 1]; -} - -def : WriteRes { // Vector MPSAD. - let Latency = 6; - let ResourceCycles = [1, 2]; -} -def : WriteRes { - let Latency = 6; - let ResourceCycles = [1, 1, 2]; -} +defm : SKXWriteResPair; // Vector integer ALU op, no logicals. +defm : SKXWriteResPair; // Vector integer shifts. +defm : SKXWriteResPair; // Vector integer multiply. +defm : SKXWriteResPair; // Vector shuffles. +defm : SKXWriteResPair; // Vector blends. +defm : SKXWriteResPair; // Vector variable blends. +defm : SKXWriteResPair; // Vector MPSAD. // Vector bitwise operations. // These are often used on both floating point and integer vectors. -defm : SKXWriteResPair; // Vector and/or/xor. +defm : SKXWriteResPair; // Vector and/or/xor. // Conversion between integer and float. -defm : SKXWriteResPair; // Float -> Integer. -defm : SKXWriteResPair; // Integer -> Float. -defm : SKXWriteResPair; // Float -> Float size conversion. +defm : SKXWriteResPair; // Float -> Integer. +defm : SKXWriteResPair; // Integer -> Float. +defm : SKXWriteResPair; // Float -> Float size conversion. // Strings instructions. // Packed Compare Implicit Length Strings, Return Mask @@ -268,16 +251,9 @@ def : WriteRes { let Latency = 100; } // def WriteSystem : SchedWrite; // AVX2. -defm : SKXWriteResPair; // Fp 256-bit width vector shuffles. -defm : SKXWriteResPair; // 256-bit width vector shuffles. -def : WriteRes { // Variable vector shifts. - let Latency = 2; - let ResourceCycles = [2, 1]; -} -def : WriteRes { - let Latency = 6; - let ResourceCycles = [2, 1, 1]; -} +defm : SKXWriteResPair; // Fp 256-bit width vector shuffles. +defm : SKXWriteResPair; // 256-bit width vector shuffles. +defm : SKXWriteResPair; // Variable vector shifts. // Old microcoded instructions that nobody use. def : WriteRes { let Latency = 100; } // def WriteMicrocoded : SchedWrite; @@ -291,27 +267,9 @@ //////////////////////////////////////////////////////////////////////////////// // Horizontal add/sub instructions. //////////////////////////////////////////////////////////////////////////////// -// HADD, HSUB PS/PD -// x,x / v,v,v. -def : WriteRes { - let Latency = 3; -} -// x,m / v,v,m. -def : WriteRes { - let Latency = 7; - let ResourceCycles = [1, 1]; -} - -// PHADD|PHSUB (S) W/D. -// v <- v,v. -def : WriteRes; - -// v <- v,m. -def : WriteRes { - let Latency = 5; - let ResourceCycles = [1, 1]; -} +defm : SKXWriteResPair; +defm : SKXWriteResPair; // Remaining instrs. Index: llvm/trunk/lib/Target/X86/X86ScheduleSLM.td =================================================================== --- llvm/trunk/lib/Target/X86/X86ScheduleSLM.td +++ llvm/trunk/lib/Target/X86/X86ScheduleSLM.td @@ -56,15 +56,21 @@ // This multiclass defines the resource usage for variants with and without // folded loads. multiclass SMWriteResPair { + list ExePorts, + int Lat, list Res = [1], int UOps = 1> { // Register variant is using a single cycle on ExePort. - def : WriteRes { let Latency = Lat; } + def : WriteRes { + let Latency = Lat; + let ResourceCycles = Res; + let NumMicroOps = UOps; + } // Memory variant also uses a cycle on MEC_RSV and adds 3 cycles to the // latency. - def : WriteRes { - let Latency = !add(Lat, 3); + def : WriteRes { + let Latency = !add(Lat, 3); + let ResourceCycles = !listconcat([1], Res); + let NumMicroOps = UOps; } } @@ -80,10 +86,10 @@ // Treat misc copies as a move. def : InstRW<[WriteMove], (instrs COPY)>; -defm : SMWriteResPair; -defm : SMWriteResPair; -defm : SMWriteResPair; -defm : SMWriteResPair; +defm : SMWriteResPair; +defm : SMWriteResPair; +defm : SMWriteResPair; +defm : SMWriteResPair; // This is for simple LEAs with one or two input operands. // The complex ones can only execute on port 1, and they require two cycles on @@ -105,74 +111,37 @@ def : WriteRes { let Latency = 3; } def : WriteRes; -defm : SMWriteResPair; -defm : SMWriteResPair; -defm : SMWriteResPair; -defm : SMWriteResPair; -defm : SMWriteResPair; -defm : SMWriteResPair; -defm : SMWriteResPair; -defm : SMWriteResPair; -defm : SMWriteResPair; - -// This is quite rough, latency depends on precision -def : WriteRes { - let Latency = 5; - let ResourceCycles = [1, 2]; -} -def : WriteRes { - let Latency = 8; - let ResourceCycles = [1, 1, 2]; -} - -def : WriteRes { - let Latency = 34; - let ResourceCycles = [1, 34]; -} -def : WriteRes { - let Latency = 37; - let ResourceCycles = [1, 1, 34]; -} +defm : SMWriteResPair; +defm : SMWriteResPair; +defm : SMWriteResPair; +defm : SMWriteResPair; +defm : SMWriteResPair; +defm : SMWriteResPair; +defm : SMWriteResPair; +defm : SMWriteResPair; +defm : SMWriteResPair; +defm : SMWriteResPair; +defm : SMWriteResPair; // Vector integer operations. def : WriteRes; def : WriteRes { let Latency = 3; } def : WriteRes; -defm : SMWriteResPair; -defm : SMWriteResPair; -defm : SMWriteResPair; -defm : SMWriteResPair; -defm : SMWriteResPair; -defm : SMWriteResPair; -defm : SMWriteResPair; +defm : SMWriteResPair; +defm : SMWriteResPair; +defm : SMWriteResPair; +defm : SMWriteResPair; +defm : SMWriteResPair; +defm : SMWriteResPair; +defm : SMWriteResPair; //////////////////////////////////////////////////////////////////////////////// // Horizontal add/sub instructions. //////////////////////////////////////////////////////////////////////////////// -// HADD, HSUB PS/PD - -def : WriteRes { - let Latency = 3; - let ResourceCycles = [2]; -} - -def : WriteRes { - let Latency = 6; - let ResourceCycles = [2, 1]; -} - -// PHADD|PHSUB (S) W/D. -def : WriteRes { - let Latency = 1; - let ResourceCycles = [1]; -} - -def : WriteRes { - let Latency = 4; - let ResourceCycles = [1, 1]; -} +defm : SMWriteResPair; +defm : SMWriteResPair; // String instructions. // Packed Compare Implicit Length Strings, Return Mask @@ -262,10 +231,10 @@ // AVX/FMA is not supported on that architecture, but we should define the basic // scheduling resources anyway. def : WriteRes; -defm : SMWriteResPair; -defm : SMWriteResPair; -defm : SMWriteResPair; -defm : SMWriteResPair; -defm : SMWriteResPair; -defm : SMWriteResPair; +defm : SMWriteResPair; +defm : SMWriteResPair; +defm : SMWriteResPair; +defm : SMWriteResPair; +defm : SMWriteResPair; +defm : SMWriteResPair; } // SchedModel Index: llvm/trunk/lib/Target/X86/X86ScheduleZnver1.td =================================================================== --- llvm/trunk/lib/Target/X86/X86ScheduleZnver1.td +++ llvm/trunk/lib/Target/X86/X86ScheduleZnver1.td @@ -99,30 +99,41 @@ // b. addpd // This multiclass is for folded loads for integer units. multiclass ZnWriteResPair { + list ExePorts, + int Lat, list Res = [1], int UOps = 1> { // Register variant takes 1-cycle on Execution Port. - def : WriteRes { let Latency = Lat; } + def : WriteRes { + let Latency = Lat; + let ResourceCycles = Res; + let NumMicroOps = UOps; + } // Memory variant also uses a cycle on ZnAGU // adds 4 cycles to the latency. - def : WriteRes { - let NumMicroOps = 2; - let Latency = !add(Lat, 4); + def : WriteRes { + let Latency = !add(Lat, 4); + let ResourceCycles = !listconcat([1], Res); + let NumMicroOps = !add(UOps, 1); } } // This multiclass is for folded loads for floating point units. multiclass ZnWriteResFpuPair { + list ExePorts, + int Lat, list Res = [1], int UOps = 1> { // Register variant takes 1-cycle on Execution Port. - def : WriteRes { let Latency = Lat; } + def : WriteRes { + let Latency = Lat; + let ResourceCycles = Res; + let NumMicroOps = UOps; + } // Memory variant also uses a cycle on ZnAGU // adds 7 cycles to the latency. - def : WriteRes { - let Latency = !add(Lat, 7); + def : WriteRes { + let Latency = !add(Lat, 7); + let ResourceCycles = !listconcat([1], Res); + let NumMicroOps = UOps; } } @@ -136,9 +147,10 @@ def : WriteRes; def : WriteRes; -defm : ZnWriteResPair; -defm : ZnWriteResPair; -defm : ZnWriteResPair; +defm : ZnWriteResPair; +defm : ZnWriteResPair; +defm : ZnWriteResPair; +defm : ZnWriteResPair; // Treat misc copies as a move. def : InstRW<[WriteMove], (instrs COPY)>; @@ -154,67 +166,60 @@ let ResourceCycles = [1, 4, 41]; } -// IMUL +// IMULH def : WriteRes{ let Latency = 4; } -def : WriteRes { - let Latency = 4; -} - -def : WriteRes { - let Latency = 8; -} // Floating point operations def : WriteRes; def : WriteRes; def : WriteRes { let Latency = 8; } -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; // Vector integer operations which uses FPU units def : WriteRes; def : WriteRes; def : WriteRes { let Latency = 8; } -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; // Vector Shift Operations -defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; // AES Instructions. -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; def : WriteRes; def : WriteRes; // Following instructions with latency=100 are microcoded. // We set long latency so as to block the entire pipeline. -defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; //Microcoded Instructions let Latency = 100 in { Index: llvm/trunk/test/CodeGen/X86/avx2-schedule.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx2-schedule.ll +++ llvm/trunk/test/CodeGen/X86/avx2-schedule.ll @@ -609,7 +609,7 @@ ; GENERIC-LABEL: test_mpsadbw: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmpsadbw $7, %ymm1, %ymm0, %ymm0 # sched: [5:1.00] -; GENERIC-NEXT: vmpsadbw $7, (%rdi), %ymm0, %ymm0 # sched: [11:1.00] +; GENERIC-NEXT: vmpsadbw $7, (%rdi), %ymm0, %ymm0 # sched: [9:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_mpsadbw: