diff --git a/llvm/lib/Target/ARM/ARMScheduleM7.td b/llvm/lib/Target/ARM/ARMScheduleM7.td --- a/llvm/lib/Target/ARM/ARMScheduleM7.td +++ b/llvm/lib/Target/ARM/ARMScheduleM7.td @@ -105,39 +105,46 @@ // Floating point conversions. def : WriteRes { let Latency = 3; } def : WriteRes { let Latency = 3; } - +def M7WriteFPMOV64 : SchedWriteRes<[M7UnitVPort]> { + let Latency = 3; let ResourceUses = [2]; +} // The FP pipeline has a latency of 3 cycles. // ALU operations (32/64-bit). These go down the FP pipeline. def : WriteRes { let Latency = 3; } -def : WriteRes { +def : WriteRes { + let ResourceUses = [1,2]; let Latency = 4; let BeginGroup = 1; } // Multiplication def : WriteRes { let Latency = 3; } -def : WriteRes { +def : WriteRes { + let ResourceUses = [1,2]; let Latency = 7; let BeginGroup = 1; } // Multiply-accumulate. FPMAC goes down the FP Pipeline. def : WriteRes { let Latency = 6; } -def : WriteRes { +def : WriteRes { + let ResourceUses = [1,2]; let Latency = 11; let BeginGroup = 1; } // Division. Effective scheduling latency is 3, though real latency is larger def : WriteRes { let Latency = 16; } -def : WriteRes { +def : WriteRes { + let ResourceUses = [1,2]; let Latency = 30; let BeginGroup = 1; } // Square-root. Effective scheduling latency is 3; real latency is larger def : WriteRes { let Latency = 16; } -def : WriteRes { +def : WriteRes { + let ResourceUses = [1,2]; let Latency = 30; let BeginGroup = 1; } @@ -283,12 +290,14 @@ // VFP loads and stores def M7LoadSP : SchedWriteRes<[M7UnitLoad, M7UnitVPort]> { let Latency = 1; } -def M7LoadDP : SchedWriteRes<[M7UnitLoad, M7UnitVPort, M7UnitVPort]> { +def M7LoadDP : SchedWriteRes<[M7UnitLoad, M7UnitVPort]> { + let ResourceUses = [2,2]; let Latency = 2; let SingleIssue = 1; } def M7StoreSP : SchedWriteRes<[M7UnitStore, M7UnitVPort]>; -def M7StoreDP : SchedWriteRes<[M7UnitStore, M7UnitVPort, M7UnitVPort]> { +def M7StoreDP : SchedWriteRes<[M7UnitStore, M7UnitVPort]> { + let ResourceUses = [1,2]; let SingleIssue = 1; } @@ -404,7 +413,7 @@ let Latency = 3; let NumMicroOps = 0; } -def M7WriteVFPExtraVPort : SchedWriteRes<[M7UnitVPort]> { +def M7WriteVFPExtra : SchedWriteRes<[]> { let Latency = 3; let NumMicroOps = 0; } @@ -417,7 +426,7 @@ // VCMP def M7WriteVCMPS : SchedWriteRes<[M7UnitVFP, M7UnitVPort]> { let Latency = 0; } -def M7WriteVCMPD : SchedWriteRes<[M7UnitVFP, M7UnitVPort, M7UnitVPort]> { +def M7WriteVCMPD : SchedWriteRes<[M7UnitVFP, M7UnitVPort]> { let Latency = 0; let BeginGroup = 1; } @@ -440,11 +449,11 @@ // VMOV def : InstRW<[WriteFPMOV], (instregex "VMOV(H|S)$", "FCONST(H|S)")>; -def : InstRW<[WriteFPMOV, M7WriteVFPExtraVPort, M7Slot0Only], +def : InstRW<[M7WriteFPMOV64, M7Slot0Only], (instregex "VMOVD$")>; -def : InstRW<[WriteFPMOV, M7WriteVFPExtraVPort, M7Slot0Only], +def : InstRW<[M7WriteFPMOV64, M7Slot0Only], (instregex "FCONSTD")>; -def : InstRW<[WriteFPMOV, M7WriteVFPExtraVPort, M7SingleIssue], +def : InstRW<[M7WriteFPMOV64, M7WriteVFPExtra, M7SingleIssue], (instregex "VMOV(DRR|RRD|RRS|SRR)")>; // Larger-latency overrides. @@ -460,29 +469,42 @@ // Multiply-accumulate. Chained SP timing is correct; rest need overrides // Double-precision chained MAC stalls the pipeline behind it for 3 cycles, -// making it appear to have 3 cycle latency for scheduling. +// making it appear to have 3 cycle latency for scheduling. MAC->MAC +// bypassing is advanced by 2/3 cycles (fused/chained), but other bypassing is +// not. -def : InstRW<[M7WriteVFPLatOverride, WriteFPMAC64, - ReadFPMAC, ReadFPMUL, ReadFPMUL], - (instregex "V(N)?ML(A|S)D$")>; +let NumMicroOps = 0 in { + def M7WriteVFPLatOverrideMAC : SchedWriteRes<[]> { let Latency = 3; } + def M7WriteVFPLatOverride5 : SchedWriteRes<[]> { let Latency = 5; } +} -// Single-precision fused MACs look like latency 5 with advance of 2. +def M7MACBypass3 : SchedReadAdvance<3, [WriteFPMAC32, WriteFPMAC64, + M7WriteVFPLatOverrideMAC, + M7WriteVFPLatOverride5, + ]>; +def M7MACBypass2 : SchedReadAdvance<2, [WriteFPMAC32, WriteFPMAC64, + M7WriteVFPLatOverrideMAC, + M7WriteVFPLatOverride5, + ]>; -def M7WriteVFPLatOverride5 : SchedWriteRes<[]> { - let Latency = 5; - let NumMicroOps = 0; -} -def M7ReadFPMAC2 : SchedReadAdvance<2>; +def : InstRW<[WriteFPMAC32, M7MACBypass3, ReadFPMUL, ReadFPMUL], + (instregex "V(N)?ML(A|S)S$")>; + +def : InstRW<[M7WriteVFPLatOverrideMAC, WriteFPMAC64, + M7MACBypass3, ReadFPMUL, ReadFPMUL], + (instregex "V(N)?ML(A|S)D$")>; + +// Single-precision fused MACs look like latency 5. def : InstRW<[M7WriteVFPLatOverride5, WriteFPMAC32, - M7ReadFPMAC2, ReadFPMUL, ReadFPMUL], + M7MACBypass2, ReadFPMUL, ReadFPMUL], (instregex "VF(N)?M(A|S)S$")>; // Double-precision fused MAC stalls the pipeline behind it for 2 cycles, making // it appear to have 3 cycle latency for scheduling. -def : InstRW<[M7WriteVFPLatOverride, WriteFPMAC64, - ReadFPMAC, ReadFPMUL, ReadFPMUL], +def : InstRW<[M7WriteVFPLatOverrideMAC, WriteFPMAC64, + M7MACBypass2, ReadFPMUL, ReadFPMUL], (instregex "VF(N)?M(A|S)D$")>; } // SchedModel = CortexM7Model diff --git a/llvm/test/tools/llvm-mca/ARM/m7-fp.s b/llvm/test/tools/llvm-mca/ARM/m7-fp.s --- a/llvm/test/tools/llvm-mca/ARM/m7-fp.s +++ b/llvm/test/tools/llvm-mca/ARM/m7-fp.s @@ -266,17 +266,17 @@ # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0.0] [0.1] [1] [2.0] [2.1] [3] [4] [5] [6] [7] [8] [9.0] [9.1] -# CHECK-NEXT: - - - 1.00 1.00 - - - - 2.00 104.00 81.00 81.00 +# CHECK-NEXT: - - - 1.00 1.00 - - - - 2.00 104.00 59.00 59.00 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0.0] [0.1] [1] [2.0] [2.1] [3] [4] [5] [6] [7] [8] [9.0] [9.1] Instructions: # CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vabs.f32 s0, s2 -# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 1.00 vabs.f64 d0, d2 +# CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vabs.f64 d0, d2 # CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vadd.f32 s0, s2, s1 -# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 1.00 vadd.f64 d0, d2, d1 +# CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vadd.f64 d0, d2, d1 # CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vcmp.f32 s1, s2 -# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 1.00 vcmp.f64 d1, d2 -# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 1.00 vcvt.f32.f64 s1, d2 +# CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vcmp.f64 d1, d2 +# CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vcvt.f32.f64 s1, d2 # CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vcvt.f64.f32 d1, s1 # CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vcvt.f32.u16 s1, s1, #8 # CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vcvt.f32.s16 s1, s1, #8 @@ -304,20 +304,20 @@ # CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vcvt.f64.s32 d1, s2 # CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vcvta.u32.f32 s1, s2 # CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vcvta.s32.f32 s1, s2 -# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 1.00 vcvta.u32.f64 s1, d2 -# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 1.00 vcvta.s32.f64 s1, d2 +# CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vcvta.u32.f64 s1, d2 +# CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vcvta.s32.f64 s1, d2 # CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vcvtm.u32.f32 s1, s2 # CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vcvtm.s32.f32 s1, s2 -# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 1.00 vcvtm.u32.f64 s1, d2 -# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 1.00 vcvtm.s32.f64 s1, d2 +# CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vcvtm.u32.f64 s1, d2 +# CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vcvtm.s32.f64 s1, d2 # CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vcvtn.u32.f32 s1, s2 # CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vcvtn.s32.f32 s1, s2 -# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 1.00 vcvtn.u32.f64 s1, d2 -# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 1.00 vcvtn.s32.f64 s1, d2 +# CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vcvtn.u32.f64 s1, d2 +# CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vcvtn.s32.f64 s1, d2 # CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vcvtp.u32.f32 s1, s2 # CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vcvtp.s32.f32 s1, s2 -# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 1.00 vcvtp.u32.f64 s1, d2 -# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 1.00 vcvtp.s32.f64 s1, d2 +# CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vcvtp.u32.f64 s1, d2 +# CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vcvtp.s32.f64 s1, d2 # CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vcvtb.f32.f16 s1, s2 # CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vcvtb.f16.f32 s1, s2 # CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vcvtr.u32.f32 s1, s2 @@ -327,64 +327,64 @@ # CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vcvtt.f16.f32 s1, s2 # CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vcvtt.f32.f16 s1, s2 # CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vdiv.f32 s0, s2, s1 -# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 1.00 vdiv.f64 d0, d2, d1 +# CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vdiv.f64 d0, d2, d1 # CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vfma.f32 s0, s2, s1 -# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 1.00 vfma.f64 d0, d2, d1 +# CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vfma.f64 d0, d2, d1 # CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vfms.f32 s0, s2, s1 -# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 1.00 vfms.f64 d0, d2, d1 +# CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vfms.f64 d0, d2, d1 # CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vfnma.f32 s0, s2, s1 -# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 1.00 vfnma.f64 d0, d2, d1 +# CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vfnma.f64 d0, d2, d1 # CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vfnms.f32 s0, s2, s1 -# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 1.00 vfnms.f64 d0, d2, d1 +# CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vfnms.f64 d0, d2, d1 # CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vmaxnm.f32 s0, s2, s1 -# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 1.00 vmaxnm.f64 d0, d2, d1 +# CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vmaxnm.f64 d0, d2, d1 # CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vminnm.f32 s0, s2, s1 -# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 1.00 vminnm.f64 d0, d2, d1 +# CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vminnm.f64 d0, d2, d1 # CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vmla.f32 s0, s2, s1 -# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 1.00 vmla.f64 d0, d2, d1 +# CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vmla.f64 d0, d2, d1 # CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vmls.f32 s0, s2, s1 -# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 1.00 vmls.f64 d0, d2, d1 +# CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vmls.f64 d0, d2, d1 # CHECK-NEXT: - - - - - - - - - - - 0.50 0.50 vmov s0, r1 # CHECK-NEXT: - - - - - - - - - - - 0.50 0.50 vmov r0, s1 -# CHECK-NEXT: - - - - - - - - - - - 1.00 1.00 vmov d0, r1, r2 -# CHECK-NEXT: - - - - - - - - - - - 1.00 1.00 vmov r0, r1, d1 -# CHECK-NEXT: - - - - - - - - - - - 1.00 1.00 vmov s0, s1, r0, r1 -# CHECK-NEXT: - - - - - - - - - - - 1.00 1.00 vmov r0, r1, s0, s1 +# CHECK-NEXT: - - - - - - - - - - - 0.50 0.50 vmov d0, r1, r2 +# CHECK-NEXT: - - - - - - - - - - - 0.50 0.50 vmov r0, r1, d1 +# CHECK-NEXT: - - - - - - - - - - - 0.50 0.50 vmov s0, s1, r0, r1 +# CHECK-NEXT: - - - - - - - - - - - 0.50 0.50 vmov r0, r1, s0, s1 # CHECK-NEXT: - - - - - - - - - - - 0.50 0.50 vmov.f32 s0, #1.000000e+00 -# CHECK-NEXT: - - - - - - - - - - - 1.00 1.00 vmov.f64 d0, #1.000000e+00 +# CHECK-NEXT: - - - - - - - - - - - 0.50 0.50 vmov.f64 d0, #1.000000e+00 # CHECK-NEXT: - - - - - - - - - - - 0.50 0.50 vmov.f32 s0, s1 -# CHECK-NEXT: - - - - - - - - - - - 1.00 1.00 vmov.f64 d0, d1 +# CHECK-NEXT: - - - - - - - - - - - 0.50 0.50 vmov.f64 d0, d1 # CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vmul.f32 s0, s2, s1 -# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 1.00 vmul.f64 d0, d2, d1 +# CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vmul.f64 d0, d2, d1 # CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vneg.f32 s0, s2 -# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 1.00 vneg.f64 d0, d2 +# CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vneg.f64 d0, d2 # CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vnmla.f32 s0, s2, s1 -# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 1.00 vnmla.f64 d0, d2, d1 +# CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vnmla.f64 d0, d2, d1 # CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vnmls.f32 s0, s2, s1 -# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 1.00 vnmls.f64 d0, d2, d1 +# CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vnmls.f64 d0, d2, d1 # CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vnmul.f32 s0, s2, s1 -# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 1.00 vnmul.f64 d0, d2, d1 +# CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vnmul.f64 d0, d2, d1 # CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vrinta.f32 s0, s2 -# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 1.00 vrinta.f64 d0, d2 +# CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vrinta.f64 d0, d2 # CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vrintm.f32 s0, s2 -# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 1.00 vrintm.f64 d0, d2 +# CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vrintm.f64 d0, d2 # CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vrintn.f32 s0, s2 -# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 1.00 vrintn.f64 d0, d2 +# CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vrintn.f64 d0, d2 # CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vrintp.f32 s0, s2 -# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 1.00 vrintp.f64 d0, d2 +# CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vrintp.f64 d0, d2 # CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vrintr.f32 s0, s2 -# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 1.00 vrintr.f64 d0, d2 +# CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vrintr.f64 d0, d2 # CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vrintz.f32 s0, s2 -# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 1.00 vrintz.f64 d0, d2 +# CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vrintz.f64 d0, d2 # CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vrintx.f32 s0, s2 -# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 1.00 vrintx.f64 d0, d2 +# CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vrintx.f64 d0, d2 # CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vseleq.f32 s0, s2, s1 -# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 1.00 vseleq.f64 d0, d2, d1 +# CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vseleq.f64 d0, d2, d1 # CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vsqrt.f32 s0, s2 -# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 1.00 vsqrt.f64 d0, d2 +# CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vsqrt.f64 d0, d2 # CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vsub.f32 s0, s2, s1 -# CHECK-NEXT: - - - - - - - - - - 1.00 1.00 1.00 vsub.f64 d0, d2, d1 -# CHECK-NEXT: - - - 0.50 0.50 - - - - - - 1.00 1.00 vldr d0, [r0] +# CHECK-NEXT: - - - - - - - - - - 1.00 0.50 0.50 vsub.f64 d0, d2, d1 +# CHECK-NEXT: - - - 0.50 0.50 - - - - - - 0.50 0.50 vldr d0, [r0] # CHECK-NEXT: - - - 0.50 0.50 - - - - - - 0.50 0.50 vldr s0, [r0] -# CHECK-NEXT: - - - - - - - - - 1.00 - 1.00 1.00 vstr d0, [r0] +# CHECK-NEXT: - - - - - - - - - 1.00 - 0.50 0.50 vstr d0, [r0] # CHECK-NEXT: - - - - - - - - - 1.00 - 0.50 0.50 vstr s0, [r0]