diff --git a/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h b/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h --- a/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h +++ b/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h @@ -38,6 +38,7 @@ /// Instructions that were issued, but not executed yet. SmallVector IssuedInst; + InstRef LastIssuedInst; /// Number of instructions issued in the current cycle. unsigned NumIssued; diff --git a/llvm/lib/MCA/Stages/InOrderIssueStage.cpp b/llvm/lib/MCA/Stages/InOrderIssueStage.cpp --- a/llvm/lib/MCA/Stages/InOrderIssueStage.cpp +++ b/llvm/lib/MCA/Stages/InOrderIssueStage.cpp @@ -57,6 +57,37 @@ return false; } +// Check that all writes of the First instruction occur before writes +// of the Second instruction. Otherwise return the number of cycles +// between the first write of the Second instruction and the last +// write of the First instruction. +static unsigned checkWritesOrder(const InstRef &First, const InstRef &Second) { + unsigned FirstWriteEnd = 0; + for (const WriteState &FirstWS : First.getInstruction()->getDefs()) { + int CyclesLeft = FirstWS.getCyclesLeft(); + if (CyclesLeft == UNKNOWN_CYCLES) + CyclesLeft = FirstWS.getLatency(); + if (CyclesLeft < 0) + CyclesLeft = 0; + FirstWriteEnd = std::max(FirstWriteEnd, (unsigned)CyclesLeft); + } + + unsigned SecondWriteStart = ~0U; + for (const WriteState &SecondWS : Second.getInstruction()->getDefs()) { + int CyclesLeft = SecondWS.getCyclesLeft(); + if (CyclesLeft == UNKNOWN_CYCLES) + CyclesLeft = SecondWS.getLatency(); + if (CyclesLeft < 0) + CyclesLeft = 0; + SecondWriteStart = std::min(SecondWriteStart, (unsigned)CyclesLeft); + } + + if (SecondWriteStart >= FirstWriteEnd) + return 0; + + return FirstWriteEnd - SecondWriteStart; +} + /// Return a number of cycles left until register requirements of the /// instructions are met. static unsigned checkRegisterHazard(const RegisterFile &PRF, @@ -118,6 +149,14 @@ HWPressureEvent(HWPressureEvent::RESOURCES, IR)); } + if (LastIssuedInst && !LastIssuedInst.getInstruction()->getDesc().RetireOOO) { + // Delay the instruction to ensure that writes occur in program + // order + if (unsigned StallWritesOrder = checkWritesOrder(LastIssuedInst, IR)) { + *StallCycles = StallWritesOrder; + } + } + return *StallCycles == 0; } @@ -161,8 +200,6 @@ const InstrDesc &Desc = IS.getDesc(); unsigned RCUTokenID = RetireControlUnit::UnhandledTokenID; - if (!Desc.RetireOOO) - RCUTokenID = RCU.dispatch(IR); IS.dispatch(RCUTokenID); if (Desc.EndGroup) { @@ -211,6 +248,7 @@ notifyInstructionExecute(IR, UsedResources, *this); IssuedInst.push_back(IR); + LastIssuedInst = IR; ++NumIssued; return llvm::ErrorSuccess(); diff --git a/llvm/lib/Target/AArch64/AArch64SchedA55.td b/llvm/lib/Target/AArch64/AArch64SchedA55.td --- a/llvm/lib/Target/AArch64/AArch64SchedA55.td +++ b/llvm/lib/Target/AArch64/AArch64SchedA55.td @@ -339,5 +339,4 @@ def : InstRW<[CortexA55WriteFSqrtSP], (instregex "^.*SQRT.*32$")>; def : InstRW<[CortexA55WriteFSqrtDP], (instregex "^.*SQRT.*64$")>; -def A55RCU : RetireControlUnit<64, 0>; } diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-stats.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-stats.s --- a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-stats.s +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-stats.s @@ -35,7 +35,7 @@ # CHECK-NEXT: 1 4 1.00 * str w0, [x21, x18, lsl #2] # CHECK: Dynamic Dispatch Stall Cycles: -# CHECK-NEXT: RAT - Register unavailable: 10 (47.6%) +# CHECK-NEXT: RAT - Register unavailable: 8 (38.1%) # CHECK-NEXT: RCU - Retire tokens unavailable: 0 # CHECK-NEXT: SCHEDQ - Scheduler full: 0 # CHECK-NEXT: LQ - Load queue full: 0 diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-views.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-views.s --- a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-views.s +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-views.s @@ -35,7 +35,7 @@ # CHECK-NEXT: 1 4 1.00 * str w0, [x21, x18, lsl #2] # CHECK: Dynamic Dispatch Stall Cycles: -# CHECK-NEXT: RAT - Register unavailable: 10 (47.6%) +# CHECK-NEXT: RAT - Register unavailable: 8 (38.1%) # CHECK-NEXT: RCU - Retire tokens unavailable: 0 # CHECK-NEXT: SCHEDQ - Scheduler full: 0 # CHECK-NEXT: LQ - Load queue full: 0 @@ -106,13 +106,13 @@ # CHECK: [0,0] DeeER. . . . ldr w4, [x2], #4 # CHECK-NEXT: [0,1] .DeeER . . . ldr w5, [x3] # CHECK-NEXT: [0,2] . DeeeER. . . madd w0, w5, w4, w0 -# CHECK-NEXT: [0,3] . DeeE-R. . . add x3, x3, x13 +# CHECK-NEXT: [0,3] . DeeER. . . add x3, x3, x13 # CHECK-NEXT: [0,4] . DeeER. . . subs x1, x1, #1 # CHECK-NEXT: [0,5] . . DeeeER . . str w0, [x21, x18, lsl #2] # CHECK-NEXT: [1,0] . . DeeER . . ldr w4, [x2], #4 # CHECK-NEXT: [1,1] . . DeeER . . ldr w5, [x3] # CHECK-NEXT: [1,2] . . . DeeeER . madd w0, w5, w4, w0 -# CHECK-NEXT: [1,3] . . . DeeE-R . add x3, x3, x13 +# CHECK-NEXT: [1,3] . . . DeeER . add x3, x3, x13 # CHECK-NEXT: [1,4] . . . DeeER . subs x1, x1, #1 # CHECK-NEXT: [1,5] . . . DeeeER str w0, [x21, x18, lsl #2] @@ -126,7 +126,7 @@ # CHECK-NEXT: 0. 2 0.0 0.0 0.0 ldr w4, [x2], #4 # CHECK-NEXT: 1. 2 0.0 0.0 0.0 ldr w5, [x3] # CHECK-NEXT: 2. 2 0.0 0.0 0.0 madd w0, w5, w4, w0 -# CHECK-NEXT: 3. 2 0.0 0.0 1.0 add x3, x3, x13 +# CHECK-NEXT: 3. 2 0.0 0.0 0.0 add x3, x3, x13 # CHECK-NEXT: 4. 2 0.0 0.0 0.0 subs x1, x1, #1 # CHECK-NEXT: 5. 2 0.0 0.0 0.0 str w0, [x21, x18, lsl #2] -# CHECK-NEXT: 2 0.0 0.0 0.2 +# CHECK-NEXT: 2 0.0 0.0 0.0 diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-in-order-retire.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-in-order-retire.s --- a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-in-order-retire.s +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-in-order-retire.s @@ -10,12 +10,12 @@ # CHECK: Iterations: 2 # CHECK-NEXT: Instructions: 12 -# CHECK-NEXT: Total Cycles: 18 +# CHECK-NEXT: Total Cycles: 20 # CHECK-NEXT: Total uOps: 12 # CHECK: Dispatch Width: 2 -# CHECK-NEXT: uOps Per Cycle: 0.67 -# CHECK-NEXT: IPC: 0.67 +# CHECK-NEXT: uOps Per Cycle: 0.60 +# CHECK-NEXT: IPC: 0.60 # CHECK-NEXT: Block RThroughput: 8.0 # CHECK: Instruction Info: @@ -40,33 +40,37 @@ # CHECK-NEXT: SCHEDQ - Scheduler full: 0 # CHECK-NEXT: LQ - Load queue full: 0 # CHECK-NEXT: SQ - Store queue full: 0 -# CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 5 (27.8%) +# CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 1 (5.0%) # CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: # CHECK-NEXT: [# dispatched], [# cycles] -# CHECK-NEXT: 0, 12 (66.7%) -# CHECK-NEXT: 2, 6 (33.3%) +# CHECK-NEXT: 0, 12 (60.0%) +# CHECK-NEXT: 1, 4 (20.0%) +# CHECK-NEXT: 2, 4 (20.0%) # CHECK: Schedulers - number of cycles where we saw N micro opcodes issued: # CHECK-NEXT: [# issued], [# cycles] -# CHECK-NEXT: 0, 12 (66.7%) -# CHECK-NEXT: 2, 6 (33.3%) +# CHECK-NEXT: 0, 12 (60.0%) +# CHECK-NEXT: 1, 4 (20.0%) +# CHECK-NEXT: 2, 4 (20.0%) # CHECK: Scheduler's queue usage: # CHECK-NEXT: No scheduler resources used. # CHECK: Retire Control Unit - number of cycles where we saw N instructions retired: # CHECK-NEXT: [# retired], [# cycles] -# CHECK-NEXT: 0, 16 (88.9%) -# CHECK-NEXT: 6, 2 (11.1%) +# CHECK-NEXT: 0, 14 (70.0%) +# CHECK-NEXT: 1, 2 (10.0%) +# CHECK-NEXT: 2, 2 (10.0%) +# CHECK-NEXT: 3, 2 (10.0%) # CHECK: Total ROB Entries: 64 -# CHECK-NEXT: Max Used ROB Entries: 8 ( 12.5% ) -# CHECK-NEXT: Average Used ROB Entries per cy: 5 ( 7.8% ) +# CHECK-NEXT: Max Used ROB Entries: 7 ( 10.9% ) +# CHECK-NEXT: Average Used ROB Entries per cy: 2 ( 3.1% ) # CHECK: Register File statistics: # CHECK-NEXT: Total number of mappings created: 12 -# CHECK-NEXT: Max number of mappings used: 8 +# CHECK-NEXT: Max number of mappings used: 7 # CHECK: Resources: # CHECK-NEXT: [0.0] - CortexA55UnitALU @@ -96,21 +100,21 @@ # CHECK-NEXT: 0.50 0.50 - - - - - - - - - - add w7, w9, w0 # CHECK: Timeline view: -# CHECK-NEXT: 01234567 +# CHECK-NEXT: 0123456789 # CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeeeeeeeER. . . sdiv w12, w21, w0 -# CHECK-NEXT: [0,1] DeeE-----R. . . add w8, w8, #1 -# CHECK-NEXT: [0,2] .DeeE----R. . . add w1, w2, w0 -# CHECK-NEXT: [0,3] .DeeE----R. . . add w3, w4, #1 -# CHECK-NEXT: [0,4] . DeeE---R. . . add w5, w6, w0 -# CHECK-NEXT: [0,5] . DeeE---R. . . add w7, w9, w0 -# CHECK-NEXT: [1,0] . . DeeeeeeeER sdiv w12, w21, w0 -# CHECK-NEXT: [1,1] . . DeeE-----R add w8, w8, #1 -# CHECK-NEXT: [1,2] . . DeeE----R add w1, w2, w0 -# CHECK-NEXT: [1,3] . . DeeE----R add w3, w4, #1 -# CHECK-NEXT: [1,4] . . DeeE---R add w5, w6, w0 -# CHECK-NEXT: [1,5] . . DeeE---R add w7, w9, w0 +# CHECK: [0,0] DeeeeeeeER. . . sdiv w12, w21, w0 +# CHECK-NEXT: [0,1] . DeeER. . . add w8, w8, #1 +# CHECK-NEXT: [0,2] . DeeER. . . add w1, w2, w0 +# CHECK-NEXT: [0,3] . .DeeER . . add w3, w4, #1 +# CHECK-NEXT: [0,4] . .DeeER . . add w5, w6, w0 +# CHECK-NEXT: [0,5] . . DeeER . . add w7, w9, w0 +# CHECK-NEXT: [1,0] . . DeeeeeeeER . sdiv w12, w21, w0 +# CHECK-NEXT: [1,1] . . . DeeER . add w8, w8, #1 +# CHECK-NEXT: [1,2] . . . DeeER . add w1, w2, w0 +# CHECK-NEXT: [1,3] . . . DeeER. add w3, w4, #1 +# CHECK-NEXT: [1,4] . . . DeeER. add w5, w6, w0 +# CHECK-NEXT: [1,5] . . . DeeER add w7, w9, w0 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -120,9 +124,9 @@ # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 2 0.0 0.0 0.0 sdiv w12, w21, w0 -# CHECK-NEXT: 1. 2 0.0 0.0 5.0 add w8, w8, #1 -# CHECK-NEXT: 2. 2 0.0 0.0 4.0 add w1, w2, w0 -# CHECK-NEXT: 3. 2 0.0 0.0 4.0 add w3, w4, #1 -# CHECK-NEXT: 4. 2 0.0 0.0 3.0 add w5, w6, w0 -# CHECK-NEXT: 5. 2 0.0 0.0 3.0 add w7, w9, w0 -# CHECK-NEXT: 2 0.0 0.0 3.2 +# CHECK-NEXT: 1. 2 0.0 0.0 0.0 add w8, w8, #1 +# CHECK-NEXT: 2. 2 0.0 0.0 0.0 add w1, w2, w0 +# CHECK-NEXT: 3. 2 0.0 0.0 0.0 add w3, w4, #1 +# CHECK-NEXT: 4. 2 0.0 0.0 0.0 add w5, w6, w0 +# CHECK-NEXT: 5. 2 0.0 0.0 0.0 add w7, w9, w0 +# CHECK-NEXT: 2 0.0 0.0 0.0