diff --git a/llvm/docs/CommandGuide/llvm-mca.rst b/llvm/docs/CommandGuide/llvm-mca.rst --- a/llvm/docs/CommandGuide/llvm-mca.rst +++ b/llvm/docs/CommandGuide/llvm-mca.rst @@ -975,7 +975,6 @@ the ``IssueWidth`` parameter in LLVM's scheduling model. Once issued, an instruction is moved to ``IssuedInst`` set until it is ready to -retire. If ``RetireControlUnit`` is defined in the LLVM's scheduling model, -:program:`llvm-mca` ensures that instructions are retired in-order. However, an -instruction is allowed to retire out-of-order if ``RetireOOO`` property is true -for at least one of its writes. +retire. :program:`llvm-mca` ensures that writes are committed in-order. However, +an instruction is allowed to commit writes and retire out-of-order if +``RetireOOO`` property is true for at least one of its writes. diff --git a/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h b/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h --- a/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h +++ b/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h @@ -27,12 +27,10 @@ namespace mca { class RegisterFile; class ResourceManager; -struct RetireControlUnit; class InOrderIssueStage final : public Stage { const MCSchedModel &SM; const MCSubtargetInfo &STI; - RetireControlUnit &RCU; RegisterFile &PRF; std::unique_ptr RM; @@ -67,14 +65,16 @@ Error tryIssue(InstRef &IR, unsigned *StallCycles); /// Update status of instructions from IssuedInst. - Error updateIssuedInst(); + void updateIssuedInst(); + + /// Retire instruction once it is executed. + void retireInstruction(InstRef &IR); public: - InOrderIssueStage(RetireControlUnit &RCU, RegisterFile &PRF, - const MCSchedModel &SM, const MCSubtargetInfo &STI) - : SM(SM), STI(STI), RCU(RCU), PRF(PRF), - RM(std::make_unique(SM)), NumIssued(0), - StallCyclesLeft(0), Bandwidth(0), LastWriteBackCycle(0) {} + InOrderIssueStage(RegisterFile &PRF, const MCSchedModel &SM, + const MCSubtargetInfo &STI) + : SM(SM), STI(STI), PRF(PRF), RM(std::make_unique(SM)), + NumIssued(0), StallCyclesLeft(0), Bandwidth(0), LastWriteBackCycle(0) {} bool isAvailable(const InstRef &) const override; bool hasWorkToComplete() const override; diff --git a/llvm/include/llvm/MCA/Stages/RetireStage.h b/llvm/include/llvm/MCA/Stages/RetireStage.h --- a/llvm/include/llvm/MCA/Stages/RetireStage.h +++ b/llvm/include/llvm/MCA/Stages/RetireStage.h @@ -30,7 +30,6 @@ RetireControlUnit &RCU; RegisterFile &PRF; LSUnitBase &LSU; - SmallVector RetireInst; RetireStage(const RetireStage &Other) = delete; RetireStage &operator=(const RetireStage &Other) = delete; @@ -39,9 +38,7 @@ RetireStage(RetireControlUnit &R, RegisterFile &F, LSUnitBase &LS) : Stage(), RCU(R), PRF(F), LSU(LS) {} - bool hasWorkToComplete() const override { - return !RCU.isEmpty() || !RetireInst.empty(); - } + bool hasWorkToComplete() const override { return !RCU.isEmpty(); } Error cycleStart() override; Error cycleEnd() override; Error execute(InstRef &IR) override; diff --git a/llvm/lib/MCA/Context.cpp b/llvm/lib/MCA/Context.cpp --- a/llvm/lib/MCA/Context.cpp +++ b/llvm/lib/MCA/Context.cpp @@ -71,23 +71,16 @@ std::unique_ptr Context::createInOrderPipeline(const PipelineOptions &Opts, SourceMgr &SrcMgr) { const MCSchedModel &SM = STI.getSchedModel(); - auto RCU = std::make_unique(SM); auto PRF = std::make_unique(SM, MRI, Opts.RegisterFileSize); - auto LSU = std::make_unique(SM, Opts.LoadQueueSize, - Opts.StoreQueueSize, Opts.AssumeNoAlias); auto Entry = std::make_unique(SrcMgr); - auto InOrderIssue = std::make_unique(*RCU, *PRF, SM, STI); - auto Retire = std::make_unique(*RCU, *PRF, *LSU); + auto InOrderIssue = std::make_unique(*PRF, SM, STI); auto StagePipeline = std::make_unique(); StagePipeline->appendStage(std::move(Entry)); StagePipeline->appendStage(std::move(InOrderIssue)); - StagePipeline->appendStage(std::move(Retire)); - addHardwareUnit(std::move(RCU)); addHardwareUnit(std::move(PRF)); - addHardwareUnit(std::move(LSU)); return StagePipeline; } diff --git a/llvm/lib/MCA/HardwareUnits/RetireControlUnit.cpp b/llvm/lib/MCA/HardwareUnits/RetireControlUnit.cpp --- a/llvm/lib/MCA/HardwareUnits/RetireControlUnit.cpp +++ b/llvm/lib/MCA/HardwareUnits/RetireControlUnit.cpp @@ -23,6 +23,8 @@ : NextAvailableSlotIdx(0), CurrentInstructionSlotIdx(0), AvailableEntries(SM.isOutOfOrder() ? SM.MicroOpBufferSize : 0), MaxRetirePerCycle(0) { + assert(SM.isOutOfOrder() && + "RetireControlUnit is not available for in-order processors"); // Check if the scheduling model provides extra information about the machine // processor. If so, then use that information to set the reorder buffer size // and the maximum number of instructions retired per cycle. @@ -33,17 +35,12 @@ MaxRetirePerCycle = EPI.MaxRetirePerCycle; } NumROBEntries = AvailableEntries; - if (!SM.isOutOfOrder() && !NumROBEntries) - return; assert(NumROBEntries && "Invalid reorder buffer size!"); Queue.resize(2 * NumROBEntries); } // Reserves a number of slots, and returns a new token. unsigned RetireControlUnit::dispatch(const InstRef &IR) { - if (!NumROBEntries) - return UnhandledTokenID; - const Instruction &Inst = *IR.getInstruction(); unsigned Entries = normalizeQuantity(Inst.getNumMicroOps()); assert((AvailableEntries >= Entries) && "Reorder Buffer unavailable!"); diff --git a/llvm/lib/MCA/Stages/InOrderIssueStage.cpp b/llvm/lib/MCA/Stages/InOrderIssueStage.cpp --- a/llvm/lib/MCA/Stages/InOrderIssueStage.cpp +++ b/llvm/lib/MCA/Stages/InOrderIssueStage.cpp @@ -182,7 +182,7 @@ PRF.addRegisterWrite(WriteRef(SourceIndex, &WS), UsedRegs); } -static void notifyInstructionExecute( +static void notifyInstructionIssue( const InstRef &IR, const SmallVectorImpl> &UsedRes, const Stage &S) { @@ -205,28 +205,11 @@ } llvm::Error InOrderIssueStage::execute(InstRef &IR) { - Instruction &IS = *IR.getInstruction(); - const InstrDesc &Desc = IS.getDesc(); - - unsigned RCUTokenID = RetireControlUnit::UnhandledTokenID; - if (!Desc.RetireOOO) - RCUTokenID = RCU.dispatch(IR); - IS.dispatch(RCUTokenID); - - if (Desc.EndGroup) { - Bandwidth = 0; - } else { - unsigned NumMicroOps = IR.getInstruction()->getNumMicroOps(); - assert(Bandwidth >= NumMicroOps); - Bandwidth -= NumMicroOps; - } - if (llvm::Error E = tryIssue(IR, &StallCyclesLeft)) return E; if (StallCyclesLeft) { StalledInst = IR; - Bandwidth = 0; } return llvm::ErrorSuccess(); @@ -235,20 +218,26 @@ llvm::Error InOrderIssueStage::tryIssue(InstRef &IR, unsigned *StallCycles) { Instruction &IS = *IR.getInstruction(); unsigned SourceIndex = IR.getSourceIndex(); + const InstrDesc &Desc = IS.getDesc(); if (!canExecute(IR, StallCycles)) { LLVM_DEBUG(dbgs() << "[E] Stalled #" << IR << " for " << *StallCycles << " cycles\n"); + Bandwidth = 0; return llvm::ErrorSuccess(); } + unsigned RCUTokenID = RetireControlUnit::UnhandledTokenID; + IS.dispatch(RCUTokenID); + SmallVector UsedRegs(PRF.getNumRegisterFiles()); addRegisterReadWrite(PRF, IS, SourceIndex, STI, UsedRegs); - notifyInstructionDispatch(IR, IS.getDesc().NumMicroOps, UsedRegs, *this); + unsigned NumMicroOps = IS.getNumMicroOps(); + notifyInstructionDispatch(IR, NumMicroOps, UsedRegs, *this); SmallVector, 4> UsedResources; - RM->issueInstruction(IS.getDesc(), UsedResources); + RM->issueInstruction(Desc, UsedResources); IS.execute(SourceIndex); // Replace resource masks with valid resource processor IDs. @@ -256,10 +245,17 @@ uint64_t Mask = Use.first.first; Use.first.first = RM->resolveResourceMask(Mask); } - notifyInstructionExecute(IR, UsedResources, *this); + notifyInstructionIssue(IR, UsedResources, *this); + + if (Desc.EndGroup) { + Bandwidth = 0; + } else { + assert(Bandwidth >= NumMicroOps); + Bandwidth -= NumMicroOps; + } IssuedInst.push_back(IR); - ++NumIssued; + NumIssued += NumMicroOps; if (!IR.getInstruction()->getDesc().RetireOOO) LastWriteBackCycle = findLastWriteBackCycle(IR); @@ -267,7 +263,7 @@ return llvm::ErrorSuccess(); } -llvm::Error InOrderIssueStage::updateIssuedInst() { +void InOrderIssueStage::updateIssuedInst() { // Update other instructions. Executed instructions will be retired during the // next cycle. unsigned NumExecuted = 0; @@ -283,29 +279,37 @@ ++I; continue; } + + PRF.onInstructionExecuted(&IS); notifyEvent( HWInstructionEvent(HWInstructionEvent::Executed, IR)); - LLVM_DEBUG(dbgs() << "[E] Instruction #" << IR << " is executed\n"); ++NumExecuted; + + retireInstruction(*I); + std::iter_swap(I, E - NumExecuted); } - // Retire instructions in the next cycle - if (NumExecuted) { - for (auto I = IssuedInst.end() - NumExecuted, E = IssuedInst.end(); I != E; - ++I) { - if (llvm::Error E = moveToTheNextStage(*I)) - return E; - } + if (NumExecuted) IssuedInst.resize(IssuedInst.size() - NumExecuted); - } +} - return llvm::ErrorSuccess(); +void InOrderIssueStage::retireInstruction(InstRef &IR) { + Instruction &IS = *IR.getInstruction(); + IS.retire(); + + llvm::SmallVector FreedRegs(PRF.getNumRegisterFiles()); + for (const WriteState &WS : IS.getDefs()) + PRF.removeRegisterWrite(WS, FreedRegs); + + notifyEvent(HWInstructionRetiredEvent(IR, FreedRegs)); + LLVM_DEBUG(dbgs() << "[E] Retired #" << IR << " \n"); } llvm::Error InOrderIssueStage::cycleStart() { NumIssued = 0; + Bandwidth = SM.IssueWidth; PRF.cycleStart(); @@ -313,8 +317,7 @@ SmallVector Freed; RM->cycleEvent(Freed); - if (llvm::Error E = updateIssuedInst()) - return E; + updateIssuedInst(); // Issue instructions scheduled for this cycle if (!StallCyclesLeft && StalledInst) { @@ -325,7 +328,6 @@ if (!StallCyclesLeft) { StalledInst.invalidate(); assert(NumIssued <= SM.IssueWidth && "Overflow."); - Bandwidth = SM.IssueWidth - NumIssued; } else { // The instruction is still stalled, cannot issue any new instructions in // this cycle. diff --git a/llvm/lib/MCA/Stages/RetireStage.cpp b/llvm/lib/MCA/Stages/RetireStage.cpp --- a/llvm/lib/MCA/Stages/RetireStage.cpp +++ b/llvm/lib/MCA/Stages/RetireStage.cpp @@ -38,13 +38,6 @@ NumRetired++; } - // Retire instructions that are not controlled by the RCU - for (InstRef &IR : RetireInst) { - IR.getInstruction()->retire(); - notifyInstructionRetired(IR); - } - RetireInst.resize(0); - return llvm::ErrorSuccess(); } @@ -58,12 +51,9 @@ PRF.onInstructionExecuted(&IS); unsigned TokenID = IS.getRCUTokenID(); - if (TokenID != RetireControlUnit::UnhandledTokenID) { - RCU.onInstructionExecuted(TokenID); - return llvm::ErrorSuccess(); - } + assert(TokenID != RetireControlUnit::UnhandledTokenID); + RCU.onInstructionExecuted(TokenID); - RetireInst.push_back(IR); return llvm::ErrorSuccess(); } diff --git a/llvm/lib/Target/AArch64/AArch64SchedA55.td b/llvm/lib/Target/AArch64/AArch64SchedA55.td --- a/llvm/lib/Target/AArch64/AArch64SchedA55.td +++ b/llvm/lib/Target/AArch64/AArch64SchedA55.td @@ -339,5 +339,4 @@ def : InstRW<[CortexA55WriteFSqrtSP], (instregex "^.*SQRT.*32$")>; def : InstRW<[CortexA55WriteFSqrtDP], (instregex "^.*SQRT.*64$")>; -def A55RCU : RetireControlUnit<64, 0>; } diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-add-sequence.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-add-sequence.s --- a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-add-sequence.s +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-add-sequence.s @@ -8,12 +8,12 @@ # CHECK: Iterations: 2 # CHECK-NEXT: Instructions: 8 -# CHECK-NEXT: Total Cycles: 10 +# CHECK-NEXT: Total Cycles: 9 # CHECK-NEXT: Total uOps: 8 # CHECK: Dispatch Width: 2 -# CHECK-NEXT: uOps Per Cycle: 0.80 -# CHECK-NEXT: IPC: 0.80 +# CHECK-NEXT: uOps Per Cycle: 0.89 +# CHECK-NEXT: IPC: 0.89 # CHECK-NEXT: Block RThroughput: 2.0 # CHECK: Instruction Info: @@ -56,16 +56,16 @@ # CHECK-NEXT: 1.00 - - - - - - - - - - - add w1, w0, #4 # CHECK: Timeline view: -# CHECK-NEXT: Index 0123456789 +# CHECK-NEXT: Index 012345678 -# CHECK: [0,0] DeeER. . add w2, w3, #1 -# CHECK-NEXT: [0,1] DeeER. . add w4, w3, #2, lsl #12 -# CHECK-NEXT: [0,2] .DeeER . add w0, w4, #3 -# CHECK-NEXT: [0,3] . DeeER . add w1, w0, #4 -# CHECK-NEXT: [1,0] . DeeER . add w2, w3, #1 -# CHECK-NEXT: [1,1] . DeeER . add w4, w3, #2, lsl #12 -# CHECK-NEXT: [1,2] . DeeER. add w0, w4, #3 -# CHECK-NEXT: [1,3] . DeeER add w1, w0, #4 +# CHECK: [0,0] DeeE . . add w2, w3, #1 +# CHECK-NEXT: [0,1] DeeE . . add w4, w3, #2, lsl #12 +# CHECK-NEXT: [0,2] .DeeE. . add w0, w4, #3 +# CHECK-NEXT: [0,3] . DeeE . add w1, w0, #4 +# CHECK-NEXT: [1,0] . DeeE . add w2, w3, #1 +# CHECK-NEXT: [1,1] . DeeE . add w4, w3, #2, lsl #12 +# CHECK-NEXT: [1,2] . DeeE. add w0, w4, #3 +# CHECK-NEXT: [1,3] . DeeE add w1, w0, #4 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-stats.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-stats.s --- a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-stats.s +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-stats.s @@ -10,12 +10,12 @@ # CHECK: Iterations: 2 # CHECK-NEXT: Instructions: 12 -# CHECK-NEXT: Total Cycles: 21 +# CHECK-NEXT: Total Cycles: 20 # CHECK-NEXT: Total uOps: 14 # CHECK: Dispatch Width: 2 -# CHECK-NEXT: uOps Per Cycle: 0.67 -# CHECK-NEXT: IPC: 0.57 +# CHECK-NEXT: uOps Per Cycle: 0.70 +# CHECK-NEXT: IPC: 0.60 # CHECK-NEXT: Block RThroughput: 3.5 # CHECK: Instruction Info: @@ -35,7 +35,7 @@ # CHECK-NEXT: 1 4 1.00 * str w0, [x21, x18, lsl #2] # CHECK: Dynamic Dispatch Stall Cycles: -# CHECK-NEXT: RAT - Register unavailable: 8 (38.1%) +# CHECK-NEXT: RAT - Register unavailable: 8 (40.0%) # CHECK-NEXT: RCU - Retire tokens unavailable: 0 # CHECK-NEXT: SCHEDQ - Scheduler full: 0 # CHECK-NEXT: LQ - Load queue full: 0 @@ -44,33 +44,22 @@ # CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: # CHECK-NEXT: [# dispatched], [# cycles] -# CHECK-NEXT: 0, 11 (52.4%) -# CHECK-NEXT: 1, 6 (28.6%) -# CHECK-NEXT: 2, 4 (19.0%) +# CHECK-NEXT: 0, 10 (50.0%) +# CHECK-NEXT: 1, 6 (30.0%) +# CHECK-NEXT: 2, 4 (20.0%) # CHECK: Schedulers - number of cycles where we saw N micro opcodes issued: # CHECK-NEXT: [# issued], [# cycles] -# CHECK-NEXT: 0, 11 (52.4%) -# CHECK-NEXT: 1, 6 (28.6%) -# CHECK-NEXT: 2, 4 (19.0%) +# CHECK-NEXT: 0, 10 (50.0%) +# CHECK-NEXT: 1, 6 (30.0%) +# CHECK-NEXT: 2, 4 (20.0%) # CHECK: Scheduler's queue usage: # CHECK-NEXT: No scheduler resources used. -# CHECK: Retire Control Unit - number of cycles where we saw N instructions retired: -# CHECK-NEXT: [# retired], [# cycles] -# CHECK-NEXT: 0, 14 (66.7%) -# CHECK-NEXT: 1, 4 (19.0%) -# CHECK-NEXT: 2, 1 (4.8%) -# CHECK-NEXT: 3, 2 (9.5%) - -# CHECK: Total ROB Entries: 64 -# CHECK-NEXT: Max Used ROB Entries: 6 ( 9.4% ) -# CHECK-NEXT: Average Used ROB Entries per cy: 2 ( 3.1% ) - # CHECK: Register File statistics: # CHECK-NEXT: Total number of mappings created: 14 -# CHECK-NEXT: Max number of mappings used: 6 +# CHECK-NEXT: Max number of mappings used: 4 # CHECK: Resources: # CHECK-NEXT: [0.0] - CortexA55UnitALU diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-views.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-views.s --- a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-views.s +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-views.s @@ -10,12 +10,12 @@ # CHECK: Iterations: 2 # CHECK-NEXT: Instructions: 12 -# CHECK-NEXT: Total Cycles: 21 +# CHECK-NEXT: Total Cycles: 20 # CHECK-NEXT: Total uOps: 14 # CHECK: Dispatch Width: 2 -# CHECK-NEXT: uOps Per Cycle: 0.67 -# CHECK-NEXT: IPC: 0.57 +# CHECK-NEXT: uOps Per Cycle: 0.70 +# CHECK-NEXT: IPC: 0.60 # CHECK-NEXT: Block RThroughput: 3.5 # CHECK: Instruction Info: @@ -35,7 +35,7 @@ # CHECK-NEXT: 1 4 1.00 * str w0, [x21, x18, lsl #2] # CHECK: Dynamic Dispatch Stall Cycles: -# CHECK-NEXT: RAT - Register unavailable: 8 (38.1%) +# CHECK-NEXT: RAT - Register unavailable: 8 (40.0%) # CHECK-NEXT: RCU - Retire tokens unavailable: 0 # CHECK-NEXT: SCHEDQ - Scheduler full: 0 # CHECK-NEXT: LQ - Load queue full: 0 @@ -44,33 +44,22 @@ # CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: # CHECK-NEXT: [# dispatched], [# cycles] -# CHECK-NEXT: 0, 11 (52.4%) -# CHECK-NEXT: 1, 6 (28.6%) -# CHECK-NEXT: 2, 4 (19.0%) +# CHECK-NEXT: 0, 10 (50.0%) +# CHECK-NEXT: 1, 6 (30.0%) +# CHECK-NEXT: 2, 4 (20.0%) # CHECK: Schedulers - number of cycles where we saw N micro opcodes issued: # CHECK-NEXT: [# issued], [# cycles] -# CHECK-NEXT: 0, 11 (52.4%) -# CHECK-NEXT: 1, 6 (28.6%) -# CHECK-NEXT: 2, 4 (19.0%) +# CHECK-NEXT: 0, 10 (50.0%) +# CHECK-NEXT: 1, 6 (30.0%) +# CHECK-NEXT: 2, 4 (20.0%) # CHECK: Scheduler's queue usage: # CHECK-NEXT: No scheduler resources used. -# CHECK: Retire Control Unit - number of cycles where we saw N instructions retired: -# CHECK-NEXT: [# retired], [# cycles] -# CHECK-NEXT: 0, 14 (66.7%) -# CHECK-NEXT: 1, 4 (19.0%) -# CHECK-NEXT: 2, 1 (4.8%) -# CHECK-NEXT: 3, 2 (9.5%) - -# CHECK: Total ROB Entries: 64 -# CHECK-NEXT: Max Used ROB Entries: 6 ( 9.4% ) -# CHECK-NEXT: Average Used ROB Entries per cy: 2 ( 3.1% ) - # CHECK: Register File statistics: # CHECK-NEXT: Total number of mappings created: 14 -# CHECK-NEXT: Max number of mappings used: 6 +# CHECK-NEXT: Max number of mappings used: 4 # CHECK: Resources: # CHECK-NEXT: [0.0] - CortexA55UnitALU @@ -101,20 +90,20 @@ # CHECK: Timeline view: # CHECK-NEXT: 0123456789 -# CHECK-NEXT: Index 0123456789 0 - -# CHECK: [0,0] DeeER. . . . ldr w4, [x2], #4 -# CHECK-NEXT: [0,1] .DeeER . . . ldr w5, [x3] -# CHECK-NEXT: [0,2] . DeeeER. . . madd w0, w5, w4, w0 -# CHECK-NEXT: [0,3] . DeeER. . . add x3, x3, x13 -# CHECK-NEXT: [0,4] . DeeER. . . subs x1, x1, #1 -# CHECK-NEXT: [0,5] . . DeeeER . . str w0, [x21, x18, lsl #2] -# CHECK-NEXT: [1,0] . . DeeER . . ldr w4, [x2], #4 -# CHECK-NEXT: [1,1] . . DeeER . . ldr w5, [x3] -# CHECK-NEXT: [1,2] . . . DeeeER . madd w0, w5, w4, w0 -# CHECK-NEXT: [1,3] . . . DeeER . add x3, x3, x13 -# CHECK-NEXT: [1,4] . . . DeeER . subs x1, x1, #1 -# CHECK-NEXT: [1,5] . . . DeeeER str w0, [x21, x18, lsl #2] +# CHECK-NEXT: Index 0123456789 + +# CHECK: [0,0] DeeE . . . . ldr w4, [x2], #4 +# CHECK-NEXT: [0,1] .DeeE. . . . ldr w5, [x3] +# CHECK-NEXT: [0,2] . DeeeE . . . madd w0, w5, w4, w0 +# CHECK-NEXT: [0,3] . DeeE . . . add x3, x3, x13 +# CHECK-NEXT: [0,4] . DeeE . . . subs x1, x1, #1 +# CHECK-NEXT: [0,5] . . DeeeE . . str w0, [x21, x18, lsl #2] +# CHECK-NEXT: [1,0] . . DeeE . . ldr w4, [x2], #4 +# CHECK-NEXT: [1,1] . . DeeE . . ldr w5, [x3] +# CHECK-NEXT: [1,2] . . . DeeeE . madd w0, w5, w4, w0 +# CHECK-NEXT: [1,3] . . . DeeE . add x3, x3, x13 +# CHECK-NEXT: [1,4] . . . DeeE . subs x1, x1, #1 +# CHECK-NEXT: [1,5] . . . DeeeE str w0, [x21, x18, lsl #2] # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-in-order-retire.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-in-order-retire.s --- a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-in-order-retire.s +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-in-order-retire.s @@ -10,12 +10,12 @@ # CHECK: Iterations: 2 # CHECK-NEXT: Instructions: 12 -# CHECK-NEXT: Total Cycles: 20 +# CHECK-NEXT: Total Cycles: 19 # CHECK-NEXT: Total uOps: 12 # CHECK: Dispatch Width: 2 -# CHECK-NEXT: uOps Per Cycle: 0.60 -# CHECK-NEXT: IPC: 0.60 +# CHECK-NEXT: uOps Per Cycle: 0.63 +# CHECK-NEXT: IPC: 0.63 # CHECK-NEXT: Block RThroughput: 8.0 # CHECK: Instruction Info: @@ -40,37 +40,26 @@ # CHECK-NEXT: SCHEDQ - Scheduler full: 0 # CHECK-NEXT: LQ - Load queue full: 0 # CHECK-NEXT: SQ - Store queue full: 0 -# CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 1 (5.0%) +# CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 1 (5.3%) # CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: # CHECK-NEXT: [# dispatched], [# cycles] -# CHECK-NEXT: 0, 12 (60.0%) -# CHECK-NEXT: 1, 4 (20.0%) -# CHECK-NEXT: 2, 4 (20.0%) +# CHECK-NEXT: 0, 11 (57.9%) +# CHECK-NEXT: 1, 4 (21.1%) +# CHECK-NEXT: 2, 4 (21.1%) # CHECK: Schedulers - number of cycles where we saw N micro opcodes issued: # CHECK-NEXT: [# issued], [# cycles] -# CHECK-NEXT: 0, 12 (60.0%) -# CHECK-NEXT: 1, 4 (20.0%) -# CHECK-NEXT: 2, 4 (20.0%) +# CHECK-NEXT: 0, 11 (57.9%) +# CHECK-NEXT: 1, 4 (21.1%) +# CHECK-NEXT: 2, 4 (21.1%) # CHECK: Scheduler's queue usage: # CHECK-NEXT: No scheduler resources used. -# CHECK: Retire Control Unit - number of cycles where we saw N instructions retired: -# CHECK-NEXT: [# retired], [# cycles] -# CHECK-NEXT: 0, 14 (70.0%) -# CHECK-NEXT: 1, 2 (10.0%) -# CHECK-NEXT: 2, 2 (10.0%) -# CHECK-NEXT: 3, 2 (10.0%) - -# CHECK: Total ROB Entries: 64 -# CHECK-NEXT: Max Used ROB Entries: 7 ( 10.9% ) -# CHECK-NEXT: Average Used ROB Entries per cy: 2 ( 3.1% ) - # CHECK: Register File statistics: # CHECK-NEXT: Total number of mappings created: 12 -# CHECK-NEXT: Max number of mappings used: 7 +# CHECK-NEXT: Max number of mappings used: 6 # CHECK: Resources: # CHECK-NEXT: [0.0] - CortexA55UnitALU @@ -100,21 +89,21 @@ # CHECK-NEXT: 0.50 0.50 - - - - - - - - - - add w7, w9, w0 # CHECK: Timeline view: -# CHECK-NEXT: 0123456789 +# CHECK-NEXT: 012345678 # CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeeeeeeeER. . . sdiv w12, w21, w0 -# CHECK-NEXT: [0,1] . DeeER. . . add w8, w8, #1 -# CHECK-NEXT: [0,2] . DeeER. . . add w1, w2, w0 -# CHECK-NEXT: [0,3] . .DeeER . . add w3, w4, #1 -# CHECK-NEXT: [0,4] . .DeeER . . add w5, w6, w0 -# CHECK-NEXT: [0,5] . . DeeER . . add w7, w9, w0 -# CHECK-NEXT: [1,0] . . DeeeeeeeER . sdiv w12, w21, w0 -# CHECK-NEXT: [1,1] . . . DeeER . add w8, w8, #1 -# CHECK-NEXT: [1,2] . . . DeeER . add w1, w2, w0 -# CHECK-NEXT: [1,3] . . . DeeER. add w3, w4, #1 -# CHECK-NEXT: [1,4] . . . DeeER. add w5, w6, w0 -# CHECK-NEXT: [1,5] . . . DeeER add w7, w9, w0 +# CHECK: [0,0] DeeeeeeeE . . . sdiv w12, w21, w0 +# CHECK-NEXT: [0,1] . DeeE . . . add w8, w8, #1 +# CHECK-NEXT: [0,2] . DeeE . . . add w1, w2, w0 +# CHECK-NEXT: [0,3] . .DeeE. . . add w3, w4, #1 +# CHECK-NEXT: [0,4] . .DeeE. . . add w5, w6, w0 +# CHECK-NEXT: [0,5] . . DeeE . . add w7, w9, w0 +# CHECK-NEXT: [1,0] . . DeeeeeeeE . sdiv w12, w21, w0 +# CHECK-NEXT: [1,1] . . . DeeE . add w8, w8, #1 +# CHECK-NEXT: [1,2] . . . DeeE . add w1, w2, w0 +# CHECK-NEXT: [1,3] . . . DeeE. add w3, w4, #1 +# CHECK-NEXT: [1,4] . . . DeeE. add w5, w6, w0 +# CHECK-NEXT: [1,5] . . . DeeE add w7, w9, w0 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-out-of-order-retire.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-out-of-order-retire.s --- a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-out-of-order-retire.s +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-out-of-order-retire.s @@ -10,12 +10,12 @@ # CHECK: Iterations: 2 # CHECK-NEXT: Instructions: 12 -# CHECK-NEXT: Total Cycles: 25 +# CHECK-NEXT: Total Cycles: 24 # CHECK-NEXT: Total uOps: 12 # CHECK: Dispatch Width: 2 -# CHECK-NEXT: uOps Per Cycle: 0.48 -# CHECK-NEXT: IPC: 0.48 +# CHECK-NEXT: uOps Per Cycle: 0.50 +# CHECK-NEXT: IPC: 0.50 # CHECK-NEXT: Block RThroughput: 10.0 # CHECK: Instruction Info: @@ -40,31 +40,21 @@ # CHECK-NEXT: SCHEDQ - Scheduler full: 0 # CHECK-NEXT: LQ - Load queue full: 0 # CHECK-NEXT: SQ - Store queue full: 0 -# CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 7 (28.0%) +# CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 7 (29.2%) # CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: # CHECK-NEXT: [# dispatched], [# cycles] -# CHECK-NEXT: 0, 19 (76.0%) -# CHECK-NEXT: 2, 6 (24.0%) +# CHECK-NEXT: 0, 18 (75.0%) +# CHECK-NEXT: 2, 6 (25.0%) # CHECK: Schedulers - number of cycles where we saw N micro opcodes issued: # CHECK-NEXT: [# issued], [# cycles] -# CHECK-NEXT: 0, 19 (76.0%) -# CHECK-NEXT: 2, 6 (24.0%) +# CHECK-NEXT: 0, 18 (75.0%) +# CHECK-NEXT: 2, 6 (25.0%) # CHECK: Scheduler's queue usage: # CHECK-NEXT: No scheduler resources used. -# CHECK: Retire Control Unit - number of cycles where we saw N instructions retired: -# CHECK-NEXT: [# retired], [# cycles] -# CHECK-NEXT: 0, 18 (72.0%) -# CHECK-NEXT: 1, 2 (8.0%) -# CHECK-NEXT: 2, 5 (20.0%) - -# CHECK: Total ROB Entries: 64 -# CHECK-NEXT: Max Used ROB Entries: 7 ( 10.9% ) -# CHECK-NEXT: Average Used ROB Entries per cy: 2 ( 3.1% ) - # CHECK: Register File statistics: # CHECK-NEXT: Total number of mappings created: 12 # CHECK-NEXT: Max number of mappings used: 7 @@ -98,20 +88,20 @@ # CHECK: Timeline view: # CHECK-NEXT: 0123456789 -# CHECK-NEXT: Index 0123456789 01234 - -# CHECK: [0,0] DeeeeeeeeeeeeER. . . fdiv s1, s2, s3 -# CHECK-NEXT: [0,1] DeeER. . . . . add w8, w8, #1 -# CHECK-NEXT: [0,2] .DeeER . . . . add w1, w2, w0 -# CHECK-NEXT: [0,3] .DeeER . . . . add w3, w4, #1 -# CHECK-NEXT: [0,4] . DeeER . . . . add w5, w6, w0 -# CHECK-NEXT: [0,5] . DeeER . . . . add w7, w9, w0 -# CHECK-NEXT: [1,0] . . DeeeeeeeeeeeeER fdiv s1, s2, s3 -# CHECK-NEXT: [1,1] . . DeeER. . . add w8, w8, #1 -# CHECK-NEXT: [1,2] . . .DeeER . . add w1, w2, w0 -# CHECK-NEXT: [1,3] . . .DeeER . . add w3, w4, #1 -# CHECK-NEXT: [1,4] . . . DeeER . . add w5, w6, w0 -# CHECK-NEXT: [1,5] . . . DeeER . . add w7, w9, w0 +# CHECK-NEXT: Index 0123456789 0123 + +# CHECK: [0,0] DeeeeeeeeeeeeE . . . fdiv s1, s2, s3 +# CHECK-NEXT: [0,1] DeeE . . . . . add w8, w8, #1 +# CHECK-NEXT: [0,2] .DeeE. . . . . add w1, w2, w0 +# CHECK-NEXT: [0,3] .DeeE. . . . . add w3, w4, #1 +# CHECK-NEXT: [0,4] . DeeE . . . . add w5, w6, w0 +# CHECK-NEXT: [0,5] . DeeE . . . . add w7, w9, w0 +# CHECK-NEXT: [1,0] . . DeeeeeeeeeeeeE fdiv s1, s2, s3 +# CHECK-NEXT: [1,1] . . DeeE . . . add w8, w8, #1 +# CHECK-NEXT: [1,2] . . .DeeE. . . add w1, w2, w0 +# CHECK-NEXT: [1,3] . . .DeeE. . . add w3, w4, #1 +# CHECK-NEXT: [1,4] . . . DeeE . . add w5, w6, w0 +# CHECK-NEXT: [1,5] . . . DeeE . . add w7, w9, w0 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions diff --git a/llvm/test/tools/llvm-mca/AMDGPU/gfx10-add-sequence.s b/llvm/test/tools/llvm-mca/AMDGPU/gfx10-add-sequence.s --- a/llvm/test/tools/llvm-mca/AMDGPU/gfx10-add-sequence.s +++ b/llvm/test/tools/llvm-mca/AMDGPU/gfx10-add-sequence.s @@ -7,12 +7,12 @@ # CHECK: Iterations: 1 # CHECK-NEXT: Instructions: 3 -# CHECK-NEXT: Total Cycles: 13 +# CHECK-NEXT: Total Cycles: 12 # CHECK-NEXT: Total uOps: 3 # CHECK: Dispatch Width: 1 -# CHECK-NEXT: uOps Per Cycle: 0.23 -# CHECK-NEXT: IPC: 0.23 +# CHECK-NEXT: uOps Per Cycle: 0.25 +# CHECK-NEXT: IPC: 0.25 # CHECK-NEXT: Block RThroughput: 3.0 # CHECK: Instruction Info: @@ -48,12 +48,12 @@ # CHECK-NEXT: - - - 1.00 - 1.00 - v_add_f32_e32 v2, v1, v0 # CHECK: Timeline view: -# CHECK-NEXT: 012 +# CHECK-NEXT: 01 # CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeeeeER . . v_add_f32_e32 v0, v0, v0 -# CHECK-NEXT: [0,1] .DeeeeER . . v_add_f32_e32 v1, v1, v1 -# CHECK-NEXT: [0,2] . .DeeeeER v_add_f32_e32 v2, v1, v0 +# CHECK: [0,0] DeeeeE .. v_add_f32_e32 v0, v0, v0 +# CHECK-NEXT: [0,1] .DeeeeE .. v_add_f32_e32 v1, v1, v1 +# CHECK-NEXT: [0,2] . .DeeeeE v_add_f32_e32 v2, v1, v0 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions diff --git a/llvm/test/tools/llvm-mca/AMDGPU/gfx10-double.s b/llvm/test/tools/llvm-mca/AMDGPU/gfx10-double.s --- a/llvm/test/tools/llvm-mca/AMDGPU/gfx10-double.s +++ b/llvm/test/tools/llvm-mca/AMDGPU/gfx10-double.s @@ -42,7 +42,7 @@ # CHECK: Iterations: 1 # CHECK-NEXT: Instructions: 27 -# CHECK-NEXT: Total Cycles: 205 +# CHECK-NEXT: Total Cycles: 204 # CHECK-NEXT: Total uOps: 27 # CHECK: Dispatch Width: 1 @@ -134,19 +134,19 @@ # CHECK-NEXT: 0123456789 0123456789 0123456789 0 # CHECK-NEXT: Index 0123456789 0123456789 0123456789 0123456789 -# CHECK: [0,0] DeeeeeeeeeeeeeeeeeeeeeER . . . . . . . . . . v_cvt_i32_f64_e32 v0, v[0:1] -# CHECK-NEXT: [0,1] .DeeeeeeeeeeeeeeeeeeeeeER. . . . . . . . . . v_cvt_f64_i32_e32 v[2:3], v2 -# CHECK-NEXT: [0,2] . DeeeeeeeeeeeeeeeeeeeeeER . . . . . . . . . v_cvt_f32_f64_e32 v4, v[4:5] -# CHECK-NEXT: [0,3] . DeeeeeeeeeeeeeeeeeeeeeER . . . . . . . . . v_cvt_f64_f32_e32 v[6:7], v6 -# CHECK-NEXT: [0,4] . DeeeeeeeeeeeeeeeeeeeeeER . . . . . . . . . v_cvt_u32_f64_e32 v8, v[8:9] -# CHECK-NEXT: [0,5] . DeeeeeeeeeeeeeeeeeeeeeER . . . . . . . . . v_cvt_f64_u32_e32 v[10:11], v10 -# CHECK-NEXT: [0,6] . . . . . DeeeeeeeeeeeeeeeeeeeeeER . . . . . v_frexp_exp_i32_f64_e32 v0, v[0:1] -# CHECK-NEXT: [0,7] . . . . . DeeeeeeeeeeeeeeeeeeeeeER . . . . . v_frexp_mant_f64_e32 v[2:3], v[2:3] -# CHECK-NEXT: [0,8] . . . . . DeeeeeeeeeeeeeeeeeeeeeER . . . . . v_fract_f64_e32 v[4:5], v[4:5] -# CHECK-NEXT: [0,9] . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeER . v_trunc_f64_e32 v[0:1], v[0:1] -# CHECK-NEXT: [0,10] . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeER . v_ceil_f64_e32 v[2:3], v[2:3] -# CHECK-NEXT: [0,11] . . . . . . . . . .DeeeeeeeeeeeeeeeeeeeeeER. v_rndne_f64_e32 v[4:5], v[4:5] -# CHECK-NEXT: [0,12] . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeER v_floor_f64_e32 v[6:7], v[6:7] +# CHECK: [0,0] DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . v_cvt_i32_f64_e32 v0, v[0:1] +# CHECK-NEXT: [0,1] .DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . v_cvt_f64_i32_e32 v[2:3], v2 +# CHECK-NEXT: [0,2] . DeeeeeeeeeeeeeeeeeeeeeE. . . . . . . . . . v_cvt_f32_f64_e32 v4, v[4:5] +# CHECK-NEXT: [0,3] . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . v_cvt_f64_f32_e32 v[6:7], v6 +# CHECK-NEXT: [0,4] . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . v_cvt_u32_f64_e32 v8, v[8:9] +# CHECK-NEXT: [0,5] . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . v_cvt_f64_u32_e32 v[10:11], v10 +# CHECK-NEXT: [0,6] . . . . . DeeeeeeeeeeeeeeeeeeeeeE. . . . . . v_frexp_exp_i32_f64_e32 v0, v[0:1] +# CHECK-NEXT: [0,7] . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . v_frexp_mant_f64_e32 v[2:3], v[2:3] +# CHECK-NEXT: [0,8] . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . v_fract_f64_e32 v[4:5], v[4:5] +# CHECK-NEXT: [0,9] . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . v_trunc_f64_e32 v[0:1], v[0:1] +# CHECK-NEXT: [0,10] . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . v_ceil_f64_e32 v[2:3], v[2:3] +# CHECK-NEXT: [0,11] . . . . . . . . . .DeeeeeeeeeeeeeeeeeeeeeE . v_rndne_f64_e32 v[4:5], v[4:5] +# CHECK-NEXT: [0,12] . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE. v_floor_f64_e32 v[6:7], v[6:7] # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions diff --git a/llvm/test/tools/llvm-mca/ARM/m7-negative-readadvance.s b/llvm/test/tools/llvm-mca/ARM/m7-negative-readadvance.s --- a/llvm/test/tools/llvm-mca/ARM/m7-negative-readadvance.s +++ b/llvm/test/tools/llvm-mca/ARM/m7-negative-readadvance.s @@ -9,12 +9,12 @@ # CHECK: Iterations: 1 # CHECK-NEXT: Instructions: 3 -# CHECK-NEXT: Total Cycles: 7 +# CHECK-NEXT: Total Cycles: 6 # CHECK-NEXT: Total uOps: 3 # CHECK: Dispatch Width: 2 -# CHECK-NEXT: uOps Per Cycle: 0.43 -# CHECK-NEXT: IPC: 0.43 +# CHECK-NEXT: uOps Per Cycle: 0.50 +# CHECK-NEXT: IPC: 0.50 # CHECK-NEXT: Block RThroughput: 1.5 # CHECK: Instruction Info: @@ -56,11 +56,11 @@ # CHECK-NEXT: - - - - 1.00 - - - - - - - 2.00 vldr d0, [r1] # CHECK: Timeline view: -# CHECK-NEXT: Index 0123456 +# CHECK-NEXT: Index 012345 -# CHECK: [0,0] DER .. add.w r1, r1, #1 -# CHECK-NEXT: [0,1] .DER .. add.w r1, r1, #2 -# CHECK-NEXT: [0,2] . DeER vldr d0, [r1] +# CHECK: [0,0] DE . add.w r1, r1, #1 +# CHECK-NEXT: [0,1] .DE . add.w r1, r1, #2 +# CHECK-NEXT: [0,2] . DeE vldr d0, [r1] # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions diff --git a/llvm/tools/llvm-mca/Views/TimelineView.cpp b/llvm/tools/llvm-mca/Views/TimelineView.cpp --- a/llvm/tools/llvm-mca/Views/TimelineView.cpp +++ b/llvm/tools/llvm-mca/Views/TimelineView.cpp @@ -77,8 +77,10 @@ "Instruction cannot be ready if it hasn't been dispatched yet!"); WTEntry.CyclesSpentInSQWhileReady += TVEntry.CycleIssued - TVEntry.CycleReady; - WTEntry.CyclesSpentAfterWBAndBeforeRetire += - (CurrentCycle - 1) - TVEntry.CycleExecuted; + if (CurrentCycle > TVEntry.CycleExecuted) { + WTEntry.CyclesSpentAfterWBAndBeforeRetire += + (CurrentCycle - 1) - TVEntry.CycleExecuted; + } break; } case HWInstructionEvent::Ready: @@ -243,7 +245,8 @@ for (unsigned I = Entry.CycleExecuted + 1, E = Entry.CycleRetired; I < E; ++I) OS << TimelineView::DisplayChar::RetireLag; - OS << TimelineView::DisplayChar::Retired; + if (Entry.CycleExecuted < Entry.CycleRetired) + OS << TimelineView::DisplayChar::Retired; // Skip other columns. for (unsigned I = Entry.CycleRetired + 1, E = LastCycle; I <= E; ++I) diff --git a/llvm/tools/llvm-mca/llvm-mca.cpp b/llvm/tools/llvm-mca/llvm-mca.cpp --- a/llvm/tools/llvm-mca/llvm-mca.cpp +++ b/llvm/tools/llvm-mca/llvm-mca.cpp @@ -278,7 +278,8 @@ processOptionImpl(PrintRegisterFileStats, Default); processOptionImpl(PrintDispatchStats, Default); processOptionImpl(PrintSchedulerStats, Default); - processOptionImpl(PrintRetireStats, Default); + if (IsOutOfOrder) + processOptionImpl(PrintRetireStats, Default); } // Returns true on success.