Index: docs/CommandGuide/llvm-mca.rst =================================================================== --- docs/CommandGuide/llvm-mca.rst +++ docs/CommandGuide/llvm-mca.rst @@ -466,13 +466,13 @@ Dynamic Dispatch Stall Cycles: RAT - Register unavailable: 0 RCU - Retire tokens unavailable: 0 - SCHEDQ - Scheduler full: 272 + SCHEDQ - Scheduler full: 272 (44.6%) LQ - Load queue full: 0 SQ - Store queue full: 0 GROUP - Static restrictions on the dispatch group: 0 - Dispatch Logic - number of cycles where we saw N instructions dispatched: + Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: [# dispatched], [# cycles] 0, 24 (3.9%) 1, 272 (44.6%) @@ -520,12 +520,11 @@ If we look at the *Dynamic Dispatch Stall Cycles* table, we see the counter for SCHEDQ reports 272 cycles. This counter is incremented every time the dispatch -logic is unable to dispatch a group of two instructions because the scheduler's -queue is full. +logic is unable to dispatch a full group because the scheduler's queue is full. Looking at the *Dispatch Logic* table, we see that the pipeline was only able to -dispatch two instructions 51.5% of the time. The dispatch group was limited to -one instruction 44.6% of the cycles, which corresponds to 272 cycles. The +dispatch two micro opcodes 51.5% of the time. The dispatch group was limited to +one micro opcode 44.6% of the cycles, which corresponds to 272 cycles. The dispatch statistics are displayed by either using the command option ``-all-stats`` or ``-dispatch-stats``. Index: test/tools/llvm-mca/X86/BtVer2/register-files-1.s =================================================================== --- test/tools/llvm-mca/X86/BtVer2/register-files-1.s +++ test/tools/llvm-mca/X86/BtVer2/register-files-1.s @@ -19,7 +19,7 @@ # CHECK-NEXT: SQ - Store queue full: 0 # CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0 -# CHECK: Dispatch Logic - number of cycles where we saw N instructions dispatched: +# CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: # CHECK-NEXT: [# dispatched], [# cycles] # CHECK-NEXT: 0, 23 (82.1%) # CHECK-NEXT: 2, 5 (17.9%) Index: test/tools/llvm-mca/X86/BtVer2/register-files-2.s =================================================================== --- test/tools/llvm-mca/X86/BtVer2/register-files-2.s +++ test/tools/llvm-mca/X86/BtVer2/register-files-2.s @@ -12,14 +12,14 @@ # CHECK-NEXT: Block RThroughput: 1.0 # CHECK: Dynamic Dispatch Stall Cycles: -# CHECK-NEXT: RAT - Register unavailable: 13 +# CHECK-NEXT: RAT - Register unavailable: 13 (46.4%) # CHECK-NEXT: RCU - Retire tokens unavailable: 0 # CHECK-NEXT: SCHEDQ - Scheduler full: 0 # CHECK-NEXT: LQ - Load queue full: 0 # CHECK-NEXT: SQ - Store queue full: 0 # CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0 -# CHECK: Dispatch Logic - number of cycles where we saw N instructions dispatched: +# CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: # CHECK-NEXT: [# dispatched], [# cycles] # CHECK-NEXT: 0, 20 (71.4%) # CHECK-NEXT: 1, 6 (21.4%) Index: test/tools/llvm-mca/X86/BtVer2/register-files-3.s =================================================================== --- test/tools/llvm-mca/X86/BtVer2/register-files-3.s +++ test/tools/llvm-mca/X86/BtVer2/register-files-3.s @@ -22,17 +22,17 @@ # CHECK-NEXT: 2 25 25.00 U idivl %eax # CHECK: Dynamic Dispatch Stall Cycles: -# CHECK-NEXT: RAT - Register unavailable: 26 +# CHECK-NEXT: RAT - Register unavailable: 26 (47.3%) # CHECK-NEXT: RCU - Retire tokens unavailable: 0 # CHECK-NEXT: SCHEDQ - Scheduler full: 0 # CHECK-NEXT: LQ - Load queue full: 0 # CHECK-NEXT: SQ - Store queue full: 0 # CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0 -# CHECK: Dispatch Logic - number of cycles where we saw N instructions dispatched: +# CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: # CHECK-NEXT: [# dispatched], [# cycles] # CHECK-NEXT: 0, 53 (96.4%) -# CHECK-NEXT: 1, 2 (3.6%) +# CHECK-NEXT: 2, 2 (3.6%) # CHECK: Register File statistics: # CHECK-NEXT: Total number of mappings created: 6 Index: test/tools/llvm-mca/X86/BtVer2/register-files-4.s =================================================================== --- test/tools/llvm-mca/X86/BtVer2/register-files-4.s +++ test/tools/llvm-mca/X86/BtVer2/register-files-4.s @@ -22,17 +22,17 @@ # CHECK-NEXT: 2 25 25.00 U idivl %eax # CHECK: Dynamic Dispatch Stall Cycles: -# CHECK-NEXT: RAT - Register unavailable: 6 +# CHECK-NEXT: RAT - Register unavailable: 6 (1.1%) # CHECK-NEXT: RCU - Retire tokens unavailable: 0 # CHECK-NEXT: SCHEDQ - Scheduler full: 0 # CHECK-NEXT: LQ - Load queue full: 0 # CHECK-NEXT: SQ - Store queue full: 0 # CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0 -# CHECK: Dispatch Logic - number of cycles where we saw N instructions dispatched: +# CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: # CHECK-NEXT: [# dispatched], [# cycles] # CHECK-NEXT: 0, 531 (96.0%) -# CHECK-NEXT: 1, 22 (4.0%) +# CHECK-NEXT: 2, 22 (4.0%) # CHECK: Register File statistics: # CHECK-NEXT: Total number of mappings created: 66 Index: test/tools/llvm-mca/X86/BtVer2/register-files-5.s =================================================================== --- test/tools/llvm-mca/X86/BtVer2/register-files-5.s +++ test/tools/llvm-mca/X86/BtVer2/register-files-5.s @@ -44,16 +44,16 @@ # CHECK: Dynamic Dispatch Stall Cycles: # CHECK-NEXT: RAT - Register unavailable: 0 -# CHECK-NEXT: RCU - Retire tokens unavailable: 8 +# CHECK-NEXT: RCU - Retire tokens unavailable: 8 (11.6%) # CHECK-NEXT: SCHEDQ - Scheduler full: 0 # CHECK-NEXT: LQ - Load queue full: 0 # CHECK-NEXT: SQ - Store queue full: 0 # CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0 -# CHECK: Dispatch Logic - number of cycles where we saw N instructions dispatched: +# CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: # CHECK-NEXT: [# dispatched], [# cycles] # CHECK-NEXT: 0, 36 (52.2%) -# CHECK-NEXT: 1, 33 (47.8%) +# CHECK-NEXT: 2, 33 (47.8%) # CHECK: Register File statistics: # CHECK-NEXT: Total number of mappings created: 66 Index: test/tools/llvm-mca/X86/Haswell/cmpxchg16b.s =================================================================== --- test/tools/llvm-mca/X86/Haswell/cmpxchg16b.s +++ test/tools/llvm-mca/X86/Haswell/cmpxchg16b.s @@ -0,0 +1,73 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -timeline -timeline-max-iterations=3 -dispatch-stats < %s | FileCheck %s + +cmpxchg16b (%rsi) + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 100 +# CHECK-NEXT: Total Cycles: 2203 +# CHECK-NEXT: Dispatch Width: 4 +# CHECK-NEXT: IPC: 0.05 +# CHECK-NEXT: Block RThroughput: 4.8 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 19 22 4.00 * * cmpxchg16b (%rsi) + +# CHECK: Dynamic Dispatch Stall Cycles: +# CHECK-NEXT: RAT - Register unavailable: 0 +# CHECK-NEXT: RCU - Retire tokens unavailable: 1487 (67.5%) +# CHECK-NEXT: SCHEDQ - Scheduler full: 0 +# CHECK-NEXT: LQ - Load queue full: 0 +# CHECK-NEXT: SQ - Store queue full: 0 +# CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0 + +# CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: +# CHECK-NEXT: [# dispatched], [# cycles] +# CHECK-NEXT: 0, 1703 (77.3%) +# CHECK-NEXT: 3, 100 (4.5%) +# CHECK-NEXT: 4, 400 (18.2%) + +# CHECK: Resources: +# CHECK-NEXT: [0] - HWDivider +# CHECK-NEXT: [1] - HWFPDivider +# CHECK-NEXT: [2] - HWPort0 +# CHECK-NEXT: [3] - HWPort1 +# CHECK-NEXT: [4] - HWPort2 +# CHECK-NEXT: [5] - HWPort3 +# CHECK-NEXT: [6] - HWPort4 +# CHECK-NEXT: [7] - HWPort5 +# CHECK-NEXT: [8] - HWPort6 +# CHECK-NEXT: [9] - HWPort7 + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] +# CHECK-NEXT: - - 2.00 6.00 0.66 0.67 1.00 4.00 4.00 0.67 + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions: +# CHECK-NEXT: - - 2.00 6.00 0.66 0.67 1.00 4.00 4.00 0.67 cmpxchg16b (%rsi) + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 0123456789 0123456789 +# CHECK-NEXT: Index 0123456789 0123456789 0123456789 012345678 + +# CHECK: [0,0] DeeeeeeeeeeeeeeeeeeeeeeER. . . . . . . . . . cmpxchg16b (%rsi) +# CHECK-NEXT: [1,0] . D=================eeeeeeeeeeeeeeeeeeeeeeER . . . . . cmpxchg16b (%rsi) +# CHECK-NEXT: [2,0] . . D==================================eeeeeeeeeeeeeeeeeeeeeeER cmpxchg16b (%rsi) + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 3 18.0 0.3 0.0 cmpxchg16b (%rsi) Index: test/tools/llvm-mca/X86/option-all-stats-1.s =================================================================== --- test/tools/llvm-mca/X86/option-all-stats-1.s +++ test/tools/llvm-mca/X86/option-all-stats-1.s @@ -27,12 +27,12 @@ # FULLREPORT: Dynamic Dispatch Stall Cycles: # FULLREPORT-NEXT: RAT - Register unavailable: 0 # FULLREPORT-NEXT: RCU - Retire tokens unavailable: 0 -# FULLREPORT-NEXT: SCHEDQ - Scheduler full: 61 +# FULLREPORT-NEXT: SCHEDQ - Scheduler full: 61 (59.2%) # FULLREPORT-NEXT: LQ - Load queue full: 0 # FULLREPORT-NEXT: SQ - Store queue full: 0 # FULLREPORT-NEXT: GROUP - Static restrictions on the dispatch group: 0 -# FULLREPORT: Dispatch Logic - number of cycles where we saw N instructions dispatched: +# FULLREPORT: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: # FULLREPORT-NEXT: [# dispatched], [# cycles] # FULLREPORT-NEXT: 0, 22 (21.4%) # FULLREPORT-NEXT: 1, 62 (60.2%) Index: test/tools/llvm-mca/X86/option-all-stats-2.s =================================================================== --- test/tools/llvm-mca/X86/option-all-stats-2.s +++ test/tools/llvm-mca/X86/option-all-stats-2.s @@ -28,12 +28,12 @@ # FULL: Dynamic Dispatch Stall Cycles: # FULL-NEXT: RAT - Register unavailable: 0 # FULL-NEXT: RCU - Retire tokens unavailable: 0 -# FULL-NEXT: SCHEDQ - Scheduler full: 61 +# FULL-NEXT: SCHEDQ - Scheduler full: 61 (59.2%) # FULL-NEXT: LQ - Load queue full: 0 # FULL-NEXT: SQ - Store queue full: 0 # FULL-NEXT: GROUP - Static restrictions on the dispatch group: 0 -# FULL: Dispatch Logic - number of cycles where we saw N instructions dispatched: +# FULL: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: # FULL-NEXT: [# dispatched], [# cycles] # FULL-NEXT: 0, 22 (21.4%) # FULL-NEXT: 1, 62 (60.2%) Index: test/tools/llvm-mca/X86/option-all-views-1.s =================================================================== --- test/tools/llvm-mca/X86/option-all-views-1.s +++ test/tools/llvm-mca/X86/option-all-views-1.s @@ -29,12 +29,12 @@ # FULLREPORT: Dynamic Dispatch Stall Cycles: # FULLREPORT-NEXT: RAT - Register unavailable: 0 # FULLREPORT-NEXT: RCU - Retire tokens unavailable: 0 -# FULLREPORT-NEXT: SCHEDQ - Scheduler full: 61 +# FULLREPORT-NEXT: SCHEDQ - Scheduler full: 61 (59.2%) # FULLREPORT-NEXT: LQ - Load queue full: 0 # FULLREPORT-NEXT: SQ - Store queue full: 0 # FULLREPORT-NEXT: GROUP - Static restrictions on the dispatch group: 0 -# FULLREPORT: Dispatch Logic - number of cycles where we saw N instructions dispatched: +# FULLREPORT: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: # FULLREPORT-NEXT: [# dispatched], [# cycles] # FULLREPORT-NEXT: 0, 22 (21.4%) # FULLREPORT-NEXT: 1, 62 (60.2%) Index: test/tools/llvm-mca/X86/option-all-views-2.s =================================================================== --- test/tools/llvm-mca/X86/option-all-views-2.s +++ test/tools/llvm-mca/X86/option-all-views-2.s @@ -28,12 +28,12 @@ # ALL: Dynamic Dispatch Stall Cycles: # ALL-NEXT: RAT - Register unavailable: 0 # ALL-NEXT: RCU - Retire tokens unavailable: 0 -# ALL-NEXT: SCHEDQ - Scheduler full: 61 +# ALL-NEXT: SCHEDQ - Scheduler full: 61 (59.2%) # ALL-NEXT: LQ - Load queue full: 0 # ALL-NEXT: SQ - Store queue full: 0 # ALL-NEXT: GROUP - Static restrictions on the dispatch group: 0 -# ALL: Dispatch Logic - number of cycles where we saw N instructions dispatched: +# ALL: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: # ALL-NEXT: [# dispatched], [# cycles] # ALL-NEXT: 0, 22 (21.4%) # ALL-NEXT: 1, 62 (60.2%) Index: tools/llvm-mca/Views/DispatchStatistics.h =================================================================== --- tools/llvm-mca/Views/DispatchStatistics.h +++ tools/llvm-mca/Views/DispatchStatistics.h @@ -24,7 +24,7 @@ /// GROUP - Static restrictions on the dispatch group: 0 /// /// -/// Dispatch Logic - number of cycles where we saw N instructions dispatched: +/// Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: /// [# dispatched], [# cycles] /// 0, 15 (11.5%) /// 2, 4 (3.1%) Index: tools/llvm-mca/Views/DispatchStatistics.cpp =================================================================== --- tools/llvm-mca/Views/DispatchStatistics.cpp +++ tools/llvm-mca/Views/DispatchStatistics.cpp @@ -26,20 +26,23 @@ } void DispatchStatistics::onEvent(const HWInstructionEvent &Event) { - if (Event.Type == HWInstructionEvent::Dispatched) - ++NumDispatched; + if (Event.Type != HWInstructionEvent::Dispatched) + return; + + const auto &DE = static_cast(Event); + NumDispatched += DE.MicroOpcodes; } void DispatchStatistics::printDispatchHistogram(llvm::raw_ostream &OS) const { std::string Buffer; raw_string_ostream TempStream(Buffer); TempStream << "\n\nDispatch Logic - " - << "number of cycles where we saw N instructions dispatched:\n"; + << "number of cycles where we saw N micro opcodes dispatched:\n"; TempStream << "[# dispatched], [# cycles]\n"; for (const std::pair &Entry : DispatchGroupSizePerCycle) { + double Percentage = ((double)Entry.second / NumCycles) * 100.0; TempStream << " " << Entry.first << ", " << Entry.second - << " (" - << format("%.1f", ((double)Entry.second / NumCycles) * 100.0) + << " (" << format("%.1f", floor((Percentage * 10) + 0.5) / 10) << "%)\n"; } @@ -47,24 +50,36 @@ OS << Buffer; } +static void printStalls(raw_ostream &OS, unsigned NumStalls, + unsigned NumCycles) { + if (!NumStalls) { + OS << NumStalls; + return; + } + + double Percentage = ((double)NumStalls / NumCycles) * 100.0; + OS << NumStalls << " (" + << format("%.1f", floor((Percentage * 10) + 0.5) / 10) << "%)"; +} + void DispatchStatistics::printDispatchStalls(raw_ostream &OS) const { std::string Buffer; - raw_string_ostream TempStream(Buffer); - TempStream << "\n\nDynamic Dispatch Stall Cycles:\n"; - TempStream << "RAT - Register unavailable: " - << HWStalls[HWStallEvent::RegisterFileStall]; - TempStream << "\nRCU - Retire tokens unavailable: " - << HWStalls[HWStallEvent::RetireControlUnitStall]; - TempStream << "\nSCHEDQ - Scheduler full: " - << HWStalls[HWStallEvent::SchedulerQueueFull]; - TempStream << "\nLQ - Load queue full: " - << HWStalls[HWStallEvent::LoadQueueFull]; - TempStream << "\nSQ - Store queue full: " - << HWStalls[HWStallEvent::StoreQueueFull]; - TempStream << "\nGROUP - Static restrictions on the dispatch group: " - << HWStalls[HWStallEvent::DispatchGroupStall]; - TempStream << '\n'; - TempStream.flush(); + raw_string_ostream SS(Buffer); + SS << "\n\nDynamic Dispatch Stall Cycles:\n"; + SS << "RAT - Register unavailable: "; + printStalls(SS, HWStalls[HWStallEvent::RegisterFileStall], NumCycles); + SS << "\nRCU - Retire tokens unavailable: "; + printStalls(SS, HWStalls[HWStallEvent::RetireControlUnitStall], NumCycles); + SS << "\nSCHEDQ - Scheduler full: "; + printStalls(SS, HWStalls[HWStallEvent::SchedulerQueueFull], NumCycles); + SS << "\nLQ - Load queue full: "; + printStalls(SS, HWStalls[HWStallEvent::LoadQueueFull], NumCycles); + SS << "\nSQ - Store queue full: "; + printStalls(SS, HWStalls[HWStallEvent::StoreQueueFull], NumCycles); + SS << "\nGROUP - Static restrictions on the dispatch group: "; + printStalls(SS, HWStalls[HWStallEvent::DispatchGroupStall], NumCycles); + SS << '\n'; + SS.flush(); OS << Buffer; } Index: tools/llvm-mca/Views/SummaryView.cpp =================================================================== --- tools/llvm-mca/Views/SummaryView.cpp +++ tools/llvm-mca/Views/SummaryView.cpp @@ -33,12 +33,10 @@ } void SummaryView::onEvent(const HWInstructionEvent &Event) { - // We are only interested in the "instruction dispatched" events generated by - // the dispatch stage for instructions that are part of iteration #0. - if (Event.Type != HWInstructionEvent::Dispatched) - return; - - if (Event.IR.getSourceIndex() >= Source.size()) + // We are only interested in the "instruction retired" events generated by + // the retire stage for instructions that are part of iteration #0. + if (Event.Type != HWInstructionEvent::Retired || + Event.IR.getSourceIndex() >= Source.size()) return; // Update the cumulative number of resource cycles based on the processor Index: tools/llvm-mca/Views/TimelineView.h =================================================================== --- tools/llvm-mca/Views/TimelineView.h +++ tools/llvm-mca/Views/TimelineView.h @@ -126,7 +126,7 @@ unsigned LastCycle; struct TimelineViewEntry { - unsigned CycleDispatched; + int CycleDispatched; // A negative value is an "invalid cycle". unsigned CycleReady; unsigned CycleIssued; unsigned CycleExecuted; Index: tools/llvm-mca/Views/TimelineView.cpp =================================================================== --- tools/llvm-mca/Views/TimelineView.cpp +++ tools/llvm-mca/Views/TimelineView.cpp @@ -29,6 +29,8 @@ MaxIterations = DEFAULT_ITERATIONS; NumInstructions *= std::min(MaxIterations, AsmSequence.getNumIterations()); Timeline.resize(NumInstructions); + TimelineViewEntry InvalidTVEntry = {-1, 0, 0, 0}; + std::fill(Timeline.begin(), Timeline.end(), InvalidTVEntry); WaitTimeEntry NullWTEntry = {0, 0, 0}; std::fill(WaitTime.begin(), WaitTime.end(), NullWTEntry); @@ -68,10 +70,13 @@ TVEntry.CycleRetired = CurrentCycle; // Update the WaitTime entry which corresponds to this Index. + assert(TVEntry.CycleDispatched >= 0 && "Invalid TVEntry found!"); + unsigned CycleDispatched = static_cast(TVEntry.CycleDispatched); WaitTimeEntry &WTEntry = WaitTime[Index % AsmSequence.size()]; WTEntry.CyclesSpentInSchedulerQueue += - TVEntry.CycleIssued - TVEntry.CycleDispatched; - assert(TVEntry.CycleDispatched <= TVEntry.CycleReady); + TVEntry.CycleIssued - CycleDispatched; + assert(CycleDispatched <= TVEntry.CycleReady && + "Instruction cannot be ready if it hasn't been dispatched yet!"); WTEntry.CyclesSpentInSQWhileReady += TVEntry.CycleIssued - TVEntry.CycleReady; WTEntry.CyclesSpentAfterWBAndBeforeRetire += @@ -88,7 +93,11 @@ Timeline[Index].CycleExecuted = CurrentCycle; break; case HWInstructionEvent::Dispatched: - Timeline[Index].CycleDispatched = CurrentCycle; + // There may be multiple dispatch events. Microcoded instructions that are + // expanded into multiple uOps may require multiple dispatch cycles. Here, + // we want to capture the first dispatch cycle. + if (Timeline[Index].CycleDispatched == -1) + Timeline[Index].CycleDispatched = static_cast(CurrentCycle); break; default: return; @@ -193,19 +202,20 @@ OS << '\n'; OS << '[' << Iteration << ',' << SourceIndex << ']'; OS.PadToColumn(10); - for (unsigned I = 0, E = Entry.CycleDispatched; I < E; ++I) + assert(Entry.CycleDispatched >= 0 && "Invalid TimelineViewEntry!"); + unsigned CycleDispatched = static_cast(Entry.CycleDispatched); + for (unsigned I = 0, E = CycleDispatched; I < E; ++I) OS << ((I % 5 == 0) ? '.' : ' '); OS << TimelineView::DisplayChar::Dispatched; - if (Entry.CycleDispatched != Entry.CycleExecuted) { + if (CycleDispatched != Entry.CycleExecuted) { // Zero latency instructions have the same value for CycleDispatched, // CycleIssued and CycleExecuted. - for (unsigned I = Entry.CycleDispatched + 1, E = Entry.CycleIssued; I < E; - ++I) + for (unsigned I = CycleDispatched + 1, E = Entry.CycleIssued; I < E; ++I) OS << TimelineView::DisplayChar::Waiting; if (Entry.CycleIssued == Entry.CycleExecuted) OS << TimelineView::DisplayChar::DisplayChar::Executed; else { - if (Entry.CycleDispatched != Entry.CycleIssued) + if (CycleDispatched != Entry.CycleIssued) OS << TimelineView::DisplayChar::Executing; for (unsigned I = Entry.CycleIssued + 1, E = Entry.CycleExecuted; I < E; ++I) Index: tools/llvm-mca/include/HWEventListener.h =================================================================== --- tools/llvm-mca/include/HWEventListener.h +++ tools/llvm-mca/include/HWEventListener.h @@ -70,12 +70,23 @@ class HWInstructionDispatchedEvent : public HWInstructionEvent { public: - HWInstructionDispatchedEvent(const InstRef &IR, llvm::ArrayRef Regs) + HWInstructionDispatchedEvent(const InstRef &IR, llvm::ArrayRef Regs, + unsigned UOps) : HWInstructionEvent(HWInstructionEvent::Dispatched, IR), - UsedPhysRegs(Regs) {} + UsedPhysRegs(Regs), MicroOpcodes(UOps) {} // Number of physical register allocated for this instruction. There is one // entry per register file. llvm::ArrayRef UsedPhysRegs; + // Number of micro opcodes dispatched. + // This field is often set to the total number of micro-opcodes specified by + // the instruction descriptor of IR. + // The only exception is when IR declares a number of micro opcodes + // which exceeds the processor DispatchWidth, and - by construction - it + // requires multiple cycles to be fully dispatched. In that particular case, + // the dispatch logic would generate more than one dispatch event (one per + // cycle), and each event would declare how many micro opcodes are effectively + // been dispatched to the schedulers. + unsigned MicroOpcodes; }; class HWInstructionRetiredEvent : public HWInstructionEvent { Index: tools/llvm-mca/include/Stages/DispatchStage.h =================================================================== --- tools/llvm-mca/include/Stages/DispatchStage.h +++ tools/llvm-mca/include/Stages/DispatchStage.h @@ -51,6 +51,7 @@ unsigned DispatchWidth; unsigned AvailableEntries; unsigned CarryOver; + InstRef CarriedOver; const llvm::MCSubtargetInfo &STI; RetireControlUnit &RCU; RegisterFile &PRF; @@ -63,7 +64,8 @@ void updateRAWDependencies(ReadState &RS, const llvm::MCSubtargetInfo &STI); void notifyInstructionDispatched(const InstRef &IR, - llvm::ArrayRef UsedPhysRegs); + llvm::ArrayRef UsedPhysRegs, + unsigned uOps); void collectWrites(llvm::SmallVectorImpl &Vec, unsigned RegID) const { @@ -75,7 +77,7 @@ const llvm::MCRegisterInfo &MRI, unsigned MaxDispatchWidth, RetireControlUnit &R, RegisterFile &F) : DispatchWidth(MaxDispatchWidth), AvailableEntries(MaxDispatchWidth), - CarryOver(0U), STI(Subtarget), RCU(R), PRF(F) {} + CarryOver(0U), CarriedOver(), STI(Subtarget), RCU(R), PRF(F) {} bool isAvailable(const InstRef &IR) const override; Index: tools/llvm-mca/lib/Stages/DispatchStage.cpp =================================================================== --- tools/llvm-mca/lib/Stages/DispatchStage.cpp +++ tools/llvm-mca/lib/Stages/DispatchStage.cpp @@ -28,9 +28,11 @@ namespace mca { void DispatchStage::notifyInstructionDispatched(const InstRef &IR, - ArrayRef UsedRegs) { + ArrayRef UsedRegs, + unsigned UOps) { LLVM_DEBUG(dbgs() << "[E] Instruction Dispatched: #" << IR << '\n'); - notifyEvent(HWInstructionDispatchedEvent(IR, UsedRegs)); + notifyEvent( + HWInstructionDispatchedEvent(IR, UsedRegs, UOps)); } bool DispatchStage::checkPRF(const InstRef &IR) const { @@ -92,6 +94,7 @@ assert(AvailableEntries == DispatchWidth); AvailableEntries = 0; CarryOver = NumMicroOps - DispatchWidth; + CarriedOver = IR; } else { assert(AvailableEntries >= NumMicroOps); AvailableEntries -= NumMicroOps; @@ -125,13 +128,26 @@ // Notify listeners of the "instruction dispatched" event, // and move IR to the next stage. - notifyInstructionDispatched(IR, RegisterFiles); + notifyInstructionDispatched(IR, RegisterFiles, + std::min(DispatchWidth, NumMicroOps)); return moveToTheNextStage(IR); } llvm::Error DispatchStage::cycleStart() { + if (!CarryOver) { + AvailableEntries = DispatchWidth; + return llvm::ErrorSuccess(); + } + AvailableEntries = CarryOver >= DispatchWidth ? 0 : DispatchWidth - CarryOver; - CarryOver = CarryOver >= DispatchWidth ? CarryOver - DispatchWidth : 0U; + unsigned DispatchedOpcodes = DispatchWidth - AvailableEntries; + CarryOver -= DispatchedOpcodes; + assert(CarriedOver.isValid() && "Invalid dispatched instruction"); + + SmallVector RegisterFiles(PRF.getNumRegisterFiles(), 0U); + notifyInstructionDispatched(CarriedOver, RegisterFiles, DispatchedOpcodes); + if (!CarryOver) + CarriedOver = InstRef(); return llvm::ErrorSuccess(); }