Index: include/llvm/MC/MCSchedule.h =================================================================== --- include/llvm/MC/MCSchedule.h +++ include/llvm/MC/MCSchedule.h @@ -183,6 +183,8 @@ unsigned NumRegisterFiles; const MCRegisterCostEntry *RegisterCostTable; unsigned NumRegisterCostEntries; + unsigned LoadQueueID; + unsigned StoreQueueID; }; /// Machine model for scheduling, bundling, and heuristics. Index: include/llvm/Target/TargetSchedule.td =================================================================== --- include/llvm/Target/TargetSchedule.td +++ include/llvm/Target/TargetSchedule.td @@ -561,3 +561,13 @@ int MaxRetirePerCycle = retirePerCycle; SchedMachineModel SchedModel = ?; } + +// Base class for Load/StoreQueue. It is used to identify processor resources +// which describe load/store queues in the LS unit. +class MemoryQueue { + ProcResource QueueDescriptor = ProcResource; + SchedMachineModel SchedModel = ?; +} + +class LoadQueue : MemoryQueue; +class StoreQueue : MemoryQueue; Index: lib/Target/X86/X86ScheduleBdVer2.td =================================================================== --- lib/Target/X86/X86ScheduleBdVer2.td +++ lib/Target/X86/X86ScheduleBdVer2.td @@ -136,12 +136,16 @@ let BufferSize = 40; } +def PdLoadQueue : LoadQueue; + let Super = PdAGLU01 in def PdStore : ProcResource<1> { // For Piledriver, the store queue is 24 entries deep. let BufferSize = 24; } +def PdStoreQueue : StoreQueue; + //===----------------------------------------------------------------------===// // Integer Execution Units // Index: test/tools/llvm-mca/X86/BdVer2/load-throughput.s =================================================================== --- test/tools/llvm-mca/X86/BdVer2/load-throughput.s +++ test/tools/llvm-mca/X86/BdVer2/load-throughput.s @@ -79,16 +79,16 @@ # CHECK: Dynamic Dispatch Stall Cycles: # CHECK-NEXT: RAT - Register unavailable: 0 # CHECK-NEXT: RCU - Retire tokens unavailable: 0 -# CHECK-NEXT: SCHEDQ - Scheduler full: 161 (77.8%) -# CHECK-NEXT: LQ - Load queue full: 0 +# CHECK-NEXT: SCHEDQ - Scheduler full: 0 +# CHECK-NEXT: LQ - Load queue full: 171 (82.6%) # CHECK-NEXT: SQ - Store queue full: 0 # CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0 # CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: # CHECK-NEXT: [# dispatched], [# cycles] -# CHECK-NEXT: 0, 26 (12.6%) -# CHECK-NEXT: 2, 162 (78.3%) -# CHECK-NEXT: 4, 19 (9.2%) +# CHECK-NEXT: 0, 21 (10.1%) +# CHECK-NEXT: 2, 172 (83.1%) +# CHECK-NEXT: 4, 14 (6.8%) # CHECK: Schedulers - number of cycles where we saw N instructions issued: # CHECK-NEXT: [# issued], [# cycles] @@ -102,9 +102,9 @@ # CHECK-NEXT: [4] Total number of buffer entries. # CHECK: [1] [2] [3] [4] -# CHECK-NEXT: PdEX 35 40 40 +# CHECK-NEXT: PdEX 27 30 40 # CHECK-NEXT: PdFPU 0 0 64 -# CHECK-NEXT: PdLoad 35 40 40 +# CHECK-NEXT: PdLoad 36 40 40 # CHECK-NEXT: PdStore 0 0 24 # CHECK: Resources: @@ -192,16 +192,16 @@ # CHECK: Dynamic Dispatch Stall Cycles: # CHECK-NEXT: RAT - Register unavailable: 0 # CHECK-NEXT: RCU - Retire tokens unavailable: 0 -# CHECK-NEXT: SCHEDQ - Scheduler full: 161 (77.8%) -# CHECK-NEXT: LQ - Load queue full: 0 +# CHECK-NEXT: SCHEDQ - Scheduler full: 0 +# CHECK-NEXT: LQ - Load queue full: 171 (82.6%) # CHECK-NEXT: SQ - Store queue full: 0 # CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0 # CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: # CHECK-NEXT: [# dispatched], [# cycles] -# CHECK-NEXT: 0, 26 (12.6%) -# CHECK-NEXT: 2, 162 (78.3%) -# CHECK-NEXT: 4, 19 (9.2%) +# CHECK-NEXT: 0, 21 (10.1%) +# CHECK-NEXT: 2, 172 (83.1%) +# CHECK-NEXT: 4, 14 (6.8%) # CHECK: Schedulers - number of cycles where we saw N instructions issued: # CHECK-NEXT: [# issued], [# cycles] @@ -215,9 +215,9 @@ # CHECK-NEXT: [4] Total number of buffer entries. # CHECK: [1] [2] [3] [4] -# CHECK-NEXT: PdEX 35 40 40 +# CHECK-NEXT: PdEX 27 30 40 # CHECK-NEXT: PdFPU 0 0 64 -# CHECK-NEXT: PdLoad 35 40 40 +# CHECK-NEXT: PdLoad 36 40 40 # CHECK-NEXT: PdStore 0 0 24 # CHECK: Resources: @@ -305,16 +305,16 @@ # CHECK: Dynamic Dispatch Stall Cycles: # CHECK-NEXT: RAT - Register unavailable: 0 # CHECK-NEXT: RCU - Retire tokens unavailable: 0 -# CHECK-NEXT: SCHEDQ - Scheduler full: 161 (77.8%) -# CHECK-NEXT: LQ - Load queue full: 0 +# CHECK-NEXT: SCHEDQ - Scheduler full: 0 +# CHECK-NEXT: LQ - Load queue full: 171 (82.6%) # CHECK-NEXT: SQ - Store queue full: 0 # CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0 # CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: # CHECK-NEXT: [# dispatched], [# cycles] -# CHECK-NEXT: 0, 26 (12.6%) -# CHECK-NEXT: 2, 162 (78.3%) -# CHECK-NEXT: 4, 19 (9.2%) +# CHECK-NEXT: 0, 21 (10.1%) +# CHECK-NEXT: 2, 172 (83.1%) +# CHECK-NEXT: 4, 14 (6.8%) # CHECK: Schedulers - number of cycles where we saw N instructions issued: # CHECK-NEXT: [# issued], [# cycles] @@ -328,9 +328,9 @@ # CHECK-NEXT: [4] Total number of buffer entries. # CHECK: [1] [2] [3] [4] -# CHECK-NEXT: PdEX 35 40 40 +# CHECK-NEXT: PdEX 27 30 40 # CHECK-NEXT: PdFPU 0 0 64 -# CHECK-NEXT: PdLoad 35 40 40 +# CHECK-NEXT: PdLoad 36 40 40 # CHECK-NEXT: PdStore 0 0 24 # CHECK: Resources: @@ -418,16 +418,16 @@ # CHECK: Dynamic Dispatch Stall Cycles: # CHECK-NEXT: RAT - Register unavailable: 0 # CHECK-NEXT: RCU - Retire tokens unavailable: 0 -# CHECK-NEXT: SCHEDQ - Scheduler full: 161 (77.8%) -# CHECK-NEXT: LQ - Load queue full: 0 +# CHECK-NEXT: SCHEDQ - Scheduler full: 0 +# CHECK-NEXT: LQ - Load queue full: 171 (82.6%) # CHECK-NEXT: SQ - Store queue full: 0 # CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0 # CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: # CHECK-NEXT: [# dispatched], [# cycles] -# CHECK-NEXT: 0, 26 (12.6%) -# CHECK-NEXT: 2, 162 (78.3%) -# CHECK-NEXT: 4, 19 (9.2%) +# CHECK-NEXT: 0, 21 (10.1%) +# CHECK-NEXT: 2, 172 (83.1%) +# CHECK-NEXT: 4, 14 (6.8%) # CHECK: Schedulers - number of cycles where we saw N instructions issued: # CHECK-NEXT: [# issued], [# cycles] @@ -441,9 +441,9 @@ # CHECK-NEXT: [4] Total number of buffer entries. # CHECK: [1] [2] [3] [4] -# CHECK-NEXT: PdEX 35 40 40 +# CHECK-NEXT: PdEX 27 30 40 # CHECK-NEXT: PdFPU 0 0 64 -# CHECK-NEXT: PdLoad 35 40 40 +# CHECK-NEXT: PdLoad 36 40 40 # CHECK-NEXT: PdStore 0 0 24 # CHECK: Resources: @@ -531,16 +531,16 @@ # CHECK: Dynamic Dispatch Stall Cycles: # CHECK-NEXT: RAT - Register unavailable: 0 # CHECK-NEXT: RCU - Retire tokens unavailable: 0 -# CHECK-NEXT: SCHEDQ - Scheduler full: 161 (77.8%) -# CHECK-NEXT: LQ - Load queue full: 0 +# CHECK-NEXT: SCHEDQ - Scheduler full: 0 +# CHECK-NEXT: LQ - Load queue full: 171 (82.6%) # CHECK-NEXT: SQ - Store queue full: 0 # CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0 # CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: # CHECK-NEXT: [# dispatched], [# cycles] -# CHECK-NEXT: 0, 26 (12.6%) -# CHECK-NEXT: 2, 162 (78.3%) -# CHECK-NEXT: 4, 19 (9.2%) +# CHECK-NEXT: 0, 21 (10.1%) +# CHECK-NEXT: 2, 172 (83.1%) +# CHECK-NEXT: 4, 14 (6.8%) # CHECK: Schedulers - number of cycles where we saw N instructions issued: # CHECK-NEXT: [# issued], [# cycles] @@ -554,9 +554,9 @@ # CHECK-NEXT: [4] Total number of buffer entries. # CHECK: [1] [2] [3] [4] -# CHECK-NEXT: PdEX 35 40 40 -# CHECK-NEXT: PdFPU 35 40 64 -# CHECK-NEXT: PdLoad 35 40 40 +# CHECK-NEXT: PdEX 27 30 40 +# CHECK-NEXT: PdFPU 27 30 64 +# CHECK-NEXT: PdLoad 36 40 40 # CHECK-NEXT: PdStore 0 0 24 # CHECK: Resources: @@ -644,16 +644,16 @@ # CHECK: Dynamic Dispatch Stall Cycles: # CHECK-NEXT: RAT - Register unavailable: 0 # CHECK-NEXT: RCU - Retire tokens unavailable: 0 -# CHECK-NEXT: SCHEDQ - Scheduler full: 161 (77.8%) -# CHECK-NEXT: LQ - Load queue full: 0 +# CHECK-NEXT: SCHEDQ - Scheduler full: 0 +# CHECK-NEXT: LQ - Load queue full: 171 (82.6%) # CHECK-NEXT: SQ - Store queue full: 0 # CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0 # CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: # CHECK-NEXT: [# dispatched], [# cycles] -# CHECK-NEXT: 0, 26 (12.6%) -# CHECK-NEXT: 2, 162 (78.3%) -# CHECK-NEXT: 4, 19 (9.2%) +# CHECK-NEXT: 0, 21 (10.1%) +# CHECK-NEXT: 2, 172 (83.1%) +# CHECK-NEXT: 4, 14 (6.8%) # CHECK: Schedulers - number of cycles where we saw N instructions issued: # CHECK-NEXT: [# issued], [# cycles] @@ -667,9 +667,9 @@ # CHECK-NEXT: [4] Total number of buffer entries. # CHECK: [1] [2] [3] [4] -# CHECK-NEXT: PdEX 35 40 40 -# CHECK-NEXT: PdFPU 35 40 64 -# CHECK-NEXT: PdLoad 35 40 40 +# CHECK-NEXT: PdEX 27 30 40 +# CHECK-NEXT: PdFPU 27 30 64 +# CHECK-NEXT: PdLoad 36 40 40 # CHECK-NEXT: PdStore 0 0 24 # CHECK: Resources: @@ -781,7 +781,7 @@ # CHECK: [1] [2] [3] [4] # CHECK-NEXT: PdEX 1 2 40 # CHECK-NEXT: PdFPU 1 2 64 -# CHECK-NEXT: PdLoad 1 2 40 +# CHECK-NEXT: PdLoad 11 12 40 # CHECK-NEXT: PdStore 0 0 24 # CHECK: Resources: Index: test/tools/llvm-mca/X86/BdVer2/store-throughput.s =================================================================== --- test/tools/llvm-mca/X86/BdVer2/store-throughput.s +++ test/tools/llvm-mca/X86/BdVer2/store-throughput.s @@ -79,16 +79,16 @@ # CHECK: Dynamic Dispatch Stall Cycles: # CHECK-NEXT: RAT - Register unavailable: 0 # CHECK-NEXT: RCU - Retire tokens unavailable: 0 -# CHECK-NEXT: SCHEDQ - Scheduler full: 369 (91.6%) +# CHECK-NEXT: SCHEDQ - Scheduler full: 0 # CHECK-NEXT: LQ - Load queue full: 0 -# CHECK-NEXT: SQ - Store queue full: 0 +# CHECK-NEXT: SQ - Store queue full: 370 (91.8%) # CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0 # CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: # CHECK-NEXT: [# dispatched], [# cycles] -# CHECK-NEXT: 0, 26 (6.5%) -# CHECK-NEXT: 1, 369 (91.6%) -# CHECK-NEXT: 3, 1 (0.2%) +# CHECK-NEXT: 0, 25 (6.2%) +# CHECK-NEXT: 1, 370 (91.8%) +# CHECK-NEXT: 2, 1 (0.2%) # CHECK-NEXT: 4, 7 (1.7%) # CHECK: Schedulers - number of cycles where we saw N instructions issued: @@ -103,10 +103,10 @@ # CHECK-NEXT: [4] Total number of buffer entries. # CHECK: [1] [2] [3] [4] -# CHECK-NEXT: PdEX 22 24 40 +# CHECK-NEXT: PdEX 22 23 40 # CHECK-NEXT: PdFPU 0 0 64 # CHECK-NEXT: PdLoad 0 0 40 -# CHECK-NEXT: PdStore 22 24 24 +# CHECK-NEXT: PdStore 23 24 24 # CHECK: Resources: # CHECK-NEXT: [0.0] - PdAGLU01 @@ -193,16 +193,16 @@ # CHECK: Dynamic Dispatch Stall Cycles: # CHECK-NEXT: RAT - Register unavailable: 0 # CHECK-NEXT: RCU - Retire tokens unavailable: 0 -# CHECK-NEXT: SCHEDQ - Scheduler full: 369 (91.6%) +# CHECK-NEXT: SCHEDQ - Scheduler full: 0 # CHECK-NEXT: LQ - Load queue full: 0 -# CHECK-NEXT: SQ - Store queue full: 0 +# CHECK-NEXT: SQ - Store queue full: 370 (91.8%) # CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0 # CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: # CHECK-NEXT: [# dispatched], [# cycles] -# CHECK-NEXT: 0, 26 (6.5%) -# CHECK-NEXT: 1, 369 (91.6%) -# CHECK-NEXT: 3, 1 (0.2%) +# CHECK-NEXT: 0, 25 (6.2%) +# CHECK-NEXT: 1, 370 (91.8%) +# CHECK-NEXT: 2, 1 (0.2%) # CHECK-NEXT: 4, 7 (1.7%) # CHECK: Schedulers - number of cycles where we saw N instructions issued: @@ -217,10 +217,10 @@ # CHECK-NEXT: [4] Total number of buffer entries. # CHECK: [1] [2] [3] [4] -# CHECK-NEXT: PdEX 22 24 40 +# CHECK-NEXT: PdEX 22 23 40 # CHECK-NEXT: PdFPU 0 0 64 # CHECK-NEXT: PdLoad 0 0 40 -# CHECK-NEXT: PdStore 22 24 24 +# CHECK-NEXT: PdStore 23 24 24 # CHECK: Resources: # CHECK-NEXT: [0.0] - PdAGLU01 @@ -307,16 +307,16 @@ # CHECK: Dynamic Dispatch Stall Cycles: # CHECK-NEXT: RAT - Register unavailable: 0 # CHECK-NEXT: RCU - Retire tokens unavailable: 0 -# CHECK-NEXT: SCHEDQ - Scheduler full: 369 (91.6%) +# CHECK-NEXT: SCHEDQ - Scheduler full: 0 # CHECK-NEXT: LQ - Load queue full: 0 -# CHECK-NEXT: SQ - Store queue full: 0 +# CHECK-NEXT: SQ - Store queue full: 370 (91.8%) # CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0 # CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: # CHECK-NEXT: [# dispatched], [# cycles] -# CHECK-NEXT: 0, 26 (6.5%) -# CHECK-NEXT: 1, 369 (91.6%) -# CHECK-NEXT: 3, 1 (0.2%) +# CHECK-NEXT: 0, 25 (6.2%) +# CHECK-NEXT: 1, 370 (91.8%) +# CHECK-NEXT: 2, 1 (0.2%) # CHECK-NEXT: 4, 7 (1.7%) # CHECK: Schedulers - number of cycles where we saw N instructions issued: @@ -331,10 +331,10 @@ # CHECK-NEXT: [4] Total number of buffer entries. # CHECK: [1] [2] [3] [4] -# CHECK-NEXT: PdEX 22 24 40 +# CHECK-NEXT: PdEX 22 23 40 # CHECK-NEXT: PdFPU 0 0 64 # CHECK-NEXT: PdLoad 0 0 40 -# CHECK-NEXT: PdStore 22 24 24 +# CHECK-NEXT: PdStore 23 24 24 # CHECK: Resources: # CHECK-NEXT: [0.0] - PdAGLU01 @@ -421,16 +421,16 @@ # CHECK: Dynamic Dispatch Stall Cycles: # CHECK-NEXT: RAT - Register unavailable: 0 # CHECK-NEXT: RCU - Retire tokens unavailable: 0 -# CHECK-NEXT: SCHEDQ - Scheduler full: 369 (91.6%) +# CHECK-NEXT: SCHEDQ - Scheduler full: 0 # CHECK-NEXT: LQ - Load queue full: 0 -# CHECK-NEXT: SQ - Store queue full: 0 +# CHECK-NEXT: SQ - Store queue full: 370 (91.8%) # CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0 # CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: # CHECK-NEXT: [# dispatched], [# cycles] -# CHECK-NEXT: 0, 26 (6.5%) -# CHECK-NEXT: 1, 369 (91.6%) -# CHECK-NEXT: 3, 1 (0.2%) +# CHECK-NEXT: 0, 25 (6.2%) +# CHECK-NEXT: 1, 370 (91.8%) +# CHECK-NEXT: 2, 1 (0.2%) # CHECK-NEXT: 4, 7 (1.7%) # CHECK: Schedulers - number of cycles where we saw N instructions issued: @@ -445,10 +445,10 @@ # CHECK-NEXT: [4] Total number of buffer entries. # CHECK: [1] [2] [3] [4] -# CHECK-NEXT: PdEX 22 24 40 +# CHECK-NEXT: PdEX 22 23 40 # CHECK-NEXT: PdFPU 0 0 64 # CHECK-NEXT: PdLoad 0 0 40 -# CHECK-NEXT: PdStore 22 24 24 +# CHECK-NEXT: PdStore 23 24 24 # CHECK: Resources: # CHECK-NEXT: [0.0] - PdAGLU01 @@ -535,16 +535,16 @@ # CHECK: Dynamic Dispatch Stall Cycles: # CHECK-NEXT: RAT - Register unavailable: 0 # CHECK-NEXT: RCU - Retire tokens unavailable: 0 -# CHECK-NEXT: SCHEDQ - Scheduler full: 745 (92.8%) +# CHECK-NEXT: SCHEDQ - Scheduler full: 0 # CHECK-NEXT: LQ - Load queue full: 0 -# CHECK-NEXT: SQ - Store queue full: 0 +# CHECK-NEXT: SQ - Store queue full: 747 (93.0%) # CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0 # CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: # CHECK-NEXT: [# dispatched], [# cycles] -# CHECK-NEXT: 0, 423 (52.7%) -# CHECK-NEXT: 1, 373 (46.5%) -# CHECK-NEXT: 3, 1 (0.1%) +# CHECK-NEXT: 0, 422 (52.6%) +# CHECK-NEXT: 1, 374 (46.6%) +# CHECK-NEXT: 2, 1 (0.1%) # CHECK-NEXT: 4, 6 (0.7%) # CHECK: Schedulers - number of cycles where we saw N instructions issued: @@ -559,8 +559,8 @@ # CHECK-NEXT: [4] Total number of buffer entries. # CHECK: [1] [2] [3] [4] -# CHECK-NEXT: PdEX 23 24 40 -# CHECK-NEXT: PdFPU 23 24 64 +# CHECK-NEXT: PdEX 22 23 40 +# CHECK-NEXT: PdFPU 22 23 64 # CHECK-NEXT: PdLoad 0 0 40 # CHECK-NEXT: PdStore 23 24 24 @@ -650,16 +650,16 @@ # CHECK: Dynamic Dispatch Stall Cycles: # CHECK-NEXT: RAT - Register unavailable: 0 # CHECK-NEXT: RCU - Retire tokens unavailable: 0 -# CHECK-NEXT: SCHEDQ - Scheduler full: 369 (91.6%) +# CHECK-NEXT: SCHEDQ - Scheduler full: 0 # CHECK-NEXT: LQ - Load queue full: 0 -# CHECK-NEXT: SQ - Store queue full: 0 +# CHECK-NEXT: SQ - Store queue full: 370 (91.8%) # CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0 # CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: # CHECK-NEXT: [# dispatched], [# cycles] -# CHECK-NEXT: 0, 26 (6.5%) -# CHECK-NEXT: 1, 369 (91.6%) -# CHECK-NEXT: 3, 1 (0.2%) +# CHECK-NEXT: 0, 25 (6.2%) +# CHECK-NEXT: 1, 370 (91.8%) +# CHECK-NEXT: 2, 1 (0.2%) # CHECK-NEXT: 4, 7 (1.7%) # CHECK: Schedulers - number of cycles where we saw N instructions issued: @@ -674,10 +674,10 @@ # CHECK-NEXT: [4] Total number of buffer entries. # CHECK: [1] [2] [3] [4] -# CHECK-NEXT: PdEX 22 24 40 -# CHECK-NEXT: PdFPU 22 24 64 +# CHECK-NEXT: PdEX 22 23 40 +# CHECK-NEXT: PdFPU 22 23 64 # CHECK-NEXT: PdLoad 0 0 40 -# CHECK-NEXT: PdStore 22 24 24 +# CHECK-NEXT: PdStore 23 24 24 # CHECK: Resources: # CHECK-NEXT: [0.0] - PdAGLU01 @@ -789,7 +789,7 @@ # CHECK-NEXT: PdEX 1 1 40 # CHECK-NEXT: PdFPU 1 1 64 # CHECK-NEXT: PdLoad 0 0 40 -# CHECK-NEXT: PdStore 1 1 24 +# CHECK-NEXT: PdStore 2 2 24 # CHECK: Resources: # CHECK-NEXT: [0.0] - PdAGLU01 Index: tools/llvm-mca/Views/SchedulerStatistics.h =================================================================== --- tools/llvm-mca/Views/SchedulerStatistics.h +++ tools/llvm-mca/Views/SchedulerStatistics.h @@ -47,9 +47,15 @@ class SchedulerStatistics final : public View { const llvm::MCSchedModel &SM; + unsigned LQResourceID; + unsigned SQResourceID; + unsigned NumIssued; unsigned NumCycles; + unsigned MostRecentLoadDispatched; + unsigned MostRecentStoreDispatched; + // Tracks the usage of a scheduler's queue. struct BufferUsage { unsigned SlotsInUse; @@ -65,11 +71,7 @@ void printSchedulerUsage(llvm::raw_ostream &OS) const; public: - SchedulerStatistics(const llvm::MCSubtargetInfo &STI) - : SM(STI.getSchedModel()), NumIssued(0), NumCycles(0), - IssuedPerCycle(STI.getSchedModel().NumProcResourceKinds, 0), - Usage(STI.getSchedModel().NumProcResourceKinds, {0, 0, 0}) {} - + SchedulerStatistics(const llvm::MCSubtargetInfo &STI); void onEvent(const HWInstructionEvent &Event) override; void onCycleBegin() override { NumCycles++; } void onCycleEnd() override { updateHistograms(); } Index: tools/llvm-mca/Views/SchedulerStatistics.cpp =================================================================== --- tools/llvm-mca/Views/SchedulerStatistics.cpp +++ tools/llvm-mca/Views/SchedulerStatistics.cpp @@ -19,29 +19,83 @@ namespace llvm { namespace mca { +SchedulerStatistics::SchedulerStatistics(const llvm::MCSubtargetInfo &STI) + : SM(STI.getSchedModel()), LQResourceID(0), SQResourceID(0), NumIssued(0), + NumCycles(0), MostRecentLoadDispatched(~0U), + MostRecentStoreDispatched(~0U), + IssuedPerCycle(STI.getSchedModel().NumProcResourceKinds, 0), + Usage(STI.getSchedModel().NumProcResourceKinds, {0, 0, 0}) { + if (SM.hasExtraProcessorInfo()) { + const MCExtraProcessorInfo &EPI = SM.getExtraProcessorInfo(); + LQResourceID = EPI.LoadQueueID; + SQResourceID = EPI.StoreQueueID; + } +} + +// FIXME: This implementation works under the assumption that load/store queue +// entries are reserved at 'instruction dispatched' stage, and released at +// 'instruction executed' stage. This currently matches the behavior of LSUnit. +// +// The current design minimizes the number of events generated by the +// Dispatch/Execute stages, at the cost of doing extra bookkeeping in method +// `onEvent`. However, it introduces a subtle dependency between this view and +// how the LSUnit works. +// +// In future we should add a new "memory queue" event type, so that we stop +// making assumptions on how LSUnit internally works. void SchedulerStatistics::onEvent(const HWInstructionEvent &Event) { if (Event.Type == HWInstructionEvent::Issued) ++NumIssued; + else if (Event.Type == HWInstructionEvent::Dispatched) { + const Instruction &Inst = *Event.IR.getInstruction(); + const unsigned Index = Event.IR.getSourceIndex(); + if (LQResourceID && Inst.getDesc().MayLoad && + MostRecentLoadDispatched != Index) { + Usage[LQResourceID].SlotsInUse++; + MostRecentLoadDispatched = Index; + } + if (SQResourceID && Inst.getDesc().MayStore && + MostRecentStoreDispatched != Index) { + Usage[SQResourceID].SlotsInUse++; + MostRecentStoreDispatched = Index; + } + } else if (Event.Type == HWInstructionEvent::Executed) { + const Instruction &Inst = *Event.IR.getInstruction(); + if (LQResourceID && Inst.getDesc().MayLoad) { + assert(Usage[LQResourceID].SlotsInUse); + Usage[LQResourceID].SlotsInUse--; + } + if (SQResourceID && Inst.getDesc().MayStore) { + assert(Usage[SQResourceID].SlotsInUse); + Usage[SQResourceID].SlotsInUse--; + } + } } void SchedulerStatistics::onReservedBuffers(const InstRef & /* unused */, ArrayRef Buffers) { for (const unsigned Buffer : Buffers) { - BufferUsage &BU = Usage[Buffer]; - BU.SlotsInUse++; - BU.MaxUsedSlots = std::max(BU.MaxUsedSlots, BU.SlotsInUse); + if (Buffer == LQResourceID || Buffer == SQResourceID) + continue; + Usage[Buffer].SlotsInUse++; } } void SchedulerStatistics::onReleasedBuffers(const InstRef & /* unused */, ArrayRef Buffers) { - for (const unsigned Buffer : Buffers) + for (const unsigned Buffer : Buffers) { + if (Buffer == LQResourceID || Buffer == SQResourceID) + continue; Usage[Buffer].SlotsInUse--; + } } void SchedulerStatistics::updateHistograms() { - for (BufferUsage &BU : Usage) + for (BufferUsage &BU : Usage) { BU.CumulativeNumUsedSlots += BU.SlotsInUse; + BU.MaxUsedSlots = std::max(BU.MaxUsedSlots, BU.SlotsInUse); + } + IssuedPerCycle[NumIssued]++; NumIssued = 0; } Index: tools/llvm-mca/include/HardwareUnits/LSUnit.h =================================================================== --- tools/llvm-mca/include/HardwareUnits/LSUnit.h +++ tools/llvm-mca/include/HardwareUnits/LSUnit.h @@ -18,6 +18,7 @@ #include "HardwareUnits/HardwareUnit.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/MC/MCSchedule.h" namespace llvm { namespace mca { @@ -99,6 +100,43 @@ // If true, loads will never alias with stores. This is the default. bool NoAlias; + // When a `MayLoad` instruction is dispatched to the schedulers for execution, + // the LSUnit reserves an entry in the `LoadQueue` for it. + // + // LoadQueue keeps track of all the loads that are in-flight. A load + // instruction is eventually removed from the LoadQueue when it reaches + // completion stage. That means. the load must be 'executed', and the LS unit + // must be able to forward the loaded value on the data path. + // + // This class doesn't know about the latency of a load instruction. So, it + // conservatively/pessimistically assumes that the latency of a load opcode + // matches the instruction latency. + // + // FIXME: In the absence of cache misses (i.e. L1I/L1D/iTLB/dTLB hits/misses), + // or load/store conflicts, the latency of a load is determined by the depth + // of the load pipeline. So, we could use field `LoadLatency` in the + // MCSchedModel to model that latency. + // However, field `LoadLatency` is often based on the 'load-to-use' latency + // from L1D (as reported in the official hardware documentation), and it + // normally already accounts for the extra latency of due to the forwarding + // data paths. + // That being said, when doing throughput analysis, `LoadLatency` may be a + // better predictor of load latency than the instruction latency. + // In particular, it would improve the load queue simulation in the presence + // of long latency instructions with folded memory operands. + // + // FIXME: On some processors, load/store operations are split into multiple + // uOps. For example, X86 AMD Jaguar natively supports 128-bit data types, but + // not 256-bit data types. So, AVX operations on 256-bits data types are + // effectively split into two complex operations. That also means, a 256-bit + // load is effectively split into two 128-bit loads, and each split load + // consumes one entry in the LoadQueue. For simplicity, this class assumes + // that any load instruction only consumes a single entry in the LoadQueue. + // Similarly, store instructions only consume a single entry in the + // StoreQueue. This is a very optimistic assumption. + // In future, we should assess again the quality of this design, and consider + // alternative approaches that let instructions specify the number of + // load/store queue entries consumed at dispatch stage. SmallSet LoadQueue; SmallSet StoreQueue; @@ -122,8 +160,8 @@ bool isLQFull() const { return LQ_Size != 0 && LoadQueue.size() == LQ_Size; } public: - LSUnit(unsigned LQ = 0, unsigned SQ = 0, bool AssumeNoAlias = false) - : LQ_Size(LQ), SQ_Size(SQ), NoAlias(AssumeNoAlias) {} + LSUnit(const MCSchedModel &SM, unsigned LQ = 0, unsigned SQ = 0, + bool AssumeNoAlias = false); #ifndef NDEBUG void dump() const; @@ -149,6 +187,15 @@ // 5. A load has to wait until an older load barrier is fully executed. // 6. A store has to wait until an older store barrier is fully executed. virtual bool isReady(const InstRef &IR) const; + + // Load and store instructions are tracked by their corresponding queues from + // dispatch until the "instruction executed" event. + // Only when a load instruction reaches the 'Executed' stage, its value + // becomes available to the users. At that point, the load no longer needs to + // be tracked by the load queue. + // FIXME: For simplicity, we optimistically assume a similar behavior for + // store instructions. In practice, store operation don't tend to leave the + // store queue until they reach the 'Retired' stage. void onInstructionExecuted(const InstRef &IR); }; Index: tools/llvm-mca/lib/Context.cpp =================================================================== --- tools/llvm-mca/lib/Context.cpp +++ tools/llvm-mca/lib/Context.cpp @@ -35,8 +35,8 @@ // Create the hardware units defining the backend. auto RCU = llvm::make_unique(SM); auto PRF = llvm::make_unique(SM, MRI, Opts.RegisterFileSize); - auto LSU = llvm::make_unique(Opts.LoadQueueSize, Opts.StoreQueueSize, - Opts.AssumeNoAlias); + auto LSU = llvm::make_unique(SM, Opts.LoadQueueSize, + Opts.StoreQueueSize, Opts.AssumeNoAlias); auto HWS = llvm::make_unique(SM, LSU.get()); // Create the pipeline stages. Index: tools/llvm-mca/lib/HardwareUnits/LSUnit.cpp =================================================================== --- tools/llvm-mca/lib/HardwareUnits/LSUnit.cpp +++ tools/llvm-mca/lib/HardwareUnits/LSUnit.cpp @@ -22,6 +22,23 @@ namespace llvm { namespace mca { +LSUnit::LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ, + bool AssumeNoAlias) + : LQ_Size(LQ), SQ_Size(SQ), NoAlias(AssumeNoAlias) { + if (SM.hasExtraProcessorInfo()) { + const MCExtraProcessorInfo &MEP = SM.getExtraProcessorInfo(); + if (!LQ_Size && MEP.LoadQueueID) { + const MCProcResourceDesc &LdQDesc = *SM.getProcResource(MEP.LoadQueueID); + LQ_Size = LdQDesc.BufferSize; + } + + if (!SQ_Size && MEP.StoreQueueID) { + const MCProcResourceDesc &StQDesc = *SM.getProcResource(MEP.StoreQueueID); + SQ_Size = StQDesc.BufferSize; + } + } +} + #ifndef NDEBUG void LSUnit::dump() const { dbgs() << "[LSUnit] LQ_Size = " << LQ_Size << '\n'; Index: tools/llvm-mca/llvm-mca.cpp =================================================================== --- tools/llvm-mca/llvm-mca.cpp +++ tools/llvm-mca/llvm-mca.cpp @@ -151,12 +151,12 @@ static cl::opt LoadQueueSize("lqueue", - cl::desc("Size of the load queue (unbound by default)"), + cl::desc("Size of the load queue"), cl::cat(ToolOptions), cl::init(0)); static cl::opt StoreQueueSize("squeue", - cl::desc("Size of the store queue (unbound by default)"), + cl::desc("Size of the store queue"), cl::cat(ToolOptions), cl::init(0)); static cl::opt Index: utils/TableGen/CodeGenSchedule.h =================================================================== --- utils/TableGen/CodeGenSchedule.h +++ utils/TableGen/CodeGenSchedule.h @@ -246,10 +246,14 @@ // Optional Retire Control Unit definition. Record *RetireControlUnit; + // Load/Store queue descriptors. + Record *LoadQueue; + Record *StoreQueue; + CodeGenProcModel(unsigned Idx, std::string Name, Record *MDef, Record *IDef) : Index(Idx), ModelName(std::move(Name)), ModelDef(MDef), ItinsDef(IDef), - RetireControlUnit(nullptr) {} + RetireControlUnit(nullptr), LoadQueue(nullptr), StoreQueue(nullptr) {} bool hasItineraries() const { return !ItinsDef->getValueAsListOfDefs("IID").empty(); @@ -260,7 +264,8 @@ } bool hasExtraProcessorInfo() const { - return RetireControlUnit || !RegisterFiles.empty(); + return RetireControlUnit || LoadQueue || StoreQueue || + !RegisterFiles.empty(); } unsigned getProcResourceIdx(Record *PRDef) const; @@ -607,6 +612,8 @@ void collectSTIPredicates(); + void collectLoadStoreQueueInfo(); + void checkCompleteness(); void inferFromRW(ArrayRef OperWrites, ArrayRef OperReads, Index: utils/TableGen/CodeGenSchedule.cpp =================================================================== --- utils/TableGen/CodeGenSchedule.cpp +++ utils/TableGen/CodeGenSchedule.cpp @@ -479,6 +479,35 @@ } } +void CodeGenSchedModels::collectLoadStoreQueueInfo() { + RecVec Queues = Records.getAllDerivedDefinitions("MemoryQueue"); + + for (Record *Queue : Queues) { + CodeGenProcModel &PM = getProcModel(Queue->getValueAsDef("SchedModel")); + if (Queue->isSubClassOf("LoadQueue")) { + if (PM.LoadQueue) { + PrintError(Queue->getLoc(), + "Expected a single LoadQueue definition"); + PrintNote(PM.LoadQueue->getLoc(), + "Previous definition of LoadQueue was here"); + } + + PM.LoadQueue = Queue; + } + + if (Queue->isSubClassOf("StoreQueue")) { + if (PM.StoreQueue) { + PrintError(Queue->getLoc(), + "Expected a single StoreQueue definition"); + PrintNote(PM.LoadQueue->getLoc(), + "Previous definition of StoreQueue was here"); + } + + PM.StoreQueue = Queue; + } + } +} + /// Collect optional processor information. void CodeGenSchedModels::collectOptionalProcessorInfo() { // Find register file definitions for each processor. @@ -487,6 +516,9 @@ // Collect processor RetireControlUnit descriptors if available. collectRetireControlUnits(); + // Collect information about load/store queues. + collectLoadStoreQueueInfo(); + checkCompleteness(); } Index: utils/TableGen/SubtargetEmitter.cpp =================================================================== --- utils/TableGen/SubtargetEmitter.cpp +++ utils/TableGen/SubtargetEmitter.cpp @@ -93,6 +93,8 @@ &ProcItinLists); unsigned EmitRegisterFileTables(const CodeGenProcModel &ProcModel, raw_ostream &OS); + void EmitLoadStoreQueueInfo(const CodeGenProcModel &ProcModel, + raw_ostream &OS); void EmitExtraProcessorInfo(const CodeGenProcModel &ProcModel, raw_ostream &OS); void EmitProcessorProp(raw_ostream &OS, const Record *R, StringRef Name, @@ -697,6 +699,30 @@ return CostTblIndex; } +void SubtargetEmitter::EmitLoadStoreQueueInfo(const CodeGenProcModel &ProcModel, + raw_ostream &OS) { + unsigned QueueID = 0; + if (ProcModel.LoadQueue) { + const Record *Queue = ProcModel.LoadQueue->getValueAsDef("QueueDescriptor"); + QueueID = + 1 + std::distance(ProcModel.ProcResourceDefs.begin(), + std::find(ProcModel.ProcResourceDefs.begin(), + ProcModel.ProcResourceDefs.end(), Queue)); + } + OS << " " << QueueID << ", // Resource Descriptor for the Load Queue\n"; + + QueueID = 0; + if (ProcModel.StoreQueue) { + const Record *Queue = + ProcModel.StoreQueue->getValueAsDef("QueueDescriptor"); + QueueID = + 1 + std::distance(ProcModel.ProcResourceDefs.begin(), + std::find(ProcModel.ProcResourceDefs.begin(), + ProcModel.ProcResourceDefs.end(), Queue)); + } + OS << " " << QueueID << ", // Resource Descriptor for the Store Queue\n"; +} + void SubtargetEmitter::EmitExtraProcessorInfo(const CodeGenProcModel &ProcModel, raw_ostream &OS) { // Generate a table of register file descriptors (one entry per each user @@ -715,6 +741,9 @@ EmitRegisterFileInfo(ProcModel, ProcModel.RegisterFiles.size(), NumCostEntries, OS); + // Add information about load/store queues. + EmitLoadStoreQueueInfo(ProcModel, OS); + OS << "};\n"; }