Index: include/llvm/MC/MCSchedule.h =================================================================== --- include/llvm/MC/MCSchedule.h +++ include/llvm/MC/MCSchedule.h @@ -24,9 +24,7 @@ /// Define a kind of processor resource that will be modeled by the scheduler. struct MCProcResourceDesc { -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) const char *Name; -#endif unsigned NumUnits; // Number of resource of this kind unsigned SuperIdx; // Index of the resources kind that contains this kind. Index: test/tools/llvm-mca/ARM/lit.local.cfg =================================================================== --- test/tools/llvm-mca/ARM/lit.local.cfg +++ test/tools/llvm-mca/ARM/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'ARM' in config.root.targets: + config.unsupported = True Index: test/tools/llvm-mca/ARM/simple-test-cortex-a9.s =================================================================== --- test/tools/llvm-mca/ARM/simple-test-cortex-a9.s +++ test/tools/llvm-mca/ARM/simple-test-cortex-a9.s @@ -0,0 +1,37 @@ +# RUN: llvm-mca -mtriple=arm-eabi -mcpu=cortex-a9 -iterations=100 < %s | FileCheck %s + +vadd.f32 s0, s2, s2 + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 100 +# CHECK-NEXT: Total Cycles: 105 +# CHECK-NEXT: Dispatch Width: 2 +# CHECK-NEXT: IPC: 0.95 + +# CHECK: Resources: +# CHECK-NEXT: [0] - A9UnitAGU +# CHECK-NEXT: [1.0] - A9UnitALU +# CHECK-NEXT: [1.1] - A9UnitALU +# CHECK-NEXT: [2] - A9UnitB +# CHECK-NEXT: [3] - A9UnitFP +# CHECK-NEXT: [4] - A9UnitLS +# CHECK-NEXT: [5] - A9UnitMul + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0] [1.0] [1.1] [2] [3] [4] [5] +# CHECK-NEXT: 1.00 - - - 1.00 - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0] [1.0] [1.1] [2] [3] [4] [5] Instructions: +# CHECK-NEXT: 1.00 - - - 1.00 - - vadd.f32 s0, s2, s2 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 4 1.00 vadd.f32 s0, s2, s2 Index: test/tools/llvm-mca/X86/cpus.s =================================================================== --- test/tools/llvm-mca/X86/cpus.s +++ test/tools/llvm-mca/X86/cpus.s @@ -0,0 +1,27 @@ +# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 < %s | FileCheck --check-prefix=ALL --check-prefix=BTVER2 %s +# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=znver1 < %s | FileCheck --check-prefix=ALL --check-prefix=ZNVER1 %s +# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge < %s | FileCheck --check-prefix=ALL --check-prefix=SANDYBRIDGE %s +# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=ivybridge < %s | FileCheck --check-prefix=ALL --check-prefix=IVYBRIDGE %s +# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=haswell < %s | FileCheck --check-prefix=ALL --check-prefix=HASWELL %s +# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=broadwell < %s | FileCheck --check-prefix=ALL --check-prefix=BROADWELL %s +# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=knl < %s | FileCheck --check-prefix=ALL --check-prefix=KNL %s +# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=skylake < %s | FileCheck --check-prefix=ALL --check-prefix=SKX %s +# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 < %s | FileCheck --check-prefix=ALL --check-prefix=SKX-AVX512 %s +# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=slm < %s | FileCheck --check-prefix=ALL --check-prefix=SLM %s + +add %edi, %eax + +# ALL: Iterations: 70 +# ALL-NEXT: Instructions: 70 + +# BTVER2: Dispatch Width: 2 +# ZNVER1: Dispatch Width: 4 +# SANDYBRIDGE: Dispatch Width: 4 +# IVYBRIDGE: Dispatch Width: 4 +# HASWELL: Dispatch Width: 4 +# BROADWELL: Dispatch Width: 4 +# KNL: Dispatch Width: 4 +# SKX: Dispatch Width: 6 +# SKX-AVX512: Dispatch Width: 6 +# SLM: Dispatch Width: 2 + Index: test/tools/llvm-mca/X86/default-iterations.s =================================================================== --- test/tools/llvm-mca/X86/default-iterations.s +++ test/tools/llvm-mca/X86/default-iterations.s @@ -0,0 +1,11 @@ +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 < %s 2>&1 | FileCheck --check-prefix=DEFAULT %s +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=0 < %s 2>&1 | FileCheck --check-prefix=DEFAULT %s +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1 < %s 2>&1 | FileCheck --check-prefix=CUSTOM %s + +add %eax, %eax + +# DEFAULT: Iterations: 70 +# DEFAULT-NEXT: Instructions: 70 + +# CUSTOM: Iterations: 1 +# CUSTOM-NEXT: Instructions: 1 Index: test/tools/llvm-mca/X86/dispatch_width.s =================================================================== --- test/tools/llvm-mca/X86/dispatch_width.s +++ test/tools/llvm-mca/X86/dispatch_width.s @@ -0,0 +1,8 @@ +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 < %s 2>&1 | FileCheck --check-prefix=DEFAULT %s +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -dispatch=0 < %s 2>&1 | FileCheck --check-prefix=DEFAULT %s +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -dispatch=1 < %s 2>&1 | FileCheck --check-prefix=CUSTOM %s + +add %eax, %eax + +# DEFAULT: Dispatch Width: 2 +# CUSTOM: Dispatch Width: 1 Index: test/tools/llvm-mca/X86/in-order-cpu.s =================================================================== --- test/tools/llvm-mca/X86/in-order-cpu.s +++ test/tools/llvm-mca/X86/in-order-cpu.s @@ -0,0 +1,3 @@ +# RUN: not llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=atom -o %t1 2>&1 | FileCheck %s + +# CHECK: error: please specify an out-of-order cpu. 'atom' is an in-order cpu. Index: test/tools/llvm-mca/X86/invalid-assembly-sequence.s =================================================================== --- test/tools/llvm-mca/X86/invalid-assembly-sequence.s +++ test/tools/llvm-mca/X86/invalid-assembly-sequence.s @@ -0,0 +1,3 @@ +# RUN: not llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 %s + +invalid_instruction_mnemonic Index: test/tools/llvm-mca/X86/invalid-cpu.s =================================================================== --- test/tools/llvm-mca/X86/invalid-cpu.s +++ test/tools/llvm-mca/X86/invalid-cpu.s @@ -0,0 +1,3 @@ +# RUN: not llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=foo -o %t1 2>&1 | FileCheck %s + +# CHECK: 'foo' is not a recognized processor for this target (ignoring processor) Index: test/tools/llvm-mca/X86/invalid-empty-file.s =================================================================== --- test/tools/llvm-mca/X86/invalid-empty-file.s +++ test/tools/llvm-mca/X86/invalid-empty-file.s @@ -0,0 +1,3 @@ +# RUN: not llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 -o %t1 2>&1 | FileCheck %s + +# CHECK: error: no assembly instructions found. Index: test/tools/llvm-mca/X86/lit.local.cfg =================================================================== --- test/tools/llvm-mca/X86/lit.local.cfg +++ test/tools/llvm-mca/X86/lit.local.cfg @@ -0,0 +1,3 @@ +if not 'X86' in config.root.targets: + config.unsupported = True + Index: test/tools/llvm-mca/X86/no-sched-model.s =================================================================== --- test/tools/llvm-mca/X86/no-sched-model.s +++ test/tools/llvm-mca/X86/no-sched-model.s @@ -0,0 +1,4 @@ +# RUN: not llvm-mca -mtriple=x86_64-unknown-unknown < %s 2>&1 | FileCheck %s +# RUN: not llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=generic < %s 2>&1 | FileCheck %s + +# CHECK: error: unable to find instruction-level scheduling information for target triple 'x86_64-unknown-unknown' and cpu 'generic'. Index: test/tools/llvm-mca/X86/simple-test-btver2.s =================================================================== --- test/tools/llvm-mca/X86/simple-test-btver2.s +++ test/tools/llvm-mca/X86/simple-test-btver2.s @@ -0,0 +1,45 @@ +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=100 < %s | FileCheck %s + +add %edi, %eax + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 100 +# CHECK-NEXT: Total Cycles: 103 +# CHECK-NEXT: Dispatch Width: 2 +# CHECK-NEXT: IPC: 0.97 + +# CHECK-LABEL: Resources: +# CHECK-NEXT: [0] - JALU0 +# CHECK-NEXT: [1] - JALU1 +# CHECK-NEXT: [2] - JDiv +# CHECK-NEXT: [3] - JFPA +# CHECK-NEXT: [4] - JFPM +# CHECK-NEXT: [5] - JFPU0 +# CHECK-NEXT: [6] - JFPU1 +# CHECK-NEXT: [7] - JLAGU +# CHECK-NEXT: [8] - JMul +# CHECK-NEXT: [9] - JSAGU +# CHECK-NEXT: [10] - JSTC +# CHECK-NEXT: [11] - JVALU0 +# CHECK-NEXT: [12] - JVALU1 +# CHECK-NEXT: [13] - JVIMUL + + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions: +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - - addl %edi, %eax + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 1 0.50 addl %edi, %eax Index: test/tools/llvm-mca/invalid_input_file_name.test =================================================================== --- test/tools/llvm-mca/invalid_input_file_name.test +++ test/tools/llvm-mca/invalid_input_file_name.test @@ -0,0 +1,3 @@ +# RUN: not llvm-mca %t.blah -o %t2 2>&1 | FileCheck --check-prefix=ENOENT %s + +# ENOENT: {{.*}}.blah: {{[Nn]}}o such file or directory Index: test/tools/llvm-mca/lit.local.cfg =================================================================== --- test/tools/llvm-mca/lit.local.cfg +++ test/tools/llvm-mca/lit.local.cfg @@ -0,0 +1,4 @@ +# Requires a non-empty default triple for these tests +if 'default_triple' not in config.available_features: + config.unsupported = True + Index: tools/LLVMBuild.txt =================================================================== --- tools/LLVMBuild.txt +++ tools/LLVMBuild.txt @@ -37,6 +37,7 @@ llvm-link llvm-lto llvm-mc + llvm-mca llvm-mcmarkup llvm-modextract llvm-mt Index: tools/llvm-mca/Backend.h =================================================================== --- tools/llvm-mca/Backend.h +++ tools/llvm-mca/Backend.h @@ -0,0 +1,141 @@ +//===--------------------- Backend.h ----------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file implements an OoO backend for the llvm-mca tool. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_MCA_BACKEND_H +#define LLVM_TOOLS_LLVM_MCA_BACKEND_H + +#include "Dispatch.h" +#include "InstrBuilder.h" +#include "Scheduler.h" +#include "SourceMgr.h" + +namespace mca { + +using namespace llvm; + +struct HWEventListener; + +/// \brief An out of order backend for a specific subtarget. +/// +/// It emulates an out-of-order execution of instructions. Instructions are +/// fetched from a MCInst sequence managed by an object of class SourceMgr. +/// Instructions are firstly dispatched to the schedulers and then executed. +/// This class tracks the lifetime of an instruction from the moment where +/// it gets dispatched to the schedulers, to the moment where it finishes +/// executing and register writes are architecturally committed. +/// In particular, it monitors changes in the state of every instruction +/// in flight. +/// Instructions are executed in a loop of iterations. The number of iterations +/// is defined by the SourceMgr object. +/// The Backend entrypoint is method 'Run()' which execute cycles in a loop +/// until there are new instructions to dispatch, and not every instruction +/// has been retired. +/// Internally, the Backend collects statistical information in the form of +/// histograms. For example, it tracks how the dispatch group size changes +/// over time. +class Backend { + const MCSubtargetInfo &STI; + + std::unique_ptr IB; + std::unique_ptr HWS; + std::unique_ptr DU; + std::unique_ptr SM; + unsigned Cycles; + + DenseMap> Instructions; + std::set Listeners; + + void RunCycle(unsigned Cycle); + +public: + Backend(const MCSubtargetInfo &Subtarget, const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, std::unique_ptr Source, + unsigned DispatchWidth = 0, unsigned RegisterFileSize = 0, + unsigned MaxRetirePerCycle = 0, unsigned LoadQueueSize = 0, + unsigned StoreQueueSize = 0, bool AssumeNoAlias = false) + : STI(Subtarget), + HWS(llvm::make_unique(this, Subtarget.getSchedModel(), + LoadQueueSize, StoreQueueSize, + AssumeNoAlias)), + DU(llvm::make_unique( + this, MRI, Subtarget.getSchedModel().MicroOpBufferSize, + RegisterFileSize, MaxRetirePerCycle, DispatchWidth, HWS.get())), + SM(std::move(Source)), Cycles(0) { + IB = llvm::make_unique(MCII, GetProcResourceMasks()); + } + + int Run() { + while (SM->HasNext() || !DU->IsRCUEmpty()) + RunCycle(Cycles++); + return 0; + } + + unsigned GetNumIterations() const { return SM->GetNumIterations(); } + unsigned GetNumInstructions() const { return SM->size(); } + unsigned GetNumCycles() const { return Cycles; } + unsigned GetTotalRegisterMappingsCreated() const { + return DU->GetTotalRegisterMappingsCreated(); + } + unsigned GetMaxUsedRegisterMappings() const { + return DU->GetMaxUsedRegisterMappings(); + } + unsigned GetDispatchWidth() const { return DU->GetDispatchWidth(); } + + const MCSubtargetInfo &GetSTI() const { return STI; } + const MCSchedModel &GetSchedModel() const { return STI.getSchedModel(); } + const ArrayRef GetProcResourceMasks() const { + return HWS->GetProcResourceMasks(); + } + + double GetRThroughput(const InstrDesc &ID) const { + return HWS->GetRThroughput(ID); + } + void GetBuffersUsage(std::vector &Usage) const { + return HWS->GetBuffersUsage(Usage); + } + + unsigned GetNumRATStalls() const { return DU->GetNumRATStalls(); } + unsigned GetNumRCUStalls() const { return DU->GetNumRCUStalls(); } + unsigned GetNumSQStalls() const { return DU->GetNumSQStalls(); } + unsigned GetNumLDQStalls() const { return DU->GetNumLDQStalls(); } + unsigned GetNumSTQStalls() const { return DU->GetNumSTQStalls(); } + unsigned GetNumDispatchGroupStalls() const { + return DU->GetNumDispatchGroupStalls(); + } + + const MCInst &GetMCInstFromIndex(unsigned Index) const { + return SM->GetMCInstFromIndex(Index); + } + + const InstrDesc &GetInstrDesc(const MCInst &Inst) const { + return IB->GetOrCreateInstrDesc(STI, Inst); + } + + const SourceMgr &GetSourceMgr() const { return *SM; } + + void AddEventListener(HWEventListener *Listener); + void NotifyCycleBegin(unsigned Cycle); + void NotifyInstructionDispatched(unsigned Index); + void NotifyInstructionReady(unsigned Index); + void NotifyInstructionIssued( + unsigned Index, const ArrayRef> &Used); + void NotifyInstructionExecuted(unsigned Index); + void NotifyResourceAvailable(const ResourceRef &RR); + void NotifyInstructionRetired(unsigned Index); + void NotifyCycleEnd(unsigned Cycle); +}; + +} // namespace mca + +#endif Index: tools/llvm-mca/Backend.cpp =================================================================== --- tools/llvm-mca/Backend.cpp +++ tools/llvm-mca/Backend.cpp @@ -0,0 +1,130 @@ +//===--------------------- Backend.cpp --------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// Implementation of class Backend which emulates an hardware OoO backend. +/// +//===----------------------------------------------------------------------===// + +#include "Backend.h" +#include "HWEventListener.h" +#include "llvm/CodeGen/TargetSchedule.h" +#include "llvm/Support/Debug.h" + +namespace mca { + +#define DEBUG_TYPE "llvm-mca" + +void Backend::AddEventListener(HWEventListener *Listener) { + if (Listener) + Listeners.insert(Listener); +} + +void Backend::RunCycle(unsigned Cycle) { + NotifyCycleBegin(Cycle); + + if (!SM->HasNext()) { + NotifyCycleEnd(Cycle); + return; + } + + InstRef IR = SM->PeekNext(); + const InstrDesc *Desc = + &IB->GetOrCreateInstrDesc(STI, *IR.second); + while (DU->IsAvailable(Desc->NumMicroOps) && DU->CanDispatch(*Desc)) { + Instruction *NewIS = IB->CreateInstruction(STI, *DU, IR.first, *IR.second); + Instructions[IR.first] = std::unique_ptr(NewIS); + NewIS->SetRCUTokenID(DU->Dispatch(IR.first, NewIS)); + + // If this is a zero latency instruction, then we don't need to dispatch + // it. Instead, we can mark it as executed. + if (NewIS->IsZeroLatency()) + NotifyInstructionExecuted(IR.first); + + // Check if we have dispatched all the instructions. + SM->UpdateNext(); + if (!SM->HasNext()) + break; + + // Prepare for the next round. + IR = SM->PeekNext(); + Desc = &IB->GetOrCreateInstrDesc(STI, *IR.second); + } + + NotifyCycleEnd(Cycle); +} + +void Backend::NotifyCycleBegin(unsigned Cycle) { + DEBUG(dbgs() << "[E] Cycle begin: " << Cycle << '\n'); + for (HWEventListener *Listener : Listeners) + Listener->OnCycleBegin(Cycle); + + DU->CycleEvent(Cycle); + HWS->CycleEvent(Cycle); +} + +void Backend::NotifyInstructionDispatched(unsigned Index) { + DEBUG(dbgs() << "[E] Instruction Dispatched: " << Index << '\n'); + for (HWEventListener *Listener : Listeners) + Listener->OnInstructionDispatched(Index); +} + +void Backend::NotifyInstructionReady(unsigned Index) { + DEBUG(dbgs() << "[E] Instruction Ready: " << Index << '\n'); + for (HWEventListener *Listener : Listeners) + Listener->OnInstructionReady(Index); +} + +void Backend::NotifyInstructionIssued( + unsigned Index, const ArrayRef> &Used) { + DEBUG(dbgs() << "[E] Instruction Issued: " << Index << '\n'; + for (const auto &Resource + : Used) { + dbgs() << "[E] Resource Used: [" << Resource.first.first << '.' + << Resource.first.second << "]\n"; + dbgs() << " cycles: " << Resource.second << '\n'; + }); + + for (HWEventListener *Listener : Listeners) + Listener->OnInstructionIssued(Index, Used); +} + +void Backend::NotifyInstructionExecuted(unsigned Index) { + DEBUG(dbgs() << "[E] Instruction Executed: " << Index << '\n'); + for (HWEventListener *Listener : Listeners) + Listener->OnInstructionExecuted(Index); + + const Instruction &IS = *Instructions[Index]; + DU->OnInstructionExecuted(IS.GetRCUTokenID()); +} + +void Backend::NotifyInstructionRetired(unsigned Index) { + DEBUG(dbgs() << "[E] Instruction Retired: " << Index << '\n'); + for (HWEventListener *Listener : Listeners) + Listener->OnInstructionRetired(Index); + + const Instruction &IS = *Instructions[Index]; + DU->InvalidateRegisterMappings(IS); + Instructions.erase(Index); +} + +void Backend::NotifyResourceAvailable(const ResourceRef &RR) { + DEBUG(dbgs() << "[E] Resource Available: [" << RR.first << '.' << RR.second + << "]\n"); + for (HWEventListener *Listener : Listeners) + Listener->OnResourceAvailable(RR); +} + +void Backend::NotifyCycleEnd(unsigned Cycle) { + DEBUG(dbgs() << "[E] Cycle end: " << Cycle << "\n\n"); + for (HWEventListener *Listener : Listeners) + Listener->OnCycleEnd(Cycle); +} + +} // namespace mca. Index: tools/llvm-mca/BackendPrinter.h =================================================================== --- tools/llvm-mca/BackendPrinter.h +++ tools/llvm-mca/BackendPrinter.h @@ -0,0 +1,104 @@ +//===--------------------- BackendPrinter.h ---------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file implements class BackendPrinter. +/// BackendPrinter is able to collect statistics related to the code executed +/// by the Backend class. Information is then printed out with the help of +/// a MCInstPrinter (to pretty print MCInst objects) and other helper classes. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_MCA_BACKENDPRINTER_H +#define LLVM_TOOLS_LLVM_MCA_BACKENDPRINTER_H + +#include "Backend.h" +#include "BackendStatistics.h" +#include "ResourcePressureView.h" +#include "TimelineView.h" +#include "llvm/MC/MCInstPrinter.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/FileUtilities.h" +#include "llvm/Support/ToolOutputFile.h" + +using namespace llvm; + +#define DEBUG_TYPE "llvm-mca" + +namespace mca { + +class ResourcePressureView; +class TimelineView; + +/// \brief A printer class that knows how to collects statistics on the +/// code analyzed by the llvm-mca tool. +/// +/// This class knows how to print out the analysis information collected +/// during the execution of the code. Internally, it delegates to other +/// classes the task of printing out timeline information as well as +/// resource pressure. +class BackendPrinter { + Backend &B; + bool EnableVerboseOutput; + + std::unique_ptr MCIP; + std::unique_ptr File; + + std::unique_ptr RPV; + std::unique_ptr TV; + std::unique_ptr BS; + + using Histogram = std::map; + void printDUStatistics(const Histogram &Stats, unsigned Cycles) const; + void printDispatchStalls(unsigned RATStalls, unsigned RCUStalls, + unsigned SQStalls, unsigned LDQStalls, + unsigned STQStalls, unsigned DGStalls) const; + void printRATStatistics(unsigned Mappings, unsigned MaxUsedMappings) const; + void printRCUStatistics(const Histogram &Histogram, unsigned Cycles) const; + void printIssuePerCycle(const Histogram &IssuePerCycle, + unsigned TotalCycles) const; + void printSchedulerUsage(const MCSchedModel &SM, + const ArrayRef &Usage) const; + void printGeneralStatistics(unsigned Iterations, unsigned Cycles, + unsigned Instructions, + unsigned DispatchWidth) const; + void printInstructionInfo() const; + + std::unique_ptr GetOutputStream(std::string OutputFile); + void initialize(std::string OputputFileName); + +public: + BackendPrinter(Backend &backend, std::string OutputFileName, + std::unique_ptr IP, bool EnableVerbose) + : B(backend), EnableVerboseOutput(EnableVerbose), MCIP(std::move(IP)) { + initialize(OutputFileName); + } + + ~BackendPrinter() { + if (File) + File->keep(); + } + + bool IsFileValid() const { return File.get(); } + raw_ostream &getOStream() const { + assert(IsFileValid()); + return File->os(); + } + + MCInstPrinter &GetMCInstPrinter() const { return *MCIP; } + + void AddResourcePressureView(); + void AddTimelineView(unsigned MaxIterations = 3, unsigned MaxCycles = 80); + + void print() const; +}; + +} // namespace mca + +#endif Index: tools/llvm-mca/BackendPrinter.cpp =================================================================== --- tools/llvm-mca/BackendPrinter.cpp +++ tools/llvm-mca/BackendPrinter.cpp @@ -0,0 +1,207 @@ +//===--------------------- BackendPrinter.cpp -------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file implements the BackendPrinter interface. +/// +//===----------------------------------------------------------------------===// + +#include "BackendPrinter.h" +#include "llvm/CodeGen/TargetSchedule.h" + +namespace mca { + +std::unique_ptr +BackendPrinter::GetOutputStream(std::string OutputFile) { + if (OutputFile == "") + OutputFile = "-"; + std::error_code EC; + auto Out = llvm::make_unique(OutputFile, EC, sys::fs::F_None); + if (!EC) + return Out; + errs() << EC.message() << '\n'; + return nullptr; +} + +void BackendPrinter::printGeneralStatistics(unsigned Iterations, + unsigned Cycles, + unsigned Instructions, + unsigned DispatchWidth) const { + unsigned TotalInstructions = Instructions * Iterations; + double IPC = (double)TotalInstructions / Cycles; + + std::string Buffer; + raw_string_ostream TempStream(Buffer); + TempStream << "Iterations: " << Iterations; + TempStream << "\nInstructions: " << TotalInstructions; + TempStream << "\nTotal Cycles: " << Cycles; + TempStream << "\nDispatch Width: " << DispatchWidth; + TempStream << "\nIPC: " << format("%.2f", IPC) << '\n'; + TempStream.flush(); + File->os() << Buffer; +} + +void BackendPrinter::printRATStatistics(unsigned TotalMappings, + unsigned MaxUsedMappings) const { + std::string Buffer; + raw_string_ostream TempStream(Buffer); + TempStream << "\n\nRegister Alias Table:"; + TempStream << "\nTotal number of mappings created: " << TotalMappings; + TempStream << "\nMax number of mappings used: " << MaxUsedMappings + << '\n'; + TempStream.flush(); + File->os() << Buffer; +} + +void BackendPrinter::printDispatchStalls(unsigned RATStalls, unsigned RCUStalls, + unsigned SCHEDQStalls, + unsigned LDQStalls, unsigned STQStalls, + unsigned DGStalls) const { + std::string Buffer; + raw_string_ostream TempStream(Buffer); + TempStream << "\n\nDynamic Dispatch Stall Cycles:\n"; + TempStream << "RAT - Register unavailable: " + << RATStalls; + TempStream << "\nRCU - Retire tokens unavailable: " + << RCUStalls; + TempStream << "\nSCHEDQ - Scheduler full: " + << SCHEDQStalls; + TempStream << "\nLQ - Load queue full: " + << LDQStalls; + TempStream << "\nSQ - Store queue full: " + << STQStalls; + TempStream << "\nGROUP - Static restrictions on the dispatch group: " + << DGStalls; + TempStream << '\n'; + TempStream.flush(); + File->os() << Buffer; +} + +void BackendPrinter::printSchedulerUsage( + const MCSchedModel &SM, const ArrayRef &Usage) const { + std::string Buffer; + raw_string_ostream TempStream(Buffer); + TempStream << "\n\nScheduler's queue usage:\n"; + const ArrayRef ResourceMasks = B.GetProcResourceMasks(); + for (unsigned i = 0; i < SM.getNumProcResourceKinds(); ++i) { + const MCProcResourceDesc &ProcResource = *SM.getProcResource(i); + if (!ProcResource.BufferSize) + continue; + + for (const BufferUsageEntry &Entry : Usage) + if (ResourceMasks[i] == Entry.first) + TempStream << ProcResource.Name << ", " << Entry.second << '/' + << ProcResource.BufferSize << '\n'; + } + + TempStream.flush(); + File->os() << Buffer; +} + +void BackendPrinter::printInstructionInfo() const { + std::string Buffer; + raw_string_ostream TempStream(Buffer); + + TempStream << "\n\nInstruction Info:\n"; + TempStream << "[1]: #uOps\n[2]: Latency\n[3]: RThroughput\n" + << "[4]: MayLoad\n[5]: MayStore\n[6]: HasSideEffects\n\n"; + + TempStream << "[1] [2] [3] [4] [5] [6]\tInstructions:\n"; + for (unsigned Idx = 0; Idx < B.GetNumInstructions(); ++Idx) { + const MCInst &Inst = B.GetMCInstFromIndex(Idx); + const InstrDesc &ID = B.GetInstrDesc(Inst); + unsigned NumMicroOpcodes = ID.NumMicroOps; + unsigned Latency = ID.MaxLatency; + double RThroughput = B.GetRThroughput(ID); + TempStream << ' ' << NumMicroOpcodes << " "; + if (NumMicroOpcodes < 10) + TempStream << " "; + else if (NumMicroOpcodes < 100) + TempStream << ' '; + TempStream << Latency << " "; + if (Latency < 10.0) + TempStream << " "; + else if (Latency < 100.0) + TempStream << ' '; + if (RThroughput) { + TempStream << format("%.2f", RThroughput) << ' '; + if (RThroughput < 10.0) + TempStream << " "; + else if (RThroughput < 100.0) + TempStream << ' '; + } else { + TempStream << " - "; + } + TempStream << (ID.MayLoad ? " * " : " "); + TempStream << (ID.MayStore ? " * " : " "); + TempStream << (ID.HasSideEffects ? " * " : " "); + MCIP->printInst(&Inst, TempStream, "", B.GetSTI()); + TempStream << '\n'; + } + + TempStream.flush(); + File->os() << Buffer; +} + +void BackendPrinter::print() const { + assert(IsFileValid()); + unsigned Cycles = B.GetNumCycles(); + printGeneralStatistics(B.GetNumIterations(), Cycles, B.GetNumInstructions(), + B.GetDispatchWidth()); + if (EnableVerboseOutput) { + printDispatchStalls(B.GetNumRATStalls(), B.GetNumRCUStalls(), + B.GetNumSQStalls(), B.GetNumLDQStalls(), + B.GetNumSTQStalls(), B.GetNumDispatchGroupStalls()); + printRATStatistics(B.GetTotalRegisterMappingsCreated(), + B.GetMaxUsedRegisterMappings()); + BS->printHistograms(File->os()); + + std::vector Usage; + B.GetBuffersUsage(Usage); + printSchedulerUsage(B.GetSchedModel(), Usage); + } + + if (RPV) { + RPV->printResourcePressure(getOStream(), Cycles); + printInstructionInfo(); + } + + if (TV) { + TV->printTimeline(getOStream()); + TV->printAverageWaitTimes(getOStream()); + } +} + +void BackendPrinter::AddResourcePressureView() { + if (!RPV) { + RPV = llvm::make_unique( + B.GetSTI(), *MCIP, B.GetSourceMgr(), B.GetProcResourceMasks()); + B.AddEventListener(RPV.get()); + } +} + +void BackendPrinter::AddTimelineView(unsigned MaxIterations, + unsigned MaxCycles) { + if (!TV) { + TV = llvm::make_unique(B.GetSTI(), *MCIP, B.GetSourceMgr(), + MaxIterations, MaxCycles); + B.AddEventListener(TV.get()); + } +} + +void BackendPrinter::initialize(std::string OutputFileName) { + File = std::move(GetOutputStream(OutputFileName)); + MCIP->setPrintImmHex(false); + if (EnableVerboseOutput) { + BS = std::move(llvm::make_unique()); + B.AddEventListener(BS.get()); + } +} + +} // namespace mca. Index: tools/llvm-mca/BackendStatistics.h =================================================================== --- tools/llvm-mca/BackendStatistics.h +++ tools/llvm-mca/BackendStatistics.h @@ -0,0 +1,95 @@ +//===--------------------- BackendStatistics.h ------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file implements a printer class for printing generic Backend +/// statistics related to the dispatch logic, scheduler and retire unit. +/// +/// Example: +/// ======== +/// +/// Dispatch Logic - number of cycles where we saw N instructions dispatched: +/// [# dispatched], [# cycles] +/// 0, 15 (11.5%) +/// 5, 4 (3.1%) +/// +/// Schedulers - number of cycles where we saw N instructions issued: +/// [# issued], [# cycles] +/// 0, 7 (5.4%) +/// 1, 4 (3.1%) +/// 2, 8 (6.2%) +/// +/// Retire Control Unit - number of cycles where we saw N instructions retired: +/// [# retired], [# cycles] +/// 0, 9 (6.9%) +/// 1, 6 (4.6%) +/// 2, 1 (0.8%) +/// 4, 3 (2.3%) +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_MCA_BACKENDSTATISTICS_H +#define LLVM_TOOLS_LLVM_MCA_BACKENDSTATISTICS_H + +#include "HWEventListener.h" +#include "llvm/Support/raw_ostream.h" +#include + +namespace mca { + +class BackendStatistics : public HWEventListener { + using Histogram = std::map; + Histogram DispatchGroupSizePerCycle; + Histogram RetiredPerCycle; + Histogram IssuedPerCycle; + + unsigned NumDispatched; + unsigned NumIssued; + unsigned NumRetired; + unsigned NumCycles; + + void updateHistograms() { + DispatchGroupSizePerCycle[NumDispatched]++; + IssuedPerCycle[NumIssued]++; + RetiredPerCycle[NumRetired]++; + NumDispatched = 0; + NumIssued = 0; + NumRetired = 0; + } + + void printRetireUnitStatistics(llvm::raw_ostream &OS) const; + void printDispatchUnitStatistics(llvm::raw_ostream &OS) const; + void printSchedulerStatistics(llvm::raw_ostream &OS) const; + +public: + BackendStatistics() : NumDispatched(0), NumIssued(0), NumRetired(0) {} + + void OnInstructionDispatched(unsigned Index) override { NumDispatched++; } + void + OnInstructionIssued(unsigned Index, + const llvm::ArrayRef> + & /* unused */) override { + NumIssued++; + } + void OnInstructionRetired(unsigned Index) override { NumRetired++; } + + void OnCycleBegin(unsigned Cycle) override { NumCycles++; } + + void OnCycleEnd(unsigned Cycle) override { updateHistograms(); } + + void printHistograms(llvm::raw_ostream &OS) { + printDispatchUnitStatistics(OS); + printSchedulerStatistics(OS); + printRetireUnitStatistics(OS); + } +}; + +} // namespace mca + +#endif Index: tools/llvm-mca/BackendStatistics.cpp =================================================================== --- tools/llvm-mca/BackendStatistics.cpp +++ tools/llvm-mca/BackendStatistics.cpp @@ -0,0 +1,79 @@ +//===--------------------- BackendStatistics.cpp ---------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// Functionalities used by the BackendPrinter to print out histograms +/// related to number of {dispatch/issue/retire} per number of cycles. +/// +//===----------------------------------------------------------------------===// + +#include "BackendStatistics.h" +#include "llvm/Support/Format.h" + +using namespace llvm; + +namespace mca { + +void BackendStatistics::printRetireUnitStatistics(llvm::raw_ostream &OS) const { + std::string Buffer; + raw_string_ostream TempStream(Buffer); + TempStream << "\n\nRetire Control Unit - " + << "number of cycles where we saw N instructions retired:\n"; + TempStream << "[# retired], [# cycles]\n"; + + for (const auto &Entry : RetiredPerCycle) { + TempStream << " " << Entry.first; + if (Entry.first < 10) + TempStream << ", "; + else + TempStream << ", "; + TempStream << Entry.second << " (" + << format("%.1f", ((double)Entry.second / NumCycles) * 100.0) + << "%)\n"; + } + + TempStream.flush(); + OS << Buffer; +} + +void BackendStatistics::printDispatchUnitStatistics(llvm::raw_ostream &OS) const { + std::string Buffer; + raw_string_ostream TempStream(Buffer); + TempStream << "\n\nDispatch Logic - " + << "number of cycles where we saw N instructions dispatched:\n"; + TempStream << "[# dispatched], [# cycles]\n"; + for (const auto &Entry : DispatchGroupSizePerCycle) { + TempStream << " " << Entry.first << ", " << Entry.second + << " (" + << format("%.1f", ((double)Entry.second / NumCycles) * 100.0) + << "%)\n"; + } + + TempStream.flush(); + OS << Buffer; +} + +void BackendStatistics::printSchedulerStatistics(llvm::raw_ostream &OS) const { + std::string Buffer; + raw_string_ostream TempStream(Buffer); + TempStream << "\n\nSchedulers - number of cycles where we saw N instructions " + "issued:\n"; + TempStream << "[# issued], [# cycles]\n"; + for (const auto &Entry : IssuedPerCycle) { + TempStream << " " << Entry.first << ", " << Entry.second << " (" + << format("%.1f", ((double)Entry.second / NumCycles) * 100) + << "%)\n"; + } + + TempStream.flush(); + OS << Buffer; +} + +} // namespace mca + Index: tools/llvm-mca/CMakeLists.txt =================================================================== --- tools/llvm-mca/CMakeLists.txt +++ tools/llvm-mca/CMakeLists.txt @@ -0,0 +1,24 @@ +set(LLVM_LINK_COMPONENTS + AllTargetsAsmPrinters + AllTargetsAsmParsers + AllTargetsDescs + AllTargetsDisassemblers + AllTargetsInfos + MC + MCParser + Support + ) + +add_llvm_tool(llvm-mca + Backend.cpp + BackendPrinter.cpp + BackendStatistics.cpp + Dispatch.cpp + InstrBuilder.cpp + Instruction.cpp + LSUnit.cpp + llvm-mca.cpp + ResourcePressureView.cpp + Scheduler.cpp + TimelineView.cpp + ) Index: tools/llvm-mca/Dispatch.h =================================================================== --- tools/llvm-mca/Dispatch.h +++ tools/llvm-mca/Dispatch.h @@ -0,0 +1,315 @@ +//===----------------------- Dispatch.h -------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file implements classes that are used to model register files, +/// reorder buffers and the hardware dispatch logic. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_MCA_DISPATCH_H +#define LLVM_TOOLS_LLVM_MCA_DISPATCH_H + +#include "Instruction.h" +#include "llvm/MC/MCRegisterInfo.h" +#include + +namespace mca { + +using namespace llvm; + +class WriteState; +class DispatchUnit; +class Scheduler; +class Backend; + +/// \brief Keeps track of register definitions. +/// +/// This class tracks register definitions, and performs register renaming +/// to break anti dependencies. +/// By default, there is no limit in the number of register aliases which +/// can be created for the purpose of register renaming. However, users can +/// specify at object construction time a limit in the number of temporary +/// registers which can be used by the register renaming logic. +class RegisterFile { + const MCRegisterInfo &MRI; + // Currently used mappings and maximum used mappings. + // These are to generate statistics only. + unsigned NumUsedMappings; + unsigned MaxUsedMappings; + // Total number of mappings created over time. + unsigned TotalMappingsCreated; + + // The maximum number of register aliases which can be used by the + // register renamer. Defaut value for this field is zero. + // A value of zero for this field means that there is no limit in the + // amount of register mappings which can be created. That is equivalent + // to having a theoretically infinite number of temporary registers. + unsigned TotalMappings; + + // This map contains an entry for every physical register. + // A register index is used as a key value to access a WriteState. + // This is how we track RAW dependencies for dispatched + // instructions. For every register, we track the last seen write only. + // This assumes that all writes fully update both super and sub registers. + // We need a flag in MCInstrDesc to check if a write also updates super + // registers. We can then have a extra tablegen flag to set for instructions. + // This is a separate patch on its own. + std::vector RegisterMappings; + // Assumptions are: + // a) a false dependencies is always removed by the register renamer. + // b) the register renamer can create an "infinite" number of mappings. + // Since we track the number of mappings created, in future we may + // introduce constraints on the number of mappings that can be created. + // For example, the maximum number of registers that are available for + // register renaming purposes may default to the size of the register file. + + // In future, we can extend this design to allow multiple register files, and + // apply different restrictions on the register mappings and the number of + // temporary registers used by mappings. + +public: + RegisterFile(const MCRegisterInfo &mri, unsigned Mappings = 0) + : MRI(mri), NumUsedMappings(0), MaxUsedMappings(0), + TotalMappingsCreated(0), TotalMappings(Mappings), + RegisterMappings(MRI.getNumRegs(), nullptr) {} + + // Creates a new mapping for RegID. + void AddRegisterMapping(WriteState *WS); + void InvalidateRegisterMapping(const WriteState &WS); + + bool IsAvailable(unsigned NumRegWrites); + void CollectWrites(SmallVector &Writes, + unsigned RegID) const; + void UpdateOnRead(ReadState &RS, unsigned RegID); + unsigned GetMaxUsedRegisterMappings() const { return MaxUsedMappings; } + unsigned GetTotalRegisterMappingsCreated() const { + return TotalMappingsCreated; + } + +#ifndef NDEBUG + void dump() const; +#endif +}; + +/// \brief tracks which instructions are in-flight (i.e. dispatched but not +/// retired) in the OoO backend. +/// +/// This class checks on every cycle if/which instructions can be retired. +/// Instructions are retired in program order. +/// In the event of instruction retired, the DispatchUnit object that owns +/// this RetireControlUnit gets notified. +/// On instruction retired, register updates are all architecturally +/// committed, and any temporary registers originally allocated for the +/// retired instruction are freed. +struct RetireControlUnit { + // A "token" (object of class RUToken) is created by the retire unit for every + // instruction dispatched to the schedulers. Flag 'Executed' is used to + // quickly check if an instruction has reached the write-back stage. A token + // also carries information related to the number of entries consumed by the + // instruction in the reorder buffer. The idea is that those entries will + // become available again once the instruction is retired. On every cycle, + // the RCU (Retire Control Unit) scans every token starting to search for + // instructions that are ready to retire. retired. Instructions are retired + // in program order. Only 'Executed' instructions are eligible for retire. + // Note that the size of the reorder buffer is defined by the scheduling model + // via field 'NumMicroOpBufferSize'. + struct RUToken { + unsigned Index; // Instruction index. + unsigned NumSlots; // Slots reserved to this instruction. + bool Executed; // True if the instruction is past the WB stage. + }; + +private: + unsigned NextAvailableSlotIdx; + unsigned CurrentInstructionSlotIdx; + unsigned AvailableSlots; + unsigned MaxRetirePerCycle; // 0 means no limit. + std::vector Queue; + DispatchUnit *Owner; + +public: + RetireControlUnit(unsigned NumSlots, unsigned RPC, DispatchUnit *DU) + : NextAvailableSlotIdx(0), CurrentInstructionSlotIdx(0), + AvailableSlots(NumSlots), MaxRetirePerCycle(RPC), Owner(DU) { + assert(NumSlots && "Expected at least one slot!"); + Queue.resize(NumSlots); + } + + bool IsFull() const { return !AvailableSlots; } + bool IsEmpty() const { return AvailableSlots == Queue.size(); } + bool IsAvailable(unsigned Quantity = 1) const { + // Some instructions may declare a number of uOps which exceedes the size + // of the reorder buffer. To avoid problems, cap the amount of slots to + // the size of the reorder buffer. + Quantity = std::min(Quantity, static_cast(Queue.size())); + return AvailableSlots >= Quantity; + } + + // Reserves a number of slots, and returns a new token. + unsigned ReserveSlot(unsigned Index, unsigned NumMicroOps); + // Returns the number of instructions retired in this cycle. + void CycleEvent(); + + void OnInstructionExecuted(unsigned TokenID); + +#ifndef NDEBUG + void dump() const; +#endif +}; + +// \brief Implements the hardware dispatch logic. +// +// This class is responsible for the dispatch stage, in which instructions are +// dispatched in groups to the Scheduler. An instruction can be dispatched if +// functional units are available. +// To be more specific, an instruction can be dispatched to the Scheduler if: +// 1) There are enough entries in the reorder buffer (implemented by class +// RetireControlUnit) to accomodate all opcodes. +// 2) There are enough temporaries to rename output register operands. +// 3) There are enough entries available in the used buffered resource(s). +// +// The number of micro opcodes that can be dispatched in one cycle is limited by +// the value of field 'DispatchWidth'. A "dynamic dispatch stall" occurs when +// processor resources are not available (i.e. at least one of the +// abovementioned checks fails). Dispatch stall events are counted during the +// entire execution of the code, and displayed by the performance report when +// flag '-verbose' is specified. +// +// If the number of micro opcodes of an instruction is bigger than +// DispatchWidth, then it can only be dispatched at the beginning of one cycle. +// The DispatchUnit will still have to wait for a number of cycles (depending on +// the DispatchWidth and the number of micro opcodes) before it can serve other +// instructions. +class DispatchUnit { + unsigned DispatchWidth; + unsigned AvailableEntries; + unsigned CarryOver; + Scheduler *SC; + + std::unique_ptr RAT; + std::unique_ptr RCU; + Backend *Owner; + + /// Dispatch stall event identifiers. + /// + /// The naming convention is: + /// * Each event name starts with the "DS_" prefix + /// * For dynamic dispatch stalls, the "DS_" prefix is always followed by the + /// name of the unavailable resource/functional unit (example: RAT) + /// * The last substring captures the reason why the event occurred (example: + /// REG_UNAVAILABLE means that register renaming couldn't find enough spare + /// temporary registers in the register file). + /// + /// List of acronyms used for processor resoures: + /// RAT - Register Alias Table (used by the register renaming logic) + /// RCU - Retire Control Unit + /// SQ - Scheduler's Queue + /// LDQ - Load Queue + /// STQ - Store Queue + enum { + DS_RAT_REG_UNAVAILABLE, + DS_RCU_TOKEN_UNAVAILABLE, + DS_SQ_TOKEN_UNAVAILABLE, + DS_LDQ_TOKEN_UNAVAILABLE, + DS_STQ_TOKEN_UNAVAILABLE, + DS_DISPATCH_GROUP_RESTRICTION, + DS_LAST + }; + + // The DispatchUnit track dispatch stall events caused by unavailable + // of hardware resources. Events are classified based on the stall kind; + // so we have a counter for every source of dispatch stall. Counters are + // stored into a vector `DispatchStall` which is always of size DS_LAST. + std::vector DispatchStalls; + + bool CheckRAT(const InstrDesc &Desc); + bool CheckRCU(const InstrDesc &Desc); + bool CheckScheduler(const InstrDesc &Desc); + + void NotifyInstructionDispatched(unsigned IID); + +public: + DispatchUnit(Backend *B, const MCRegisterInfo &MRI, + unsigned MicroOpBufferSize, unsigned RegisterFileSize, + unsigned MaxRetirePerCycle, unsigned MaxDispatchWidth, + Scheduler *Sched) + : DispatchWidth(MaxDispatchWidth), AvailableEntries(MaxDispatchWidth), + CarryOver(0U), SC(Sched), + RAT(llvm::make_unique(MRI, RegisterFileSize)), + RCU(llvm::make_unique(MicroOpBufferSize, + MaxRetirePerCycle, this)), + Owner(B), DispatchStalls(DS_LAST, 0) {} + + unsigned GetDispatchWidth() const { return DispatchWidth; } + + bool IsAvailable(unsigned NumEntries) const { + return NumEntries <= AvailableEntries || AvailableEntries == DispatchWidth; + } + + bool IsRCUEmpty() const { return RCU->IsEmpty(); } + + bool CanDispatch(const InstrDesc &Desc) { + assert(IsAvailable(Desc.NumMicroOps)); + return CheckRCU(Desc) && CheckRAT(Desc) && CheckScheduler(Desc); + } + + unsigned Dispatch(unsigned IID, Instruction *NewInst); + + void CollectWrites(SmallVector &Vec, unsigned RegID) const { + return RAT->CollectWrites(Vec, RegID); + } + unsigned GetNumRATStalls() const { + return DispatchStalls[DS_RAT_REG_UNAVAILABLE]; + } + unsigned GetNumRCUStalls() const { + return DispatchStalls[DS_RCU_TOKEN_UNAVAILABLE]; + } + unsigned GetNumSQStalls() const { + return DispatchStalls[DS_SQ_TOKEN_UNAVAILABLE]; + } + unsigned GetNumLDQStalls() const { + return DispatchStalls[DS_LDQ_TOKEN_UNAVAILABLE]; + } + unsigned GetNumSTQStalls() const { + return DispatchStalls[DS_STQ_TOKEN_UNAVAILABLE]; + } + unsigned GetNumDispatchGroupStalls() const { + return DispatchStalls[DS_DISPATCH_GROUP_RESTRICTION]; + } + unsigned GetMaxUsedRegisterMappings() const { + return RAT->GetMaxUsedRegisterMappings(); + } + unsigned GetTotalRegisterMappingsCreated() const { + return RAT->GetTotalRegisterMappingsCreated(); + } + void AddNewRegisterMapping(WriteState *WS) { RAT->AddRegisterMapping(WS); } + + void CycleEvent(unsigned Cycle) { + RCU->CycleEvent(); + AvailableEntries = + CarryOver >= DispatchWidth ? 0 : DispatchWidth - CarryOver; + CarryOver = CarryOver >= DispatchWidth ? CarryOver - DispatchWidth : 0U; + } + + void NotifyInstructionRetired(unsigned Index); + + void OnInstructionExecuted(unsigned TokenID) { + return RCU->OnInstructionExecuted(TokenID); + } + + void InvalidateRegisterMappings(const Instruction &Inst); +#ifndef NDEBUG + void dump() const; +#endif +}; + +} // namespace mca + +#endif Index: tools/llvm-mca/Dispatch.cpp =================================================================== --- tools/llvm-mca/Dispatch.cpp +++ tools/llvm-mca/Dispatch.cpp @@ -0,0 +1,267 @@ +//===--------------------- Dispatch.cpp -------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file implements methods declared by class RegisterFile, DispatchUnit and +/// RetireControlUnit. +/// +//===----------------------------------------------------------------------===// + +#include "Dispatch.h" +#include "Scheduler.h" +#include "Backend.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "llvm-mca" + +namespace mca { + +// Creates a new mapping for RegID. +void RegisterFile::AddRegisterMapping(WriteState *WS) { + unsigned RegID = WS->GetRegisterID(); + assert(RegID && "Adding an invalid register definition?"); + + RegisterMappings[RegID] = WS; + for (MCSubRegIterator I(RegID, &MRI); I.isValid(); ++I) + RegisterMappings[*I] = WS; + if (MaxUsedMappings == NumUsedMappings) + MaxUsedMappings++; + NumUsedMappings++; + TotalMappingsCreated++; + // If this is a partial update, then we are done. + if (!WS->FullyUpdatesSuperRegs()) + return; + + for (MCSuperRegIterator I(RegID, &MRI); I.isValid(); ++I) + RegisterMappings[*I] = WS; +} + +void RegisterFile::InvalidateRegisterMapping(const WriteState &WS) { + unsigned RegID = WS.GetRegisterID(); + bool ShouldInvalidateSuperRegs = WS.FullyUpdatesSuperRegs(); + + assert(RegID != 0 && "Invalidating an already invalid register?"); + assert(WS.GetCyclesLeft() != -512 && + "Invalidating a write of unknown cycles!"); + assert(WS.GetCyclesLeft() <= 0 && "Invalid cycles left for this write!"); + if (!RegisterMappings[RegID]) + return; + + assert(NumUsedMappings); + NumUsedMappings--; + + if (RegisterMappings[RegID] == &WS) + RegisterMappings[RegID] = nullptr; + + for (MCSubRegIterator I(RegID, &MRI); I.isValid(); ++I) + if (RegisterMappings[*I] == &WS) + RegisterMappings[*I] = nullptr; + + if (!ShouldInvalidateSuperRegs) + return; + + for (MCSuperRegIterator I(RegID, &MRI); I.isValid(); ++I) + if (RegisterMappings[*I] == &WS) + RegisterMappings[*I] = nullptr; +} + +// Update the number of used mappings in the event of instruction retired. +void DispatchUnit::InvalidateRegisterMappings(const Instruction &IS) { + for (const auto &WS : IS.GetDefs()) { + DEBUG(dbgs() << "[RAT] Invalidating mapping for: "); + DEBUG(WS->dump()); + RAT->InvalidateRegisterMapping(*WS.get()); + } +} + +void RegisterFile::CollectWrites(SmallVector &Writes, + unsigned RegID) const { + assert(RegID && RegID < RegisterMappings.size()); + WriteState *WS = RegisterMappings[RegID]; + if (WS) { + DEBUG(dbgs() << "Found a dependent use for RegID=" << RegID << '\n'); + Writes.push_back(WS); + } + + // Handle potential partial register updates. + for (MCSubRegIterator I(RegID, &MRI); I.isValid(); ++I) { + WS = RegisterMappings[*I]; + if (WS && std::find(Writes.begin(), Writes.end(), WS) == Writes.end()) { + DEBUG(dbgs() << "Found a dependent use of subReg " << *I << " (part of " + << RegID << ")\n"); + Writes.push_back(WS); + } + } +} + +bool RegisterFile::IsAvailable(unsigned NumRegWrites) { + if (!TotalMappings) + return true; + if (NumRegWrites > TotalMappings) { + // The user specified a too small number of registers. + // Artificially set the number of temporaries to NumRegWrites. + errs() << "warning: not enough temporaries in the register file. " + << "The register file size has been automatically increased to " + << NumRegWrites << '\n'; + TotalMappings = NumRegWrites; + } + + return NumRegWrites + NumUsedMappings <= TotalMappings; +} + +#ifndef NDEBUG +void RegisterFile::dump() const { + for (unsigned i = 0; i < MRI.getNumRegs(); ++i) + if (RegisterMappings[i]) { + dbgs() << MRI.getName(i) << ", " << i << ", "; + RegisterMappings[i]->dump(); + } + + dbgs() << "TotalMappingsCreated: " << TotalMappingsCreated + << ", MaxUsedMappings: " << MaxUsedMappings + << ", NumUsedMappings: " << NumUsedMappings << '\n'; +} +#endif + +// Reserves a number of slots, and returns a new token. +unsigned RetireControlUnit::ReserveSlot(unsigned Index, unsigned NumMicroOps) { + assert(IsAvailable(NumMicroOps)); + unsigned NormalizedQuantity = + std::min(NumMicroOps, static_cast(Queue.size())); + // Zero latency instructions may have zero mOps. Artificially bump this + // value to 1. Although zero latency instructions don't consume scheduler + // resources, they still consume one slot in the retire queue. + NormalizedQuantity = std::max(NormalizedQuantity, 1U); + unsigned TokenID = NextAvailableSlotIdx; + Queue[NextAvailableSlotIdx] = {Index, NormalizedQuantity, false}; + NextAvailableSlotIdx += NormalizedQuantity; + NextAvailableSlotIdx %= Queue.size(); + AvailableSlots -= NormalizedQuantity; + return TokenID; +} + +void DispatchUnit::NotifyInstructionDispatched(unsigned Index) { + Owner->NotifyInstructionDispatched(Index); +} + +void DispatchUnit::NotifyInstructionRetired(unsigned Index) { + Owner->NotifyInstructionRetired(Index); +} + +void RetireControlUnit::CycleEvent() { + if (IsEmpty()) + return; + + unsigned NumRetired = 0; + while (!IsEmpty()) { + if (MaxRetirePerCycle != 0 && NumRetired == MaxRetirePerCycle) + break; + RUToken &Current = Queue[CurrentInstructionSlotIdx]; + assert(Current.NumSlots && "Reserved zero slots?"); + if (!Current.Executed) + break; + Owner->NotifyInstructionRetired(Current.Index); + CurrentInstructionSlotIdx += Current.NumSlots; + CurrentInstructionSlotIdx %= Queue.size(); + AvailableSlots += Current.NumSlots; + NumRetired++; + } +} + +void RetireControlUnit::OnInstructionExecuted(unsigned TokenID) { + assert(Queue.size() > TokenID); + assert(Queue[TokenID].Executed == false && Queue[TokenID].Index != ~0U); + Queue[TokenID].Executed = true; +} + +#ifndef NDEBUG +void RetireControlUnit::dump() const { + dbgs() << "Retire Unit: { Total Slots=" << Queue.size() + << ", Available Slots=" << AvailableSlots << " }\n"; +} +#endif + +bool DispatchUnit::CheckRAT(const InstrDesc &Desc) { + unsigned NumWrites = Desc.Writes.size(); + if (RAT->IsAvailable(NumWrites)) + return true; + DispatchStalls[DS_RAT_REG_UNAVAILABLE]++; + return false; +} + +bool DispatchUnit::CheckRCU(const InstrDesc &Desc) { + unsigned NumMicroOps = Desc.NumMicroOps; + if (RCU->IsAvailable(NumMicroOps)) + return true; + DispatchStalls[DS_RCU_TOKEN_UNAVAILABLE]++; + return false; +} + +bool DispatchUnit::CheckScheduler(const InstrDesc &Desc) { + // If this is a zero-latency instruction, then it bypasses + // the scheduler. + switch (SC->CanBeDispatched(Desc)) { + case Scheduler::HWS_AVAILABLE: + return true; + case Scheduler::HWS_QUEUE_UNAVAILABLE: + DispatchStalls[DS_SQ_TOKEN_UNAVAILABLE]++; + break; + case Scheduler::HWS_LD_QUEUE_UNAVAILABLE: + DispatchStalls[DS_LDQ_TOKEN_UNAVAILABLE]++; + break; + case Scheduler::HWS_ST_QUEUE_UNAVAILABLE: + DispatchStalls[DS_STQ_TOKEN_UNAVAILABLE]++; + break; + case Scheduler::HWS_DISPATCH_GROUP_RESTRICTION: + DispatchStalls[DS_DISPATCH_GROUP_RESTRICTION]++; + } + + return false; +} + +unsigned DispatchUnit::Dispatch(unsigned IID, Instruction *NewInst) { + assert(!CarryOver && "Cannot dispatch another instruction!"); + unsigned NumMicroOps = NewInst->GetDesc().NumMicroOps; + if (NumMicroOps > DispatchWidth) { + assert(AvailableEntries == DispatchWidth); + AvailableEntries = 0; + CarryOver = NumMicroOps - DispatchWidth; + } else { + assert(AvailableEntries >= NumMicroOps); + AvailableEntries -= NumMicroOps; + } + + // Reserve slots in the RCU. + unsigned RCUTokenID = RCU->ReserveSlot(IID, NumMicroOps); + Owner->NotifyInstructionDispatched(IID); + + SC->ScheduleInstruction(IID, NewInst); + return RCUTokenID; +} + +#ifndef NDEBUG +void DispatchUnit::dump() const { + RAT->dump(); + RCU->dump(); + + unsigned DSRAT = DispatchStalls[DS_RAT_REG_UNAVAILABLE]; + unsigned DSRCU = DispatchStalls[DS_RCU_TOKEN_UNAVAILABLE]; + unsigned DSSCHEDQ = DispatchStalls[DS_SQ_TOKEN_UNAVAILABLE]; + unsigned DSLQ = DispatchStalls[DS_LDQ_TOKEN_UNAVAILABLE]; + unsigned DSSQ = DispatchStalls[DS_STQ_TOKEN_UNAVAILABLE]; + + dbgs() << "STALLS --- RAT: " << DSRAT << ", RCU: " << DSRCU + << ", SCHED_QUEUE: " << DSSCHEDQ << ", LOAD_QUEUE: " << DSLQ + << ", STORE_QUEUE: " << DSSQ << '\n'; +} +#endif + +} // namespace mca Index: tools/llvm-mca/HWEventListener.h =================================================================== --- tools/llvm-mca/HWEventListener.h +++ tools/llvm-mca/HWEventListener.h @@ -0,0 +1,34 @@ +#ifndef LLVM_TOOLS_LLVM_MCA_HWEVENTLISTENER_H +#define LLVM_TOOLS_LLVM_MCA_HWEVENTLISTENER_H + +#include "llvm/ADT/ArrayRef.h" +#include + +namespace mca { + +using namespace llvm; + +struct HWEventListener { + // Events generated by the Retire Control Unit. + virtual void OnInstructionRetired(unsigned Index){}; + + // Events generated by the Scheduler. + using ResourceRef = std::pair; + virtual void + OnInstructionIssued(unsigned Index, + const ArrayRef> &Used) {} + virtual void OnInstructionExecuted(unsigned Index) {} + virtual void OnInstructionReady(unsigned Index) {} + virtual void OnResourceAvailable(const ResourceRef &RRef){}; + + // Events generated by the Dispatch logic. + virtual void OnInstructionDispatched(unsigned Index) {} + + // Generic events generated by the Backend. + virtual void OnCycleBegin(unsigned Cycle) {} + virtual void OnCycleEnd(unsigned Cycle) {} +}; + +} // namespace mca + +#endif Index: tools/llvm-mca/InstrBuilder.h =================================================================== --- tools/llvm-mca/InstrBuilder.h +++ tools/llvm-mca/InstrBuilder.h @@ -0,0 +1,62 @@ +//===--------------------- InstrBuilder.h -----------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// A builder class for instructions that are statically analyzed by llvm-mca. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_MCA_INSTRBUILDER_H +#define LLVM_TOOLS_LLVM_MCA_INSTRBUILDER_H + +#include "Dispatch.h" +#include "Instruction.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" + + +namespace mca { + +using namespace llvm; + +class DispatchUnit; + +/// \brief A builder class that knows how to construct Instruction objects. +/// +/// Every llvm-mca Instruction is described by an object of class InstrDesc. +/// An InstrDesc describes which registers are read/written by the instruction, +/// as well as the instruction latency and hardware resources consumed. +/// +/// This class is used by the tool to construct Instructions and instruction +/// descriptors (i.e. InstrDesc objects). +/// Information from the machine scheduling model is used to identify processor +/// resources that are consumed by an instruction. +class InstrBuilder { + const MCInstrInfo &MCII; + const ArrayRef ProcResourceMasks; + + DenseMap> Descriptors; + DenseMap> Instructions; + + void CreateInstrDescImpl(const MCSubtargetInfo &STI, const MCInst &MCI); + +public: + InstrBuilder(const MCInstrInfo &mcii, const ArrayRef Masks) + : MCII(mcii), ProcResourceMasks(Masks) {} + + const InstrDesc &GetOrCreateInstrDesc(const MCSubtargetInfo &STI, + const MCInst &MCI); + + Instruction *CreateInstruction(const MCSubtargetInfo &STI, DispatchUnit &DU, + unsigned Idx, const MCInst &MCI); +}; + +} // namespace mca + +#endif Index: tools/llvm-mca/InstrBuilder.cpp =================================================================== --- tools/llvm-mca/InstrBuilder.cpp +++ tools/llvm-mca/InstrBuilder.cpp @@ -0,0 +1,530 @@ +//===--------------------- InstrBuilder.cpp ---------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file implements the InstrBuilder interface. +/// +//===----------------------------------------------------------------------===// + +#include "InstrBuilder.h" +#include "llvm/MC/MCInst.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +#define DEBUG_TYPE "llvm-mca" + +namespace mca { + +static void +initializeUsedResources(InstrDesc &ID, const MCSchedClassDesc &SCDesc, + const MCSubtargetInfo &STI, + const ArrayRef ProcResourceMasks) { + const MCSchedModel &SM = STI.getSchedModel(); + + // Populate resources consumed. + using ResourcePlusCycles = std::pair; + std::vector Worklist; + for (unsigned i = 0; i < SCDesc.NumWriteProcResEntries; ++i) { + const MCWriteProcResEntry *PRE = STI.getWriteProcResBegin(&SCDesc) + i; + const MCProcResourceDesc &PR = *SM.getProcResource(PRE->ProcResourceIdx); + uint64_t Mask = ProcResourceMasks[PRE->ProcResourceIdx]; + if (PR.BufferSize != -1) + ID.Buffers.push_back(Mask); + CycleSegment RCy(0, PRE->Cycles, false); + Worklist.emplace_back(ResourcePlusCycles(Mask, ResourceUsage(RCy))); + } + + // Sort elements by mask popcount, so that we prioritize resource units over + // resource groups, and smaller groups over larger groups. + std::sort(Worklist.begin(), Worklist.end(), + [](const ResourcePlusCycles &A, const ResourcePlusCycles &B) { + unsigned popcntA = countPopulation(A.first); + unsigned popcntB = countPopulation(B.first); + if (popcntA < popcntB) + return true; + if (popcntA > popcntB) + return false; + return A.first < B.first; + }); + + uint64_t UsedResourceUnits = 0; + + // Remove cycles contributed by smaller resources. + for (unsigned i = 0; i < Worklist.size(); ++i) { + ResourcePlusCycles &A = Worklist[i]; + if (!A.second.size()) { + A.second.NumUnits = 0; + A.second.SetReserved(); + ID.Resources.emplace_back(A); + continue; + } + + ID.Resources.emplace_back(A); + uint64_t NormalizedMask = A.first; + if (countPopulation(A.first) == 1) { + UsedResourceUnits |= A.first; + } else { + // Remove the leading 1 from the resource group mask. + NormalizedMask ^= PowerOf2Floor(NormalizedMask); + } + + for (unsigned j = i + 1; j < Worklist.size(); ++j) { + ResourcePlusCycles &B = Worklist[j]; + if ((NormalizedMask & B.first) == NormalizedMask) { + B.second.CS.Subtract(A.second.size()); + if (countPopulation(B.first) > 1) + B.second.NumUnits++; + } + } + } + + // A SchedWrite may specify a number of cycles in which a resource group + // is reserved. For example (on target x86; cpu Haswell): + // + // SchedWriteRes<[HWPort0, HWPort1, HWPort01]> { + // let ResourceCycles = [2, 2, 3]; + // } + // + // This means: + // Resource units HWPort0 and HWPort1 are both used for 2cy. + // Resource group HWPort01 is the union of HWPort0 and HWPort1. + // Since this write touches both HWPort0 and HWPort1 for 2cy, HWPort01 + // will not be usable for 2 entire cycles from instruction issue. + // + // On top of those 2cy, SchedWriteRes explicitly specifies an extra latency + // of 3 cycles for HWPort01. This tool assumes that the 3cy latency is an + // extra delay on top of the 2 cycles latency. + // During those extra cycles, HWPort01 is not usable by other instructions. + for (ResourcePlusCycles &RPC : ID.Resources) { + if (countPopulation(RPC.first) > 1 && !RPC.second.IsReserved()) { + // Remove the leading 1 from the resource group mask. + uint64_t Mask = RPC.first ^ PowerOf2Floor(RPC.first); + if ((Mask & UsedResourceUnits) == Mask) + RPC.second.SetReserved(); + } + } + + DEBUG(for (const auto &R + : ID.Resources) dbgs() + << "\t\tMask=" << R.first << ", cy=" << R.second.size() << '\n'; + for (const auto &R + : ID.Buffers) dbgs() + << "\t\tBuffer Mask=" << R << '\n';); +} + +static void computeMaxLatency(InstrDesc &ID, const MCInstrDesc &MCDesc, + const MCSchedClassDesc &SCDesc, + const MCSubtargetInfo &STI) { + unsigned MaxLatency = 0; + unsigned NumWriteLatencyEntries = SCDesc.NumWriteLatencyEntries; + for (unsigned i = 0; i < NumWriteLatencyEntries; ++i) { + int Cycles = STI.getWriteLatencyEntry(&SCDesc, i)->Cycles; + // Check if this is an unknown latency. Conservatively (pessimistically) + // assume a latency of 100cy if late. + if (Cycles == -1) + Cycles = 100; + MaxLatency = std::max(MaxLatency, static_cast(Cycles)); + } + + if (MCDesc.isCall()) { + // We cannot estimate how long this call will take. + // Artificially set an arbitrarily high latency (100cy). + MaxLatency = std::max(100U, MaxLatency); + } + + // Check if this instruction consumes any processor resources. + // If the latency is unknown, then conservatively set it equal to the maximum + // number of cycles for a resource consumed by this instruction. + if (!MaxLatency && ID.Resources.size()) { + // Check if this instruction consumes any processor resources. + // If so, then compute the max of the cycles spent for each resource, and + // use it as the MaxLatency. + for (const auto &Resource : ID.Resources) + MaxLatency = std::max(MaxLatency, Resource.second.size()); + } + + if (SCDesc.isVariant() && MaxLatency == 0) { + errs() << "note: unknown latency for a variant opcode. Conservatively" + << " assume a default latency of 1cy.\n"; + MaxLatency = 1; + } + + ID.MaxLatency = MaxLatency; +} + +static void populateWrites(InstrDesc &ID, const MCInst &MCI, + const MCInstrDesc &MCDesc, + const MCSchedClassDesc &SCDesc, + const MCSubtargetInfo &STI) { + computeMaxLatency(ID, MCDesc, SCDesc, STI); + + // Set if writes through this opcode may update super registers. + // TODO: on x86-64, a 4 byte write of a general purpose register always + // fully updates the super-register. + // More in general, (at least on x86) not all register writes perform + // a partial (super-)register update. + // For example, an AVX instruction that writes on a XMM register implicitly + // zeroes the upper half of every aliasing super-register. + // + // For now, we pessimistically assume that writes are all potentially + // partial register updates. This is a good default for most targets, execept + // for those like x86 which implement a special semantic for certain opcodes. + // At least on x86, this may lead to an inaccurate prediction of the + // instruction level parallelism. + bool FullyUpdatesSuperRegisters = false; + + // Now Populate Writes. + + // This algorithm currently works under the strong (and potentially incorrect) + // assumption that information related to register def/uses can be obtained + // from MCInstrDesc. + // + // However class MCInstrDesc is used to describe MachineInstr objects and not + // MCInst objects. To be more specific, MCInstrDesc objects are opcode + // descriptors that are automatically generated via tablegen based on the + // instruction set information available from the target .td files. That + // means, the number of (explicit) definitions according to MCInstrDesc alwyas + // matches the cardinality of the `(outs)` set in tablegen. + // + // By constructions, definitions must appear first in the operand sequence of + // a MachineInstr. Also, the (outs) sequence is preserved (example: the first + // element in the outs set is the first operand in the corresponding + // MachineInstr). That's the reason why MCInstrDesc only need to declare the + // total number of register definitions, and not where those definitions are + // in the machine operand sequence. + // + // Unfortunately, it is not safe to use the information from MCInstrDesc to + // also describe MCInst objects. An MCInst object can be obtained from a + // MachineInstr through a lowering step which may restructure the operand + // sequence (and even remove or introduce new operands). So, there is a high + // risk that the lowering step breaks the assumptions that register + // definitions are always at the beginning of the machine operand sequence. + // + // This is a fundamental problem, and it is still an open problem. Essentially + // we have to find a way to correlate def/use operands of a MachineInstr to + // operands of an MCInst. Otherwise, we cannot correctly reconstruct data + // dependencies, nor we can correctly interpret the scheduling model, which + // heavily uses machine operand indices to define processor read-advance + // information, and to identify processor write resources. Essentially, we + // need something like a MCInstrDesc, but for MCInst. But we also need a way + // to map MCInst operands back to MachineInstr operands, in order to be able + // to correctly identify register definitions, register uses, and read-advance + // information. + + // This requires that to compute operand indices related to MCInst register + // definitions. This information could be automatically generated via tablegen + // at least for all the opcodes that don't require a custom lowering step from + // MachineInstr to MCInst. As long as we can correlate register definitions + // from a MachineInstr to register definitions in a MCInst, then we are done. + // + // Unfortunately, we don't have that information now. So, this prototype + // currently work under the strong (and potentially flawed) assumption that we + // can always safely trust the content of an MCInstrDesc. For example, we can + // query a MCInstrDesc to obtain the number of explicit and implicit register + // defintions. We also assume that register definitions always come first in + // the operand sequence. This last assumption normally makes sense for + // MachineInstr, where register definitions always appear at the beginning of + // the operands sequence. In reality, these assumptions could be broken by the + // lowering step, which can decide to lay out operands in a different order + // than the original order of operand as specified by the MachineInstr. + // + // Things get even more complicated in the presence of "optional" register + // definitions. For MachineInstr, optional register definitions are always at + // the end of the operand sequence. Some ARM instructions that may update the + // status flags specify that register as a optional operand. Since we don't + // have operand descriptors for MCInst, we assume for now that the optional + // definition is always the last operand of a MCInst. Again, this assumption + // may be okay for most targets. However, there is no guarantee that targets + // would respect that. + // + // In conclusion: these are for now the strong assumptions made by the tool: + // * The number of explicit and implicit register definitions in a MCInst + // matches the number of explicit and implicit definitions according to + // the opcode descriptor (MCInstrDesc). + // * Register definitions take precedence over register uses in the operands + // list. + // * If an opcode specifies an optional definition, then the optional + // definition is always the last operand in the sequence, and it can be + // set to zero (i.e. "no register"). + // + // These assumptions luckily work quite well for some out-of-order + // in-tree targets like x86. This is mainly because the vast majority of + // the instructions is expanded to MCInst in a very straightforward way. + // So, the lowering of a MachineOperand is often implemented as a loop of + // "lower operand", which preservers the original order of the operands. + // + // In the longer term, we obviously need a proper solution for this issue. + unsigned NumExplicitDefs = MCDesc.getNumDefs(); + unsigned NumImplicitDefs = MCDesc.getNumImplicitDefs(); + unsigned NumWriteLatencyEntries = SCDesc.NumWriteLatencyEntries; + unsigned TotalDefs = NumExplicitDefs + NumImplicitDefs; + if (MCDesc.hasOptionalDef()) + TotalDefs++; + ID.Writes.resize(TotalDefs); + // Iterate over the operands list, and skip non-register operands. + // The first NumExplictDefs register operands are expected to be register + // definitions. + unsigned CurrentDef = 0; + unsigned i = 0; + for (; i < MCI.getNumOperands() && CurrentDef < NumExplicitDefs; ++i) { + const MCOperand &Op = MCI.getOperand(i); + if (!Op.isReg()) + continue; + + WriteDescriptor &Write = ID.Writes[CurrentDef]; + Write.OpIndex = i; + if (CurrentDef < NumWriteLatencyEntries) { + const MCWriteLatencyEntry &WLE = + *STI.getWriteLatencyEntry(&SCDesc, CurrentDef); + // Conservatively default to MaxLatency. + Write.Latency = WLE.Cycles == -1 ? ID.MaxLatency : WLE.Cycles; + Write.SClassOrWriteResourceID = WLE.WriteResourceID; + } else { + // Assign a default latency for this write. + Write.Latency = ID.MaxLatency; + Write.SClassOrWriteResourceID = 0; + } + Write.FullyUpdatesSuperRegs = FullyUpdatesSuperRegisters; + Write.IsOptionalDef = false; + DEBUG(dbgs() << "\t\tOpIdx=" << Write.OpIndex + << ", Latency=" << Write.Latency << ", WriteResourceID=" + << Write.SClassOrWriteResourceID << '\n'); + CurrentDef++; + } + + if (CurrentDef != NumExplicitDefs) + llvm::report_fatal_error( + "error: Expected more register operand definitions. "); + + CurrentDef = 0; + for (CurrentDef = 0; CurrentDef < NumImplicitDefs; ++CurrentDef) { + unsigned Index = NumExplicitDefs + CurrentDef; + WriteDescriptor &Write = ID.Writes[Index]; + Write.OpIndex = -1; + Write.RegisterID = MCDesc.getImplicitDefs()[CurrentDef]; + Write.Latency = ID.MaxLatency; + Write.SClassOrWriteResourceID = 0; + Write.IsOptionalDef = false; + assert(Write.RegisterID != 0 && "Expected a valid phys register!"); + DEBUG(dbgs() << "\t\tOpIdx=" << Write.OpIndex << ", PhysReg=" + << Write.RegisterID << ", Latency=" << Write.Latency + << ", WriteResourceID=" << Write.SClassOrWriteResourceID + << '\n'); + } + + if (MCDesc.hasOptionalDef()) { + // Always assume that the optional definition is the last operand of the + // MCInst sequence. + const MCOperand &Op = MCI.getOperand(MCI.getNumOperands() - 1); + if (i == MCI.getNumOperands() || !Op.isReg()) + llvm::report_fatal_error( + "error: expected a register operand for an optional " + "definition. Instruction has not be correctly analyzed.\n", + false); + + WriteDescriptor &Write = ID.Writes[TotalDefs - 1]; + Write.OpIndex = MCI.getNumOperands() - 1; + // Assign a default latency for this write. + Write.Latency = ID.MaxLatency; + Write.SClassOrWriteResourceID = 0; + Write.IsOptionalDef = true; + } +} + +static void populateReads(InstrDesc &ID, const MCInst &MCI, + const MCInstrDesc &MCDesc, + const MCSchedClassDesc &SCDesc, + const MCSubtargetInfo &STI) { + unsigned SchedClassID = MCDesc.getSchedClass(); + bool HasReadAdvanceEntries = SCDesc.NumReadAdvanceEntries > 0; + + unsigned i = 0; + unsigned NumExplicitDefs = MCDesc.getNumDefs(); + // Skip explicit definitions. + for (; i < MCI.getNumOperands() && NumExplicitDefs; ++i) { + const MCOperand &Op = MCI.getOperand(i); + if (Op.isReg()) + NumExplicitDefs--; + } + + if (NumExplicitDefs) + llvm::report_fatal_error( + "error: Expected more register operand definitions. ", false); + + unsigned NumExplicitUses = MCI.getNumOperands() - i; + unsigned NumImplicitUses = MCDesc.getNumImplicitUses(); + if (MCDesc.hasOptionalDef()) { + assert(NumExplicitUses); + NumExplicitUses--; + } + unsigned TotalUses = NumExplicitUses + NumImplicitUses; + if (!TotalUses) + return; + + ID.Reads.resize(TotalUses); + for (unsigned CurrentUse = 0; CurrentUse < NumExplicitUses; ++CurrentUse) { + ReadDescriptor &Read = ID.Reads[CurrentUse]; + Read.OpIndex = i + CurrentUse; + Read.HasReadAdvanceEntries = HasReadAdvanceEntries; + Read.SchedClassID = SchedClassID; + DEBUG(dbgs() << "\t\tOpIdx=" << Read.OpIndex); + } + + for (unsigned CurrentUse = 0; CurrentUse < NumImplicitUses; ++CurrentUse) { + ReadDescriptor &Read = ID.Reads[NumExplicitUses + CurrentUse]; + Read.OpIndex = -1; + Read.RegisterID = MCDesc.getImplicitUses()[CurrentUse]; + Read.HasReadAdvanceEntries = false; + Read.SchedClassID = SchedClassID; + DEBUG(dbgs() << "\t\tOpIdx=" << Read.OpIndex + << ", RegisterID=" << Read.RegisterID << '\n'); + } +} + +void InstrBuilder::CreateInstrDescImpl(const MCSubtargetInfo &STI, + const MCInst &MCI) { + assert(STI.getSchedModel().hasInstrSchedModel() && + "Itineraries are not yet supported!"); + + unsigned short Opcode = MCI.getOpcode(); + // Obtain the instruction descriptor from the opcode. + const MCInstrDesc &MCDesc = MCII.get(Opcode); + const MCSchedModel &SM = STI.getSchedModel(); + + // Then obtain the scheduling class information from the instruction. + const MCSchedClassDesc &SCDesc = + *SM.getSchedClassDesc(MCDesc.getSchedClass()); + + // Create a new empty descriptor. + InstrDesc *ID = new InstrDesc(); + + if (SCDesc.isVariant()) { + errs() << "warning: don't know how to model variant opcodes.\n" + << "note: assume 1 micro opcode.\n"; + ID->NumMicroOps = 1U; + } else { + ID->NumMicroOps = SCDesc.NumMicroOps; + } + + if (MCDesc.isCall()) { + // We don't correctly model calls. + errs() << "warning: found a call in the input assembly sequence.\n" + << "note: call instructions are not correctly modeled. Assume a " + "latency of 100cy.\n"; + } + + if (MCDesc.isReturn()) { + errs() << "warning: found a return instruction in the input assembly " + "sequence.\n" + << "note: program counter updates are ignored.\n"; + } + + ID->MayLoad = MCDesc.mayLoad(); + ID->MayStore = MCDesc.mayStore(); + ID->HasSideEffects = MCDesc.hasUnmodeledSideEffects(); + + initializeUsedResources(*ID, SCDesc, STI, ProcResourceMasks); + populateWrites(*ID, MCI, MCDesc, SCDesc, STI); + populateReads(*ID, MCI, MCDesc, SCDesc, STI); + + DEBUG(dbgs() << "\t\tMaxLatency=" << ID->MaxLatency << '\n'); + DEBUG(dbgs() << "\t\tNumMicroOps=" << ID->NumMicroOps << '\n'); + + // Now add the new descriptor. + Descriptors[Opcode] = std::unique_ptr(ID); +} + +const InstrDesc &InstrBuilder::GetOrCreateInstrDesc(const MCSubtargetInfo &STI, + const MCInst &MCI) { + auto it = Descriptors.find(MCI.getOpcode()); + if (it == Descriptors.end()) + CreateInstrDescImpl(STI, MCI); + return *Descriptors[MCI.getOpcode()].get(); +} + +Instruction *InstrBuilder::CreateInstruction(const MCSubtargetInfo &STI, + DispatchUnit &DU, unsigned Idx, + const MCInst &MCI) { + const InstrDesc &D = GetOrCreateInstrDesc(STI, MCI); + Instruction *NewIS = new Instruction(D); + + // Populate Reads first. + const MCSchedModel &SM = STI.getSchedModel(); + SmallVector DependentWrites; + for (const ReadDescriptor &RD : D.Reads) { + int RegID = -1; + if (RD.OpIndex != -1) { + // explicit read. + const MCOperand &Op = MCI.getOperand(RD.OpIndex); + // Skip non-register operands. + if (!Op.isReg()) + continue; + RegID = Op.getReg(); + } else { + // Implicit read. + RegID = RD.RegisterID; + } + + // Skip invalid register operands. + if (!RegID) + continue; + + // Okay, this is a register operand. Create a ReadState for it. + assert(RegID > 0 && "Invalid register ID found!"); + ReadState *NewRDS = new ReadState(RD); + NewIS->GetUses().emplace_back(std::unique_ptr(NewRDS)); + DU.CollectWrites(DependentWrites, RegID); + NewRDS->SetDependentWrites(DependentWrites.size()); + DEBUG(dbgs() << "Found " << DependentWrites.size() + << " dependent writes\n"); + + // We know that this read depends on all the writes in DependentWrites. + // For each write, check if we have ReadAdvance information, and use it + // to figure out after how many cycles this read becomes available. + if (!RD.HasReadAdvanceEntries) { + for (WriteState *WS : DependentWrites) + WS->AddUser(NewRDS, /* ReadAdvance */ 0); + // Prepare the set for another round. + DependentWrites.clear(); + continue; + } + + const MCSchedClassDesc *SC = SM.getSchedClassDesc(RD.SchedClassID); + for (WriteState *WS : DependentWrites) { + unsigned WriteResID = WS->GetWriteResourceID(); + int ReadAdvance = STI.getReadAdvanceCycles(SC, RD.OpIndex, WriteResID); + WS->AddUser(NewRDS, ReadAdvance); + } + + // Prepare the set for another round. + DependentWrites.clear(); + } + + // Now populate writes. + for (const WriteDescriptor &WD : D.Writes) { + unsigned RegID = + WD.OpIndex == -1 ? WD.RegisterID : MCI.getOperand(WD.OpIndex).getReg(); + assert((RegID || WD.IsOptionalDef) && "Expected a valid register ID!"); + // Special case where this is a optional definition, and the actual register + // is 0. + if (WD.IsOptionalDef && !RegID) + continue; + + WriteState *NewWS = new WriteState(WD); + NewIS->GetDefs().emplace_back(std::unique_ptr(NewWS)); + NewWS->SetRegisterID(RegID); + DU.AddNewRegisterMapping(NewWS); + } + + // Update Latency. + NewIS->SetCyclesLeft(D.MaxLatency); + return NewIS; +} + +} // namespace mca Index: tools/llvm-mca/Instruction.h =================================================================== --- tools/llvm-mca/Instruction.h +++ tools/llvm-mca/Instruction.h @@ -0,0 +1,336 @@ +//===--------------------- Instruction.h ------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file defines abstractions used by the Backend to model register reads, +/// register writes and instructions. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_MCA_INSTRUCTION_H +#define LLVM_TOOLS_LLVM_MCA_INSTRUCTION_H + +#include "llvm/Support/MathExtras.h" +#include +#include +#include + +namespace mca { + +using namespace llvm; + +struct WriteDescriptor; +struct ReadDescriptor; +class WriteState; +class ReadState; + +constexpr int UNKNOWN_CYCLES = -512; + +/// \brief A register write descriptor. +struct WriteDescriptor { + int OpIndex; // Operand index. -1 if this is an implicit write. + // Write latency. Number of cycles before write-back stage. + int Latency; + // This field is set to a value different than zero only if this + // is an implicit definition. + unsigned RegisterID; + // True if this write generates a partial update of a super-registers. + // On X86, this flag is set by byte/word writes on GPR registers. Also, + // a write of an XMM register only partially updates the corresponding + // YMM super-register if the write is associated to a legacy SSE instruction. + bool FullyUpdatesSuperRegs; + // Instruction itineraries would set this field to the SchedClass ID. + // Otherwise, it defaults to the WriteResourceID from teh MCWriteLatencyEntry + // element associated to this write. + // When computing read latencies, this value is matched against the + // "ReadAdvance" information. The hardware backend may implement + // dedicated forwarding paths to quickly propagate write results to dependent + // instructions waiting in the reservation station (effectively bypassing the + // write-back stage). + unsigned SClassOrWriteResourceID; + // True only if this is a write obtained from an optional definition. + // Optional definitions are allowed to reference regID zero (i.e. "no + // register"). + bool IsOptionalDef; +}; + +/// \brief A register read descriptor. +struct ReadDescriptor { + // This field defaults to -1 if this is an implicit read. + int OpIndex; + // This field is only set if this is an implicit read. + unsigned RegisterID; + // Scheduling Class Index. It is used to query the scheduling model for the + // MCSchedClassDesc object. + unsigned SchedClassID; + // True if there may be a local forwarding logic in hardware to serve a + // write used by this read. This information, along with SchedClassID, is + // used to dynamically check at Instruction creation time, if the input + // operands can benefit from a ReadAdvance bonus. + bool HasReadAdvanceEntries; +}; + +/// \brief Tracks uses of a register definition (e.g. register write). +/// +/// Each implicit/explicit register write is associated with an instance of +/// this class. A WriteState object tracks the dependent users of a +/// register write. It also tracks how many cycles are left before the write +/// back stage. +class WriteState { + const WriteDescriptor &WD; + // On instruction issue, this field is set equal to the write latency. + // Before instruction issue, this field defaults to -512, a special + // value that represents an "unknown" number of cycles. + int CyclesLeft; + + // Actual register defined by this write. This field is only used + // to speedup queries on the register file. + // For implicit writes, this field always matches the value of + // field RegisterID from WD. + unsigned RegisterID; + + // A list of dependent reads. Users is a set of dependent + // reads. A dependent read is added to the set only if CyclesLeft + // is "unknown". As soon as CyclesLeft is 'known', each user in the set + // gets notified with the actual CyclesLeft. + + // The 'second' element of a pair is a "ReadAdvance" number of cycles. + std::set> Users; + +public: + WriteState(const WriteDescriptor &Desc) + : WD(Desc), CyclesLeft(UNKNOWN_CYCLES), RegisterID(Desc.RegisterID) {} + WriteState(const WriteState &Other) = delete; + WriteState &operator=(const WriteState &Other) = delete; + + int GetCyclesLeft() const { return CyclesLeft; } + unsigned GetWriteResourceID() const { return WD.SClassOrWriteResourceID; } + unsigned GetRegisterID() const { return RegisterID; } + void SetRegisterID(unsigned ID) { RegisterID = ID; } + + void AddUser(ReadState *Use, int ReadAdvance); + bool FullyUpdatesSuperRegs() const { return WD.FullyUpdatesSuperRegs; } + bool IsWrittenBack() const { return CyclesLeft == 0; } + + // On every cycle, update CyclesLeft and notify dependent users. + void CycleEvent(); + void OnInstructionIssued(); + +#ifndef NDEBUG + void dump() const; +#endif +}; + +/// \brief Tracks register operand latency in cycles. +/// +/// A read may be dependent on more than one write. This occurs when some +/// writes only partially update the register associated to this read. +class ReadState { + const ReadDescriptor &RD; + unsigned DependentWrites; + int CyclesLeft; + unsigned TotalCycles; + +public: + bool IsReady() const { + if (DependentWrites) + return false; + return (CyclesLeft == UNKNOWN_CYCLES || CyclesLeft == 0); + } + + ReadState(const ReadDescriptor &Desc) + : RD(Desc), DependentWrites(0), CyclesLeft(UNKNOWN_CYCLES), + TotalCycles(0) {} + ReadState(const ReadState &Other) = delete; + ReadState &operator=(const ReadState &Other) = delete; + + const ReadDescriptor &GetDescriptor() const { return RD; } + unsigned GetSchedClass() const { return RD.SchedClassID; } + void CycleEvent(); + void WriteStartEvent(unsigned Cycles); + void SetDependentWrites(unsigned Writes) { DependentWrites = Writes; } +}; + +/// \brief A sequence of cycles. +/// +/// This class can be used as a building block to construct ranges of cycles. +class CycleSegment { + unsigned Begin; // Inclusive. + unsigned End; // Exclusive. + bool Reserved; // Resources associated to this segment must be reserved. + +public: + CycleSegment(unsigned StartCycle, unsigned EndCycle, bool IsReserved = false) + : Begin(StartCycle), End(EndCycle), Reserved(IsReserved) {} + + bool Contains(unsigned Cycle) const { return Cycle >= Begin && Cycle < End; } + bool StartsAfter(const CycleSegment &CS) const { return End <= CS.Begin; } + bool EndsBefore(const CycleSegment &CS) const { return Begin >= CS.End; } + bool Overlaps(const CycleSegment &CS) const { + return !StartsAfter(CS) && !EndsBefore(CS); + } + bool IsExecuting() const { return Begin == 0 && End != 0; } + bool IsExecuted() const { return End == 0; } + bool operator<(const CycleSegment &Other) const { + return Begin < Other.Begin; + } + CycleSegment &operator--(void) { + if (Begin) + Begin--; + if (End) + End--; + return *this; + } + + bool IsValid() const { return Begin <= End; } + unsigned size() const { return End - Begin; }; + void Subtract(unsigned Cycles) { + assert(End >= Cycles); + End -= Cycles; + } + + unsigned begin() const { return Begin; } + unsigned end() const { return End; } + void setEnd(unsigned NewEnd) { End = NewEnd; } + bool IsReserved() const { return Reserved; } + void SetReserved() { Reserved = true; } +}; + +/// \brief Helper used by class InstrDesc to describe how hardware resources +/// are used. +/// +/// This class describes how many resource units of a specific resource kind +/// (and how many cycles) are "used" by an instruction. +struct ResourceUsage { + CycleSegment CS; + unsigned NumUnits; + ResourceUsage(CycleSegment Cycles, unsigned Units = 1) + : CS(Cycles), NumUnits(Units) {} + unsigned size() const { return CS.size(); } + bool IsReserved() const { return CS.IsReserved(); } + void SetReserved() { CS.SetReserved(); } +}; + +/// \brief An instruction descriptor +struct InstrDesc { + std::vector Writes; // Implicit writes are at the end. + std::vector Reads; // Implicit reads are at the end. + + // For every resource used by an instruction of this kind, this vector + // reports the number of "consumed cycles". + std::vector> Resources; + + // A list of buffered resources consumed by this instruction. + std::vector Buffers; + unsigned MaxLatency; + // Number of MicroOps for this instruction. + unsigned NumMicroOps; + + bool MayLoad; + bool MayStore; + bool HasSideEffects; +}; + +/// An instruction dispatched to the out-of-order backend. +/// +/// This class is used to monitor changes in the internal state of instructions +/// that are dispatched by the DispatchUnit to the hardware schedulers. +class Instruction { + const InstrDesc &Desc; + + enum InstrStage { + IS_INVALID, // Instruction in an invalid state. + IS_AVAILABLE, // Instruction dispatched but operands are not ready. + IS_READY, // Instruction dispatched and operands ready. + IS_EXECUTING, // Instruction issued. + IS_EXECUTED, // Instruction executed. Values are written back. + IS_RETIRED // Instruction retired. + }; + + // The current instruction stage. + enum InstrStage Stage; + + // This value defaults to the instruction latency. This instruction is + // considered executed when field CyclesLeft goes to zero. + int CyclesLeft; + + // Retire Unit token ID for this instruction. + unsigned RCUTokenID; + + using VecDefs = std::vector>; + using VecUses = std::vector>; + + // Output dependencies. + // One entry per each implicit and explicit register definition. + VecDefs Defs; + + // Input dependencies. + // One entry per each implicit and explicit register use. + VecUses Uses; + + // This instruction has already been dispatched, and all operands are ready. + void SetReady() { + assert(Stage == IS_AVAILABLE); + Stage = IS_READY; + } + +public: + Instruction(const InstrDesc &D) + : Desc(D), Stage(IS_INVALID), CyclesLeft(-1) {} + Instruction(const Instruction &Other) = delete; + Instruction &operator=(const Instruction &Other) = delete; + + VecDefs &GetDefs() { return Defs; } + const VecDefs &GetDefs() const { return Defs; } + VecUses &GetUses() { return Uses; } + const VecUses &GetUses() const { return Uses; } + const InstrDesc &GetDesc() const { return Desc; } + + unsigned GetRCUTokenID() const { return RCUTokenID; } + int GetCyclesLeft() const { return CyclesLeft; } + void SetCyclesLeft(int Cycles) { CyclesLeft = Cycles; } + void SetRCUTokenID(unsigned TokenID) { RCUTokenID = TokenID; } + + // Transition to the dispatch stage. + // No definition is updated because the instruction is not "executing". + void Dispatch() { + assert(Stage == IS_INVALID); + Stage = IS_AVAILABLE; + } + + // Instruction issued. Transition to the IS_EXECUTING state, and update + // all the definitions. + void Execute(); + + void ForceExecuted() { + assert((Stage == IS_INVALID && IsZeroLatency()) || + (Stage == IS_READY && Desc.MaxLatency == 0)); + Stage = IS_EXECUTED; + } + + // Checks if operands are available. If all operands area ready, + // then this forces a transition from IS_AVAILABLE to IS_READY. + bool IsReady(); + + bool IsDispatched() const { return Stage == IS_AVAILABLE; } + bool IsExecuting() const { return Stage == IS_EXECUTING; } + bool IsExecuted() const { return Stage == IS_EXECUTED; } + bool IsZeroLatency() const; + + void Retire() { + assert(Stage == IS_EXECUTED); + Stage = IS_RETIRED; + } + + void CycleEvent(); +}; + +} // namespace mca + +#endif Index: tools/llvm-mca/Instruction.cpp =================================================================== --- tools/llvm-mca/Instruction.cpp +++ tools/llvm-mca/Instruction.cpp @@ -0,0 +1,132 @@ +//===--------------------- Instruction.cpp ----------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines abstractions used by the Backend to model register reads, +// register writes and instructions. +// +//===----------------------------------------------------------------------===// + +#include "Instruction.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +namespace mca { + +void ReadState::WriteStartEvent(unsigned Cycles) { + assert(DependentWrites); + assert(CyclesLeft == UNKNOWN_CYCLES); + + // This read may be dependent on more than one write. This typically occurs + // when a definition is the result of multiple writes where at least one + // write does a partial register update. + // The HW is forced to do some extra bookkeeping to track of all the + // dependent writes, and implement a merging scheme for the partial writes. + --DependentWrites; + TotalCycles = std::max(TotalCycles, Cycles); + + if (!DependentWrites) + CyclesLeft = TotalCycles; +} + +void WriteState::OnInstructionIssued() { + assert(CyclesLeft == UNKNOWN_CYCLES); + // Update the number of cycles left based on the WriteDescriptor info. + CyclesLeft = WD.Latency; + + // Now that the time left before write-back is know, notify + // all the users. + for (auto &User : Users) { + ReadState *RS = User.first; + unsigned ReadCycles = std::max(0, CyclesLeft - User.second); + RS->WriteStartEvent(ReadCycles); + } +} + +void WriteState::AddUser(ReadState *User, int ReadAdvance) { + // If CyclesLeft is different than -1, then we don't need to + // update the list of users. We can just notify the user with + // the actual number of cycles left (which may be zero). + if (CyclesLeft != UNKNOWN_CYCLES) { + unsigned ReadCycles = std::max(0, CyclesLeft - ReadAdvance); + User->WriteStartEvent(ReadCycles); + return; + } + + std::pair NewPair(User, ReadAdvance); + Users.insert(NewPair); +} + +void WriteState::CycleEvent() { + // Note: CyclesLeft can be a negative number. It is an error to + // make it an unsigned quantity because users of this write may + // specify a negative ReadAdvance. + if (CyclesLeft != UNKNOWN_CYCLES) + CyclesLeft--; +} + +void ReadState::CycleEvent() { + // If CyclesLeft is unknown, then bail out immediately. + if (CyclesLeft == UNKNOWN_CYCLES) + return; + + // If there are still dependent writes, or we reached cycle zero, + // then just exit. + if (DependentWrites || CyclesLeft == 0) + return; + + CyclesLeft--; +} + +#ifndef NDEBUG +void WriteState::dump() const { + dbgs() << "{ OpIdx=" << WD.OpIndex << ", Lat=" << WD.Latency << ", RegID " + << GetRegisterID() << ", Cycles Left=" << GetCyclesLeft() << " }\n"; +} +#endif + +bool Instruction::IsReady() { + if (Stage == IS_READY) + return true; + + assert(Stage == IS_AVAILABLE); + for (const auto &Use : Uses) + if (!Use.get()->IsReady()) + return false; + + SetReady(); + return true; +} + +void Instruction::Execute() { + assert(Stage == IS_READY); + Stage = IS_EXECUTING; + for (auto &Def : Defs) + Def->OnInstructionIssued(); +} + +bool Instruction::IsZeroLatency() const { + return Desc.MaxLatency == 0 && Defs.size() == 0 && Uses.size() == 0; +} + +void Instruction::CycleEvent() { + if (IsDispatched()) { + for (auto &Use : Uses) + Use->CycleEvent(); + return; + } + if (IsExecuting()) { + for (auto &Def : Defs) + Def->CycleEvent(); + CyclesLeft--; + } + if (!CyclesLeft) + Stage = IS_EXECUTED; +} + +} // Namespace end. Index: tools/llvm-mca/LLVMBuild.txt =================================================================== --- tools/llvm-mca/LLVMBuild.txt +++ tools/llvm-mca/LLVMBuild.txt @@ -0,0 +1,22 @@ +;===- ./tools/llvm-mc/LLVMBuild.txt ----------------------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Tool +name = llvm-mca +parent = Tools +required_libraries = MC MCParser Support all-targets Index: tools/llvm-mca/LSUnit.h =================================================================== --- tools/llvm-mca/LSUnit.h +++ tools/llvm-mca/LSUnit.h @@ -0,0 +1,162 @@ +//===------------------------- LSUnit.h --------------------------*- C++-*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// A Load/Store unit class that models load/store queues and that implements +/// a simple weak memory consistency model. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_MCA_LSUNIT_H +#define LLVM_TOOLS_LLVM_MCA_LSUNIT_H + +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include + +using namespace llvm; + +#define DEBUG_TYPE "llvm-mca" + +namespace mca { + +/// \brief A Load/Store Unit implementing a load and store queues. +/// +/// This class implements a load queue and a store queue to emulate the +/// out-of-order execution of memory operations. +/// Each load (or store) consumes an entry in the load (or store) queue. +/// +/// Rules are: +/// 1) A younger load is allowed to pass an older load only if there are no +/// stores nor barriers in between the two loads. +/// 2) An younger store is not allowed to pass an older store. +/// 3) A younger store is not allowed to pass an older load. +/// 4) A younger load is allowed to pass an older store only if the load does +/// not alias with the store. +/// +/// This class conservatively/pessimistically assumes that loads +/// may-alias store operations. Under this assumption, younger loads are never +/// allowed to pass older stores (this negatively affects rule 4.). +/// +/// In order to make it possible for a younger load to pass an older store, +/// flag `AssumeNoAlias` must be set by the constructor of LSUnit. +/// Essentially, this LSUnit doesn't attempt to run any sort alias analysis to +/// predict when loads and stores don't alias with eachother. +/// +/// In the case of write-combining memory, rule 2. could be relaxed to allow +/// reordering of non-aliasing store operations. In practice, this class +/// doesn't allow it. +/// To put it in another way, there is no option to specify a different memory +/// type for memory operations (example: write-through, write-combining, etc.). +/// Also, there is no way to weaken the memory model, and this unit doesn't +/// support write-combining behavior. +/// +/// No assumptions are made on the size of the store buffer. +/// As mentioned before, this class conservatively assumes a may-alias relation +/// between loads and stores. Consequently, LSUnit doesn't know how to identify +/// cases where store-to-load forwarding would occur. +/// +/// LSUnit doesn't attempt to predict whether a load or store hits or misses +/// the L1 cache. To be more specific, LSUnit doesn't know anything about +/// the cache hierarchy and memory types. +/// It only knows if an instruction "mayLoad" and/or "mayStore". For loads, the +/// scheduling model provides an "optimistic" load-to-use latency (which usually +/// matches the load-to-use latency for when there is a hit in the L1D). +/// +/// Class MCInstrDesc in LLVM doesn't know about serializing operations, nor +/// memory-barrier like instructions. +/// LSUnit conservatively assumes that an instruction which `mayLoad` and has +/// `unmodeled side effects` behave like a "soft" load-barrier. That means, it +/// serializes loads without forcing a flush of the load queue. +/// Similarly, instructions that both `mayStore` and have `unmodeled side +/// effects` are treated like store barriers. A full memory +/// barrier is a 'mayLoad' and 'mayStore' instruction with unmodeled side +/// effects. This is obviously inaccurate, but this is the best that we can do +/// at the moment. +/// +/// Each load/store barrier consumes one entry in the load/store queue. A +/// load/store barrier enforces ordering of loads/stores: +/// - A younger load cannot pass a load barrier. +/// - A younger store cannot pass a store barrier. +/// +/// A younger load has to wait for the memory load barrier to execute. +/// A load/store barrier is "executed" when it becomes the oldest entry in +/// the load/store queue(s). That also means, all the older loads/stores have +/// already been executed. +class LSUnit { + // Load queue size. + // LQ_Size == 0 means that there are infinite slots in the load queue. + unsigned LQ_Size; + + // Store queue size. + // SQ_Size == 0 means that there are infinite slots in the store queue. + unsigned SQ_Size; + + // If true, loads will never alias with stores. + bool NoAlias; + + std::set LoadQueue; + std::set StoreQueue; + + void AssignLQSlot(unsigned Index); + void AssignSQSlot(unsigned Index); + bool IsReadyNoAlias(unsigned Index) const; + + // An instruction that both 'mayStore' and 'HasUnmodeledSideEffects' is + // conservatively treated as a store barrier. It forces older store to be + // executed before newer stores are issued. + std::set StoreBarriers; + + // An instruction that both 'MayLoad' and 'HasUnmodeledSideEffects' is + // conservatively treated as a load barrier. It forces older loads to execute + // before newer loads are issued. + std::set LoadBarriers; + +public: + LSUnit(unsigned LQ = 0, unsigned SQ = 0, bool AssumeNoAlias = false) + : LQ_Size(LQ), SQ_Size(SQ), NoAlias(AssumeNoAlias) {} + +#ifndef NDEBUG + void dump() const; +#endif + + bool IsSQEmpty() const { return StoreQueue.empty(); } + bool IsLQEmpty() const { return LoadQueue.empty(); } + bool IsSQFull() const { return SQ_Size != 0 && StoreQueue.size() == SQ_Size; } + bool IsLQFull() const { return LQ_Size != 0 && LoadQueue.size() == LQ_Size; } + + void Reserve(unsigned Index, bool MayLoad, bool MayStore, bool IsMemBarrier) { + if (!MayLoad && !MayStore) + return; + if (MayLoad) { + if (IsMemBarrier) + LoadBarriers.insert(Index); + AssignLQSlot(Index); + } + if (MayStore) { + if (IsMemBarrier) + StoreBarriers.insert(Index); + AssignSQSlot(Index); + } + } + + // The rules are: + // 1. A store may not pass a previous store. + // 2. A load may not pass a previous store unless flag 'NoAlias' is set. + // 3. A load may pass a previous load. + // 4. A store may not pass a previous load (regardless of flag 'NoAlias'). + // 5. A load has to wait until an older load barrier is fully executed. + // 6. A store has to wait until an older store barrier is fully executed. + bool IsReady(unsigned Index) const; + void OnInstructionExecuted(unsigned Index); +}; + +} // namespace mca + +#endif Index: tools/llvm-mca/LSUnit.cpp =================================================================== --- tools/llvm-mca/LSUnit.cpp +++ tools/llvm-mca/LSUnit.cpp @@ -0,0 +1,115 @@ +//===----------------------- LSUnit.cpp --------------------------*- C++-*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// A Load-Store Unit for the llvm-mca tool. +/// +//===----------------------------------------------------------------------===// + +#include "LSUnit.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "llvm-mca" + +namespace mca { + +#ifndef NDEBUG +void LSUnit::dump() const { + dbgs() << "[LSUnit] LQ_Size = " << LQ_Size << '\n'; + dbgs() << "[LSUnit] SQ_Size = " << SQ_Size << '\n'; + dbgs() << "[LSUnit] NextLQSlotIdx = " << LoadQueue.size() << '\n'; + dbgs() << "[LSUnit] NextSQSlotIdx = " << StoreQueue.size() << '\n'; +} +#endif + +void LSUnit::AssignLQSlot(unsigned Index) { + assert(!IsLQFull()); + assert(LoadQueue.count(Index) == 0); + + DEBUG(dbgs() << "[LSUnit] - AssignLQSlot \n"); + LoadQueue.insert(Index); +} + +void LSUnit::AssignSQSlot(unsigned Index) { + assert(!IsSQFull()); + assert(StoreQueue.count(Index) == 0); + + DEBUG(dbgs() << "[LSUnit] - AssignSQSlot \n"); + StoreQueue.insert(Index); +} + +bool LSUnit::IsReady(unsigned Index) const { + bool IsALoad = LoadQueue.count(Index) != 0; + bool IsAStore = StoreQueue.count(Index) != 0; + unsigned LoadBarrierIndex = LoadBarriers.empty() ? 0 : *LoadBarriers.begin(); + unsigned StoreBarrierIndex = StoreBarriers.empty() ? 0 : *StoreBarriers.begin(); + + if (IsALoad && LoadBarrierIndex) { + if (Index > LoadBarrierIndex) + return false; + if (Index == LoadBarrierIndex && Index != *LoadQueue.begin()) + return false; + } + + if (IsAStore && StoreBarrierIndex) { + if (Index > StoreBarrierIndex) + return false; + if (Index == StoreBarrierIndex && Index != *StoreQueue.begin()) + return false; + } + + if (NoAlias && IsALoad) + return true; + + if (StoreQueue.size()) { + // Check if this memory operation is younger than the older store. + if (Index > *StoreQueue.begin()) + return false; + } + + // Okay, we are older than the oldest store in the queue. + // If there are no pending loads, then we can say for sure that this + // instruction is ready. + if (IsLQEmpty()) + return true; + + // Check if there are no older loads. + if (Index <= *LoadQueue.begin()) + return true; + + // There is at least one younger load. + return !IsAStore; +} + +void LSUnit::OnInstructionExecuted(unsigned Index) { + std::set::iterator it = LoadQueue.find(Index); + if (it != LoadQueue.end()) { + DEBUG(dbgs() << "[LSUnit]: Instruction idx=" << Index + << " has been removed from the load queue.\n"); + LoadQueue.erase(it); + } + + it = StoreQueue.find(Index); + if (it != StoreQueue.end()) { + DEBUG(dbgs() << "[LSUnit]: Instruction idx=" << Index + << " has been removed from the store queue.\n"); + StoreQueue.erase(it); + } + + if (!StoreBarriers.empty() && Index == *StoreBarriers.begin()) + StoreBarriers.erase(StoreBarriers.begin()); + if (!LoadBarriers.empty() && Index == *LoadBarriers.begin()) + LoadBarriers.erase(LoadBarriers.begin()); +} +} // namespace mca Index: tools/llvm-mca/ResourcePressureView.h =================================================================== --- tools/llvm-mca/ResourcePressureView.h +++ tools/llvm-mca/ResourcePressureView.h @@ -0,0 +1,115 @@ +//===--------------------- ResourcePressureView.h ---------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file define class ResourcePressureView. +/// Class ResourcePressureView observes hardware events generated by +/// the Backend object and collects statistics related to resource usage at +/// instruction granularity. +/// Resource pressure information is then printed out to a stream in the +/// form of a table like the one from the example below: +/// +/// Resources: +/// [0] - JALU0 +/// [1] - JALU1 +/// [2] - JDiv +/// [3] - JFPM +/// [4] - JFPU0 +/// [5] - JFPU1 +/// [6] - JLAGU +/// [7] - JSAGU +/// [8] - JSTC +/// [9] - JVIMUL +/// +/// Resource pressure per iteration: +/// [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] +/// 0.00 0.00 0.00 0.00 2.00 2.00 0.00 0.00 0.00 0.00 +/// +/// Resource pressure by instruction: +/// [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions: +/// - - - - - 1.00 - - - - vpermilpd $1, %xmm0, +/// %xmm1 +/// - - - - 1.00 - - - - - vaddps %xmm0, %xmm1, +/// %xmm2 +/// - - - - - 1.00 - - - - vmovshdup %xmm2, %xmm3 +/// - - - - 1.00 - - - - - vaddss %xmm2, %xmm3, +/// %xmm4 +/// +/// In this example, we have AVX code executed on AMD Jaguar (btver2). +/// Both shuffles and vector floating point add operations on XMM registers have +/// a reciprocal throughput of 1cy. +/// Each add is issued to pipeline JFPU0, while each shuffle is issued to +/// pipeline JFPU1. The overall pressure per iteration is reported by two +/// tables: the first smaller table is the resource pressure per iteration; +/// the second table reports resource pressure per instruction. Values are the +/// average resource cycles consumed by an instruction. +/// Every vector add from the example uses resource JFPU0 for an average of 1cy +/// per iteration. Consequently, the resource pressure on JFPU0 is of 2cy per +/// iteration. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_MCA_RESOURCEPRESSUREVIEW_H +#define LLVM_TOOLS_LLVM_MCA_RESOURCEPRESSUREVIEW_H + +#include "HWEventListener.h" +#include "SourceMgr.h" +#include "llvm/MC/MCInstPrinter.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include + +namespace mca { + +using namespace llvm; + +class Backend; + +/// This class collects resource pressure statistics and it is able to print +/// out all the collected information as a table to an output stream. +class ResourcePressureView : public HWEventListener { + const MCSubtargetInfo &STI; + MCInstPrinter &MCIP; + const SourceMgr &Source; + + // Map to quickly get a resource descriptor from a mask. + std::map Resource2VecIndex; + + // Table of resources used by instructions. + std::vector ResourceUsage; + unsigned NumResourceUnits; + + const MCInst &GetMCInstFromIndex(unsigned Index) const; + void printResourcePressurePerIteration(raw_ostream &OS, + unsigned Executions) const; + void printResourcePressurePerInstruction(raw_ostream &OS, + unsigned Executions) const; + void initialize(const ArrayRef ProcResoureMasks); + +public: + ResourcePressureView(const MCSubtargetInfo &ST, MCInstPrinter &Printer, + const SourceMgr &SM, + const ArrayRef ProcResourceMasks) + : STI(ST), MCIP(Printer), Source(SM) { + initialize(ProcResourceMasks); + } + + void OnInstructionIssued( + unsigned Index, + const ArrayRef> &Used) override; + + void printResourcePressure(raw_ostream &OS, unsigned Cycles) const { + unsigned Executions = Source.GetNumIterations(); + printResourcePressurePerIteration(OS, Executions); + printResourcePressurePerInstruction(OS, Executions); + } +}; + +} // namespace mca + +#endif Index: tools/llvm-mca/ResourcePressureView.cpp =================================================================== --- tools/llvm-mca/ResourcePressureView.cpp +++ tools/llvm-mca/ResourcePressureView.cpp @@ -0,0 +1,174 @@ +//===--------------------- ResourcePressureView.cpp -------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file implements methods in the ResourcePressureView interface. +/// +//===----------------------------------------------------------------------===// + +#include "ResourcePressureView.h" +#include "llvm/Support/raw_ostream.h" + +namespace mca { + +void ResourcePressureView::initialize( + const ArrayRef ProcResourceMasks) { + // Populate the map of resource descriptors. + unsigned R2VIndex = 0; + const MCSchedModel &SM = STI.getSchedModel(); + for (unsigned i = 0; i < SM.getNumProcResourceKinds(); ++i) { + const MCProcResourceDesc &ProcResource = *SM.getProcResource(i); + unsigned NumUnits = ProcResource.NumUnits; + // Skip groups and invalid resources with zero units. + if (ProcResource.SubUnitsIdxBegin || !NumUnits) + continue; + + uint64_t ResourceMask = ProcResourceMasks[i]; + Resource2VecIndex.insert( + std::pair(ResourceMask, R2VIndex)); + R2VIndex += ProcResource.NumUnits; + } + + NumResourceUnits = R2VIndex; + ResourceUsage.resize(NumResourceUnits * (Source.size() + 1)); + std::fill(ResourceUsage.begin(), ResourceUsage.end(), 0); +} + +void ResourcePressureView::OnInstructionIssued( + unsigned Index, const ArrayRef> &Used) { + unsigned SourceIdx = Index % Source.size(); + for (const auto &Use : Used) { + const ResourceRef &RR = Use.first; + assert(Resource2VecIndex.find(RR.first) != Resource2VecIndex.end()); + unsigned R2VIndex = Resource2VecIndex[RR.first]; + R2VIndex += countTrailingZeros(RR.second); + ResourceUsage[R2VIndex + NumResourceUnits * SourceIdx] += Use.second; + ResourceUsage[R2VIndex + NumResourceUnits * Source.size()] += Use.second; + } +} + +static void printColumnNames(raw_string_ostream &OS, const MCSchedModel &SM) { + for (unsigned i = 1, ResourceIndex = 0; i < SM.getNumProcResourceKinds(); + ++i) { + const MCProcResourceDesc &ProcResource = *SM.getProcResource(i); + unsigned NumUnits = ProcResource.NumUnits; + // Skip groups and invalid resources with zero units. + if (ProcResource.SubUnitsIdxBegin || !NumUnits) + continue; + + if (NumUnits == 1) { + OS << '[' << ResourceIndex << ']'; + if (ResourceIndex < 10) + OS << " "; + else + OS << " "; + ResourceIndex++; + continue; + } + + for (unsigned j = 0; j < NumUnits; ++j) { + OS << "[" << ResourceIndex << '.' << j << ']'; + if (ResourceIndex < 10) + OS << " "; + else + OS << ' '; + } + ResourceIndex++; + } +} + +void ResourcePressureView::printResourcePressurePerIteration( + raw_ostream &OS, unsigned Executions) const { + std::string Buffer; + raw_string_ostream TempStream(Buffer); + + TempStream << "\n\nResources:\n"; + const MCSchedModel &SM = STI.getSchedModel(); + for (unsigned i = 0, ResourceIndex = 0; i < SM.getNumProcResourceKinds(); + ++i) { + const MCProcResourceDesc &ProcResource = *SM.getProcResource(i); + unsigned NumUnits = ProcResource.NumUnits; + // Skip groups and invalid resources with zero units. + if (ProcResource.SubUnitsIdxBegin || !NumUnits) + continue; + + for (unsigned j = 0; j < NumUnits; ++j) { + TempStream << '[' << ResourceIndex; + if (NumUnits > 1) + TempStream << '.' << j; + TempStream << "] - " << ProcResource.Name << '\n'; + } + + ResourceIndex++; + } + + TempStream << "\n\nResource pressure per iteration:\n"; + printColumnNames(TempStream, SM); + TempStream << '\n'; + + for (unsigned j = 0; j < NumResourceUnits; ++j) { + unsigned Usage = ResourceUsage[j + Source.size() * NumResourceUnits]; + if (!Usage) { + TempStream << " - "; + continue; + } + + double Pressure = (double)Usage / Executions; + TempStream << format("%.2f", Pressure); + if (Pressure < 10.0) + TempStream << " "; + else if (Pressure < 100.0) + TempStream << " "; + else + TempStream << ' '; + } + + TempStream.flush(); + OS << Buffer; +} + +void ResourcePressureView::printResourcePressurePerInstruction( + raw_ostream &OS, unsigned Executions) const { + std::string Buffer; + raw_string_ostream TempStream(Buffer); + + TempStream << "\n\nResource pressure by instruction:\n"; + printColumnNames(TempStream, STI.getSchedModel()); + TempStream << "\tInstructions:\n"; + + for (unsigned i = 0; i < Source.size(); ++i) { + for (unsigned j = 0; j < NumResourceUnits; ++j) { + unsigned Usage = ResourceUsage[j + i * NumResourceUnits]; + if (Usage == 0) { + TempStream << " - "; + } else { + double Pressure = (double)Usage / Executions; + if (Pressure < 0.005) { + TempStream << " - "; + } else { + TempStream << format("%.2f", Pressure); + if (Pressure < 10.0) + TempStream << " "; + else if (Pressure < 100.0) + TempStream << " "; + else + TempStream << ' '; + } + } + } + + MCIP.printInst(&Source.GetMCInstFromIndex(i), TempStream, "", STI); + TempStream << '\n'; + TempStream.flush(); + OS << Buffer; + Buffer = ""; + } +} + +} // namespace mca Index: tools/llvm-mca/Scheduler.h =================================================================== --- tools/llvm-mca/Scheduler.h +++ tools/llvm-mca/Scheduler.h @@ -0,0 +1,522 @@ +//===--------------------- Scheduler.h ------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// A scheduler for Processor Resource Units and Processor Resource Groups. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_MCA_SCHEDULER_H +#define LLVM_TOOLS_LLVM_MCA_SCHEDULER_H + +#include "Instruction.h" +#include "LSUnit.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include + +namespace mca { + +using namespace llvm; + +class Backend; + +enum ResourceStateEvent { + RS_BUFFER_AVAILABLE, + RS_BUFFER_UNAVAILABLE, + RS_RESERVED +}; + +/// \brief A descriptor for processor resources. +/// +/// Each object of class ResourceState is associated to a specific processor +/// resource. There is an instance of this class for every processor resource +/// defined by the scheduling model. +/// A ResourceState dynamically tracks the availability of units of a processor +/// resource. For example, the ResourceState of a ProcResGroup tracks the +/// availability of resource units which are part of the group. +/// +/// Internally, ResourceState uses a round-robin selector to identify +/// which unit of the group shall be used next. +class ResourceState { + // A resource identifier. + // If this ResourceState describes a group, then the number of bits + // set in this mask is equivalent to the number of units of the group. + // A ResourceMask can be used to quickly check if a unit is part of a group + // or not. + // + // Field ResourceMask has exactly one bit set if this object describes a + // simple ProcResource. For ProcResource only, this field can be used as a + // unique identifer. + // Field ResourceMask for resource groups is obtained by OR'ing + // the value of ResourceMask of every ProcResourceUnits in the group. + uint64_t ResourceMask; + + // A ProcResource can specify a number of units. A ProcResource with more + // than one unit behaves like a group. + // This field has one bit set for each resource in the group. + // Note that in this file, we often call sub-resource a resource which is + // part of a group. + // For ProcResGroup, this field defaults to 'ResourceMask'. For non-group + // resources, the number of bits set in this mask is equivalent to the + // number of units (i.e. field 'NumUnits' in 'ProcResourceUnits'). + uint64_t ResourceSizeMask; + + // A simple round-robin selector for processor resources. + // Each bit of the mask identifies a sub resource within this group. + // + // As an example, lets assume that this ResourceState describes a + // processor resource group composed of the following three units: + // ResourceA -- 0b001 + // ResourceB -- 0b010 + // ResourceC -- 0b100 + // + // Each unit is identified by a ResourceMask which always contains a + // single bit set. Field NextInSequenceMask is initially set to value + // 0xb111. That value is obtained by OR'ing the resource masks of + // processor resource that are part of the group. + // + // NextInSequenceMask -- 0b111 + // + // Field NextInSequenceMask is used by the resource manager (i.e. + // an object of class ResourceManager) to select the "next available resource" + // from the set. The algorithm would prioritize resources with a bigger + // ResourceMask value. + // + // In this example, there are three resources in the set, and 'ResourceC' + // has the highest mask value. The round-robin selector would firstly select + // 'ResourceC', then 'ResourceB', and eventually 'ResourceA'. + // + // When a resource R is used, its corresponding bit is cleared from the set. + // + // Back to the example: + // If 'ResourceC' is selected, then the new value of NextInSequenceMask + // becomes 0xb011. + // + // When NextInSequenceMask becomes zero, it is reset to its original value + // (in this example, that value would be 0b111). + uint64_t NextInSequenceMask; + + // Some instructions can only be issued on very specific pipeline resources. + // For those instructions, we know exactly which resource would be consumed + // without having to dynamically select it using field 'NextInSequenceMask'. + // + // The resource mask bit associated to the (statically) selected + // processor resource is still cleared from the 'NextInSequenceMask'. + // If that bit was already zero in NextInSequenceMask, then we update + // mask 'RemovedFromNextInSequence'. + // + // When NextInSequenceMask is reset back to its initial value, the algorithm + // removes any bits which are set in RemoveFromNextInSequence. + uint64_t RemovedFromNextInSequence; + + // A mask of ready units. + uint64_t ReadyMask; + + // Buffered resources will have this field set to a positive number bigger + // than 0. A buffered resource behaves like a separate reservation station + // implementing its own buffer for out-of-order execution. + // A buffer of 1 is for units that force in-order execution. + // A value of 0 is treated specially. In particular, a resource with + // A BufferSize = 0 is for an in-order issue/dispatch resource. + // That means, this resource is reserved starting from the dispatch event, + // until all the "resource cycles" are consumed after the issue event. + // While this resource is reserved, no other instruction may be dispatched. + int BufferSize; + + // Available slots in the buffer (zero, if this is not a buffered resource). + unsigned AvailableSlots; + + // Maximum number of buffer slots seen used during one cycle. + // This helps tracking dynamic dispatch stalls caused by the lack of + // entries in the scheduler's queue. + unsigned MaxUsedSlots; + + // True if this is resource is currently unavailable. + // An instruction may "reserve" a resource for a number of cycles. + // During those cycles, the reserved resource cannot be used for other + // instructions, even if the ReadyMask is set. + bool Unavailable; + + bool IsSubResourceReady(uint64_t ID) const { return ReadyMask & ID; } + + /// Returns the mask identifier of the next available resource in the set. + uint64_t GetNextInSequence() const { + assert(NextInSequenceMask); + return PowerOf2Floor(NextInSequenceMask); + } + + /// Returns the mask of the next available resource within the set, + /// and updates the resource selector. + void UpdateNextInSequence() { + NextInSequenceMask ^= GetNextInSequence(); + if (!NextInSequenceMask) + NextInSequenceMask = ResourceSizeMask; + } + + uint64_t ComputeResourceSizeMaskForGroup(uint64_t ResourceMask) { + assert(countPopulation(ResourceMask) > 1); + return ResourceMask ^ PowerOf2Floor(ResourceMask); + } + +public: + ResourceState(const MCProcResourceDesc &Desc, uint64_t Mask) + : ResourceMask(Mask) { + bool IsAGroup = countPopulation(ResourceMask) > 1; + ResourceSizeMask = IsAGroup ? ComputeResourceSizeMaskForGroup(ResourceMask) + : ((1ULL << Desc.NumUnits) - 1); + NextInSequenceMask = ResourceSizeMask; + RemovedFromNextInSequence = 0; + ReadyMask = ResourceSizeMask; + BufferSize = Desc.BufferSize; + AvailableSlots = BufferSize == -1 ? 0U : static_cast(BufferSize); + MaxUsedSlots = 0; + Unavailable = false; + } + + uint64_t GetResourceMask() const { return ResourceMask; } + int GetBufferSize() const { return BufferSize; } + unsigned GetMaxUsedSlots() const { return MaxUsedSlots; } + + bool IsBuffered() const { return BufferSize > 0; } + bool IsInOrder() const { return BufferSize == 1; } + bool IsADispatchHazard() const { return BufferSize == 0; } + bool IsReserved() const { return Unavailable; } + + void SetReserved() { Unavailable = true; } + void ClearReserved() { Unavailable = false; } + + // A resource is ready if it is not reserved, and if there are enough + // available units. + // If a resource is also a dispatch hazard, then we don't check if + // it is reserved because that check would always return true. + // A resource marked as "dispatch hazard" is always reserved at + // dispatch time. When this method is called, the assumption is that + // the user of this resource has been already dispatched. + bool IsReady(unsigned NumUnits = 1) const { + return (!IsReserved() || IsADispatchHazard()) && + countPopulation(ReadyMask) >= NumUnits; + } + bool IsAResourceGroup() const { return countPopulation(ResourceMask) > 1; } + + bool ContainsResource(uint64_t ID) const { return ResourceMask & ID; } + + void MarkSubResourceAsUsed(uint64_t ID) { + assert(IsSubResourceReady(ID)); + ReadyMask ^= ID; + } + + void ReleaseSubResource(uint64_t ID) { + assert(!IsSubResourceReady(ID)); + ReadyMask ^= ID; + } + + unsigned GetNumUnits() const { + return IsAResourceGroup() ? 1U : countPopulation(ResourceSizeMask); + } + + uint64_t SelectNextInSequence(); + void RemoveFromNextInSequence(uint64_t ID); + + ResourceStateEvent IsBufferAvailable() const { + if (IsADispatchHazard() && IsReserved()) + return RS_RESERVED; + if (!IsBuffered() || AvailableSlots) + return RS_BUFFER_AVAILABLE; + return RS_BUFFER_UNAVAILABLE; + } + + void ReserveBuffer() { + if (AvailableSlots) + AvailableSlots--; + unsigned UsedSlots = static_cast(BufferSize) - AvailableSlots; + MaxUsedSlots = std::max(MaxUsedSlots, UsedSlots); + } + + void ReleaseBuffer() { + if (BufferSize > 0) + AvailableSlots++; + assert(AvailableSlots <= static_cast(BufferSize)); + } + +#ifndef NDEBUG + void dump() const; +#endif +}; + +/// \brief A resource unit identifier. +/// +/// This is used to identify a specific processor resource unit using a pair +/// of indices where the 'first' index is a processor resource mask, and the +/// 'second' index is an index for a "sub-resource" (i.e. unit). +typedef std::pair ResourceRef; + +// First: a resource mask identifying a buffered resource. +// Second: max number of buffer entries used in this resource. +typedef std::pair BufferUsageEntry; + +/// A resource manager for processor resource units and groups. +/// +/// This class owns all the ResourceState objects, and it is responsible for +/// acting on requests from a Scheduler by updating the internal state of +/// ResourceState objects. +/// This class doesn't know about instruction itineraries and functional units. +/// In future, it can be extended to support itineraries too through the same +/// public interface. +class ResourceManager { + // The resource manager owns all the ResourceState. + SmallDenseMap> Resources; + + // Keeps track of which resources are busy, and how many cycles are left + // before those become usable again. + SmallDenseMap BusyResources; + + // A table to map processor resource IDs to processor resource masks. + SmallVector ProcResID2Mask; + + // Adds a new resource state in Resources, as well as a new descriptor in + // ResourceDescriptor. + void AddResource(const MCProcResourceDesc &Desc, uint64_t Mask); + + // Compute processor resource masks for each processor resource declared by + // the scheduling model. + void ComputeProcResourceMasks(const MCSchedModel &SM); + + // Populate resource descriptors. + void initialize(const MCSchedModel &SM) { + ComputeProcResourceMasks(SM); + for (unsigned i = 0; i < SM.getNumProcResourceKinds(); ++i) + AddResource(*SM.getProcResource(i), ProcResID2Mask[i]); + } + + // Returns the actual resource unit that will be used. + ResourceRef SelectPipe(uint64_t ResourceID); + + void Use(ResourceRef RR); + void Release(ResourceRef RR); + + unsigned GetNumUnits(uint64_t ResourceID) const { + assert(Resources.find(ResourceID) != Resources.end()); + return Resources.find(ResourceID)->getSecond()->GetNumUnits(); + } + + // Reserve a specific Resource kind. + void ReserveBuffer(uint64_t ResourceID) { + assert(IsBufferAvailable(ResourceID) == + ResourceStateEvent::RS_BUFFER_AVAILABLE); + ResourceState &Resource = *Resources[ResourceID]; + Resource.ReserveBuffer(); + } + + void ReleaseBuffer(uint64_t ResourceID) { + Resources[ResourceID]->ReleaseBuffer(); + } + + ResourceStateEvent IsBufferAvailable(uint64_t ResourceID) const { + const ResourceState &Resource = *Resources.find(ResourceID)->second; + return Resource.IsBufferAvailable(); + } + + bool IsReady(uint64_t ResourceID, unsigned NumUnits) const { + const ResourceState &Resource = *Resources.find(ResourceID)->second; + return Resource.IsReady(NumUnits); + } + +public: + ResourceManager(const MCSchedModel &SM) { initialize(SM); } + + ResourceStateEvent CanBeDispatched(const InstrDesc &Desc) const { + ResourceStateEvent Result = ResourceStateEvent::RS_BUFFER_AVAILABLE; + for (uint64_t Buffer : Desc.Buffers) { + Result = IsBufferAvailable(Buffer); + if (Result != ResourceStateEvent::RS_BUFFER_AVAILABLE) + break; + } + + return Result; + } + + void ReserveBuffers(const InstrDesc &Desc) { + for (const auto &R : Desc.Buffers) + ReserveBuffer(R); + } + + void ReleaseBuffers(const InstrDesc &Desc) { + for (const auto &R : Desc.Buffers) + ReleaseBuffer(R); + } + + void ReserveResource(uint64_t ResourceID) { + ResourceState &Resource = *Resources[ResourceID]; + assert(!Resource.IsReserved()); + Resource.SetReserved(); + } + + void ReleaseResource(uint64_t ResourceID) { + ResourceState &Resource = *Resources[ResourceID]; + Resource.ClearReserved(); + } + + void ReserveDispatchHazardResources(const InstrDesc &Desc); + + // Returns true if all resources are in-order, and there is at least one + // resource which is a dispatch hazard (BufferSize = 0). + bool MustIssueImmediately(const InstrDesc &Desc); + + bool CanBeIssued(const InstrDesc &Desc) const; + double GetRThroughput(const InstrDesc &Desc) const; + + void + IssueInstruction(unsigned Index, const InstrDesc &Desc, + SmallVector, 4> &Pipes); + + void CycleEvent(SmallVector &ResourcesFreed); + + void getBuffersUsage(std::vector &Usage) const { + for (const auto &Resource : Resources) { + const ResourceState &RS = *Resource.second; + if (RS.IsBuffered()) + Usage.emplace_back(std::pair(RS.GetResourceMask(), + RS.GetMaxUsedSlots())); + } + } + + const ArrayRef GetProcResMasks() const { return ProcResID2Mask; } + +#ifndef NDEBUG + void dump() const { + for (const auto &Resource : Resources) + Resource.second->dump(); + } +#endif +}; // namespace mca + +/// Class Scheduler is responsible for issuing instructions to pipeline +/// resources. Internally, it delegates to a ResourceManager the management of +/// processor resources. +/// This class is also responsible for tracking the progress of instructions +/// from the dispatch stage, until the write-back stage. +/// +/// An nstruction dispatched to the Scheduler is initially placed into either +/// the 'WaitQueue' or the 'ReadyQueue' depending on the availability of the input +/// operands. +/// Instructions in the WaitQueue are ordered by instruction index. +/// An instruction is moved from the WaitQueue to the ReadyQueue when register +/// operands become available, and all memory dependencies are met. +/// Instructions that are moved from the WaitQueue to the ReadyQueue transition +/// from state 'IS_AVAILABLE' to state 'IS_READY'. +/// +/// At the beginning of each cycle, the Scheduler checks if there are +/// instructions in the WaitQueue that can be moved to the ReadyQueue. If the +/// ReadyQueue is not empty, then older instructions from the queue are issued to +/// the processor pipelines, and the underlying ResourceManager is updated +/// accordingly. The ReadyQueue is ordered by instruction index to guarantee +/// that the first instructions in the set are also the oldest. +/// +/// An Instruction is moved from the ReadyQueue the `IssuedQueue` when it is +/// issued to a (one or more) pipeline(s). This event also causes an instruction +/// state transition (i.e. from state IS_READY, to state IS_EXECUTING). +/// An Instruction leaves the IssuedQueue when it reaches the write-back stage. +class Scheduler { + const MCSchedModel &SM; + + // Hardware resources that are managed by this scheduler. + std::unique_ptr Resources; + std::unique_ptr LSU; + + // The Backend gets notified when instructions are ready/issued/executed. + Backend *Owner; + + std::map WaitQueue; + std::map ReadyQueue; + std::map IssuedQueue; + + void NotifyInstructionIssued( + unsigned Index, const ArrayRef> &Used); + void NotifyInstructionExecuted(unsigned Index); + void NotifyInstructionReady(unsigned Index); + void NotifyResourceAvailable(const ResourceRef &RR); + + /// Issue instructions from the ready queue by giving priority to older + /// instructions. + void Issue(); + + /// Issue an instruction without updating the ready queue. + void IssueInstruction(Instruction *IS, unsigned InstrIndex); + void UpdatePendingQueue(); + void UpdateIssuedQueue(); + +public: + Scheduler(Backend *B, const MCSchedModel &Model, unsigned LoadQueueSize, + unsigned StoreQueueSize, bool AssumeNoAlias) + : SM(Model), Resources(llvm::make_unique(SM)), + LSU(llvm::make_unique(LoadQueueSize, StoreQueueSize, + AssumeNoAlias)), + Owner(B) {} + + /// Scheduling events. + /// + /// The DispatchUnit is responsible for querying the Scheduler before + /// dispatching new instructions. Queries are performed through method + /// `Scheduler::CanBeDispatched`, which returns an instance of this enum to + /// tell if the dispatch would fail or not. If scheduling resources are + /// available, and the instruction can be dispatched, then the query returns + /// HWS_AVAILABLE. A values different than HWS_AVAILABLE means that the + /// instruction cannot be dispatched during this cycle. + /// + /// Each event name starts with prefix "HWS_", and it is followed by + /// a substring which describes the reason why the Scheduler was unavailable + /// (or "AVAILABLE" if the instruction is allowed to be dispatched). + /// + /// HWS_QUEUE_UNAVAILABLE is returned if there are not enough available slots + /// in the scheduler's queue. That means, one (or more) buffered resources + /// consumed by the instruction were full. + /// + /// HWS_LD_QUEUE_UNAVAILABLE is returned when the instruction 'mayLoad', and + /// the load queue in the load/store unit (implemented by class LSUnit) is + /// full. Similarly, HWS_ST_QUEUE_UNAVAILABLE is returned when the store queue + /// is full, and the instruction to be dispatched 'mayStore'. + /// + /// HWS_DISPATCH_GROUP_RESTRICTION is only returned in special cases where the + /// instruction consumes an in-order issue/dispatch resource (i.e. a resource + /// with `BufferSize=0`), and the resource is not ready at dispatch + /// stage. + enum Event { + HWS_AVAILABLE, + HWS_QUEUE_UNAVAILABLE, + HWS_DISPATCH_GROUP_RESTRICTION, + HWS_LD_QUEUE_UNAVAILABLE, + HWS_ST_QUEUE_UNAVAILABLE + }; + + Event CanBeDispatched(const InstrDesc &Desc) const; + Instruction *ScheduleInstruction(unsigned Idx, Instruction *MCIS); + + double GetRThroughput(const InstrDesc &Desc) const { + return Resources->GetRThroughput(Desc); + } + + void CycleEvent(unsigned Cycle); + + void GetBuffersUsage(std::vector &Usage) const { + Resources->getBuffersUsage(Usage); + } + + const ArrayRef GetProcResourceMasks() const { + return Resources->GetProcResMasks(); + } + +#ifndef NDEBUG + void dump() const; +#endif +}; + +} // Namespace mca + +#endif Index: tools/llvm-mca/Scheduler.cpp =================================================================== --- tools/llvm-mca/Scheduler.cpp +++ tools/llvm-mca/Scheduler.cpp @@ -0,0 +1,454 @@ +//===--------------------- Scheduler.cpp ------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// A scheduler for processor resource units and processor resource groups. +// +//===----------------------------------------------------------------------===// + +#include "Scheduler.h" +#include "Backend.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +#define DEBUG_TYPE "llvm-mca" + +namespace mca { + +uint64_t ResourceState::SelectNextInSequence() { + assert(IsReady()); + uint64_t Next = GetNextInSequence(); + while (!IsSubResourceReady(Next)) { + UpdateNextInSequence(); + Next = GetNextInSequence(); + } + return Next; +} + +#ifndef NDEBUG +void ResourceState::dump() const { + dbgs() << "MASK: " << ResourceMask << ", SIZE_MASK: " << ResourceSizeMask + << ", NEXT: " << NextInSequenceMask << ", RDYMASK: " << ReadyMask + << ", BufferSize=" << BufferSize + << ", AvailableSlots=" << AvailableSlots + << ", Reserved=" << Unavailable << '\n'; +} +#endif + +// Adds a new resource state in Resources, as well as a new descriptor in +// ResourceDescriptor. Map 'Resources' allows to quickly obtain ResourceState +// objects from resource mask identifiers. +void ResourceManager::AddResource(const MCProcResourceDesc &Desc, + uint64_t Mask) { + assert(Resources.find(Mask) == Resources.end() && "Resource already added!"); + Resources[Mask] = llvm::make_unique(Desc, Mask); +} + +// Populate vector ProcResID2Mask with resource masks. One per each processor +// resource declared by the scheduling model. +void ResourceManager::ComputeProcResourceMasks(const MCSchedModel &SM) { + unsigned ProcResourceID = 0; + + // Create a unique bitmask for every processor resource unit. + // Skip resource at index 0, since it always references 'InvalidUnit'. + ProcResID2Mask.resize(SM.getNumProcResourceKinds()); + for (unsigned i = 1; i < SM.getNumProcResourceKinds(); ++i) { + const MCProcResourceDesc &Desc = *SM.getProcResource(i); + if (Desc.SubUnitsIdxBegin) + continue; + ProcResID2Mask[i] = 1ULL << ProcResourceID; + ProcResourceID++; + } + + // Create a unique bitmask for every processor resource group. + for (unsigned i = 1; i < SM.getNumProcResourceKinds(); ++i) { + const MCProcResourceDesc &Desc = *SM.getProcResource(i); + if (!Desc.SubUnitsIdxBegin) + continue; + ProcResID2Mask[i] |= 1ULL << ProcResourceID; + for (unsigned U = 0; U < Desc.NumUnits; ++U) { + uint64_t OtherMask = ProcResID2Mask[Desc.SubUnitsIdxBegin[U]]; + ProcResID2Mask[i] |= OtherMask; + } + ProcResourceID++; + } +} + +// Returns the actual resource consumed by this Use. +// First, is the primary resource ID. +// Second, is the specific sub-resource ID. +std::pair ResourceManager::SelectPipe(uint64_t ResourceID) { + ResourceState &RS = *Resources[ResourceID]; + uint64_t SubResourceID = RS.SelectNextInSequence(); + if (RS.IsAResourceGroup()) + return SelectPipe(SubResourceID); + return std::pair(ResourceID, SubResourceID); +} + +void ResourceState::RemoveFromNextInSequence(uint64_t ID) { + assert(NextInSequenceMask); + assert(countPopulation(ID) == 1); + if (ID > GetNextInSequence()) + RemovedFromNextInSequence |= ID; + NextInSequenceMask = NextInSequenceMask & (~ID); + if (!NextInSequenceMask) { + NextInSequenceMask = ResourceSizeMask; + assert(NextInSequenceMask != RemovedFromNextInSequence); + NextInSequenceMask ^= RemovedFromNextInSequence; + RemovedFromNextInSequence = 0; + } +} + +void ResourceManager::Use(ResourceRef RR) { + // Mark the sub-resource referenced by RR as used. + ResourceState &RS = *Resources[RR.first]; + RS.MarkSubResourceAsUsed(RR.second); + // If there are still available units in RR.first, + // then we are done. + if (RS.IsReady()) + return; + + // Notify to other resources that RR.first is no longer available. + for (auto &Res : Resources) { + ResourceState &Current = *Res.second.get(); + if (!Current.IsAResourceGroup() || Current.GetResourceMask() == RR.first) + continue; + + if (Current.ContainsResource(RR.first)) { + Current.MarkSubResourceAsUsed(RR.first); + Current.RemoveFromNextInSequence(RR.first); + } + } +} + +void ResourceManager::Release(ResourceRef RR) { + ResourceState &RS = *Resources[RR.first]; + bool WasFullyUsed = !RS.IsReady(); + RS.ReleaseSubResource(RR.second); + if (!WasFullyUsed) + return; + + for (auto &Res : Resources) { + ResourceState &Current = *Res.second.get(); + if (!Current.IsAResourceGroup() || Current.GetResourceMask() == RR.first) + continue; + + if (Current.ContainsResource(RR.first)) + Current.ReleaseSubResource(RR.first); + } +} + +void ResourceManager::ReserveDispatchHazardResources(const InstrDesc &Desc) { + for (const auto &R : Desc.Buffers) { + ResourceState &Resource = *Resources[R]; + if (Resource.IsADispatchHazard()) { + assert(!Resource.IsReserved()); + Resource.SetReserved(); + } + } +} + +bool ResourceManager::CanBeIssued(const InstrDesc &Desc) const { + return std::all_of(Desc.Resources.begin(), Desc.Resources.end(), + [&](const std::pair &E) { + unsigned NumUnits = + E.second.IsReserved() ? 0U : E.second.NumUnits; + return IsReady(E.first, NumUnits); + }); +} + +// Returns true if all resources are in-order, and there is at least one +// resource which is a dispatch hazard (BufferSize = 0). +bool ResourceManager::MustIssueImmediately(const InstrDesc &Desc) { + if (!CanBeIssued(Desc)) + return false; + bool AllInOrderResources = + std::all_of(Desc.Buffers.begin(), Desc.Buffers.end(), + [&](const unsigned BufferMask) { + const ResourceState &Resource = *Resources[BufferMask]; + return Resource.IsInOrder() || Resource.IsADispatchHazard(); + }); + if (!AllInOrderResources) + return false; + + return std::any_of(Desc.Buffers.begin(), Desc.Buffers.end(), + [&](const unsigned BufferMask) { + return Resources[BufferMask]->IsADispatchHazard(); + }); +} + +double ResourceManager::GetRThroughput(const InstrDesc &ID) const { + double RThroughput = 0; + for (const auto &Usage : ID.Resources) { + const CycleSegment &CS = Usage.second.CS; + assert(CS.begin() == 0); + + if (Usage.second.IsReserved()) { + RThroughput = std::max(RThroughput, (double)CS.size()); + continue; + } + + unsigned Population = std::max(1U, countPopulation(Usage.first) - 1); + unsigned NumUnits = Population * GetNumUnits(Usage.first); + NumUnits -= Usage.second.NumUnits - 1; + unsigned Cycles = CS.size(); + RThroughput = std::max(RThroughput, (double)Cycles / NumUnits); + } + return RThroughput; +} + +void ResourceManager::IssueInstruction( + unsigned Index, const InstrDesc &Desc, + SmallVector, 4> &Pipes) { + ReleaseBuffers(Desc); + for (const auto &R : Desc.Resources) { + const CycleSegment &CS = R.second.CS; + if (!CS.size()) { + ReleaseResource(R.first); + continue; + } + + assert(CS.begin() == 0 && "Invalid {Start, End} cycles!"); + if (!R.second.IsReserved()) { + auto Pipe = SelectPipe(R.first); + Use(Pipe); + BusyResources[Pipe] += CS.size(); + Pipes.emplace_back(std::pair(Pipe, CS.size())); + } else { + // Mark this group as reserved. + assert(R.second.IsReserved()); + assert((countPopulation(R.first) > 1) && "Expected a group!"); + ReserveResource(R.first); + BusyResources[ResourceRef(R.first, R.first)] += CS.size(); + } + } +} + +void ResourceManager::CycleEvent(SmallVector &ResourcesFreed) { + for (auto &BR : BusyResources) { + if (BR.second) + BR.second--; + if (!BR.second) { + // Release this resource. + const ResourceRef &RR = BR.first; + + if (countPopulation(RR.first) == 1) + Release(RR); + + ReleaseResource(RR.first); + ResourcesFreed.push_back(RR); + } + } + + for (const auto &RF : ResourcesFreed) + BusyResources.erase(RF); +} + +Instruction *Scheduler::ScheduleInstruction(unsigned Idx, Instruction *MCIS) { + assert(WaitQueue.find(Idx) == WaitQueue.end()); + assert(ReadyQueue.find(Idx) == ReadyQueue.end()); + assert(IssuedQueue.find(Idx) == IssuedQueue.end()); + + // Special case where MCIS is a zero-latency instruction. A zero-latency + // instruction doesn't consume any scheduler resources. That is because it + // doesn't need to be executed. Most of the times, zero latency instructions + // are removed at register renaming stage. For example, register-register + // moves can be removed at register renaming stage by creating new aliases. + // Zero-idiom instruction (for example: a `xor reg, reg`) can also be + // eliminated at register renaming stage, since we know in advance that those + // clear their output register. + if (MCIS->IsZeroLatency()) { + MCIS->ForceExecuted(); + NotifyInstructionIssued(Idx, {}); + NotifyInstructionExecuted(Idx); + return MCIS; + } + + const InstrDesc &Desc = MCIS->GetDesc(); + + // Consume entries in the reservation stations. + Resources->ReserveBuffers(Desc); + + // Mark units with BufferSize=0 as reserved. These resources will only + // be released after MCIS is issued, and all the ResourceCycles for + // those units have been consumed. + Resources->ReserveDispatchHazardResources(Desc); + + bool MayLoad = Desc.MayLoad; + bool MayStore = Desc.MayStore; + if (MayLoad || MayStore) + LSU->Reserve(Idx, MayLoad, MayStore, Desc.HasSideEffects); + + MCIS->Dispatch(); + bool IsReady = MCIS->IsReady(); + if (IsReady && (MayLoad || MayStore)) + IsReady &= LSU->IsReady(Idx); + + if (!IsReady) { + DEBUG(dbgs() << "[SCHEDULER] Adding " << Idx << " to the Wait Queue\n"); + WaitQueue[Idx] = MCIS; + return MCIS; + } + NotifyInstructionReady(Idx); + + // Special case where the instruction is ready, and it uses an in-order dispatch/issue + // processor resource. The instruction is issued immediately to the pipelines. + // Any other in-order buffered resources (i.e. BufferSize=1) are consumed. + if (Resources->MustIssueImmediately(Desc)) { + DEBUG(dbgs() << "[SCHEDULER] Instruction " << Idx << " issued immediately\n"); + IssueInstruction(MCIS, Idx); + return MCIS; + } + + DEBUG(dbgs() << "[SCHEDULER] Adding " << Idx << " to the Ready Queue\n"); + ReadyQueue[Idx] = MCIS; + return MCIS; +} + +void Scheduler::CycleEvent(unsigned /* unused */) { + SmallVector ResourcesFreed; + Resources->CycleEvent(ResourcesFreed); + + for (const ResourceRef &RR : ResourcesFreed) + NotifyResourceAvailable(RR); + + UpdateIssuedQueue(); + UpdatePendingQueue(); + Issue(); +} + +#ifndef NDEBUG +void Scheduler::dump() const { + dbgs() << "[SCHEDULER]: WaitQueue size is: " << WaitQueue.size() << '\n'; + dbgs() << "[SCHEDULER]: ReadyQueue size is: " << ReadyQueue.size() << '\n'; + dbgs() << "[SCHEDULER]: IssuedQueue size is: " << IssuedQueue.size() << '\n'; + Resources->dump(); +} +#endif + +Scheduler::Event Scheduler::CanBeDispatched(const InstrDesc &Desc) const { + if (Desc.MayLoad && LSU->IsLQFull()) + return HWS_LD_QUEUE_UNAVAILABLE; + if (Desc.MayStore && LSU->IsSQFull()) + return HWS_ST_QUEUE_UNAVAILABLE; + + Scheduler::Event Event; + switch (Resources->CanBeDispatched(Desc)) { + case ResourceStateEvent::RS_BUFFER_AVAILABLE: + Event = HWS_AVAILABLE; + break; + case ResourceStateEvent::RS_BUFFER_UNAVAILABLE: + Event = HWS_QUEUE_UNAVAILABLE; + break; + case ResourceStateEvent::RS_RESERVED: + Event = HWS_DISPATCH_GROUP_RESTRICTION; + } + return Event; +} + +void Scheduler::IssueInstruction(Instruction *IS, unsigned InstrIndex) { + // Issue the instruction and collect all the consumed resources + // into a vector. That vector is then used to notify the listener. + // Most instructions consume very few resurces (typically one or + // two resources). We use a small vector here, and conservatively + // initialize its capacity to 4. This should address the majority of + // the cases. + SmallVector, 4> UsedResources; + + const InstrDesc &D = IS->GetDesc(); + Resources->IssueInstruction(InstrIndex, D, UsedResources); + // Notify the instruction that it started executing. + // This updates the internal state of each write. + IS->Execute(); + + if (D.MaxLatency) { + IssuedQueue[InstrIndex] = IS; + NotifyInstructionIssued(InstrIndex, UsedResources); + } else { + // A zero latency instruction which reads and/or updates registers. + NotifyInstructionIssued(InstrIndex, UsedResources); + NotifyInstructionExecuted(InstrIndex); + } +} + +void Scheduler::Issue() { + std::vector ToRemove; + for (auto QueueEntry : ReadyQueue) { + // Give priority to older instructions in ReadyQueue. The ready queue is + // ordered by key, and therefore older instructions are visited first. + Instruction *IS = QueueEntry.second; + const InstrDesc &D = IS->GetDesc(); + if (!Resources->CanBeIssued(D)) + continue; + unsigned InstrIndex = QueueEntry.first; + IssueInstruction(IS, InstrIndex); + ToRemove.emplace_back(InstrIndex); + } + + for (const unsigned InstrIndex : ToRemove) + ReadyQueue.erase(InstrIndex); +} + +void Scheduler::UpdatePendingQueue() { + // Scan the set of waiting instructions and promote them to the + // ready queue if operands are all ready. + for (auto I = WaitQueue.begin(), E = WaitQueue.end(); I != E;) { + I->second->CycleEvent(); + + const InstrDesc &Desc = I->second->GetDesc(); + bool IsMemOp = Desc.MayLoad || Desc.MayStore; + bool IsReady = I->second->IsReady(); + if (IsReady && IsMemOp) + IsReady &= LSU->IsReady(I->first); + + if (IsReady) { + NotifyInstructionReady(I->first); + ReadyQueue[I->first] = I->second; + auto ToRemove = I; + ++I; + WaitQueue.erase(ToRemove); + } else { + ++I; + } + } +} + +void Scheduler::UpdateIssuedQueue() { + for (auto I = IssuedQueue.begin(), E = IssuedQueue.end(); I != E;) { + I->second->CycleEvent(); + if (I->second->IsExecuted()) { + NotifyInstructionExecuted(I->first); + auto ToRemove = I; + ++I; + IssuedQueue.erase(ToRemove); + } else { + DEBUG(dbgs() << "[SCHEDULER]: Instruction " << I->first + << " is still executing.\n"); + ++I; + } + } +} + +void Scheduler::NotifyInstructionIssued( + unsigned Index, const ArrayRef> &Used) { + Owner->NotifyInstructionIssued(Index, Used); +} + +void Scheduler::NotifyInstructionExecuted(unsigned Index) { + LSU->OnInstructionExecuted(Index); + Owner->NotifyInstructionExecuted(Index); +} + +void Scheduler::NotifyInstructionReady(unsigned Index) { + Owner->NotifyInstructionReady(Index); +} + +void Scheduler::NotifyResourceAvailable(const ResourceRef &RR) { + Owner->NotifyResourceAvailable(RR); +} +} // namespace mca Index: tools/llvm-mca/SourceMgr.h =================================================================== --- tools/llvm-mca/SourceMgr.h +++ tools/llvm-mca/SourceMgr.h @@ -0,0 +1,65 @@ +//===--------------------- SourceMgr.h --------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file implements class SourceMgr. Class SourceMgr abstracts the input +/// code sequence (a sequence of MCInst), and assings unique identifiers to +/// every instruction in the sequence. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_MCA_SOURCEMGR_H +#define LLVM_TOOLS_LLVM_MCA_SOURCEMGR_H + +#include "llvm/MC/MCInst.h" +#include + +namespace mca { + +using namespace llvm; + +typedef std::pair InstRef; + +class SourceMgr { + using InstVec = std::vector>; + InstVec Sequence; + unsigned Current; + unsigned Iterations; + static const unsigned DefaultIterations = 70; +public: + SourceMgr(unsigned NumIterations) + : Current(0), Iterations(NumIterations ? NumIterations : DefaultIterations) {} + + unsigned GetCurrentIteration() const { return Current / Sequence.size(); } + unsigned GetNumIterations() const { return Iterations; } + unsigned size() const { return Sequence.size(); } + const InstVec &GetSequence() const { return Sequence; } + InstVec &GetSequence() { return Sequence; } + + bool HasNext() { return Current < (Iterations * size()); } + void UpdateNext() { Current++; } + + const InstRef PeekNext() const { + unsigned Index = GetCurrentInstructionIndex(); + return InstRef(Current, Sequence[Index].get()); + } + + unsigned GetCurrentInstructionIndex() const { + return Current % Sequence.size(); + } + + const MCInst &GetMCInstFromIndex(unsigned Index) const { + return *Sequence[Index % size()]; + } + + bool IsEmpty() const { return size() == 0; } +}; + +} // namespace mca + +#endif Index: tools/llvm-mca/TimelineView.h =================================================================== --- tools/llvm-mca/TimelineView.h +++ tools/llvm-mca/TimelineView.h @@ -0,0 +1,184 @@ +//===--------------------- TimelineView.h ---------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \brief +/// +/// This file implements a timeline view for the llvm-mca tool. +/// +/// Class TimelineView observes events generated by the backend. For every +/// instruction executed by the backend, it stores information related to +/// state transition. It then plots that information in the form of a table +/// as reported by the example below: +/// +/// Timeline view: +/// 0123456 +/// Index 0123456789 +/// +/// [0,0] DeER . . .. vmovshdup %xmm0, %xmm1 +/// [0,1] DeER . . .. vpermilpd $1, %xmm0, %xmm2 +/// [0,2] .DeER. . .. vpermilps $231, %xmm0, %xmm5 +/// [0,3] .DeeeER . .. vaddss %xmm1, %xmm0, %xmm3 +/// [0,4] . D==eeeER. .. vaddss %xmm3, %xmm2, %xmm4 +/// [0,5] . D=====eeeER .. vaddss %xmm4, %xmm5, %xmm6 +/// +/// [1,0] . DeE------R .. vmovshdup %xmm0, %xmm1 +/// [1,1] . DeE------R .. vpermilpd $1, %xmm0, %xmm2 +/// [1,2] . DeE-----R .. vpermilps $231, %xmm0, %xmm5 +/// [1,3] . D=eeeE--R .. vaddss %xmm1, %xmm0, %xmm3 +/// [1,4] . D===eeeER .. vaddss %xmm3, %xmm2, %xmm4 +/// [1,5] . D======eeeER vaddss %xmm4, %xmm5, %xmm6 +/// +/// There is an entry for every instruction in the input assembly sequence. +/// The first field is a pair of numbers obtained from the instruction index. +/// The first element of the pair is the iteration index, while the second +/// element of the pair is a sequence number (i.e. a position in the assembly +/// sequence). +/// The second field of the table is the actual timeline information; each +/// column is the information related to a specific cycle of execution. +/// The timeline of an instruction is described by a sequence of character +/// where each character represents the instruction state at a specific cycle. +/// +/// Possible instruction states are: +/// D: Instruction Dispatched +/// e: Instruction Executing +/// E: Instruction Executed (write-back stage) +/// R: Instruction retired +/// =: Instruction waiting in the Scheduler's queue +/// -: Instruction executed, waiting to retire in order. +/// +/// dots ('.') and empty spaces are cycles where the instruction is not +/// in-flight. +/// +/// The last column is the assembly instruction associated to the entry. +/// +/// Based on the timeline view information from the example, instruction 0 +/// at iteration 0 was dispatched at cycle 0, and was retired at cycle 3. +/// Instruction [0,1] was also dispatched at cycle 0, and it retired at +/// the same cycle than instruction [0,0]. +/// Instruction [0,4] has been dispatched at cycle 2. However, it had to +/// wait for two cycles before being issued. That is because operands +/// became ready only at cycle 5. +/// +/// This view helps further understanding bottlenecks and the impact of +/// resource pressure on the code. +/// +/// To better understand why instructions had to wait for multiple cycles in +/// the scheduler's queue, class TimelineView also reports extra timing info +/// in another table named "Average Wait times" (see example below). +/// +/// +/// Average Wait times (based on the timeline view): +/// [0]: Executions +/// [1]: Average time spent waiting in a scheduler's queue +/// [2]: Average time spent waiting in a scheduler's queue while ready +/// [3]: Average time elapsed from WB until retire stage +/// +/// [0] [1] [2] [3] +/// 0. 2 1.0 1.0 3.0 vmovshdup %xmm0, %xmm1 +/// 1. 2 1.0 1.0 3.0 vpermilpd $1, %xmm0, %xmm2 +/// 2. 2 1.0 1.0 2.5 vpermilps $231, %xmm0, %xmm5 +/// 3. 2 1.5 0.5 1.0 vaddss %xmm1, %xmm0, %xmm3 +/// 4. 2 3.5 0.0 0.0 vaddss %xmm3, %xmm2, %xmm4 +/// 5. 2 6.5 0.0 0.0 vaddss %xmm4, %xmm5, %xmm6 +/// +/// By comparing column [2] with column [1], we get an idea about how many +/// cycles were spent in the scheduler's queue due to data dependencies. +/// +/// In this example, instruction 5 spent an average of ~6 cycles in the +/// scheduler's queue. As soon as operands became ready, the instruction +/// was immediately issued to the pipeline(s). +/// That is expected because instruction 5 cannot transition to the "ready" +/// state until %xmm4 is written by instruction 4. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_MCA_TIMELINEVIEW_H +#define LLVM_TOOLS_LLVM_MCA_TIMELINEVIEW_H + +#include "HWEventListener.h" +#include "SourceMgr.h" +#include "llvm/MC/MCInstPrinter.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/raw_ostream.h" +#include + + +namespace mca { + +using namespace llvm; + +/// \brief This class listens to instruction state transition events +/// in order to construct a timeline information. +/// +/// For every instruction executed by the Backend, this class constructs +/// a TimelineViewEntry object. TimelineViewEntry objects are then used +/// to print the timeline information, as well as the "average wait times" +/// for every instruction in the input assembly sequence. +class TimelineView : public HWEventListener { + const MCSubtargetInfo &STI; + MCInstPrinter &MCIP; + const SourceMgr &AsmSequence; + + unsigned CurrentCycle; + unsigned MaxCycle; + unsigned LastCycle; + + struct TimelineViewEntry { + unsigned CycleDispatched; + unsigned CycleReady; + unsigned CycleIssued; + unsigned CycleExecuted; + unsigned CycleRetired; + }; + std::vector Timeline; + + struct WaitTimeEntry { + unsigned Executions; + unsigned CyclesSpentInSchedulerQueue; + unsigned CyclesSpentInSQWhileReady; + unsigned CyclesSpentAfterWBAndBeforeRetire; + }; + std::vector WaitTime; + + void printTimelineViewEntry(raw_string_ostream &OS, + const TimelineViewEntry &E, unsigned Iteration, + unsigned SourceIndex) const; + void printWaitTimeEntry(raw_string_ostream &OS, const WaitTimeEntry &E, + unsigned Index) const; + + const unsigned DEFAULT_ITERATIONS = 10; + +public: + TimelineView(const MCSubtargetInfo &sti, MCInstPrinter &Printer, + const SourceMgr &Sequence, unsigned MaxIterations, + unsigned Cycles) + : STI(sti), MCIP(Printer), AsmSequence(Sequence), CurrentCycle(0), + MaxCycle(Cycles == 0 ? 80 : Cycles), LastCycle(0) { + initialize(MaxIterations); + } + + void initialize(unsigned MaxIterations); + + // Event handlers. + void OnInstructionDispatched(unsigned Index) override; + void OnInstructionReady(unsigned Index) override; + void OnInstructionIssued( + unsigned Index, + const ArrayRef> &Used) override; + void OnInstructionExecuted(unsigned Index) override; + void OnInstructionRetired(unsigned Index) override; + void OnCycleBegin(unsigned Cycle) override { CurrentCycle = Cycle; } + + // print functionalities. + void printTimeline(raw_ostream &OS) const; + void printAverageWaitTimes(raw_ostream &OS) const; +}; + +} // namespace mca + +#endif Index: tools/llvm-mca/TimelineView.cpp =================================================================== --- tools/llvm-mca/TimelineView.cpp +++ tools/llvm-mca/TimelineView.cpp @@ -0,0 +1,247 @@ +//===--------------------- TimelineView.cpp ---------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \brief +/// +/// This file implements the TimelineView interface. +/// +//===----------------------------------------------------------------------===// + +#include "TimelineView.h" + +using namespace llvm; + +namespace mca { + +void TimelineView::initialize(unsigned MaxIterations) { + unsigned NumInstructions = + AsmSequence.GetNumIterations() * AsmSequence.size(); + if (!MaxIterations) + MaxIterations = DEFAULT_ITERATIONS; + unsigned NumEntries = + std::min(NumInstructions, MaxIterations * AsmSequence.size()); + Timeline.resize(NumEntries); + TimelineViewEntry NullTVEntry = {0, 0, 0, 0, 0}; + std::fill(Timeline.begin(), Timeline.end(), NullTVEntry); + + WaitTime.resize(AsmSequence.size()); + WaitTimeEntry NullWTEntry = {0, 0, 0, 0}; + std::fill(WaitTime.begin(), WaitTime.end(), NullWTEntry); +} + +void TimelineView::OnInstructionDispatched(unsigned Index) { + if (CurrentCycle >= MaxCycle || Index >= Timeline.size()) + return; + Timeline[Index].CycleDispatched = CurrentCycle; + LastCycle = std::max(LastCycle, CurrentCycle); +} + +void TimelineView::OnInstructionReady(unsigned Index) { + if (CurrentCycle >= MaxCycle || Index >= Timeline.size()) + return; + Timeline[Index].CycleReady = CurrentCycle; + LastCycle = std::max(LastCycle, CurrentCycle); +} + +void TimelineView::OnInstructionIssued( + unsigned Index, + const ArrayRef> & /* Unused */) { + if (CurrentCycle >= MaxCycle || Index >= Timeline.size()) + return; + Timeline[Index].CycleIssued = CurrentCycle; + LastCycle = std::max(LastCycle, CurrentCycle); +} + +void TimelineView::OnInstructionExecuted(unsigned Index) { + if (CurrentCycle >= MaxCycle || Index >= Timeline.size()) + return; + Timeline[Index].CycleExecuted = CurrentCycle; + LastCycle = std::max(LastCycle, CurrentCycle); +} + +void TimelineView::OnInstructionRetired(unsigned Index) { + if (CurrentCycle >= MaxCycle || Index >= Timeline.size()) + return; + TimelineViewEntry &TVEntry = Timeline[Index]; + TVEntry.CycleRetired = CurrentCycle; + LastCycle = std::max(LastCycle, CurrentCycle); + + // Update the WaitTime entry which corresponds to this Index. + + WaitTimeEntry &WTEntry = WaitTime[Index % AsmSequence.size()]; + WTEntry.Executions++; + WTEntry.CyclesSpentInSchedulerQueue += + TVEntry.CycleIssued - TVEntry.CycleDispatched; + assert(TVEntry.CycleDispatched <= TVEntry.CycleReady); + WTEntry.CyclesSpentInSQWhileReady += TVEntry.CycleIssued - TVEntry.CycleReady; + WTEntry.CyclesSpentAfterWBAndBeforeRetire += + (TVEntry.CycleRetired - 1) - TVEntry.CycleExecuted; +} + +void TimelineView::printWaitTimeEntry(raw_string_ostream &OS, + const WaitTimeEntry &Entry, + unsigned SourceIndex) const { + OS << SourceIndex << '.'; + if (SourceIndex < 10) + OS << " "; + else if (SourceIndex < 100) + OS << " "; + else if (SourceIndex < 1000) + OS << " "; + else + OS << ' '; + + if (Entry.Executions == 0) { + OS << " - - - - "; + } else { + double AverageTime1, AverageTime2, AverageTime3; + unsigned Executions = Entry.Executions; + AverageTime1 = (double)Entry.CyclesSpentInSchedulerQueue / Executions; + AverageTime2 = (double)Entry.CyclesSpentInSQWhileReady / Executions; + AverageTime3 = (double)Entry.CyclesSpentAfterWBAndBeforeRetire / Executions; + if (Executions < 10) + OS << ' ' << Executions << " "; + else if (Executions < 100) + OS << ' ' << Executions << " "; + else + OS << Executions << " "; + + + OS << format("%.1f", AverageTime1); + if (AverageTime1 < 10.0) + OS << " "; + else if (AverageTime1 < 100.0) + OS << " "; + else + OS << " "; + + OS << format("%.1f", AverageTime2); + if (AverageTime2 < 10.0) + OS << " "; + else if (AverageTime2 < 100.0) + OS << " "; + else + OS << " "; + + OS << format("%.1f", AverageTime3); + if (AverageTime3 < 10.0) + OS << " "; + else if (AverageTime3 < 100.0) + OS << ' '; + } +} + +void TimelineView::printAverageWaitTimes(raw_ostream &OS) const { + if (WaitTime.empty()) + return; + + std::string Buffer; + raw_string_ostream TempStream(Buffer); + + TempStream + << "\n\nAverage Wait times (based on the timeline view):\n" + << "[0]: Executions\n" + << "[1]: Average time spent waiting in a scheduler's queue\n" + << "[2]: Average time spent waiting in a scheduler's queue while ready\n" + << "[3]: Average time elapsed from WB until retire stage\n\n"; + TempStream << " [0] [1] [2] [3]\n"; + + for (unsigned i = 0; i < WaitTime.size(); ++i) { + printWaitTimeEntry(TempStream, WaitTime[i], i); + // Append the instruction info at the end of the line. + const MCInst &Inst = AsmSequence.GetMCInstFromIndex(i); + MCIP.printInst(&Inst, TempStream, "", STI); + TempStream << '\n'; + TempStream.flush(); + OS << Buffer; + Buffer = ""; + } +} + +void TimelineView::printTimelineViewEntry(raw_string_ostream &OS, + const TimelineViewEntry &E, + unsigned Iteration, + unsigned SourceIndex) const { + if (SourceIndex == 0) + OS << '\n'; + OS << '[' << Iteration << ',' << SourceIndex << "]\t"; + for (unsigned i = 0; i < E.CycleDispatched; ++i) + OS << ((i % 5 == 0) ? '.' : ' '); + OS << 'D'; + if (E.CycleDispatched != E.CycleExecuted) { + // Zero latency instructions have the same value for CycleDispatched, + // CycleIssued and CycleExecuted. + for (unsigned i = E.CycleDispatched + 1; i < E.CycleIssued; ++i) + OS << '='; + if (E.CycleIssued == E.CycleExecuted) + OS << 'E'; + else { + if (E.CycleDispatched != E.CycleIssued) + OS << 'e'; + for (unsigned i = E.CycleIssued + 1; i < E.CycleExecuted; ++i) + OS << 'e'; + OS << 'E'; + } + } + + for (unsigned i = E.CycleExecuted + 1; i < E.CycleRetired; ++i) + OS << '-'; + OS << 'R'; + + // Skip other columns. + for (unsigned i = E.CycleRetired + 1; i <= LastCycle; ++i) + OS << ((i % 5 == 0 || i == LastCycle) ? '.' : ' '); +} + +static void printTimelineHeader(raw_string_ostream &OS, unsigned Cycles) { + OS << "\n\nTimeline view:\n"; + OS << " \t"; + for (unsigned i = 0; i <= Cycles; ++i) { + if (((i / 10) & 1) == 0) + OS << ' '; + else + OS << i % 10; + } + + OS << "\nIndex\t"; + for (unsigned i = 0; i <= Cycles; ++i) { + if (((i / 10) & 1) == 0) + OS << i % 10; + else + OS << ' '; + } + OS << '\n'; +} + +void TimelineView::printTimeline(raw_ostream &OS) const { + std::string Buffer; + raw_string_ostream TempStream(Buffer); + + printTimelineHeader(TempStream, LastCycle); + TempStream.flush(); + OS << Buffer; + + for (unsigned Idx = 0; Idx < Timeline.size(); ++Idx) { + Buffer = ""; + const TimelineViewEntry &E = Timeline[Idx]; + if (E.CycleRetired == 0) + return; + + unsigned Iteration = Idx / AsmSequence.size(); + unsigned SourceIndex = Idx % AsmSequence.size(); + printTimelineViewEntry(TempStream, E, Iteration, SourceIndex); + // Append the instruction info at the end of the line. + const MCInst &Inst = AsmSequence.GetMCInstFromIndex(Idx); + MCIP.printInst(&Inst, TempStream, "", STI); + TempStream << '\n'; + TempStream.flush(); + OS << Buffer; + } +} + +} // namespace mca Index: tools/llvm-mca/llvm-mca.cpp =================================================================== --- tools/llvm-mca/llvm-mca.cpp +++ tools/llvm-mca/llvm-mca.cpp @@ -0,0 +1,328 @@ +//===-- llvm-mca.cpp - Machine Code Analyzer -------------------*- C++ -* -===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This utility is a simple driver that allows static performance analysis on +// machine code similarly to how IACA (Intel Architecture Code Analyzer) works. +// +// llvm-mca [options] +// -march +// -mcpu +// -o +// +// The target defaults to the host target. +// The cpu defaults to 'generic'. +// The output defaults to standard output. +// +//===----------------------------------------------------------------------===// + +#include "BackendPrinter.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCObjectFileInfo.h" +#include "llvm/MC/MCParser/MCTargetAsmParser.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/PrettyStackTrace.h" +#include "llvm/Support/Signals.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/TargetSelect.h" + +using namespace llvm; + +static cl::opt + InputFilename(cl::Positional, cl::desc(""), cl::init("-")); + +static cl::opt OutputFilename("o", cl::desc("Output filename"), + cl::init("-"), + cl::value_desc("filename")); + +static cl::opt + ArchName("march", cl::desc("Target arch to assemble for, " + "see -version for available targets")); + +static cl::opt + TripleName("mtriple", cl::desc("Target triple to assemble for, " + "see -version for available targets")); + +static cl::opt + MCPU("mcpu", + cl::desc("Target a specific cpu type (-mcpu=help for details)"), + cl::value_desc("cpu-name"), cl::init("generic")); + +static cl::list + MAttrs("mattr", cl::CommaSeparated, + cl::desc("Target specific attributes (-mattr=help for details)"), + cl::value_desc("a1,+a2,-a3,...")); + +static cl::opt MainFileName( + "main-file-name", + cl::desc("Specifies the name we should consider the input file")); + +static cl::opt + OutputAsmVariant("output-asm-variant", + cl::desc("Syntax variant to use for output printing")); + +static cl::opt Iterations("iterations", + cl::desc("Number of iterations to run"), + cl::init(0)); + +static cl::opt DispatchWidth( + "dispatch", + cl::desc("Dispatch Width. By default it is set equal to IssueWidth"), + cl::init(0)); + +static cl::opt MaxRetirePerCycle( + "max-retire-per-cycle", + cl::desc("Maximum number of instructions that can be retired in one cycle"), + cl::init(0)); + +static cl::opt + RegisterFileSize("register-file-size", + cl::desc("Maximum number of temporary registers which can " + "be used for register mappings"), + cl::init(0)); + +static cl::opt PrintTimelineView("timeline", + cl::desc("Print the timeline view"), + cl::init(false)); + +static cl::opt TimelineMaxIterations( + "timeline-max-iterations", + cl::desc("Maximum number of iterations to print in timeline view."), + cl::init(0)); + +static cl::opt TimelineMaxCycles( + "timeline-max-cycles", + cl::desc( + "Maximum number of cycles in the timeline view. Defaults to 80 cycles"), + cl::init(80)); + +static cl::opt PrintModeVerbose("verbose", + cl::desc("Enable verbose output"), + cl::init(false)); + +static cl::opt + AssumeNoAlias("noalias", + cl::desc("Assume that there is no memory aliasing"), + cl::init(false)); + +static cl::opt + LoadQueueSize("lqueue", cl::desc("Size of the load queue"), cl::init(0)); +static cl::opt + StoreQueueSize("squeue", cl::desc("Size of the store queue"), cl::init(0)); + +static const Target *GetTarget(const char *ProgName) { + TripleName = Triple::normalize(TripleName); + if (TripleName.empty()) + TripleName = Triple::normalize(sys::getDefaultTargetTriple()); + Triple TheTriple(TripleName); + + // Get the target specific parser. + std::string Error; + const Target *TheTarget = + TargetRegistry::lookupTarget(ArchName, TheTriple, Error); + if (!TheTarget) { + errs() << ProgName << ": " << Error; + return nullptr; + } + + // Return the found target. + return TheTarget; +} + +static int AssembleInput(const char *ProgName, const Target *TheTarget, + SourceMgr &SrcMgr, MCContext &Ctx, MCStreamer &Str, + MCAsmInfo &MAI, MCSubtargetInfo &STI, + MCInstrInfo &MCII, MCTargetOptions &MCOptions) { + std::unique_ptr Parser(createMCAsmParser(SrcMgr, Ctx, Str, MAI)); + std::unique_ptr TAP( + TheTarget->createMCAsmParser(STI, *Parser, MCII, MCOptions)); + + if (!TAP) { + errs() << ProgName + << ": error: this target does not support assembly parsing.\n"; + return 1; + } + + Parser->setTargetParser(*TAP); + return Parser->Run(false); +} + +namespace { + +class MCStreamerWrapper final : public MCStreamer { + using InstVec = std::vector>; + InstVec &Insts; + +public: + MCStreamerWrapper(MCContext &Context, InstVec &Vec) + : MCStreamer(Context), Insts(Vec) {} + + // We only want to intercept the emission of new instructions. + virtual void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, + bool /* unused */) override { + Insts.emplace_back(new MCInst(Inst)); + } + + bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) override { + return true; + } + + void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size, + unsigned ByteAlignment) override {} + void EmitZerofill(MCSection *Section, MCSymbol *Symbol = nullptr, + uint64_t Size = 0, unsigned ByteAlignment = 0) override {} + void EmitGPRel32Value(const MCExpr *Value) override {} + void BeginCOFFSymbolDef(const MCSymbol *Symbol) override {} + void EmitCOFFSymbolStorageClass(int StorageClass) override {} + void EmitCOFFSymbolType(int Type) override {} + void EndCOFFSymbolDef() override {} + + const InstVec &GetInstructionSequence() const { return Insts; } +}; + +} // end of anonymous namespace + +int main(int argc, char **argv) { + sys::PrintStackTraceOnErrorSignal(argv[0]); + PrettyStackTraceProgram X(argc, argv); + llvm_shutdown_obj Y; // Call llvm_shutdown() on exit. + + // Initialize targets and assembly parsers. + llvm::InitializeAllTargetInfos(); + llvm::InitializeAllTargetMCs(); + llvm::InitializeAllAsmParsers(); + + // Enable printing of available targets when flag --version is specified. + cl::AddExtraVersionPrinter(TargetRegistry::printRegisteredTargetsForVersion); + + // Parse flags and initialize target options. + cl::ParseCommandLineOptions(argc, argv, + "llvm machine code performance analyzer.\n"); + MCTargetOptions MCOptions; + MCOptions.PreserveAsmComments = false; + + // Get the target from the triple. If a triple is not specified, then select + // the default triple for the host. If the triple doesn't correspond to any + // registered target, then exit with an error message. + const char *ProgName = argv[0]; + const Target *TheTarget = GetTarget(ProgName); + if (!TheTarget) + return 1; + + // GetTarget() may replaced TripleName with a default triple. + // For safety, reconstruct the Triple object. + Triple TheTriple(TripleName); + + ErrorOr> BufferPtr = + MemoryBuffer::getFileOrSTDIN(InputFilename); + if (std::error_code EC = BufferPtr.getError()) { + errs() << InputFilename << ": " << EC.message() << '\n'; + return 1; + } + + SourceMgr SrcMgr; + + // Tell SrcMgr about this buffer, which is what the parser will pick up. + SrcMgr.AddNewSourceBuffer(std::move(*BufferPtr), SMLoc()); + + std::unique_ptr MRI(TheTarget->createMCRegInfo(TripleName)); + assert(MRI && "Unable to create target register info!"); + + std::unique_ptr MAI(TheTarget->createMCAsmInfo(*MRI, TripleName)); + assert(MAI && "Unable to create target asm info!"); + + MCObjectFileInfo MOFI; + MCContext Ctx(MAI.get(), MRI.get(), &MOFI, &SrcMgr); + MOFI.InitMCObjectFileInfo(TheTriple, /* PIC= */ false, Ctx); + + if (!MainFileName.empty()) + Ctx.setMainFileName(MainFileName); + + // Package up features to be passed to target/subtarget + std::string FeaturesStr; + if (MAttrs.size()) { + SubtargetFeatures Features; + for (unsigned i = 0; i != MAttrs.size(); ++i) + Features.AddFeature(MAttrs[i]); + FeaturesStr = Features.getString(); + } + + std::unique_ptr BOS; + std::unique_ptr S = + llvm::make_unique(Iterations); + MCStreamerWrapper Str(Ctx, S->GetSequence()); + + std::unique_ptr MCII(TheTarget->createMCInstrInfo()); + std::unique_ptr STI( + TheTarget->createMCSubtargetInfo(TripleName, MCPU, FeaturesStr)); + if (!STI->isCPUStringValid(MCPU)) + return 1; + + if (!STI->getSchedModel().isOutOfOrder()) { + errs() << "error: please specify an out-of-order cpu. '" << MCPU + << "' is an in-order cpu.\n"; + return 1; + } + + if (!STI->getSchedModel().hasInstrSchedModel()) { + errs() + << "error: unable to find instruction-level scheduling information for" + << " target triple '" << TheTriple.normalize() << "' and cpu '" << MCPU + << "'.\n"; + + if (STI->getSchedModel().InstrItineraries) + errs() << "note: cpu '" << MCPU << "' provides itineraries. However, " + << "instruction itineraries are currently unsupported.\n"; + return 1; + } + + std::unique_ptr IP(TheTarget->createMCInstPrinter( + Triple(TripleName), OutputAsmVariant, *MAI, *MCII, *MRI)); + if (!IP) { + errs() << "error: unable to create instruction printer for target triple '" + << TheTriple.normalize() << "' with assembly variant " + << OutputAsmVariant << ".\n"; + return 1; + } + + int Res = AssembleInput(ProgName, TheTarget, SrcMgr, Ctx, Str, *MAI, *STI, + *MCII, MCOptions); + if (Res) + return Res; + + if (S->IsEmpty()) { + errs() << "error: no assembly instructions found.\n"; + return 1; + } + + const MCSchedModel &SM = STI->getSchedModel(); + + unsigned Width = SM.IssueWidth; + if (DispatchWidth) + Width = DispatchWidth; + + std::unique_ptr B = llvm::make_unique( + *STI, *MCII, *MRI, std::move(S), Width, RegisterFileSize, MaxRetirePerCycle, + LoadQueueSize, StoreQueueSize, AssumeNoAlias); + + std::unique_ptr Printer = + llvm::make_unique(*B, OutputFilename, std::move(IP), + PrintModeVerbose); + Printer->AddResourcePressureView(); + if (PrintTimelineView) + Printer->AddTimelineView(TimelineMaxIterations, TimelineMaxCycles); + + B->Run(); + Printer->print(); + return 0; +} Index: utils/TableGen/SubtargetEmitter.cpp =================================================================== --- utils/TableGen/SubtargetEmitter.cpp +++ utils/TableGen/SubtargetEmitter.cpp @@ -616,7 +616,7 @@ OS << "static const llvm::MCProcResourceDesc " << ProcModel.ModelName << "ProcResources" << "[] = {\n" - << " {DBGFIELD(\"InvalidUnit\") 0, 0, 0, 0},\n"; + << " {\"InvalidUnit\", 0, 0, 0, 0},\n"; unsigned SubUnitsOffset = 1; for (unsigned i = 0, e = ProcModel.ProcResourceDefs.size(); i < e; ++i) { @@ -644,8 +644,9 @@ } NumUnits = PRDef->getValueAsInt("NumUnits"); } + // Emit the ProcResourceDesc - OS << " {DBGFIELD(\"" << PRDef->getName() << "\") "; + OS << " {\"" << PRDef->getName() << "\", "; if (PRDef->getName().size() < 15) OS.indent(15 - PRDef->getName().size()); OS << NumUnits << ", " << SuperIdx << ", " << BufferSize << ", ";