Index: llvm/trunk/include/llvm/MC/MCSchedule.h =================================================================== --- llvm/trunk/include/llvm/MC/MCSchedule.h +++ llvm/trunk/include/llvm/MC/MCSchedule.h @@ -128,6 +128,51 @@ } }; +/// Specify the cost of a register definition in terms of number of physical +/// register allocated at register renaming stage. For example, AMD Jaguar. +/// natively supports 128-bit data types, and operations on 256-bit registers +/// (i.e. YMM registers) are internally split into two COPs (complex operations) +/// and each COP updates a physical register. Basically, on Jaguar, a YMM +/// register write effectively consumes two physical registers. That means, +/// the cost of a YMM write in the BtVer2 model is 2. +struct MCRegisterCostEntry { + unsigned RegisterClassID; + unsigned Cost; +}; + +/// A register file descriptor. +/// +/// This struct allows to describe processor register files. In particular, it +/// helps describing the size of the register file, as well as the cost of +/// allocating a register file at register renaming stage. +/// FIXME: this struct can be extended to provide information about the number +/// of read/write ports to the register file. A value of zero for field +/// 'NumPhysRegs' means: this register file has an unbounded number of physical +/// registers. +struct MCRegisterFileDesc { + const char *Name; + uint16_t NumPhysRegs; + uint16_t NumRegisterCostEntries; + // Index of the first cost entry in MCExtraProcessorInfo::RegisterCostTable. + uint16_t RegisterCostEntryIdx; +}; + +/// Provide extra details about the machine processor. +/// +/// This is a collection of "optional" processor information that is not +/// normally used by the LLVM machine schedulers, but that can be consumed by +/// external tools like llvm-mca to improve the quality of the peformance +/// analysis. +/// In future, the plan is to extend this struct with extra information (for +/// example: maximum number of instructions retired per cycle; actual size of +/// the reorder buffer; etc.). +struct MCExtraProcessorInfo { + const MCRegisterFileDesc *RegisterFiles; + unsigned NumRegisterFiles; + const MCRegisterCostEntry *RegisterCostTable; + unsigned NumRegisterCostEntries; +}; + /// Machine model for scheduling, bundling, and heuristics. /// /// The machine model directly provides basic information about the @@ -198,11 +243,21 @@ friend class InstrItineraryData; const InstrItinerary *InstrItineraries; + const MCExtraProcessorInfo *ExtraProcessorInfo; + + bool hasExtraProcessorInfo() const { return ExtraProcessorInfo; } + unsigned getProcessorID() const { return ProcID; } /// Does this machine model include instruction-level scheduling. bool hasInstrSchedModel() const { return SchedClassTable; } + const MCExtraProcessorInfo &getExtraProcessorInfo() const { + assert(hasExtraProcessorInfo() && + "No extra information available for this model"); + return *ExtraProcessorInfo; + } + /// Return true if this machine model data for all instructions with a /// scheduling class (itinerary class or SchedRW list). bool isComplete() const { return CompleteModel; } Index: llvm/trunk/include/llvm/Target/TargetSchedule.td =================================================================== --- llvm/trunk/include/llvm/Target/TargetSchedule.td +++ llvm/trunk/include/llvm/Target/TargetSchedule.td @@ -442,3 +442,20 @@ SchedReadWrite AliasRW = alias; SchedMachineModel SchedModel = ?; } + +// Alow the definition of processor register files. +// Each processor register file declares the number of physical registers, as +// well as a optional register cost information. The cost of a register R is the +// number of physical registers used to rename R (at register renaming stage). +// That value defaults to 1, to all the registers contained in the register +// file. The set of target register files is inferred from the list of register +// classes. Register costs are defined at register class granularity. An empty +// list of register classes means that this register file contains all the +// registers defined by the target. +class RegisterFile Classes = [], + list Costs = []> { + list RegClasses = Classes; + list RegCosts = Costs; + int NumPhysRegs = numPhysRegs; + SchedMachineModel SchedModel = ?; +} Index: llvm/trunk/lib/Target/X86/X86ScheduleBtVer2.td =================================================================== --- llvm/trunk/lib/Target/X86/X86ScheduleBtVer2.td +++ llvm/trunk/lib/Target/X86/X86ScheduleBtVer2.td @@ -38,6 +38,16 @@ def JFPU0 : ProcResource<1>; // Vector/FPU Pipe0: VALU0/VIMUL/FPA def JFPU1 : ProcResource<1>; // Vector/FPU Pipe1: VALU1/STC/FPM +// The Integer PRF for Jaguar is 64 entries, and it holds the architectural and +// speculative version of the 64-bit integer registers. +// Reference: www.realworldtech.com/jaguar/4/ +def IntegerPRF : RegisterFile<64, [GR8, GR16, GR32, GR64, CCR]>; + +// The Jaguar FP Retire Queue renames SIMD and FP uOps onto a pool of 72 SSE +// registers. Operations on 256-bit data types are cracked into two COPs. +// Reference: www.realworldtech.com/jaguar/4/ +def FpuPRF: RegisterFile<72, [VR64, VR128, VR256], [1, 1, 2]>; + // Integer Pipe Scheduler def JALU01 : ProcResGroup<[JALU0, JALU1]> { let BufferSize=20; Index: llvm/trunk/test/tools/llvm-mca/X86/BtVer2/register-files-1.s =================================================================== --- llvm/trunk/test/tools/llvm-mca/X86/BtVer2/register-files-1.s +++ llvm/trunk/test/tools/llvm-mca/X86/BtVer2/register-files-1.s @@ -6,13 +6,30 @@ # CHECK: Iterations: 5 # CHECK-NEXT: Instructions: 10 + # CHECK: Dynamic Dispatch Stall Cycles: # CHECK-NEXT: RAT - Register unavailable: 0 +# CHECK-NEXT: RCU - Retire tokens unavailable: 0 +# CHECK-NEXT: SCHEDQ - Scheduler full: 0 +# CHECK-NEXT: LQ - Load queue full: 0 +# CHECK-NEXT: SQ - Store queue full: 0 +# CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0 + + +# CHECK: Register File statistics: +# CHECK-NEXT: Total number of mappings created: 10 +# CHECK-NEXT: Max number of mappings used: 10 + +# CHECK: * Register File #1 -- FpuPRF: +# CHECK-NEXT: Number of physical registers: 72 +# CHECK-NEXT: Total number of mappings created: 10 +# CHECK-NEXT: Max number of mappings used: 10 + +# CHECK: * Register File #2 -- IntegerPRF: +# CHECK-NEXT: Number of physical registers: 64 +# CHECK-NEXT: Total number of mappings created: 0 +# CHECK-NEXT: Max number of mappings used: 0 -# CHECK: Register File statistics. -# CHECK-NEXT: Register File #0 -# CHECK-NEXT: Total number of mappings created: 10 -# CHECK-NEXT: Max number of mappings used: 10 # CHECK: Timeline view: # CHECK-NEXT: 0123456789 Index: llvm/trunk/test/tools/llvm-mca/X86/BtVer2/register-files-2.s =================================================================== --- llvm/trunk/test/tools/llvm-mca/X86/BtVer2/register-files-2.s +++ llvm/trunk/test/tools/llvm-mca/X86/BtVer2/register-files-2.s @@ -6,13 +6,30 @@ # CHECK: Iterations: 5 # CHECK-NEXT: Instructions: 10 -# CHECK: Dynamic Dispatch Stall Cycles: + +# CHECK: Dynamic Dispatch Stall Cycles: # CHECK-NEXT: RAT - Register unavailable: 13 +# CHECK-NEXT: RCU - Retire tokens unavailable: 0 +# CHECK-NEXT: SCHEDQ - Scheduler full: 0 +# CHECK-NEXT: LQ - Load queue full: 0 +# CHECK-NEXT: SQ - Store queue full: 0 +# CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0 + + +# CHECK: Register File statistics: +# CHECK-NEXT: Total number of mappings created: 10 +# CHECK-NEXT: Max number of mappings used: 5 + +# CHECK: * Register File #1 -- FpuPRF: +# CHECK-NEXT: Number of physical registers: 72 +# CHECK-NEXT: Total number of mappings created: 10 +# CHECK-NEXT: Max number of mappings used: 5 + +# CHECK: * Register File #2 -- IntegerPRF: +# CHECK-NEXT: Number of physical registers: 64 +# CHECK-NEXT: Total number of mappings created: 0 +# CHECK-NEXT: Max number of mappings used: 0 -# CHECK: Register File statistics. -# CHECK-NEXT: Register File #0 -# CHECK-NEXT: Total number of mappings created: 10 -# CHECK-NEXT: Max number of mappings used: 5 # CHECK: Timeline view: # CHECK-NEXT: 0123456789 Index: llvm/trunk/test/tools/llvm-mca/X86/BtVer2/register-files-3.s =================================================================== --- llvm/trunk/test/tools/llvm-mca/X86/BtVer2/register-files-3.s +++ llvm/trunk/test/tools/llvm-mca/X86/BtVer2/register-files-3.s @@ -17,10 +17,26 @@ # CHECK-NEXT: 2 25 25.00 * idivl %eax -# CHECK: RAT - Register unavailable: 26 +# CHECK: Dynamic Dispatch Stall Cycles: +# CHECK-NEXT: RAT - Register unavailable: 26 +# CHECK-NEXT: RCU - Retire tokens unavailable: 0 +# CHECK-NEXT: SCHEDQ - Scheduler full: 0 +# CHECK-NEXT: LQ - Load queue full: 0 +# CHECK-NEXT: SQ - Store queue full: 0 +# CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0 + + +# CHECK: Register File statistics: +# CHECK-NEXT: Total number of mappings created: 6 +# CHECK-NEXT: Max number of mappings used: 3 + +# CHECK: * Register File #1 -- FpuPRF: +# CHECK-NEXT: Number of physical registers: 72 +# CHECK-NEXT: Total number of mappings created: 0 +# CHECK-NEXT: Max number of mappings used: 0 -# CHECK: Register File statistics. -# CHECK-NEXT: Register File #0 +# CHECK: * Register File #2 -- IntegerPRF: +# CHECK-NEXT: Number of physical registers: 64 # CHECK-NEXT: Total number of mappings created: 6 # CHECK-NEXT: Max number of mappings used: 3 Index: llvm/trunk/test/tools/llvm-mca/X86/BtVer2/register-files-4.s =================================================================== --- llvm/trunk/test/tools/llvm-mca/X86/BtVer2/register-files-4.s +++ llvm/trunk/test/tools/llvm-mca/X86/BtVer2/register-files-4.s @@ -0,0 +1,49 @@ +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=22 -verbose -timeline -timeline-max-iterations=3 < %s | FileCheck %s + +idiv %eax + +# CHECK: Iterations: 22 +# CHECK-NEXT: Instructions: 22 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 2 25 25.00 * idivl %eax + + +# CHECK: Dynamic Dispatch Stall Cycles: +# CHECK-NEXT: RAT - Register unavailable: 6 +# CHECK-NEXT: RCU - Retire tokens unavailable: 0 +# CHECK-NEXT: SCHEDQ - Scheduler full: 0 +# CHECK-NEXT: LQ - Load queue full: 0 +# CHECK-NEXT: SQ - Store queue full: 0 +# CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0 + + +# CHECK: Register File statistics: +# CHECK-NEXT: Total number of mappings created: 66 +# CHECK-NEXT: Max number of mappings used: 63 + +# CHECK: * Register File #1 -- FpuPRF: +# CHECK-NEXT: Number of physical registers: 72 +# CHECK-NEXT: Total number of mappings created: 0 +# CHECK-NEXT: Max number of mappings used: 0 + +# CHECK: * Register File #2 -- IntegerPRF: +# CHECK-NEXT: Number of physical registers: 64 +# CHECK-NEXT: Total number of mappings created: 66 +# CHECK-NEXT: Max number of mappings used: 63 + + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 0123456789 0123456789 01234567 +# CHECK-NEXT: Index 0123456789 0123456789 0123456789 0123456789 +# CHECK: [0,0] DeeeeeeeeeeeeeeeeeeeeeeeeeER . . . . . . . . . . . idivl %eax +# CHECK: [1,0] .D========================eeeeeeeeeeeeeeeeeeeeeeeeeER . . . . . . idivl %eax +# CHECK: [2,0] . D================================================eeeeeeeeeeeeeeeeeeeeeeeeeER idivl %eax Index: llvm/trunk/test/tools/llvm-mca/X86/BtVer2/register-files-5.s =================================================================== --- llvm/trunk/test/tools/llvm-mca/X86/BtVer2/register-files-5.s +++ llvm/trunk/test/tools/llvm-mca/X86/BtVer2/register-files-5.s @@ -0,0 +1,105 @@ +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1 -resource-pressure=false -instruction-info=false -verbose -timeline < %s | FileCheck %s + + vdivps %ymm0, %ymm0, %ymm1 + vaddps %ymm0, %ymm0, %ymm2 + vaddps %ymm0, %ymm0, %ymm3 + vaddps %ymm0, %ymm0, %ymm4 + vaddps %ymm0, %ymm0, %ymm5 + vaddps %ymm0, %ymm0, %ymm6 + vaddps %ymm0, %ymm0, %ymm7 + vaddps %ymm0, %ymm0, %ymm8 + vaddps %ymm0, %ymm0, %ymm9 + vaddps %ymm0, %ymm0, %ymm10 + vaddps %ymm0, %ymm0, %ymm11 + vaddps %ymm0, %ymm0, %ymm12 + vaddps %ymm0, %ymm0, %ymm13 + vaddps %ymm0, %ymm0, %ymm14 + vaddps %ymm0, %ymm0, %ymm15 + vaddps %ymm2, %ymm0, %ymm0 + vaddps %ymm2, %ymm0, %ymm3 + vaddps %ymm2, %ymm0, %ymm4 + vaddps %ymm2, %ymm0, %ymm5 + vaddps %ymm2, %ymm0, %ymm6 + vaddps %ymm2, %ymm0, %ymm7 + vaddps %ymm2, %ymm0, %ymm8 + vaddps %ymm2, %ymm0, %ymm9 + vaddps %ymm2, %ymm0, %ymm10 + vaddps %ymm2, %ymm0, %ymm11 + vaddps %ymm2, %ymm0, %ymm12 + vaddps %ymm2, %ymm0, %ymm13 + vaddps %ymm2, %ymm0, %ymm14 + vaddps %ymm2, %ymm0, %ymm15 + vaddps %ymm3, %ymm0, %ymm2 + vaddps %ymm3, %ymm0, %ymm4 + vaddps %ymm3, %ymm0, %ymm5 + vaddps %ymm3, %ymm0, %ymm6 + + +# CHECK: Iterations: 1 +# CHECK-NEXT: Instructions: 33 +# CHECK-NEXT: Total Cycles: 70 +# CHECK-NEXT: Dispatch Width: 2 +# CHECK-NEXT: IPC: 0.47 + + +# CHECK: Dynamic Dispatch Stall Cycles: +# CHECK-NEXT: RAT - Register unavailable: 0 +# CHECK-NEXT: RCU - Retire tokens unavailable: 8 +# CHECK-NEXT: SCHEDQ - Scheduler full: 0 +# CHECK-NEXT: LQ - Load queue full: 0 +# CHECK-NEXT: SQ - Store queue full: 0 +# CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0 + + +# CHECK: Register File statistics: +# CHECK-NEXT: Total number of mappings created: 66 +# CHECK-NEXT: Max number of mappings used: 64 + +# CHECK: * Register File #1 -- FpuPRF: +# CHECK-NEXT: Number of physical registers: 72 +# CHECK-NEXT: Total number of mappings created: 66 +# CHECK-NEXT: Max number of mappings used: 64 + +# CHECK: * Register File #2 -- IntegerPRF: +# CHECK-NEXT: Number of physical registers: 64 +# CHECK-NEXT: Total number of mappings created: 0 +# CHECK-NEXT: Max number of mappings used: 0 + + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 0123456789 0123456789 +# CHECK-NEXT: Index 0123456789 0123456789 0123456789 0123456789 + +# CHECK: [0,0] DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeER . . . . . . vdivps %ymm0, %ymm0, %ymm1 +# CHECK-NEXT: [0,1] .DeeeE----------------------------------R . . . . . . vaddps %ymm0, %ymm0, %ymm2 +# CHECK-NEXT: [0,2] . D=eeeE--------------------------------R . . . . . . vaddps %ymm0, %ymm0, %ymm3 +# CHECK-NEXT: [0,3] . D==eeeE------------------------------R . . . . . . vaddps %ymm0, %ymm0, %ymm4 +# CHECK-NEXT: [0,4] . D===eeeE----------------------------R . . . . . . vaddps %ymm0, %ymm0, %ymm5 +# CHECK-NEXT: [0,5] . D====eeeE--------------------------R . . . . . . vaddps %ymm0, %ymm0, %ymm6 +# CHECK-NEXT: [0,6] . .D=====eeeE------------------------R . . . . . . vaddps %ymm0, %ymm0, %ymm7 +# CHECK-NEXT: [0,7] . . D======eeeE----------------------R . . . . . . vaddps %ymm0, %ymm0, %ymm8 +# CHECK-NEXT: [0,8] . . D=======eeeE--------------------R . . . . . . vaddps %ymm0, %ymm0, %ymm9 +# CHECK-NEXT: [0,9] . . D========eeeE------------------R . . . . . . vaddps %ymm0, %ymm0, %ymm10 +# CHECK-NEXT: [0,10] . . D=========eeeE----------------R . . . . . . vaddps %ymm0, %ymm0, %ymm11 +# CHECK-NEXT: [0,11] . . .D==========eeeE--------------R . . . . . . vaddps %ymm0, %ymm0, %ymm12 +# CHECK-NEXT: [0,12] . . . D===========eeeE------------R . . . . . . vaddps %ymm0, %ymm0, %ymm13 +# CHECK-NEXT: [0,13] . . . D============eeeE----------R . . . . . . vaddps %ymm0, %ymm0, %ymm14 +# CHECK-NEXT: [0,14] . . . D=============eeeE--------R . . . . . . vaddps %ymm0, %ymm0, %ymm15 +# CHECK-NEXT: [0,15] . . . D==============eeeE------R . . . . . . vaddps %ymm2, %ymm0, %ymm0 +# CHECK-NEXT: [0,16] . . . .D================eeeE---R . . . . . . vaddps %ymm2, %ymm0, %ymm3 +# CHECK-NEXT: [0,17] . . . . D=================eeeE-R . . . . . . vaddps %ymm2, %ymm0, %ymm4 +# CHECK-NEXT: [0,18] . . . . D==================eeeER . . . . . . vaddps %ymm2, %ymm0, %ymm5 +# CHECK-NEXT: [0,19] . . . . D===================eeeER . . . . . . vaddps %ymm2, %ymm0, %ymm6 +# CHECK-NEXT: [0,20] . . . . D====================eeeER . . . . . vaddps %ymm2, %ymm0, %ymm7 +# CHECK-NEXT: [0,21] . . . . .D=====================eeeER . . . . . vaddps %ymm2, %ymm0, %ymm8 +# CHECK-NEXT: [0,22] . . . . . D======================eeeER. . . . . vaddps %ymm2, %ymm0, %ymm9 +# CHECK-NEXT: [0,23] . . . . . D=======================eeeER . . . . vaddps %ymm2, %ymm0, %ymm10 +# CHECK-NEXT: [0,24] . . . . . D========================eeeER . . . . vaddps %ymm2, %ymm0, %ymm11 +# CHECK-NEXT: [0,25] . . . . . D=========================eeeER . . . vaddps %ymm2, %ymm0, %ymm12 +# CHECK-NEXT: [0,26] . . . . . .D==========================eeeER . . . vaddps %ymm2, %ymm0, %ymm13 +# CHECK-NEXT: [0,27] . . . . . . D===========================eeeER. . . vaddps %ymm2, %ymm0, %ymm14 +# CHECK-NEXT: [0,28] . . . . . . D============================eeeER . . vaddps %ymm2, %ymm0, %ymm15 +# CHECK-NEXT: [0,29] . . . . . . D=============================eeeER . . vaddps %ymm3, %ymm0, %ymm2 +# CHECK-NEXT: [0,30] . . . . . . D==============================eeeER . vaddps %ymm3, %ymm0, %ymm4 +# CHECK-NEXT: [0,31] . . . . . . .D===============================eeeER . vaddps %ymm3, %ymm0, %ymm5 +# CHECK-NEXT: [0,32] . . . . . . . . D========================eeeER vaddps %ymm3, %ymm0, %ymm6 Index: llvm/trunk/tools/llvm-mca/Backend.h =================================================================== --- llvm/trunk/tools/llvm-mca/Backend.h +++ llvm/trunk/tools/llvm-mca/Backend.h @@ -69,7 +69,7 @@ LoadQueueSize, StoreQueueSize, AssumeNoAlias)), DU(llvm::make_unique( - this, MRI, Subtarget.getSchedModel().MicroOpBufferSize, + this, STI, MRI, Subtarget.getSchedModel().MicroOpBufferSize, RegisterFileSize, MaxRetirePerCycle, DispatchWidth, HWS.get())), SM(Source), Cycles(0) { HWS->setDispatchUnit(DU.get()); Index: llvm/trunk/tools/llvm-mca/BackendStatistics.h =================================================================== --- llvm/trunk/tools/llvm-mca/BackendStatistics.h +++ llvm/trunk/tools/llvm-mca/BackendStatistics.h @@ -113,6 +113,8 @@ // There is one entry for each register file implemented by the processor. llvm::SmallVector RegisterFiles; + void initializeRegisterFileInfo(); + void printRetireUnitStatistics(llvm::raw_ostream &OS) const; void printDispatchUnitStatistics(llvm::raw_ostream &OS) const; void printSchedulerStatistics(llvm::raw_ostream &OS) const; @@ -131,10 +133,9 @@ public: BackendStatistics(const llvm::MCSubtargetInfo &sti) : STI(sti), NumDispatched(0), NumIssued(0), NumRetired(0), NumCycles(0), - HWStalls(HWStallEvent::LastGenericEvent), - // TODO: The view currently assumes a single register file. This will - // change in future. - RegisterFiles(1) {} + HWStalls(HWStallEvent::LastGenericEvent) { + initializeRegisterFileInfo(); + } void onInstructionEvent(const HWInstructionEvent &Event) override; Index: llvm/trunk/tools/llvm-mca/BackendStatistics.cpp =================================================================== --- llvm/trunk/tools/llvm-mca/BackendStatistics.cpp +++ llvm/trunk/tools/llvm-mca/BackendStatistics.cpp @@ -20,6 +20,26 @@ namespace mca { +void BackendStatistics::initializeRegisterFileInfo() { + const MCSchedModel &SM = STI.getSchedModel(); + RegisterFileUsage Empty = {0, 0, 0}; + if (!SM.hasExtraProcessorInfo()) { + // Assume a single register file. + RegisterFiles.emplace_back(Empty); + return; + } + + // Initialize a RegisterFileUsage for every user defined register file, plus + // the default register file which is always at index #0. + const MCExtraProcessorInfo &PI = SM.getExtraProcessorInfo(); + // There is always an "InvalidRegisterFile" entry in tablegen. That entry can + // be skipped. If there are no user defined register files, then reserve a + // single entry for the default register file at index #0. + unsigned NumRegFiles = std::max(PI.NumRegisterFiles, 1U); + RegisterFiles.resize(NumRegFiles); + std::fill(RegisterFiles.begin(), RegisterFiles.end(), Empty); +} + void BackendStatistics::onInstructionEvent(const HWInstructionEvent &Event) { switch (Event.Type) { default: @@ -134,13 +154,36 @@ std::string Buffer; raw_string_ostream TempStream(Buffer); - TempStream << "\n\nRegister File statistics."; - for (unsigned I = 0, E = RegisterFiles.size(); I < E; ++I) { + TempStream << "\n\nRegister File statistics:"; + const RegisterFileUsage &GlobalUsage = RegisterFiles[0]; + TempStream << "\nTotal number of mappings created: " + << GlobalUsage.TotalMappings; + TempStream << "\nMax number of mappings used: " + << GlobalUsage.MaxUsedMappings << '\n'; + + for (unsigned I = 1, E = RegisterFiles.size(); I < E; ++I) { const RegisterFileUsage &RFU = RegisterFiles[I]; - TempStream << "\nRegister File #" << I; - TempStream << "\n Total number of mappings created: " << RFU.TotalMappings; - TempStream << "\n Max number of mappings used: " - << RFU.MaxUsedMappings; + // Obtain the register file descriptor from the scheduling model. + assert(STI.getSchedModel().hasExtraProcessorInfo() && + "Unable to find register file info!"); + const MCExtraProcessorInfo &PI = + STI.getSchedModel().getExtraProcessorInfo(); + assert(I <= PI.NumRegisterFiles && "Unexpected register file index!"); + const MCRegisterFileDesc &RFDesc = PI.RegisterFiles[I]; + // Skip invalid register files. + if (!RFDesc.NumPhysRegs) + continue; + + TempStream << "\n* Register File #" << I; + TempStream << " -- " << StringRef(RFDesc.Name) << ':'; + TempStream << "\n Number of physical registers: "; + if (!RFDesc.NumPhysRegs) + TempStream << "unbounded"; + else + TempStream << RFDesc.NumPhysRegs; + TempStream << "\n Total number of mappings created: " << RFU.TotalMappings; + TempStream << "\n Max number of mappings used: " + << RFU.MaxUsedMappings << '\n'; } TempStream.flush(); Index: llvm/trunk/tools/llvm-mca/Dispatch.h =================================================================== --- llvm/trunk/tools/llvm-mca/Dispatch.h +++ llvm/trunk/tools/llvm-mca/Dispatch.h @@ -51,27 +51,29 @@ // This is where information related to the various register files is kept. // This set always contains at least one register file at index #0. That // register file "sees" all the physical registers declared by the target, and - // (by default) it allows an unbound number of mappings. + // (by default) it allows an unbounded number of mappings. // Users can limit the number of mappings that can be created by register file // #0 through the command line flag `-register-file-size`. llvm::SmallVector RegisterFiles; + // This pair is used to identify the owner of a physical register, as well as + // the cost of using that register file. + using IndexPlusCostPairTy = std::pair; + // RegisterMapping objects are mainly used to track physical register // definitions. A WriteState object describes a register definition, and it is // used to track RAW dependencies (see Instruction.h). A RegisterMapping // object also specifies the set of register files. The mapping between // physreg and register files is done using a "register file mask". // - // A register file mask identifies a set of register files. Each bit of the - // mask representation references a specific register file. - // For example: - // 0b0001 --> Register file #0 - // 0b0010 --> Register file #1 - // 0b0100 --> Register file #2 + // A register file index identifies a user defined register file. + // There is one index per RegisterMappingTracker, and index #0 is reserved to + // the default unified register file. // - // Note that this implementation allows register files to overlap. - // The maximum number of register files allowed by this implementation is 32. - using RegisterMapping = std::pair; + // This implementation does not allow overlapping register files. The only + // register file that is allowed to overlap with other register files is + // register file #0. + using RegisterMapping = std::pair; // This map contains one entry for each physical register defined by the // processor scheduling model. @@ -95,24 +97,32 @@ // The list of register classes is then converted by the tablegen backend into // a list of register class indices. That list, along with the number of // available mappings, is then used to create a new RegisterMappingTracker. - void addRegisterFile(llvm::ArrayRef RegisterClasses, - unsigned NumTemps); - - // Allocates a new register mapping in every register file specified by the - // register file mask. This method is called from addRegisterMapping. - void createNewMappings(unsigned RegisterFileMask, + void + addRegisterFile(llvm::ArrayRef RegisterClasses, + unsigned NumPhysRegs); + + // Allocates register mappings in register file specified by the + // IndexPlusCostPairTy object. This method is called from addRegisterMapping. + void createNewMappings(IndexPlusCostPairTy IPC, llvm::MutableArrayRef UsedPhysRegs); - // Removes a previously allocated mapping from each register file in the - // RegisterFileMask set. This method is called from invalidateRegisterMapping. - void removeMappings(unsigned RegisterFileMask, + // Removes a previously allocated mapping from the register file referenced + // by the IndexPlusCostPairTy object. This method is called from + // invalidateRegisterMapping. + void removeMappings(IndexPlusCostPairTy IPC, llvm::MutableArrayRef FreedPhysRegs); + // Create an instance of RegisterMappingTracker for every register file + // specified by the processor model. + // If no register file is specified, then this method creates a single + // register file with an unbounded number of registers. + void initialize(const llvm::MCSchedModel &SM, unsigned NumRegs); + public: - RegisterFile(const llvm::MCRegisterInfo &mri, unsigned TempRegs = 0) - : MRI(mri), RegisterMappings(MRI.getNumRegs(), {nullptr, 0U}) { - addRegisterFile({}, TempRegs); - // TODO: teach the scheduling models how to specify multiple register files. + RegisterFile(const llvm::MCSchedModel &SM, const llvm::MCRegisterInfo &mri, + unsigned NumRegs = 0) + : MRI(mri), RegisterMappings(mri.getNumRegs(), {nullptr, {0, 0}}) { + initialize(SM, NumRegs); } // Creates a new register mapping for RegID. @@ -245,7 +255,7 @@ std::unique_ptr RCU; Backend *Owner; - bool checkRAT(unsigned Index, const Instruction &Desc); + bool checkRAT(unsigned Index, const Instruction &Inst); bool checkRCU(unsigned Index, const InstrDesc &Desc); bool checkScheduler(unsigned Index, const InstrDesc &Desc); @@ -254,13 +264,14 @@ llvm::ArrayRef UsedPhysRegs); public: - DispatchUnit(Backend *B, const llvm::MCRegisterInfo &MRI, - unsigned MicroOpBufferSize, unsigned RegisterFileSize, - unsigned MaxRetirePerCycle, unsigned MaxDispatchWidth, - Scheduler *Sched) + DispatchUnit(Backend *B, const llvm::MCSubtargetInfo &STI, + const llvm::MCRegisterInfo &MRI, unsigned MicroOpBufferSize, + unsigned RegisterFileSize, unsigned MaxRetirePerCycle, + unsigned MaxDispatchWidth, Scheduler *Sched) : DispatchWidth(MaxDispatchWidth), AvailableEntries(MaxDispatchWidth), CarryOver(0U), SC(Sched), - RAT(llvm::make_unique(MRI, RegisterFileSize)), + RAT(llvm::make_unique(STI.getSchedModel(), MRI, + RegisterFileSize)), RCU(llvm::make_unique(MicroOpBufferSize, MaxRetirePerCycle, this)), Owner(B) {} Index: llvm/trunk/tools/llvm-mca/Dispatch.cpp =================================================================== --- llvm/trunk/tools/llvm-mca/Dispatch.cpp +++ llvm/trunk/tools/llvm-mca/Dispatch.cpp @@ -25,53 +25,99 @@ namespace mca { -void RegisterFile::addRegisterFile(ArrayRef RegisterClasses, - unsigned NumTemps) { +void RegisterFile::initialize(const MCSchedModel &SM, unsigned NumRegs) { + // Create a default register file that "sees" all the machine registers + // declared by the target. The number of physical registers in the default + // register file is set equal to `NumRegs`. A value of zero for `NumRegs` + // means: this register file has an unbounded number of physical registers. + addRegisterFile({} /* all registers */, NumRegs); + if (!SM.hasExtraProcessorInfo()) + return; + + // For each user defined register file, allocate a RegisterMappingTracker + // object. The size of every register file, as well as the mapping between + // register files and register classes is specified via tablegen. + const MCExtraProcessorInfo &Info = SM.getExtraProcessorInfo(); + for (unsigned I = 0, E = Info.NumRegisterFiles; I < E; ++I) { + const MCRegisterFileDesc &RF = Info.RegisterFiles[I]; + // Skip invalid register files with zero physical registers. + unsigned Length = RF.NumRegisterCostEntries; + if (!RF.NumPhysRegs) + continue; + // The cost of a register definition is equivalent to the number of + // physical registers that are allocated at register renaming stage. + const MCRegisterCostEntry *FirstElt = + &Info.RegisterCostTable[RF.RegisterCostEntryIdx]; + addRegisterFile(ArrayRef(FirstElt, Length), + RF.NumPhysRegs); + } +} + +void RegisterFile::addRegisterFile(ArrayRef Entries, + unsigned NumPhysRegs) { + // A default register file is always allocated at index #0. That register file + // is mainly used to count the total number of mappings created by all + // register files at runtime. Users can limit the number of available physical + // registers in register file #0 through the command line flag + // `-register-file-size`. unsigned RegisterFileIndex = RegisterFiles.size(); - assert(RegisterFileIndex < 32 && "Too many register files!"); - RegisterFiles.emplace_back(NumTemps); + RegisterFiles.emplace_back(NumPhysRegs); - // Special case where there are no register classes specified. - // An empty register class set means *all* registers. - if (RegisterClasses.empty()) { - for (std::pair &Mapping : RegisterMappings) - Mapping.second |= 1U << RegisterFileIndex; - } else { - for (const unsigned RegClassIndex : RegisterClasses) { - const MCRegisterClass &RC = MRI.getRegClass(RegClassIndex); - for (const MCPhysReg Reg : RC) - RegisterMappings[Reg].second |= 1U << RegisterFileIndex; + // Special case where there is no register class identifier in the set. + // An empty set of register classes means: this register file contains all + // the physical registers specified by the target. + if (Entries.empty()) { + for (std::pair &Mapping : RegisterMappings) + Mapping.second = std::make_pair(RegisterFileIndex, 1U); + return; + } + + // Now update the cost of individual registers. + for (const MCRegisterCostEntry &RCE : Entries) { + const MCRegisterClass &RC = MRI.getRegClass(RCE.RegisterClassID); + for (const MCPhysReg Reg : RC) { + IndexPlusCostPairTy &Entry = RegisterMappings[Reg].second; + if (Entry.first) { + // The only register file that is allowed to overlap is the default + // register file at index #0. The analysis is inaccurate if register + // files overlap. + errs() << "warning: register " << MRI.getName(Reg) + << " defined in multiple register files."; + } + Entry.first = RegisterFileIndex; + Entry.second = RCE.Cost; } } } -void RegisterFile::createNewMappings(unsigned RegisterFileMask, +void RegisterFile::createNewMappings(IndexPlusCostPairTy Entry, MutableArrayRef UsedPhysRegs) { - assert(RegisterFileMask && "RegisterFileMask cannot be zero!"); - // Notify each register file that contains RegID. - do { - unsigned NextRegisterFile = llvm::PowerOf2Floor(RegisterFileMask); - unsigned RegisterFileIndex = llvm::countTrailingZeros(NextRegisterFile); + unsigned RegisterFileIndex = Entry.first; + unsigned Cost = Entry.second; + if (RegisterFileIndex) { RegisterMappingTracker &RMT = RegisterFiles[RegisterFileIndex]; - RMT.NumUsedMappings++; - UsedPhysRegs[RegisterFileIndex]++; - RegisterFileMask ^= NextRegisterFile; - } while (RegisterFileMask); + RMT.NumUsedMappings += Cost; + UsedPhysRegs[RegisterFileIndex] += Cost; + } + + // Now update the default register mapping tracker. + RegisterFiles[0].NumUsedMappings += Cost; + UsedPhysRegs[0] += Cost; } -void RegisterFile::removeMappings(unsigned RegisterFileMask, +void RegisterFile::removeMappings(IndexPlusCostPairTy Entry, MutableArrayRef FreedPhysRegs) { - assert(RegisterFileMask && "RegisterFileMask cannot be zero!"); - // Notify each register file that contains RegID. - do { - unsigned NextRegisterFile = llvm::PowerOf2Floor(RegisterFileMask); - unsigned RegisterFileIndex = llvm::countTrailingZeros(NextRegisterFile); + unsigned RegisterFileIndex = Entry.first; + unsigned Cost = Entry.second; + if (RegisterFileIndex) { RegisterMappingTracker &RMT = RegisterFiles[RegisterFileIndex]; - assert(RMT.NumUsedMappings); - RMT.NumUsedMappings--; - FreedPhysRegs[RegisterFileIndex]++; - RegisterFileMask ^= NextRegisterFile; - } while (RegisterFileMask); + RMT.NumUsedMappings -= Cost; + FreedPhysRegs[RegisterFileIndex] += Cost; + } + + // Now update the default register mapping tracker. + RegisterFiles[0].NumUsedMappings -= Cost; + FreedPhysRegs[0] += Cost; } void RegisterFile::addRegisterMapping(WriteState &WS, @@ -145,32 +191,30 @@ } unsigned RegisterFile::isAvailable(ArrayRef Regs) const { - SmallVector NumTemporaries(getNumRegisterFiles()); + SmallVector NumPhysRegs(getNumRegisterFiles()); // Find how many new mappings must be created for each register file. for (const unsigned RegID : Regs) { - unsigned RegisterFileMask = RegisterMappings[RegID].second; - do { - unsigned NextRegisterFileID = llvm::PowerOf2Floor(RegisterFileMask); - NumTemporaries[llvm::countTrailingZeros(NextRegisterFileID)]++; - RegisterFileMask ^= NextRegisterFileID; - } while (RegisterFileMask); + const IndexPlusCostPairTy &Entry = RegisterMappings[RegID].second; + if (Entry.first) + NumPhysRegs[Entry.first] += Entry.second; + NumPhysRegs[0] += Entry.second; } unsigned Response = 0; for (unsigned I = 0, E = getNumRegisterFiles(); I < E; ++I) { - unsigned Temporaries = NumTemporaries[I]; - if (!Temporaries) + unsigned NumRegs = NumPhysRegs[I]; + if (!NumRegs) continue; const RegisterMappingTracker &RMT = RegisterFiles[I]; if (!RMT.TotalMappings) { - // The register file has an unbound number of microarchitectural + // The register file has an unbounded number of microarchitectural // registers. continue; } - if (RMT.TotalMappings < Temporaries) { + if (RMT.TotalMappings < NumRegs) { // The current register file is too small. This may occur if the number of // microarchitectural registers in register file #0 was changed by the // users via flag -reg-file-size. Alternatively, the scheduling model @@ -179,7 +223,7 @@ "Not enough microarchitectural registers in the register file"); } - if (RMT.TotalMappings < RMT.NumUsedMappings + Temporaries) + if (RMT.TotalMappings < (RMT.NumUsedMappings + NumRegs)) Response |= (1U << I); } @@ -190,7 +234,8 @@ void RegisterFile::dump() const { for (unsigned I = 0, E = MRI.getNumRegs(); I < E; ++I) { const RegisterMapping &RM = RegisterMappings[I]; - dbgs() << MRI.getName(I) << ", " << I << ", Map=" << RM.second << ", "; + dbgs() << MRI.getName(I) << ", " << I << ", Map=" << RM.second.first + << ", "; if (RM.first) RM.first->dump(); else Index: llvm/trunk/utils/TableGen/CodeGenSchedule.h =================================================================== --- llvm/trunk/utils/TableGen/CodeGenSchedule.h +++ llvm/trunk/utils/TableGen/CodeGenSchedule.h @@ -26,6 +26,7 @@ class CodeGenTarget; class CodeGenSchedModels; class CodeGenInstruction; +class CodeGenRegisterClass; using RecVec = std::vector; using RecIter = std::vector::const_iterator; @@ -157,6 +158,38 @@ #endif }; +/// Represent the cost of allocating a register of register class RCDef. +/// +/// The cost of allocating a register is equivalent to the number of physical +/// registers used by the register renamer. Register costs are defined at +/// register class granularity. +struct CodeGenRegisterCost { + Record *RCDef; + unsigned Cost; + CodeGenRegisterCost(Record *RC, unsigned RegisterCost) + : RCDef(RC), Cost(RegisterCost) {} + CodeGenRegisterCost(const CodeGenRegisterCost &) = default; + CodeGenRegisterCost &operator=(const CodeGenRegisterCost &) = delete; +}; + +/// A processor register file. +/// +/// This class describes a processor register file. Register file information is +/// currently consumed by external tools like llvm-mca to predict dispatch +/// stalls due to register pressure. +struct CodeGenRegisterFile { + std::string Name; + Record *RegisterFileDef; + + unsigned NumPhysRegs; + std::vector Costs; + + CodeGenRegisterFile(StringRef name, Record *def) + : Name(name), RegisterFileDef(def), NumPhysRegs(0) {} + + bool hasDefaultCosts() const { return Costs.empty(); } +}; + // Processor model. // // ModelName is a unique name used to name an instantiation of MCSchedModel. @@ -199,6 +232,9 @@ // Per-operand machine model resources associated with this processor. RecVec ProcResourceDefs; + // List of Register Files. + std::vector RegisterFiles; + CodeGenProcModel(unsigned Idx, std::string Name, Record *MDef, Record *IDef) : Index(Idx), ModelName(std::move(Name)), ModelDef(MDef), ItinsDef(IDef) {} @@ -211,6 +247,10 @@ return !WriteResDefs.empty() || !ItinRWDefs.empty(); } + bool hasExtraProcessorInfo() const { + return !RegisterFiles.empty(); + } + unsigned getProcResourceIdx(Record *PRDef) const; bool isUnsupported(const CodeGenInstruction &Inst) const; @@ -396,6 +436,8 @@ void collectSchedClasses(); + void collectRegisterFiles(); + std::string createSchedClassName(Record *ItinClassDef, ArrayRef OperWrites, ArrayRef OperReads); Index: llvm/trunk/utils/TableGen/CodeGenSchedule.cpp =================================================================== --- llvm/trunk/utils/TableGen/CodeGenSchedule.cpp +++ llvm/trunk/utils/TableGen/CodeGenSchedule.cpp @@ -211,6 +211,9 @@ DEBUG(dbgs() << "\n+++ RESOURCE DEFINITIONS (collectProcResources) +++\n"); collectProcResources(); + // Find register file definitions for each processor. + collectRegisterFiles(); + checkCompleteness(); } @@ -1486,6 +1489,30 @@ } } +// Collect all the RegisterFile definitions available in this target. +void CodeGenSchedModels::collectRegisterFiles() { + RecVec RegisterFileDefs = Records.getAllDerivedDefinitions("RegisterFile"); + + // RegisterFiles is the vector of CodeGenRegisterFile. + for (Record *RF : RegisterFileDefs) { + // For each register file definition, construct a CodeGenRegisterFile object + // and add it to the appropriate scheduling model. + CodeGenProcModel &PM = getProcModel(RF->getValueAsDef("SchedModel")); + PM.RegisterFiles.emplace_back(CodeGenRegisterFile(RF->getName(),RF)); + CodeGenRegisterFile &CGRF = PM.RegisterFiles.back(); + + // Now set the number of physical registers as well as the cost of registers + // in each register class. + CGRF.NumPhysRegs = RF->getValueAsInt("NumPhysRegs"); + RecVec RegisterClasses = RF->getValueAsListOfDefs("RegClasses"); + std::vector RegisterCosts = RF->getValueAsListOfInts("RegCosts"); + for (unsigned I = 0, E = RegisterClasses.size(); I < E; ++I) { + int Cost = RegisterCosts.size() > I ? RegisterCosts[I] : 1; + CGRF.Costs.emplace_back(RegisterClasses[I], Cost); + } + } +} + // Collect and sort WriteRes, ReadAdvance, and ProcResources. void CodeGenSchedModels::collectProcResources() { ProcResourceDefs = Records.getAllDerivedDefinitions("ProcResourceUnits"); Index: llvm/trunk/utils/TableGen/SubtargetEmitter.cpp =================================================================== --- llvm/trunk/utils/TableGen/SubtargetEmitter.cpp +++ llvm/trunk/utils/TableGen/SubtargetEmitter.cpp @@ -90,6 +90,7 @@ void EmitItineraries(raw_ostream &OS, std::vector> &ProcItinLists); + void EmitExtraProcessorInfo(const CodeGenProcModel &ProcModel, raw_ostream &OS); void EmitProcessorProp(raw_ostream &OS, const Record *R, StringRef Name, char Separator); void EmitProcessorResourceSubUnits(const CodeGenProcModel &ProcModel, @@ -604,6 +605,61 @@ OS << "};\n"; } +void SubtargetEmitter::EmitExtraProcessorInfo(const CodeGenProcModel &ProcModel, + raw_ostream &OS) { + if (llvm::all_of(ProcModel.RegisterFiles, [](const CodeGenRegisterFile &RF) { + return RF.hasDefaultCosts(); + })) + return; + + // Print the RegisterCost table first. + OS << "\n// {RegisterClassID, Register Cost}\n"; + OS << "static const llvm::MCRegisterCostEntry " << ProcModel.ModelName + << "RegisterCosts" + << "[] = {\n"; + + for (const CodeGenRegisterFile &RF : ProcModel.RegisterFiles) { + // Skip register files with a default cost table. + if (RF.hasDefaultCosts()) + continue; + // Add entries to the cost table. + for (const CodeGenRegisterCost &RC : RF.Costs) { + OS << " { "; + Record *Rec = RC.RCDef; + if (Rec->getValue("Namespace")) + OS << Rec->getValueAsString("Namespace") << "::"; + OS << Rec->getName() << "RegClassID, " << RC.Cost << "},\n"; + } + } + OS << "};\n"; + + // Now generate a table with register file info. + OS << "\n // {Name, #PhysRegs, #CostEntries, IndexToCostTbl}\n"; + OS << "static const llvm::MCRegisterFileDesc " << ProcModel.ModelName + << "RegisterFiles" + << "[] = {\n" + << " { \"InvalidRegisterFile\", 0, 0, 0 },\n"; + unsigned CostTblIndex = 0; + + for (const CodeGenRegisterFile &RD : ProcModel.RegisterFiles) { + OS << " { "; + OS << '"' << RD.Name << '"' << ", " << RD.NumPhysRegs << ", "; + unsigned NumCostEntries = RD.Costs.size(); + OS << NumCostEntries << ", " << CostTblIndex << "},\n"; + CostTblIndex += NumCostEntries; + } + OS << "};\n"; + + // Now generate a table for the extra processor info. + OS << "\nstatic const llvm::MCExtraProcessorInfo " << ProcModel.ModelName + << "ExtraInfo = {\n " << ProcModel.ModelName << "RegisterFiles,\n " + << (1 + ProcModel.RegisterFiles.size()) + << ", // Number of register files.\n " + << ProcModel.ModelName << "RegisterCosts,\n " << CostTblIndex + << " // Number of register cost entries.\n" + << "};\n"; +} + void SubtargetEmitter::EmitProcessorResources(const CodeGenProcModel &ProcModel, raw_ostream &OS) { EmitProcessorResourceSubUnits(ProcModel, OS); @@ -1157,6 +1213,9 @@ void SubtargetEmitter::EmitProcessorModels(raw_ostream &OS) { // For each processor model. for (const CodeGenProcModel &PM : SchedModels.procModels()) { + // Emit extra processor info if available. + if (PM.hasExtraProcessorInfo()) + EmitExtraProcessorInfo(PM, OS); // Emit processor resource table. if (PM.hasInstrSchedModel()) EmitProcessorResources(PM, OS); @@ -1197,9 +1256,13 @@ OS << " nullptr, nullptr, 0, 0," << " // No instruction-level machine model.\n"; if (PM.hasItineraries()) - OS << " " << PM.ItinsDef->getName() << "\n"; + OS << " " << PM.ItinsDef->getName() << ",\n"; + else + OS << " nullptr, // No Itinerary\n"; + if (PM.hasExtraProcessorInfo()) + OS << " &" << PM.ModelName << "ExtraInfo\n"; else - OS << " nullptr // No Itinerary\n"; + OS << " nullptr // No extra processor descriptor\n"; OS << "};\n"; } }