diff --git a/llvm/docs/CommandGuide/llvm-mca.rst b/llvm/docs/CommandGuide/llvm-mca.rst --- a/llvm/docs/CommandGuide/llvm-mca.rst +++ b/llvm/docs/CommandGuide/llvm-mca.rst @@ -16,8 +16,8 @@ of machine code in a specific CPU. Performance is measured in terms of throughput as well as processor resource -consumption. The tool currently works for processors with an out-of-order -backend, for which there is a scheduling model available in LLVM. +consumption. The tool currently works for processors with a backend for which +there is a scheduling model available in LLVM. The main goal of this tool is not just to predict the performance of the code when run on the target, but also help with diagnosing potential performance @@ -204,7 +204,8 @@ Print information about bottlenecks that affect the throughput. This analysis can be expensive, and it is disabled by default. Bottlenecks are highlighted - in the summary view. + in the summary view. Bottleneck analysis is currently not supported for + processors with an in-order backend. .. option:: -json @@ -388,7 +389,9 @@ Throughput). Field *DispatchWidth* is the maximum number of micro opcodes that are dispatched -to the out-of-order backend every simulated cycle. +to the out-of-order backend every simulated cycle. For processors with an +in-order backend, *DispatchWidth* is the maximum number of micro opcodes issued +to the backend every simulated cycle. IPC is computed dividing the total number of simulated instructions by the total number of cycles. @@ -653,6 +656,8 @@ dependent on the simulation and (as always) by the quality of the processor model in llvm. +Bottleneck analysis is currently not supported for processors with an in-order +backend. Extra Statistics to Further Diagnose Performance Issues ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -797,11 +802,14 @@ * Write Back (Instruction is executed, and results are written back). * Retire (Instruction is retired; writes are architecturally committed). -The default pipeline only models the out-of-order portion of a processor. -Therefore, the instruction fetch and decode stages are not modeled. Performance -bottlenecks in the frontend are not diagnosed. :program:`llvm-mca` assumes that -instructions have all been decoded and placed into a queue before the simulation -start. Also, :program:`llvm-mca` does not model branch prediction. +The in-order pipeline implements the following sequence of stages: +* InOrderIssue (Instruction is issued to the processor pipelines). +* Retire (Instruction is retired; writes are architecturally committed). + +:program:`llvm-mca` assumes that instructions have all been decoded and placed +into a queue before the simulation start. Therefore, the instruction fetch and +decode stages are not modeled. Performance bottlenecks in the frontend are not +diagnosed. Also, :program:`llvm-mca` does not model branch prediction. Instruction Dispatch """""""""""""""""""" @@ -957,3 +965,17 @@ #. A load may pass a previous load. #. A load may not pass a previous store unless ``-noalias`` is set. #. A load has to wait until an older load barrier is fully executed. + +In-order Issue and Execute +"""""""""""""""""""""""""""""""""""" +In-order processors are modelled as a single ``InOrderIssueStage`` stage. It +bypasses Dispatch, Scheduler and Load/Store unit. Instructions are issued as +soon as their operand registers are available and resource requirements are +met. Multiple instructions can be issued in one cycle according to the value of +the ``IssueWidth`` parameter in LLVM's scheduling model. + +Once issued, an instruction is moved to ``IssuedInst`` set until it is ready to +retire. If ``RetireControlUnit`` is defined in the LLVM's scheduling model, +:program:`llvm-mca` ensures that instructions are retired in-order. However, an +instruction is allowed to retire out-of-order if ``RetireOOO`` property is true +for at least one of its writes. diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst --- a/llvm/docs/ReleaseNotes.rst +++ b/llvm/docs/ReleaseNotes.rst @@ -130,6 +130,9 @@ * The options ``--build-id-link-{dir,input,output}`` have been deleted. (`D96310 `_) +* Support for in-order processors has been added to ``llvm-mca``. + (`D94928 `_) + Changes to LLDB --------------------------------- diff --git a/llvm/include/llvm/MC/MCSchedule.h b/llvm/include/llvm/MC/MCSchedule.h --- a/llvm/include/llvm/MC/MCSchedule.h +++ b/llvm/include/llvm/MC/MCSchedule.h @@ -108,15 +108,16 @@ /// /// Defined as an aggregate struct for creating tables with initializer lists. struct MCSchedClassDesc { - static const unsigned short InvalidNumMicroOps = (1U << 14) - 1; + static const unsigned short InvalidNumMicroOps = (1U << 13) - 1; static const unsigned short VariantNumMicroOps = InvalidNumMicroOps - 1; #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) const char* Name; #endif - uint16_t NumMicroOps : 14; + uint16_t NumMicroOps : 13; uint16_t BeginGroup : 1; uint16_t EndGroup : 1; + uint16_t RetireOOO : 1; uint16_t WriteProcResIdx; // First index into WriteProcResTable. uint16_t NumWriteProcResEntries; uint16_t WriteLatencyIdx; // First index into WriteLatencyTable. diff --git a/llvm/include/llvm/MCA/Context.h b/llvm/include/llvm/MCA/Context.h --- a/llvm/include/llvm/MCA/Context.h +++ b/llvm/include/llvm/MCA/Context.h @@ -68,6 +68,11 @@ /// This pipeline consists of Fetch, Dispatch, Execute, and Retire stages. std::unique_ptr createDefaultPipeline(const PipelineOptions &Opts, SourceMgr &SrcMgr); + + /// Construct a basic pipeline for simulating an in-order pipeline. + /// This pipeline consists of Fetch, InOrderIssue, and Retire stages. + std::unique_ptr createInOrderPipeline(const PipelineOptions &Opts, + SourceMgr &SrcMgr); }; } // namespace mca diff --git a/llvm/include/llvm/MCA/HardwareUnits/RegisterFile.h b/llvm/include/llvm/MCA/HardwareUnits/RegisterFile.h --- a/llvm/include/llvm/MCA/HardwareUnits/RegisterFile.h +++ b/llvm/include/llvm/MCA/HardwareUnits/RegisterFile.h @@ -172,11 +172,6 @@ void freePhysRegs(const RegisterRenamingInfo &Entry, MutableArrayRef FreedPhysRegs); - // Collects writes that are in a RAW dependency with RS. - // This method is called from `addRegisterRead()`. - void collectWrites(const ReadState &RS, - SmallVectorImpl &Writes) const; - // Create an instance of RegisterMappingTracker for every register file // specified by the processor model. // If no register file is specified, then this method creates a default @@ -187,6 +182,10 @@ RegisterFile(const MCSchedModel &SM, const MCRegisterInfo &mri, unsigned NumRegs = 0); + // Collects writes that are in a RAW dependency with RS. + void collectWrites(const ReadState &RS, + SmallVectorImpl &Writes) const; + // This method updates the register mappings inserting a new register // definition. This method is also responsible for updating the number of // allocated physical registers in each register file modified by the write. diff --git a/llvm/include/llvm/MCA/HardwareUnits/RetireControlUnit.h b/llvm/include/llvm/MCA/HardwareUnits/RetireControlUnit.h --- a/llvm/include/llvm/MCA/HardwareUnits/RetireControlUnit.h +++ b/llvm/include/llvm/MCA/HardwareUnits/RetireControlUnit.h @@ -104,6 +104,9 @@ #ifndef NDEBUG void dump() const; #endif + + // Assigned to instructions that are not handled by the RCU. + static const unsigned UnhandledTokenID = ~0U; }; } // namespace mca diff --git a/llvm/include/llvm/MCA/Instruction.h b/llvm/include/llvm/MCA/Instruction.h --- a/llvm/include/llvm/MCA/Instruction.h +++ b/llvm/include/llvm/MCA/Instruction.h @@ -375,6 +375,7 @@ bool HasSideEffects; bool BeginGroup; bool EndGroup; + bool RetireOOO; // True if all buffered resources are in-order, and there is at least one // buffer which is a dispatch hazard (BufferSize = 0). diff --git a/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h b/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h new file mode 100644 --- /dev/null +++ b/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h @@ -0,0 +1,84 @@ +//===---------------------- InOrderIssueStage.h -----------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// InOrderIssueStage implements an in-order execution pipeline. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_MCA_IN_ORDER_ISSUE_STAGE_H +#define LLVM_MCA_IN_ORDER_ISSUE_STAGE_H + +#include "llvm/ADT/SmallVector.h" +#include "llvm/MCA/SourceMgr.h" +#include "llvm/MCA/Stages/Stage.h" + +#include + +namespace llvm { +struct MCSchedModel; +class MCSubtargetInfo; + +namespace mca { +class RegisterFile; +class ResourceManager; +struct RetireControlUnit; + +class InOrderIssueStage final : public Stage { + const MCSchedModel &SM; + const MCSubtargetInfo &STI; + RetireControlUnit &RCU; + RegisterFile &PRF; + std::unique_ptr RM; + + /// Instructions that were issued, but not executed yet. + SmallVector IssuedInst; + + /// Number of instructions issued in the current cycle. + unsigned NumIssued; + + /// If an instruction cannot execute due to an unmet register or resource + /// dependency, the it is stalled for StallCyclesLeft. + InstRef StalledInst; + unsigned StallCyclesLeft; + + /// Number of instructions that can be issued in the current cycle. + unsigned Bandwidth; + + InOrderIssueStage(const InOrderIssueStage &Other) = delete; + InOrderIssueStage &operator=(const InOrderIssueStage &Other) = delete; + + /// If IR has an unmet register or resource dependency, canExecute returns + /// false. StallCycles is set to the number of cycles left before the + /// instruction can be issued. + bool canExecute(const InstRef &IR, unsigned *StallCycles) const; + + /// Issue the instruction, or update StallCycles if IR is stalled. + Error tryIssue(InstRef &IR, unsigned *StallCycles); + + /// Update status of instructions from IssuedInst. + Error updateIssuedInst(); + +public: + InOrderIssueStage(RetireControlUnit &RCU, RegisterFile &PRF, + const MCSchedModel &SM, const MCSubtargetInfo &STI) + : SM(SM), STI(STI), RCU(RCU), PRF(PRF), + RM(std::make_unique(SM)), StallCyclesLeft(0), + Bandwidth(0) {} + + bool isAvailable(const InstRef &) const override; + bool hasWorkToComplete() const override; + Error execute(InstRef &IR) override; + Error cycleStart() override; + Error cycleEnd() override; +}; + +} // namespace mca +} // namespace llvm + +#endif // LLVM_MCA_IN_ORDER_ISSUE_STAGE_H diff --git a/llvm/include/llvm/MCA/Stages/RetireStage.h b/llvm/include/llvm/MCA/Stages/RetireStage.h --- a/llvm/include/llvm/MCA/Stages/RetireStage.h +++ b/llvm/include/llvm/MCA/Stages/RetireStage.h @@ -16,6 +16,7 @@ #ifndef LLVM_MCA_STAGES_RETIRESTAGE_H #define LLVM_MCA_STAGES_RETIRESTAGE_H +#include "llvm/ADT/SmallVector.h" #include "llvm/MCA/HardwareUnits/LSUnit.h" #include "llvm/MCA/HardwareUnits/RegisterFile.h" #include "llvm/MCA/HardwareUnits/RetireControlUnit.h" @@ -29,6 +30,7 @@ RetireControlUnit &RCU; RegisterFile &PRF; LSUnitBase &LSU; + SmallVector RetireInst; RetireStage(const RetireStage &Other) = delete; RetireStage &operator=(const RetireStage &Other) = delete; @@ -37,7 +39,9 @@ RetireStage(RetireControlUnit &R, RegisterFile &F, LSUnitBase &LS) : Stage(), RCU(R), PRF(F), LSU(LS) {} - bool hasWorkToComplete() const override { return !RCU.isEmpty(); } + bool hasWorkToComplete() const override { + return !RCU.isEmpty() || !RetireInst.empty(); + } Error cycleStart() override; Error execute(InstRef &IR) override; void notifyInstructionRetired(const InstRef &IR) const; diff --git a/llvm/include/llvm/Target/TargetSchedule.td b/llvm/include/llvm/Target/TargetSchedule.td --- a/llvm/include/llvm/Target/TargetSchedule.td +++ b/llvm/include/llvm/Target/TargetSchedule.td @@ -262,6 +262,10 @@ // Allow a processor to mark some scheduling classes as single-issue. // SingleIssue is an alias for Begin/End Group. bit SingleIssue = false; + // An instruction is allowed to retire out-of-order if RetireOOO is + // true for at least one of its writes. This field is only used by + // MCA for in-order subtargets, and is ignored for other targets. + bit RetireOOO = false; SchedMachineModel SchedModel = ?; } diff --git a/llvm/lib/MCA/CMakeLists.txt b/llvm/lib/MCA/CMakeLists.txt --- a/llvm/lib/MCA/CMakeLists.txt +++ b/llvm/lib/MCA/CMakeLists.txt @@ -14,6 +14,7 @@ Stages/DispatchStage.cpp Stages/EntryStage.cpp Stages/ExecuteStage.cpp + Stages/InOrderIssueStage.cpp Stages/InstructionTables.cpp Stages/MicroOpQueueStage.cpp Stages/RetireStage.cpp diff --git a/llvm/lib/MCA/Context.cpp b/llvm/lib/MCA/Context.cpp --- a/llvm/lib/MCA/Context.cpp +++ b/llvm/lib/MCA/Context.cpp @@ -21,6 +21,7 @@ #include "llvm/MCA/Stages/DispatchStage.h" #include "llvm/MCA/Stages/EntryStage.h" #include "llvm/MCA/Stages/ExecuteStage.h" +#include "llvm/MCA/Stages/InOrderIssueStage.h" #include "llvm/MCA/Stages/MicroOpQueueStage.h" #include "llvm/MCA/Stages/RetireStage.h" @@ -31,6 +32,9 @@ Context::createDefaultPipeline(const PipelineOptions &Opts, SourceMgr &SrcMgr) { const MCSchedModel &SM = STI.getSchedModel(); + if (!SM.isOutOfOrder()) + return createInOrderPipeline(Opts, SrcMgr); + // Create the hardware units defining the backend. auto RCU = std::make_unique(SM); auto PRF = std::make_unique(SM, MRI, Opts.RegisterFileSize); @@ -64,5 +68,29 @@ return StagePipeline; } +std::unique_ptr +Context::createInOrderPipeline(const PipelineOptions &Opts, SourceMgr &SrcMgr) { + const MCSchedModel &SM = STI.getSchedModel(); + auto RCU = std::make_unique(SM); + auto PRF = std::make_unique(SM, MRI, Opts.RegisterFileSize); + auto LSU = std::make_unique(SM, Opts.LoadQueueSize, + Opts.StoreQueueSize, Opts.AssumeNoAlias); + + auto Entry = std::make_unique(SrcMgr); + auto InOrderIssue = std::make_unique(*RCU, *PRF, SM, STI); + auto Retire = std::make_unique(*RCU, *PRF, *LSU); + + auto StagePipeline = std::make_unique(); + StagePipeline->appendStage(std::move(Entry)); + StagePipeline->appendStage(std::move(InOrderIssue)); + StagePipeline->appendStage(std::move(Retire)); + + addHardwareUnit(std::move(RCU)); + addHardwareUnit(std::move(PRF)); + addHardwareUnit(std::move(LSU)); + + return StagePipeline; +} + } // namespace mca } // namespace llvm diff --git a/llvm/lib/MCA/HardwareUnits/RetireControlUnit.cpp b/llvm/lib/MCA/HardwareUnits/RetireControlUnit.cpp --- a/llvm/lib/MCA/HardwareUnits/RetireControlUnit.cpp +++ b/llvm/lib/MCA/HardwareUnits/RetireControlUnit.cpp @@ -33,12 +33,18 @@ MaxRetirePerCycle = EPI.MaxRetirePerCycle; } NumROBEntries = AvailableEntries; + bool IsOutOfOrder = SM.MicroOpBufferSize; + if (!IsOutOfOrder && !NumROBEntries) + return; assert(NumROBEntries && "Invalid reorder buffer size!"); Queue.resize(2 * NumROBEntries); } // Reserves a number of slots, and returns a new token. unsigned RetireControlUnit::dispatch(const InstRef &IR) { + if (!NumROBEntries) + return UnhandledTokenID; + const Instruction &Inst = *IR.getInstruction(); unsigned Entries = normalizeQuantity(Inst.getNumMicroOps()); assert((AvailableEntries >= Entries) && "Reorder Buffer unavailable!"); @@ -47,6 +53,7 @@ Queue[NextAvailableSlotIdx] = {IR, Entries, false}; NextAvailableSlotIdx += std::max(1U, Entries); NextAvailableSlotIdx %= Queue.size(); + assert(TokenID < UnhandledTokenID && "Invalid token ID"); AvailableEntries -= Entries; return TokenID; diff --git a/llvm/lib/MCA/InstrBuilder.cpp b/llvm/lib/MCA/InstrBuilder.cpp --- a/llvm/lib/MCA/InstrBuilder.cpp +++ b/llvm/lib/MCA/InstrBuilder.cpp @@ -570,6 +570,7 @@ ID->HasSideEffects = MCDesc.hasUnmodeledSideEffects(); ID->BeginGroup = SCDesc.BeginGroup; ID->EndGroup = SCDesc.EndGroup; + ID->RetireOOO = SCDesc.RetireOOO; initializeUsedResources(*ID, SCDesc, STI, ProcResourceMasks); computeMaxLatency(*ID, MCDesc, SCDesc, STI); diff --git a/llvm/lib/MCA/Stages/InOrderIssueStage.cpp b/llvm/lib/MCA/Stages/InOrderIssueStage.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/MCA/Stages/InOrderIssueStage.cpp @@ -0,0 +1,292 @@ +//===---------------------- InOrderIssueStage.cpp ---------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// InOrderIssueStage implements an in-order execution pipeline. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/MCA/Stages/InOrderIssueStage.h" + +#include "llvm/MC/MCSchedule.h" +#include "llvm/MCA/HWEventListener.h" +#include "llvm/MCA/HardwareUnits/RegisterFile.h" +#include "llvm/MCA/HardwareUnits/ResourceManager.h" +#include "llvm/MCA/HardwareUnits/RetireControlUnit.h" +#include "llvm/MCA/Instruction.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/Error.h" + +#include + +#define DEBUG_TYPE "llvm-mca" +namespace llvm { +namespace mca { + +bool InOrderIssueStage::hasWorkToComplete() const { + return !IssuedInst.empty() || StalledInst; +} + +bool InOrderIssueStage::isAvailable(const InstRef &IR) const { + const Instruction &Inst = *IR.getInstruction(); + unsigned NumMicroOps = Inst.getNumMicroOps(); + const InstrDesc &Desc = Inst.getDesc(); + + if (Bandwidth < NumMicroOps) + return false; + + // Instruction with BeginGroup must be the first instruction to be issued in a + // cycle. + if (Desc.BeginGroup && NumIssued != 0) + return false; + + return true; +} + +static bool hasResourceHazard(const ResourceManager &RM, const InstRef &IR) { + if (RM.checkAvailability(IR.getInstruction()->getDesc())) { + LLVM_DEBUG(dbgs() << "[E] Stall #" << IR << '\n'); + return true; + } + + return false; +} + +/// Return a number of cycles left until register requirements of the +/// instructions are met. +static unsigned checkRegisterHazard(const RegisterFile &PRF, + const MCSchedModel &SM, + const MCSubtargetInfo &STI, + const InstRef &IR) { + unsigned StallCycles = 0; + SmallVector Writes; + + for (const ReadState &RS : IR.getInstruction()->getUses()) { + const ReadDescriptor &RD = RS.getDescriptor(); + const MCSchedClassDesc *SC = SM.getSchedClassDesc(RD.SchedClassID); + + PRF.collectWrites(RS, Writes); + for (const WriteRef &WR : Writes) { + const WriteState *WS = WR.getWriteState(); + unsigned WriteResID = WS->getWriteResourceID(); + int ReadAdvance = STI.getReadAdvanceCycles(SC, RD.UseIndex, WriteResID); + LLVM_DEBUG(dbgs() << "[E] ReadAdvance for #" << IR << ": " << ReadAdvance + << '\n'); + + if (WS->getCyclesLeft() == UNKNOWN_CYCLES) { + // Try again in the next cycle until the value is known + StallCycles = std::max(StallCycles, 1U); + continue; + } + + int CyclesLeft = WS->getCyclesLeft() - ReadAdvance; + if (CyclesLeft > 0) { + LLVM_DEBUG(dbgs() << "[E] Register hazard: " << WS->getRegisterID() + << '\n'); + StallCycles = std::max(StallCycles, (unsigned)CyclesLeft); + } + } + Writes.clear(); + } + + return StallCycles; +} + +bool InOrderIssueStage::canExecute(const InstRef &IR, + unsigned *StallCycles) const { + *StallCycles = 0; + + if (unsigned RegStall = checkRegisterHazard(PRF, SM, STI, IR)) { + *StallCycles = RegStall; + // FIXME: add a parameter to HWStallEvent to indicate a number of cycles. + for (unsigned I = 0; I < RegStall; ++I) { + notifyEvent( + HWStallEvent(HWStallEvent::RegisterFileStall, IR)); + notifyEvent( + HWPressureEvent(HWPressureEvent::REGISTER_DEPS, IR)); + } + } else if (hasResourceHazard(*RM, IR)) { + *StallCycles = 1; + notifyEvent( + HWStallEvent(HWStallEvent::DispatchGroupStall, IR)); + notifyEvent( + HWPressureEvent(HWPressureEvent::RESOURCES, IR)); + } + + return *StallCycles == 0; +} + +static void addRegisterReadWrite(RegisterFile &PRF, Instruction &IS, + unsigned SourceIndex, + const MCSubtargetInfo &STI, + SmallVectorImpl &UsedRegs) { + assert(!IS.isEliminated()); + + for (ReadState &RS : IS.getUses()) + PRF.addRegisterRead(RS, STI); + + for (WriteState &WS : IS.getDefs()) + PRF.addRegisterWrite(WriteRef(SourceIndex, &WS), UsedRegs); +} + +static void notifyInstructionExecute( + const InstRef &IR, + const SmallVectorImpl> &UsedRes, + const Stage &S) { + + S.notifyEvent( + HWInstructionEvent(HWInstructionEvent::Ready, IR)); + S.notifyEvent(HWInstructionIssuedEvent(IR, UsedRes)); + + LLVM_DEBUG(dbgs() << "[E] Issued #" << IR << "\n"); +} + +static void notifyInstructionDispatch(const InstRef &IR, unsigned Ops, + const SmallVectorImpl &UsedRegs, + const Stage &S) { + + S.notifyEvent( + HWInstructionDispatchedEvent(IR, UsedRegs, Ops)); + + LLVM_DEBUG(dbgs() << "[E] Dispatched #" << IR << "\n"); +} + +llvm::Error InOrderIssueStage::execute(InstRef &IR) { + Instruction &IS = *IR.getInstruction(); + const InstrDesc &Desc = IS.getDesc(); + + unsigned RCUTokenID = RetireControlUnit::UnhandledTokenID; + if (!Desc.RetireOOO) + RCUTokenID = RCU.dispatch(IR); + IS.dispatch(RCUTokenID); + + if (Desc.EndGroup) { + Bandwidth = 0; + } else { + unsigned NumMicroOps = IR.getInstruction()->getNumMicroOps(); + assert(Bandwidth >= NumMicroOps); + Bandwidth -= NumMicroOps; + } + + if (llvm::Error E = tryIssue(IR, &StallCyclesLeft)) + return E; + + if (StallCyclesLeft) { + StalledInst = IR; + Bandwidth = 0; + } + + return llvm::ErrorSuccess(); +} + +llvm::Error InOrderIssueStage::tryIssue(InstRef &IR, unsigned *StallCycles) { + Instruction &IS = *IR.getInstruction(); + unsigned SourceIndex = IR.getSourceIndex(); + + if (!canExecute(IR, StallCycles)) { + LLVM_DEBUG(dbgs() << "[E] Stalled #" << IR << " for " << *StallCycles + << " cycles\n"); + return llvm::ErrorSuccess(); + } + + SmallVector UsedRegs(PRF.getNumRegisterFiles()); + addRegisterReadWrite(PRF, IS, SourceIndex, STI, UsedRegs); + + notifyInstructionDispatch(IR, IS.getDesc().NumMicroOps, UsedRegs, *this); + + SmallVector, 4> UsedResources; + RM->issueInstruction(IS.getDesc(), UsedResources); + IS.execute(SourceIndex); + + // Replace resource masks with valid resource processor IDs. + for (std::pair &Use : UsedResources) { + uint64_t Mask = Use.first.first; + Use.first.first = RM->resolveResourceMask(Mask); + } + notifyInstructionExecute(IR, UsedResources, *this); + + IssuedInst.push_back(IR); + ++NumIssued; + + return llvm::ErrorSuccess(); +} + +llvm::Error InOrderIssueStage::updateIssuedInst() { + // Update other instructions. Executed instructions will be retired during the + // next cycle. + unsigned NumExecuted = 0; + for (auto I = IssuedInst.begin(), E = IssuedInst.end(); + I != (E - NumExecuted);) { + InstRef &IR = *I; + Instruction &IS = *IR.getInstruction(); + + IS.cycleEvent(); + if (!IS.isExecuted()) { + LLVM_DEBUG(dbgs() << "[E] Instruction #" << IR + << " is still executing\n"); + ++I; + continue; + } + notifyEvent( + HWInstructionEvent(HWInstructionEvent::Executed, IR)); + + LLVM_DEBUG(dbgs() << "[E] Instruction #" << IR << " is executed\n"); + ++NumExecuted; + std::iter_swap(I, E - NumExecuted); + } + + // Retire instructions in the next cycle + if (NumExecuted) { + for (auto I = IssuedInst.end() - NumExecuted, E = IssuedInst.end(); I != E; + ++I) { + if (llvm::Error E = moveToTheNextStage(*I)) + return E; + } + IssuedInst.resize(IssuedInst.size() - NumExecuted); + } + + return llvm::ErrorSuccess(); +} + +llvm::Error InOrderIssueStage::cycleStart() { + NumIssued = 0; + + // Release consumed resources. + SmallVector Freed; + RM->cycleEvent(Freed); + + if (llvm::Error E = updateIssuedInst()) + return E; + + // Issue instructions scheduled for this cycle + if (!StallCyclesLeft && StalledInst) { + if (llvm::Error E = tryIssue(StalledInst, &StallCyclesLeft)) + return E; + } + + if (!StallCyclesLeft) { + StalledInst.invalidate(); + assert(NumIssued <= SM.IssueWidth && "Overflow."); + Bandwidth = SM.IssueWidth - NumIssued; + } else { + // The instruction is still stalled, cannot issue any new instructions in + // this cycle. + Bandwidth = 0; + } + + return llvm::ErrorSuccess(); +} + +llvm::Error InOrderIssueStage::cycleEnd() { + if (StallCyclesLeft > 0) + --StallCyclesLeft; + return llvm::ErrorSuccess(); +} + +} // namespace mca +} // namespace llvm diff --git a/llvm/lib/MCA/Stages/RetireStage.cpp b/llvm/lib/MCA/Stages/RetireStage.cpp --- a/llvm/lib/MCA/Stages/RetireStage.cpp +++ b/llvm/lib/MCA/Stages/RetireStage.cpp @@ -23,9 +23,6 @@ namespace mca { llvm::Error RetireStage::cycleStart() { - if (RCU.isEmpty()) - return llvm::ErrorSuccess(); - const unsigned MaxRetirePerCycle = RCU.getMaxRetirePerCycle(); unsigned NumRetired = 0; while (!RCU.isEmpty()) { @@ -39,11 +36,26 @@ NumRetired++; } + // Retire instructions that are not controlled by the RCU + for (InstRef &IR : RetireInst) { + IR.getInstruction()->retire(); + notifyInstructionRetired(IR); + } + RetireInst.resize(0); + return llvm::ErrorSuccess(); } llvm::Error RetireStage::execute(InstRef &IR) { - RCU.onInstructionExecuted(IR.getInstruction()->getRCUTokenID()); + Instruction &IS = *IR.getInstruction(); + + unsigned TokenID = IS.getRCUTokenID(); + if (TokenID != RetireControlUnit::UnhandledTokenID) { + RCU.onInstructionExecuted(TokenID); + return llvm::ErrorSuccess(); + } + + RetireInst.push_back(IR); return llvm::ErrorSuccess(); } diff --git a/llvm/lib/Target/AArch64/AArch64SchedA55.td b/llvm/lib/Target/AArch64/AArch64SchedA55.td --- a/llvm/lib/Target/AArch64/AArch64SchedA55.td +++ b/llvm/lib/Target/AArch64/AArch64SchedA55.td @@ -151,6 +151,8 @@ // FP Mul, Div, Sqrt. Div/Sqrt are not pipelined def : WriteRes { let Latency = 4; } + +let RetireOOO = 1 in { def : WriteRes { let Latency = 22; let ResourceCycles = [29]; } def CortexA55WriteFMAC : SchedWriteRes<[CortexA55UnitFPMAC]> { let Latency = 4; } @@ -166,7 +168,7 @@ let ResourceCycles = [9]; } def CortexA55WriteFSqrtDP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 22; let ResourceCycles = [19]; } - +} //===----------------------------------------------------------------------===// // Subtarget-specific SchedRead types. @@ -336,4 +338,6 @@ def : InstRW<[CortexA55WriteFSqrtHP], (instregex "^.*SQRT.*16$")>; def : InstRW<[CortexA55WriteFSqrtSP], (instregex "^.*SQRT.*32$")>; def : InstRW<[CortexA55WriteFSqrtDP], (instregex "^.*SQRT.*64$")>; + +def A55RCU : RetireControlUnit<64, 0>; } diff --git a/llvm/test/TableGen/InvalidMCSchedClassDesc.td b/llvm/test/TableGen/InvalidMCSchedClassDesc.td --- a/llvm/test/TableGen/InvalidMCSchedClassDesc.td +++ b/llvm/test/TableGen/InvalidMCSchedClassDesc.td @@ -19,7 +19,7 @@ // Inst_B didn't have the resoures, and it is invalid. // CHECK: SchedModel_ASchedClasses[] = { // CHECK: {DBGFIELD("Inst_A") 1 -// CHECK-NEXT: {DBGFIELD("Inst_B") 16383 +// CHECK-NEXT: {DBGFIELD("Inst_B") 8191 let SchedModel = SchedModel_A in { def Write_A : SchedWriteRes<[]>; def : InstRW<[Write_A], (instrs Inst_A)>; @@ -27,7 +27,7 @@ // Inst_A didn't have the resoures, and it is invalid. // CHECK: SchedModel_BSchedClasses[] = { -// CHECK: {DBGFIELD("Inst_A") 16383 +// CHECK: {DBGFIELD("Inst_A") 8191 // CHECK-NEXT: {DBGFIELD("Inst_B") 1 let SchedModel = SchedModel_B in { def Write_B: SchedWriteRes<[]>; diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-add-sequence.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-add-sequence.s new file mode 100644 --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-add-sequence.s @@ -0,0 +1,81 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --timeline --iterations=2 < %s | FileCheck %s + +add w2, w3, #1 +add w4, w3, #2, lsl #12 +add w0, w4, #3 +add w1, w0, #4 + +# CHECK: Iterations: 2 +# CHECK-NEXT: Instructions: 8 +# CHECK-NEXT: Total Cycles: 10 +# CHECK-NEXT: Total uOps: 8 + +# CHECK: Dispatch Width: 2 +# CHECK-NEXT: uOps Per Cycle: 0.80 +# CHECK-NEXT: IPC: 0.80 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 3 0.50 add w2, w3, #1 +# CHECK-NEXT: 1 3 0.50 add w4, w3, #2, lsl #12 +# CHECK-NEXT: 1 3 0.50 add w0, w4, #3 +# CHECK-NEXT: 1 3 0.50 add w1, w0, #4 + +# CHECK: Resources: +# CHECK-NEXT: [0.0] - CortexA55UnitALU +# CHECK-NEXT: [0.1] - CortexA55UnitALU +# CHECK-NEXT: [1] - CortexA55UnitB +# CHECK-NEXT: [2] - CortexA55UnitDiv +# CHECK-NEXT: [3.0] - CortexA55UnitFPALU +# CHECK-NEXT: [3.1] - CortexA55UnitFPALU +# CHECK-NEXT: [4] - CortexA55UnitFPDIV +# CHECK-NEXT: [5.0] - CortexA55UnitFPMAC +# CHECK-NEXT: [5.1] - CortexA55UnitFPMAC +# CHECK-NEXT: [6] - CortexA55UnitLd +# CHECK-NEXT: [7] - CortexA55UnitMAC +# CHECK-NEXT: [8] - CortexA55UnitSt + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3.0] [3.1] [4] [5.0] [5.1] [6] [7] [8] +# CHECK-NEXT: 2.00 2.00 - - - - - - - - - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3.0] [3.1] [4] [5.0] [5.1] [6] [7] [8] Instructions: +# CHECK-NEXT: - 1.00 - - - - - - - - - - add w2, w3, #1 +# CHECK-NEXT: 1.00 - - - - - - - - - - - add w4, w3, #2, lsl #12 +# CHECK-NEXT: - 1.00 - - - - - - - - - - add w0, w4, #3 +# CHECK-NEXT: 1.00 - - - - - - - - - - - add w1, w0, #4 + +# CHECK: Timeline view: +# CHECK-NEXT: Index 0123456789 + +# CHECK: [0,0] DeeER. . add w2, w3, #1 +# CHECK-NEXT: [0,1] DeeER. . add w4, w3, #2, lsl #12 +# CHECK-NEXT: [0,2] .DeeER . add w0, w4, #3 +# CHECK-NEXT: [0,3] . DeeER . add w1, w0, #4 +# CHECK-NEXT: [1,0] . DeeER . add w2, w3, #1 +# CHECK-NEXT: [1,1] . DeeER . add w4, w3, #2, lsl #12 +# CHECK-NEXT: [1,2] . DeeER. add w0, w4, #3 +# CHECK-NEXT: [1,3] . DeeER add w1, w0, #4 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 0.0 0.0 0.0 add w2, w3, #1 +# CHECK-NEXT: 1. 2 0.0 0.0 0.0 add w4, w3, #2, lsl #12 +# CHECK-NEXT: 2. 2 0.0 0.0 0.0 add w0, w4, #3 +# CHECK-NEXT: 3. 2 0.0 0.0 0.0 add w1, w0, #4 +# CHECK-NEXT: 2 0.0 0.0 0.0 diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-stats.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-stats.s new file mode 100644 --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-stats.s @@ -0,0 +1,100 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-stats --iterations=2 < %s | FileCheck %s + +ldr w4, [x2], #4 +ldr w5, [x3] +madd w0, w5, w4, w0 +add x3, x3, x13 +subs x1, x1, #1 +str w0, [x21, x18, lsl #2] + +# CHECK: Iterations: 2 +# CHECK-NEXT: Instructions: 12 +# CHECK-NEXT: Total Cycles: 21 +# CHECK-NEXT: Total uOps: 14 + +# CHECK: Dispatch Width: 2 +# CHECK-NEXT: uOps Per Cycle: 0.67 +# CHECK-NEXT: IPC: 0.57 +# CHECK-NEXT: Block RThroughput: 3.5 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 2 3 1.00 * ldr w4, [x2], #4 +# CHECK-NEXT: 1 3 1.00 * ldr w5, [x3] +# CHECK-NEXT: 1 4 1.00 madd w0, w5, w4, w0 +# CHECK-NEXT: 1 3 0.50 add x3, x3, x13 +# CHECK-NEXT: 1 3 0.50 subs x1, x1, #1 +# CHECK-NEXT: 1 4 1.00 * str w0, [x21, x18, lsl #2] + +# CHECK: Dynamic Dispatch Stall Cycles: +# CHECK-NEXT: RAT - Register unavailable: 10 (47.6%) +# CHECK-NEXT: RCU - Retire tokens unavailable: 0 +# CHECK-NEXT: SCHEDQ - Scheduler full: 0 +# CHECK-NEXT: LQ - Load queue full: 0 +# CHECK-NEXT: SQ - Store queue full: 0 +# CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0 + +# CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: +# CHECK-NEXT: [# dispatched], [# cycles] +# CHECK-NEXT: 0, 11 (52.4%) +# CHECK-NEXT: 1, 6 (28.6%) +# CHECK-NEXT: 2, 4 (19.0%) + +# CHECK: Schedulers - number of cycles where we saw N micro opcodes issued: +# CHECK-NEXT: [# issued], [# cycles] +# CHECK-NEXT: 0, 11 (52.4%) +# CHECK-NEXT: 1, 6 (28.6%) +# CHECK-NEXT: 2, 4 (19.0%) + +# CHECK: Scheduler's queue usage: +# CHECK-NEXT: No scheduler resources used. + +# CHECK: Retire Control Unit - number of cycles where we saw N instructions retired: +# CHECK-NEXT: [# retired], [# cycles] +# CHECK-NEXT: 0, 14 (66.7%) +# CHECK-NEXT: 1, 4 (19.0%) +# CHECK-NEXT: 2, 1 (4.8%) +# CHECK-NEXT: 3, 2 (9.5%) + +# CHECK: Total ROB Entries: 64 +# CHECK-NEXT: Max Used ROB Entries: 6 ( 9.4% ) +# CHECK-NEXT: Average Used ROB Entries per cy: 2 ( 3.1% ) + +# CHECK: Register File statistics: +# CHECK-NEXT: Total number of mappings created: 14 +# CHECK-NEXT: Max number of mappings used: 6 + +# CHECK: Resources: +# CHECK-NEXT: [0.0] - CortexA55UnitALU +# CHECK-NEXT: [0.1] - CortexA55UnitALU +# CHECK-NEXT: [1] - CortexA55UnitB +# CHECK-NEXT: [2] - CortexA55UnitDiv +# CHECK-NEXT: [3.0] - CortexA55UnitFPALU +# CHECK-NEXT: [3.1] - CortexA55UnitFPALU +# CHECK-NEXT: [4] - CortexA55UnitFPDIV +# CHECK-NEXT: [5.0] - CortexA55UnitFPMAC +# CHECK-NEXT: [5.1] - CortexA55UnitFPMAC +# CHECK-NEXT: [6] - CortexA55UnitLd +# CHECK-NEXT: [7] - CortexA55UnitMAC +# CHECK-NEXT: [8] - CortexA55UnitSt + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3.0] [3.1] [4] [5.0] [5.1] [6] [7] [8] +# CHECK-NEXT: 1.00 1.00 - - - - - - - 2.00 1.00 1.00 + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3.0] [3.1] [4] [5.0] [5.1] [6] [7] [8] Instructions: +# CHECK-NEXT: - - - - - - - - - 1.00 - - ldr w4, [x2], #4 +# CHECK-NEXT: - - - - - - - - - 1.00 - - ldr w5, [x3] +# CHECK-NEXT: - - - - - - - - - - 1.00 - madd w0, w5, w4, w0 +# CHECK-NEXT: - 1.00 - - - - - - - - - - add x3, x3, x13 +# CHECK-NEXT: 1.00 - - - - - - - - - - - subs x1, x1, #1 +# CHECK-NEXT: - - - - - - - - - - - 1.00 str w0, [x21, x18, lsl #2] diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-views.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-views.s new file mode 100644 --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-views.s @@ -0,0 +1,132 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views --iterations=2 < %s | FileCheck %s + +ldr w4, [x2], #4 +ldr w5, [x3] +madd w0, w5, w4, w0 +add x3, x3, x13 +subs x1, x1, #1 +str w0, [x21, x18, lsl #2] + +# CHECK: Iterations: 2 +# CHECK-NEXT: Instructions: 12 +# CHECK-NEXT: Total Cycles: 21 +# CHECK-NEXT: Total uOps: 14 + +# CHECK: Dispatch Width: 2 +# CHECK-NEXT: uOps Per Cycle: 0.67 +# CHECK-NEXT: IPC: 0.57 +# CHECK-NEXT: Block RThroughput: 3.5 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 2 3 1.00 * ldr w4, [x2], #4 +# CHECK-NEXT: 1 3 1.00 * ldr w5, [x3] +# CHECK-NEXT: 1 4 1.00 madd w0, w5, w4, w0 +# CHECK-NEXT: 1 3 0.50 add x3, x3, x13 +# CHECK-NEXT: 1 3 0.50 subs x1, x1, #1 +# CHECK-NEXT: 1 4 1.00 * str w0, [x21, x18, lsl #2] + +# CHECK: Dynamic Dispatch Stall Cycles: +# CHECK-NEXT: RAT - Register unavailable: 10 (47.6%) +# CHECK-NEXT: RCU - Retire tokens unavailable: 0 +# CHECK-NEXT: SCHEDQ - Scheduler full: 0 +# CHECK-NEXT: LQ - Load queue full: 0 +# CHECK-NEXT: SQ - Store queue full: 0 +# CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0 + +# CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: +# CHECK-NEXT: [# dispatched], [# cycles] +# CHECK-NEXT: 0, 11 (52.4%) +# CHECK-NEXT: 1, 6 (28.6%) +# CHECK-NEXT: 2, 4 (19.0%) + +# CHECK: Schedulers - number of cycles where we saw N micro opcodes issued: +# CHECK-NEXT: [# issued], [# cycles] +# CHECK-NEXT: 0, 11 (52.4%) +# CHECK-NEXT: 1, 6 (28.6%) +# CHECK-NEXT: 2, 4 (19.0%) + +# CHECK: Scheduler's queue usage: +# CHECK-NEXT: No scheduler resources used. + +# CHECK: Retire Control Unit - number of cycles where we saw N instructions retired: +# CHECK-NEXT: [# retired], [# cycles] +# CHECK-NEXT: 0, 14 (66.7%) +# CHECK-NEXT: 1, 4 (19.0%) +# CHECK-NEXT: 2, 1 (4.8%) +# CHECK-NEXT: 3, 2 (9.5%) + +# CHECK: Total ROB Entries: 64 +# CHECK-NEXT: Max Used ROB Entries: 6 ( 9.4% ) +# CHECK-NEXT: Average Used ROB Entries per cy: 2 ( 3.1% ) + +# CHECK: Register File statistics: +# CHECK-NEXT: Total number of mappings created: 14 +# CHECK-NEXT: Max number of mappings used: 6 + +# CHECK: Resources: +# CHECK-NEXT: [0.0] - CortexA55UnitALU +# CHECK-NEXT: [0.1] - CortexA55UnitALU +# CHECK-NEXT: [1] - CortexA55UnitB +# CHECK-NEXT: [2] - CortexA55UnitDiv +# CHECK-NEXT: [3.0] - CortexA55UnitFPALU +# CHECK-NEXT: [3.1] - CortexA55UnitFPALU +# CHECK-NEXT: [4] - CortexA55UnitFPDIV +# CHECK-NEXT: [5.0] - CortexA55UnitFPMAC +# CHECK-NEXT: [5.1] - CortexA55UnitFPMAC +# CHECK-NEXT: [6] - CortexA55UnitLd +# CHECK-NEXT: [7] - CortexA55UnitMAC +# CHECK-NEXT: [8] - CortexA55UnitSt + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3.0] [3.1] [4] [5.0] [5.1] [6] [7] [8] +# CHECK-NEXT: 1.00 1.00 - - - - - - - 2.00 1.00 1.00 + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3.0] [3.1] [4] [5.0] [5.1] [6] [7] [8] Instructions: +# CHECK-NEXT: - - - - - - - - - 1.00 - - ldr w4, [x2], #4 +# CHECK-NEXT: - - - - - - - - - 1.00 - - ldr w5, [x3] +# CHECK-NEXT: - - - - - - - - - - 1.00 - madd w0, w5, w4, w0 +# CHECK-NEXT: - 1.00 - - - - - - - - - - add x3, x3, x13 +# CHECK-NEXT: 1.00 - - - - - - - - - - - subs x1, x1, #1 +# CHECK-NEXT: - - - - - - - - - - - 1.00 str w0, [x21, x18, lsl #2] + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 0 + +# CHECK: [0,0] DeeER. . . . ldr w4, [x2], #4 +# CHECK-NEXT: [0,1] .DeeER . . . ldr w5, [x3] +# CHECK-NEXT: [0,2] . DeeeER. . . madd w0, w5, w4, w0 +# CHECK-NEXT: [0,3] . DeeE-R. . . add x3, x3, x13 +# CHECK-NEXT: [0,4] . DeeER. . . subs x1, x1, #1 +# CHECK-NEXT: [0,5] . . DeeeER . . str w0, [x21, x18, lsl #2] +# CHECK-NEXT: [1,0] . . DeeER . . ldr w4, [x2], #4 +# CHECK-NEXT: [1,1] . . DeeER . . ldr w5, [x3] +# CHECK-NEXT: [1,2] . . . DeeeER . madd w0, w5, w4, w0 +# CHECK-NEXT: [1,3] . . . DeeE-R . add x3, x3, x13 +# CHECK-NEXT: [1,4] . . . DeeER . subs x1, x1, #1 +# CHECK-NEXT: [1,5] . . . DeeeER str w0, [x21, x18, lsl #2] + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 0.0 0.0 0.0 ldr w4, [x2], #4 +# CHECK-NEXT: 1. 2 0.0 0.0 0.0 ldr w5, [x3] +# CHECK-NEXT: 2. 2 0.0 0.0 0.0 madd w0, w5, w4, w0 +# CHECK-NEXT: 3. 2 0.0 0.0 1.0 add x3, x3, x13 +# CHECK-NEXT: 4. 2 0.0 0.0 0.0 subs x1, x1, #1 +# CHECK-NEXT: 5. 2 0.0 0.0 0.0 str w0, [x21, x18, lsl #2] +# CHECK-NEXT: 2 0.0 0.0 0.2 diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-in-order-retire.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-in-order-retire.s new file mode 100644 --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-in-order-retire.s @@ -0,0 +1,128 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-stats --all-views --iterations=2 < %s | FileCheck %s + +sdiv w12, w21, w0 +add w8, w8, #1 +add w1, w2, w0 +add w3, w4, #1 +add w5, w6, w0 +add w7, w9, w0 + +# CHECK: Iterations: 2 +# CHECK-NEXT: Instructions: 12 +# CHECK-NEXT: Total Cycles: 18 +# CHECK-NEXT: Total uOps: 12 + +# CHECK: Dispatch Width: 2 +# CHECK-NEXT: uOps Per Cycle: 0.67 +# CHECK-NEXT: IPC: 0.67 +# CHECK-NEXT: Block RThroughput: 8.0 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 8 8.00 sdiv w12, w21, w0 +# CHECK-NEXT: 1 3 0.50 add w8, w8, #1 +# CHECK-NEXT: 1 3 0.50 add w1, w2, w0 +# CHECK-NEXT: 1 3 0.50 add w3, w4, #1 +# CHECK-NEXT: 1 3 0.50 add w5, w6, w0 +# CHECK-NEXT: 1 3 0.50 add w7, w9, w0 + +# CHECK: Dynamic Dispatch Stall Cycles: +# CHECK-NEXT: RAT - Register unavailable: 0 +# CHECK-NEXT: RCU - Retire tokens unavailable: 0 +# CHECK-NEXT: SCHEDQ - Scheduler full: 0 +# CHECK-NEXT: LQ - Load queue full: 0 +# CHECK-NEXT: SQ - Store queue full: 0 +# CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 5 (27.8%) + +# CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: +# CHECK-NEXT: [# dispatched], [# cycles] +# CHECK-NEXT: 0, 12 (66.7%) +# CHECK-NEXT: 2, 6 (33.3%) + +# CHECK: Schedulers - number of cycles where we saw N micro opcodes issued: +# CHECK-NEXT: [# issued], [# cycles] +# CHECK-NEXT: 0, 12 (66.7%) +# CHECK-NEXT: 2, 6 (33.3%) + +# CHECK: Scheduler's queue usage: +# CHECK-NEXT: No scheduler resources used. + +# CHECK: Retire Control Unit - number of cycles where we saw N instructions retired: +# CHECK-NEXT: [# retired], [# cycles] +# CHECK-NEXT: 0, 16 (88.9%) +# CHECK-NEXT: 6, 2 (11.1%) + +# CHECK: Total ROB Entries: 64 +# CHECK-NEXT: Max Used ROB Entries: 8 ( 12.5% ) +# CHECK-NEXT: Average Used ROB Entries per cy: 5 ( 7.8% ) + +# CHECK: Register File statistics: +# CHECK-NEXT: Total number of mappings created: 12 +# CHECK-NEXT: Max number of mappings used: 8 + +# CHECK: Resources: +# CHECK-NEXT: [0.0] - CortexA55UnitALU +# CHECK-NEXT: [0.1] - CortexA55UnitALU +# CHECK-NEXT: [1] - CortexA55UnitB +# CHECK-NEXT: [2] - CortexA55UnitDiv +# CHECK-NEXT: [3.0] - CortexA55UnitFPALU +# CHECK-NEXT: [3.1] - CortexA55UnitFPALU +# CHECK-NEXT: [4] - CortexA55UnitFPDIV +# CHECK-NEXT: [5.0] - CortexA55UnitFPMAC +# CHECK-NEXT: [5.1] - CortexA55UnitFPMAC +# CHECK-NEXT: [6] - CortexA55UnitLd +# CHECK-NEXT: [7] - CortexA55UnitMAC +# CHECK-NEXT: [8] - CortexA55UnitSt + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3.0] [3.1] [4] [5.0] [5.1] [6] [7] [8] +# CHECK-NEXT: 2.50 2.50 - 8.00 - - - - - - - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3.0] [3.1] [4] [5.0] [5.1] [6] [7] [8] Instructions: +# CHECK-NEXT: - - - 8.00 - - - - - - - - sdiv w12, w21, w0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - add w8, w8, #1 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - add w1, w2, w0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - add w3, w4, #1 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - add w5, w6, w0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - add w7, w9, w0 + +# CHECK: Timeline view: +# CHECK-NEXT: 01234567 +# CHECK-NEXT: Index 0123456789 + +# CHECK: [0,0] DeeeeeeeER. . . sdiv w12, w21, w0 +# CHECK-NEXT: [0,1] DeeE-----R. . . add w8, w8, #1 +# CHECK-NEXT: [0,2] .DeeE----R. . . add w1, w2, w0 +# CHECK-NEXT: [0,3] .DeeE----R. . . add w3, w4, #1 +# CHECK-NEXT: [0,4] . DeeE---R. . . add w5, w6, w0 +# CHECK-NEXT: [0,5] . DeeE---R. . . add w7, w9, w0 +# CHECK-NEXT: [1,0] . . DeeeeeeeER sdiv w12, w21, w0 +# CHECK-NEXT: [1,1] . . DeeE-----R add w8, w8, #1 +# CHECK-NEXT: [1,2] . . DeeE----R add w1, w2, w0 +# CHECK-NEXT: [1,3] . . DeeE----R add w3, w4, #1 +# CHECK-NEXT: [1,4] . . DeeE---R add w5, w6, w0 +# CHECK-NEXT: [1,5] . . DeeE---R add w7, w9, w0 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 0.0 0.0 0.0 sdiv w12, w21, w0 +# CHECK-NEXT: 1. 2 0.0 0.0 5.0 add w8, w8, #1 +# CHECK-NEXT: 2. 2 0.0 0.0 4.0 add w1, w2, w0 +# CHECK-NEXT: 3. 2 0.0 0.0 4.0 add w3, w4, #1 +# CHECK-NEXT: 4. 2 0.0 0.0 3.0 add w5, w6, w0 +# CHECK-NEXT: 5. 2 0.0 0.0 3.0 add w7, w9, w0 +# CHECK-NEXT: 2 0.0 0.0 3.2 diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-out-of-order-retire.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-out-of-order-retire.s new file mode 100644 --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-out-of-order-retire.s @@ -0,0 +1,129 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-stats --all-views --iterations=2 < %s | FileCheck %s + +fdiv s1, s2, s3 +add w8, w8, #1 +add w1, w2, w0 +add w3, w4, #1 +add w5, w6, w0 +add w7, w9, w0 + +# CHECK: Iterations: 2 +# CHECK-NEXT: Instructions: 12 +# CHECK-NEXT: Total Cycles: 25 +# CHECK-NEXT: Total uOps: 12 + +# CHECK: Dispatch Width: 2 +# CHECK-NEXT: uOps Per Cycle: 0.48 +# CHECK-NEXT: IPC: 0.48 +# CHECK-NEXT: Block RThroughput: 10.0 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 13 10.00 fdiv s1, s2, s3 +# CHECK-NEXT: 1 3 0.50 add w8, w8, #1 +# CHECK-NEXT: 1 3 0.50 add w1, w2, w0 +# CHECK-NEXT: 1 3 0.50 add w3, w4, #1 +# CHECK-NEXT: 1 3 0.50 add w5, w6, w0 +# CHECK-NEXT: 1 3 0.50 add w7, w9, w0 + +# CHECK: Dynamic Dispatch Stall Cycles: +# CHECK-NEXT: RAT - Register unavailable: 0 +# CHECK-NEXT: RCU - Retire tokens unavailable: 0 +# CHECK-NEXT: SCHEDQ - Scheduler full: 0 +# CHECK-NEXT: LQ - Load queue full: 0 +# CHECK-NEXT: SQ - Store queue full: 0 +# CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 7 (28.0%) + +# CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: +# CHECK-NEXT: [# dispatched], [# cycles] +# CHECK-NEXT: 0, 19 (76.0%) +# CHECK-NEXT: 2, 6 (24.0%) + +# CHECK: Schedulers - number of cycles where we saw N micro opcodes issued: +# CHECK-NEXT: [# issued], [# cycles] +# CHECK-NEXT: 0, 19 (76.0%) +# CHECK-NEXT: 2, 6 (24.0%) + +# CHECK: Scheduler's queue usage: +# CHECK-NEXT: No scheduler resources used. + +# CHECK: Retire Control Unit - number of cycles where we saw N instructions retired: +# CHECK-NEXT: [# retired], [# cycles] +# CHECK-NEXT: 0, 18 (72.0%) +# CHECK-NEXT: 1, 2 (8.0%) +# CHECK-NEXT: 2, 5 (20.0%) + +# CHECK: Total ROB Entries: 64 +# CHECK-NEXT: Max Used ROB Entries: 7 ( 10.9% ) +# CHECK-NEXT: Average Used ROB Entries per cy: 2 ( 3.1% ) + +# CHECK: Register File statistics: +# CHECK-NEXT: Total number of mappings created: 12 +# CHECK-NEXT: Max number of mappings used: 7 + +# CHECK: Resources: +# CHECK-NEXT: [0.0] - CortexA55UnitALU +# CHECK-NEXT: [0.1] - CortexA55UnitALU +# CHECK-NEXT: [1] - CortexA55UnitB +# CHECK-NEXT: [2] - CortexA55UnitDiv +# CHECK-NEXT: [3.0] - CortexA55UnitFPALU +# CHECK-NEXT: [3.1] - CortexA55UnitFPALU +# CHECK-NEXT: [4] - CortexA55UnitFPDIV +# CHECK-NEXT: [5.0] - CortexA55UnitFPMAC +# CHECK-NEXT: [5.1] - CortexA55UnitFPMAC +# CHECK-NEXT: [6] - CortexA55UnitLd +# CHECK-NEXT: [7] - CortexA55UnitMAC +# CHECK-NEXT: [8] - CortexA55UnitSt + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3.0] [3.1] [4] [5.0] [5.1] [6] [7] [8] +# CHECK-NEXT: 2.50 2.50 - - - - 10.00 - - - - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3.0] [3.1] [4] [5.0] [5.1] [6] [7] [8] Instructions: +# CHECK-NEXT: - - - - - - 10.00 - - - - - fdiv s1, s2, s3 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - add w8, w8, #1 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - add w1, w2, w0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - add w3, w4, #1 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - add w5, w6, w0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - add w7, w9, w0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 01234 + +# CHECK: [0,0] DeeeeeeeeeeeeER. . . fdiv s1, s2, s3 +# CHECK-NEXT: [0,1] DeeER. . . . . add w8, w8, #1 +# CHECK-NEXT: [0,2] .DeeER . . . . add w1, w2, w0 +# CHECK-NEXT: [0,3] .DeeER . . . . add w3, w4, #1 +# CHECK-NEXT: [0,4] . DeeER . . . . add w5, w6, w0 +# CHECK-NEXT: [0,5] . DeeER . . . . add w7, w9, w0 +# CHECK-NEXT: [1,0] . . DeeeeeeeeeeeeER fdiv s1, s2, s3 +# CHECK-NEXT: [1,1] . . DeeER. . . add w8, w8, #1 +# CHECK-NEXT: [1,2] . . .DeeER . . add w1, w2, w0 +# CHECK-NEXT: [1,3] . . .DeeER . . add w3, w4, #1 +# CHECK-NEXT: [1,4] . . . DeeER . . add w5, w6, w0 +# CHECK-NEXT: [1,5] . . . DeeER . . add w7, w9, w0 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 0.0 0.0 0.0 fdiv s1, s2, s3 +# CHECK-NEXT: 1. 2 0.0 0.0 0.0 add w8, w8, #1 +# CHECK-NEXT: 2. 2 0.0 0.0 0.0 add w1, w2, w0 +# CHECK-NEXT: 3. 2 0.0 0.0 0.0 add w3, w4, #1 +# CHECK-NEXT: 4. 2 0.0 0.0 0.0 add w5, w6, w0 +# CHECK-NEXT: 5. 2 0.0 0.0 0.0 add w7, w9, w0 +# CHECK-NEXT: 2 0.0 0.0 0.0 diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/in-order-bottleneck-analysis.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/in-order-bottleneck-analysis.s new file mode 100644 --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/in-order-bottleneck-analysis.s @@ -0,0 +1,8 @@ +# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views < %s | FileCheck %s +# CHECK-NOT: Throughput Bottlenecks + +# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --bottleneck-analysis < %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK-WARN +# CHECK-WARN: warning: bottleneck analysis is not supported for in-order CPU 'cortex-a55' + +add w2, w3, #1 + diff --git a/llvm/test/tools/llvm-mca/ARM/m7-negative-readadvance.s b/llvm/test/tools/llvm-mca/ARM/m7-negative-readadvance.s new file mode 100644 --- /dev/null +++ b/llvm/test/tools/llvm-mca/ARM/m7-negative-readadvance.s @@ -0,0 +1,75 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=arm -mcpu=cortex-m7 --timeline --iterations=1 < %s | FileCheck %s + +add r1, r1, #1 +# ReadAdvance: 0 +add r1, r1, #2 +# ReadAdvance: -1 +vldr d0, [r1] + +# CHECK: Iterations: 1 +# CHECK-NEXT: Instructions: 3 +# CHECK-NEXT: Total Cycles: 7 +# CHECK-NEXT: Total uOps: 3 + +# CHECK: Dispatch Width: 2 +# CHECK-NEXT: uOps Per Cycle: 0.43 +# CHECK-NEXT: IPC: 0.43 +# CHECK-NEXT: Block RThroughput: 1.5 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 1 0.50 add.w r1, r1, #1 +# CHECK-NEXT: 1 1 0.50 add.w r1, r1, #2 +# CHECK-NEXT: 1 3 1.00 * vldr d0, [r1] + +# CHECK: Resources: +# CHECK-NEXT: [0.0] - M7UnitALU +# CHECK-NEXT: [0.1] - M7UnitALU +# CHECK-NEXT: [1] - M7UnitBranch +# CHECK-NEXT: [2.0] - M7UnitLoad +# CHECK-NEXT: [2.1] - M7UnitLoad +# CHECK-NEXT: [3] - M7UnitMAC +# CHECK-NEXT: [4] - M7UnitSIMD +# CHECK-NEXT: [5] - M7UnitShift1 +# CHECK-NEXT: [6] - M7UnitShift2 +# CHECK-NEXT: [7] - M7UnitStore +# CHECK-NEXT: [8] - M7UnitVFP +# CHECK-NEXT: [9.0] - M7UnitVPort +# CHECK-NEXT: [9.1] - M7UnitVPort + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0.0] [0.1] [1] [2.0] [2.1] [3] [4] [5] [6] [7] [8] [9.0] [9.1] +# CHECK-NEXT: 1.00 1.00 - - 1.00 - - - - - - - 2.00 + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0.0] [0.1] [1] [2.0] [2.1] [3] [4] [5] [6] [7] [8] [9.0] [9.1] Instructions: +# CHECK-NEXT: - 1.00 - - - - - - - - - - - add.w r1, r1, #1 +# CHECK-NEXT: 1.00 - - - - - - - - - - - - add.w r1, r1, #2 +# CHECK-NEXT: - - - - 1.00 - - - - - - - 2.00 vldr d0, [r1] + +# CHECK: Timeline view: +# CHECK-NEXT: Index 0123456 + +# CHECK: [0,0] DER .. add.w r1, r1, #1 +# CHECK-NEXT: [0,1] .DER .. add.w r1, r1, #2 +# CHECK-NEXT: [0,2] . DeER vldr d0, [r1] + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 1 0.0 0.0 0.0 add.w r1, r1, #1 +# CHECK-NEXT: 1. 1 0.0 0.0 0.0 add.w r1, r1, #2 +# CHECK-NEXT: 2. 1 0.0 0.0 0.0 vldr d0, [r1] +# CHECK-NEXT: 1 0.0 0.0 0.0 diff --git a/llvm/test/tools/llvm-mca/X86/in-order-cpu.s b/llvm/test/tools/llvm-mca/X86/in-order-cpu.s --- a/llvm/test/tools/llvm-mca/X86/in-order-cpu.s +++ b/llvm/test/tools/llvm-mca/X86/in-order-cpu.s @@ -1,3 +1,3 @@ -# RUN: not llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=atom -o /dev/null 2>&1 | FileCheck %s - -# CHECK: error: please specify an out-of-order cpu. 'atom' is an in-order cpu. +# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=atom -o /dev/null 2>&1 | FileCheck %s +# CHECK: warning: support for in-order CPU 'atom' is experimental. +movsbw %al, %di diff --git a/llvm/tools/llvm-mca/llvm-mca.cpp b/llvm/tools/llvm-mca/llvm-mca.cpp --- a/llvm/tools/llvm-mca/llvm-mca.cpp +++ b/llvm/tools/llvm-mca/llvm-mca.cpp @@ -257,14 +257,15 @@ O = Default.getValue(); } -static void processViewOptions() { +static void processViewOptions(bool IsOutOfOrder) { if (!EnableAllViews.getNumOccurrences() && !EnableAllStats.getNumOccurrences()) return; if (EnableAllViews.getNumOccurrences()) { processOptionImpl(PrintSummaryView, EnableAllViews); - processOptionImpl(EnableBottleneckAnalysis, EnableAllViews); + if (IsOutOfOrder) + processOptionImpl(EnableBottleneckAnalysis, EnableAllViews); processOptionImpl(PrintResourcePressureView, EnableAllViews); processOptionImpl(PrintTimelineView, EnableAllViews); processOptionImpl(PrintInstructionInfoView, EnableAllViews); @@ -327,9 +328,6 @@ return 1; } - // Apply overrides to llvm-mca specific options. - processViewOptions(); - if (MCPU == "native") MCPU = std::string(llvm::sys::getHostCPUName()); @@ -339,10 +337,10 @@ if (!STI->isCPUStringValid(MCPU)) return 1; - if (!PrintInstructionTables && !STI->getSchedModel().isOutOfOrder()) { - WithColor::error() << "please specify an out-of-order cpu. '" << MCPU - << "' is an in-order cpu.\n"; - return 1; + bool IsOutOfOrder = STI->getSchedModel().isOutOfOrder(); + if (!PrintInstructionTables && !IsOutOfOrder) { + WithColor::warning() << "support for in-order CPU '" << MCPU + << "' is experimental.\n"; } if (!STI->getSchedModel().hasInstrSchedModel()) { @@ -358,6 +356,9 @@ return 1; } + // Apply overrides to llvm-mca specific options. + processViewOptions(IsOutOfOrder); + std::unique_ptr MRI(TheTarget->createMCRegInfo(TripleName)); assert(MRI && "Unable to create target register info!"); @@ -539,6 +540,11 @@ std::make_unique(SM, Insts, DispatchWidth)); if (EnableBottleneckAnalysis) { + if (!IsOutOfOrder) { + WithColor::warning() + << "bottleneck analysis is not supported for in-order CPU '" << MCPU + << "'.\n"; + } Printer.addView(std::make_unique( *STI, *IP, Insts, S.getNumIterations())); } diff --git a/llvm/utils/TableGen/SubtargetEmitter.cpp b/llvm/utils/TableGen/SubtargetEmitter.cpp --- a/llvm/utils/TableGen/SubtargetEmitter.cpp +++ b/llvm/utils/TableGen/SubtargetEmitter.cpp @@ -993,6 +993,7 @@ SCDesc.NumMicroOps = 0; SCDesc.BeginGroup = false; SCDesc.EndGroup = false; + SCDesc.RetireOOO = false; SCDesc.WriteProcResIdx = 0; SCDesc.WriteLatencyIdx = 0; SCDesc.ReadAdvanceIdx = 0; @@ -1095,6 +1096,7 @@ SCDesc.EndGroup |= WriteRes->getValueAsBit("EndGroup"); SCDesc.BeginGroup |= WriteRes->getValueAsBit("SingleIssue"); SCDesc.EndGroup |= WriteRes->getValueAsBit("SingleIssue"); + SCDesc.RetireOOO |= WriteRes->getValueAsBit("RetireOOO"); // Create an entry for each ProcResource listed in WriteRes. RecVec PRVec = WriteRes->getValueAsListOfDefs("ProcResources"); @@ -1293,7 +1295,7 @@ std::vector &SCTab = SchedTables.ProcSchedClasses[1 + (PI - SchedModels.procModelBegin())]; - OS << "\n// {Name, NumMicroOps, BeginGroup, EndGroup," + OS << "\n// {Name, NumMicroOps, BeginGroup, EndGroup, RetireOOO," << " WriteProcResIdx,#, WriteLatencyIdx,#, ReadAdvanceIdx,#}\n"; OS << "static const llvm::MCSchedClassDesc " << PI->ModelName << "SchedClasses[] = {\n"; @@ -1304,7 +1306,7 @@ && "invalid class not first"); OS << " {DBGFIELD(\"InvalidSchedClass\") " << MCSchedClassDesc::InvalidNumMicroOps - << ", false, false, 0, 0, 0, 0, 0, 0},\n"; + << ", false, false, false, 0, 0, 0, 0, 0, 0},\n"; for (unsigned SCIdx = 1, SCEnd = SCTab.size(); SCIdx != SCEnd; ++SCIdx) { MCSchedClassDesc &MCDesc = SCTab[SCIdx]; @@ -1315,6 +1317,7 @@ OS << MCDesc.NumMicroOps << ", " << ( MCDesc.BeginGroup ? "true" : "false" ) << ", " << ( MCDesc.EndGroup ? "true" : "false" ) + << ", " << ( MCDesc.RetireOOO ? "true" : "false" ) << ", " << format("%2d", MCDesc.WriteProcResIdx) << ", " << MCDesc.NumWriteProcResEntries << ", " << format("%2d", MCDesc.WriteLatencyIdx)