diff --git a/llvm/include/llvm/MC/MCSchedule.h b/llvm/include/llvm/MC/MCSchedule.h --- a/llvm/include/llvm/MC/MCSchedule.h +++ b/llvm/include/llvm/MC/MCSchedule.h @@ -108,15 +108,16 @@ /// /// Defined as an aggregate struct for creating tables with initializer lists. struct MCSchedClassDesc { - static const unsigned short InvalidNumMicroOps = (1U << 14) - 1; + static const unsigned short InvalidNumMicroOps = (1U << 13) - 1; static const unsigned short VariantNumMicroOps = InvalidNumMicroOps - 1; #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) const char* Name; #endif - uint16_t NumMicroOps : 14; + uint16_t NumMicroOps : 13; bool BeginGroup : 1; bool EndGroup : 1; + bool RetireOOO : 1; uint16_t WriteProcResIdx; // First index into WriteProcResTable. uint16_t NumWriteProcResEntries; uint16_t WriteLatencyIdx; // First index into WriteLatencyTable. diff --git a/llvm/include/llvm/MCA/Context.h b/llvm/include/llvm/MCA/Context.h --- a/llvm/include/llvm/MCA/Context.h +++ b/llvm/include/llvm/MCA/Context.h @@ -68,6 +68,11 @@ /// This pipeline consists of Fetch, Dispatch, Execute, and Retire stages. std::unique_ptr createDefaultPipeline(const PipelineOptions &Opts, SourceMgr &SrcMgr); + + /// Construct a basic pipeline for simulating an in-order pipeline. + /// This pipeline consists of Fetch, InOrderIssue, and Retire stages. + std::unique_ptr createInOrderPipeline(const PipelineOptions &Opts, + SourceMgr &SrcMgr); }; } // namespace mca diff --git a/llvm/include/llvm/MCA/HardwareUnits/RegisterFile.h b/llvm/include/llvm/MCA/HardwareUnits/RegisterFile.h --- a/llvm/include/llvm/MCA/HardwareUnits/RegisterFile.h +++ b/llvm/include/llvm/MCA/HardwareUnits/RegisterFile.h @@ -172,11 +172,6 @@ void freePhysRegs(const RegisterRenamingInfo &Entry, MutableArrayRef FreedPhysRegs); - // Collects writes that are in a RAW dependency with RS. - // This method is called from `addRegisterRead()`. - void collectWrites(const ReadState &RS, - SmallVectorImpl &Writes) const; - // Create an instance of RegisterMappingTracker for every register file // specified by the processor model. // If no register file is specified, then this method creates a default @@ -187,6 +182,10 @@ RegisterFile(const MCSchedModel &SM, const MCRegisterInfo &mri, unsigned NumRegs = 0); + // Collects writes that are in a RAW dependency with RS. + void collectWrites(const ReadState &RS, + SmallVectorImpl &Writes) const; + // This method updates the register mappings inserting a new register // definition. This method is also responsible for updating the number of // allocated physical registers in each register file modified by the write. diff --git a/llvm/include/llvm/MCA/HardwareUnits/RetireControlUnit.h b/llvm/include/llvm/MCA/HardwareUnits/RetireControlUnit.h --- a/llvm/include/llvm/MCA/HardwareUnits/RetireControlUnit.h +++ b/llvm/include/llvm/MCA/HardwareUnits/RetireControlUnit.h @@ -104,6 +104,9 @@ #ifndef NDEBUG void dump() const; #endif + + // Assigned to instructions that are not handled by the RCU. + static const unsigned UnhandledTokenID = ~0U; }; } // namespace mca diff --git a/llvm/include/llvm/MCA/Instruction.h b/llvm/include/llvm/MCA/Instruction.h --- a/llvm/include/llvm/MCA/Instruction.h +++ b/llvm/include/llvm/MCA/Instruction.h @@ -375,6 +375,7 @@ bool HasSideEffects; bool BeginGroup; bool EndGroup; + bool RetireOOO; // True if all buffered resources are in-order, and there is at least one // buffer which is a dispatch hazard (BufferSize = 0). diff --git a/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h b/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h new file mode 100644 --- /dev/null +++ b/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h @@ -0,0 +1,84 @@ +//===---------------------- InOrderIssueStage.h -----------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// InOrderIssueStage implements an in-order execution pipeline. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_MCA_IN_ORDER_ISSUE_STAGE_H +#define LLVM_MCA_IN_ORDER_ISSUE_STAGE_H + +#include "llvm/ADT/SmallVector.h" +#include "llvm/MCA/SourceMgr.h" +#include "llvm/MCA/Stages/Stage.h" + +#include + +namespace llvm { +struct MCSchedModel; +class MCSubtargetInfo; + +namespace mca { +class RegisterFile; +class ResourceManager; +struct RetireControlUnit; + +class InOrderIssueStage final : public Stage { + const MCSchedModel &SM; + const MCSubtargetInfo &STI; + RetireControlUnit &RCU; + RegisterFile &PRF; + std::unique_ptr RM; + + /// Instructions that were issued, but not executed yet. + SmallVector IssuedInst; + + /// Number of instructions issued in the current cycle. + unsigned NumIssued; + + /// If an instruction cannot execute due to an unmet register or resource + /// dependency, the it is stalled for StallCyclesLeft. + InstRef StalledInst; + unsigned StallCyclesLeft; + + /// Number of instructions that can be issued in the current cycle. + unsigned Bandwidth; + + InOrderIssueStage(const InOrderIssueStage &Other) = delete; + InOrderIssueStage &operator=(const InOrderIssueStage &Other) = delete; + + /// If IR has an unmet register or resource dependency, canExecute returns + /// false. StallCycles is set to the number of cycles left before the + /// instruction can be issued. + bool canExecute(const InstRef &IR, unsigned *StallCycles) const; + + /// Issue the instruction, or update StallCycles if IR is stalled. + Error tryIssue(InstRef &IR, unsigned *StallCycles); + + /// Update status of instructions from IssuedInst. + Error updateIssuedInst(); + +public: + InOrderIssueStage(RetireControlUnit &RCU, RegisterFile &PRF, + const MCSchedModel &SM, const MCSubtargetInfo &STI) + : SM(SM), STI(STI), RCU(RCU), PRF(PRF), + RM(std::make_unique(SM)), StallCyclesLeft(0), + Bandwidth(0) {} + + bool isAvailable(const InstRef &) const override; + bool hasWorkToComplete() const override; + Error execute(InstRef &IR) override; + Error cycleStart() override; + Error cycleEnd() override; +}; + +} // namespace mca +} // namespace llvm + +#endif // LLVM_MCA_IN_ORDER_ISSUE_STAGE_H diff --git a/llvm/include/llvm/MCA/Stages/RetireStage.h b/llvm/include/llvm/MCA/Stages/RetireStage.h --- a/llvm/include/llvm/MCA/Stages/RetireStage.h +++ b/llvm/include/llvm/MCA/Stages/RetireStage.h @@ -16,6 +16,7 @@ #ifndef LLVM_MCA_RETIRE_STAGE_H #define LLVM_MCA_RETIRE_STAGE_H +#include "llvm/ADT/SmallVector.h" #include "llvm/MCA/HardwareUnits/LSUnit.h" #include "llvm/MCA/HardwareUnits/RegisterFile.h" #include "llvm/MCA/HardwareUnits/RetireControlUnit.h" @@ -29,6 +30,7 @@ RetireControlUnit &RCU; RegisterFile &PRF; LSUnitBase &LSU; + SmallVector RetireInst; RetireStage(const RetireStage &Other) = delete; RetireStage &operator=(const RetireStage &Other) = delete; @@ -37,7 +39,9 @@ RetireStage(RetireControlUnit &R, RegisterFile &F, LSUnitBase &LS) : Stage(), RCU(R), PRF(F), LSU(LS) {} - bool hasWorkToComplete() const override { return !RCU.isEmpty(); } + bool hasWorkToComplete() const override { + return !RCU.isEmpty() || !RetireInst.empty(); + } Error cycleStart() override; Error execute(InstRef &IR) override; void notifyInstructionRetired(const InstRef &IR) const; diff --git a/llvm/include/llvm/Target/TargetSchedule.td b/llvm/include/llvm/Target/TargetSchedule.td --- a/llvm/include/llvm/Target/TargetSchedule.td +++ b/llvm/include/llvm/Target/TargetSchedule.td @@ -262,6 +262,10 @@ // Allow a processor to mark some scheduling classes as single-issue. // SingleIssue is an alias for Begin/End Group. bit SingleIssue = false; + // An instruction is allowed to retire out-of-order if RetireOOO is + // true for at least one of its writes. This field is only used by + // MCA for in-order subtargets, and is ignored for other targets. + bit RetireOOO = false; SchedMachineModel SchedModel = ?; } diff --git a/llvm/lib/MCA/CMakeLists.txt b/llvm/lib/MCA/CMakeLists.txt --- a/llvm/lib/MCA/CMakeLists.txt +++ b/llvm/lib/MCA/CMakeLists.txt @@ -14,6 +14,7 @@ Stages/DispatchStage.cpp Stages/EntryStage.cpp Stages/ExecuteStage.cpp + Stages/InOrderIssueStage.cpp Stages/InstructionTables.cpp Stages/MicroOpQueueStage.cpp Stages/RetireStage.cpp diff --git a/llvm/lib/MCA/Context.cpp b/llvm/lib/MCA/Context.cpp --- a/llvm/lib/MCA/Context.cpp +++ b/llvm/lib/MCA/Context.cpp @@ -21,6 +21,7 @@ #include "llvm/MCA/Stages/DispatchStage.h" #include "llvm/MCA/Stages/EntryStage.h" #include "llvm/MCA/Stages/ExecuteStage.h" +#include "llvm/MCA/Stages/InOrderIssueStage.h" #include "llvm/MCA/Stages/MicroOpQueueStage.h" #include "llvm/MCA/Stages/RetireStage.h" @@ -31,6 +32,9 @@ Context::createDefaultPipeline(const PipelineOptions &Opts, SourceMgr &SrcMgr) { const MCSchedModel &SM = STI.getSchedModel(); + if (!SM.isOutOfOrder()) + return createInOrderPipeline(Opts, SrcMgr); + // Create the hardware units defining the backend. auto RCU = std::make_unique(SM); auto PRF = std::make_unique(SM, MRI, Opts.RegisterFileSize); @@ -64,5 +68,29 @@ return StagePipeline; } +std::unique_ptr +Context::createInOrderPipeline(const PipelineOptions &Opts, SourceMgr &SrcMgr) { + const MCSchedModel &SM = STI.getSchedModel(); + auto RCU = std::make_unique(SM); + auto PRF = std::make_unique(SM, MRI, Opts.RegisterFileSize); + auto LSU = std::make_unique(SM, Opts.LoadQueueSize, + Opts.StoreQueueSize, Opts.AssumeNoAlias); + + auto Entry = std::make_unique(SrcMgr); + auto InOrderIssue = std::make_unique(*RCU, *PRF, SM, STI); + auto Retire = std::make_unique(*RCU, *PRF, *LSU); + + auto StagePipeline = std::make_unique(); + StagePipeline->appendStage(std::move(Entry)); + StagePipeline->appendStage(std::move(InOrderIssue)); + StagePipeline->appendStage(std::move(Retire)); + + addHardwareUnit(std::move(RCU)); + addHardwareUnit(std::move(PRF)); + addHardwareUnit(std::move(LSU)); + + return StagePipeline; +} + } // namespace mca } // namespace llvm diff --git a/llvm/lib/MCA/HardwareUnits/RetireControlUnit.cpp b/llvm/lib/MCA/HardwareUnits/RetireControlUnit.cpp --- a/llvm/lib/MCA/HardwareUnits/RetireControlUnit.cpp +++ b/llvm/lib/MCA/HardwareUnits/RetireControlUnit.cpp @@ -33,12 +33,18 @@ MaxRetirePerCycle = EPI.MaxRetirePerCycle; } NumROBEntries = AvailableEntries; + bool IsOutOfOrder = SM.MicroOpBufferSize; + if (!IsOutOfOrder && !NumROBEntries) + return; assert(NumROBEntries && "Invalid reorder buffer size!"); Queue.resize(2 * NumROBEntries); } // Reserves a number of slots, and returns a new token. unsigned RetireControlUnit::dispatch(const InstRef &IR) { + if (!NumROBEntries) + return UnhandledTokenID; + const Instruction &Inst = *IR.getInstruction(); unsigned Entries = normalizeQuantity(Inst.getNumMicroOps()); assert((AvailableEntries >= Entries) && "Reorder Buffer unavailable!"); @@ -47,6 +53,7 @@ Queue[NextAvailableSlotIdx] = {IR, Entries, false}; NextAvailableSlotIdx += std::max(1U, Entries); NextAvailableSlotIdx %= Queue.size(); + assert(TokenID < UnhandledTokenID && "Invalid token ID"); AvailableEntries -= Entries; return TokenID; diff --git a/llvm/lib/MCA/InstrBuilder.cpp b/llvm/lib/MCA/InstrBuilder.cpp --- a/llvm/lib/MCA/InstrBuilder.cpp +++ b/llvm/lib/MCA/InstrBuilder.cpp @@ -570,6 +570,7 @@ ID->HasSideEffects = MCDesc.hasUnmodeledSideEffects(); ID->BeginGroup = SCDesc.BeginGroup; ID->EndGroup = SCDesc.EndGroup; + ID->RetireOOO = SCDesc.RetireOOO; initializeUsedResources(*ID, SCDesc, STI, ProcResourceMasks); computeMaxLatency(*ID, MCDesc, SCDesc, STI); diff --git a/llvm/lib/MCA/Stages/InOrderIssueStage.cpp b/llvm/lib/MCA/Stages/InOrderIssueStage.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/MCA/Stages/InOrderIssueStage.cpp @@ -0,0 +1,288 @@ +//===---------------------- InOrderIssueStage.cpp ---------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// InOrderIssueStage implements an in-order execution pipeline. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/MCA/Stages/InOrderIssueStage.h" + +#include "llvm/MC/MCSchedule.h" +#include "llvm/MCA/HWEventListener.h" +#include "llvm/MCA/HardwareUnits/RegisterFile.h" +#include "llvm/MCA/HardwareUnits/ResourceManager.h" +#include "llvm/MCA/HardwareUnits/RetireControlUnit.h" +#include "llvm/MCA/Instruction.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/Error.h" + +#include + +#define DEBUG_TYPE "llvm-mca" +namespace llvm { +namespace mca { + +bool InOrderIssueStage::hasWorkToComplete() const { + return !IssuedInst.empty() || StalledInst; +} + +bool InOrderIssueStage::isAvailable(const InstRef &IR) const { + const Instruction &Inst = *IR.getInstruction(); + unsigned NumMicroOps = Inst.getNumMicroOps(); + const InstrDesc &Desc = Inst.getDesc(); + + if (Bandwidth < NumMicroOps) + return false; + + // Instruction with BeginGroup must be the first instruction to be issued in a + // cycle. + if (Desc.BeginGroup && NumIssued != 0) + return false; + + return true; +} + +static bool hasResourceHazard(const ResourceManager &RM, const InstRef &IR) { + if (RM.checkAvailability(IR.getInstruction()->getDesc())) { + LLVM_DEBUG(dbgs() << "[E] Stall #" << IR << '\n'); + return true; + } + + return false; +} + +/// Return a number of cycles left until register requirements of the +/// instructions are met. +static unsigned checkRegisterHazard(const RegisterFile &PRF, + const MCSchedModel &SM, + const MCSubtargetInfo &STI, + const InstRef &IR) { + unsigned StallCycles = 0; + SmallVector Writes; + + for (const ReadState &RS : IR.getInstruction()->getUses()) { + const ReadDescriptor &RD = RS.getDescriptor(); + const MCSchedClassDesc *SC = SM.getSchedClassDesc(RD.SchedClassID); + + PRF.collectWrites(RS, Writes); + for (const WriteRef &WR : Writes) { + const WriteState *WS = WR.getWriteState(); + unsigned WriteResID = WS->getWriteResourceID(); + int ReadAdvance = STI.getReadAdvanceCycles(SC, RD.UseIndex, WriteResID); + LLVM_DEBUG(dbgs() << "[E] ReadAdvance for #" << IR << ": " << ReadAdvance + << '\n'); + + assert(ReadAdvance >= 0); + assert(WS->getCyclesLeft() != UNKNOWN_CYCLES); + unsigned CyclesLeft = WS->getCyclesLeft(); + if (CyclesLeft > (unsigned)ReadAdvance) { + LLVM_DEBUG(dbgs() << "[E] Register hazard: " << WS->getRegisterID() + << '\n'); + StallCycles = std::max(StallCycles, CyclesLeft - ReadAdvance); + } + } + Writes.clear(); + } + + return StallCycles; +} + +bool InOrderIssueStage::canExecute(const InstRef &IR, + unsigned *StallCycles) const { + *StallCycles = 0; + + if (unsigned RegStall = checkRegisterHazard(PRF, SM, STI, IR)) { + *StallCycles = RegStall; + // FIXME: add a parameter to HWStallEvent to indicate a number of cycles. + for (unsigned I = 0; I < RegStall; ++I) { + notifyEvent( + HWStallEvent(HWStallEvent::RegisterFileStall, IR)); + notifyEvent( + HWPressureEvent(HWPressureEvent::REGISTER_DEPS, IR)); + } + } else if (hasResourceHazard(*RM, IR)) { + *StallCycles = 1; + notifyEvent( + HWStallEvent(HWStallEvent::DispatchGroupStall, IR)); + notifyEvent( + HWPressureEvent(HWPressureEvent::RESOURCES, IR)); + } + + return *StallCycles == 0; +} + +static void addRegisterReadWrite(RegisterFile &PRF, Instruction &IS, + unsigned SourceIndex, + const MCSubtargetInfo &STI, + SmallVectorImpl &UsedRegs) { + assert(!IS.isEliminated()); + + for (ReadState &RS : IS.getUses()) + PRF.addRegisterRead(RS, STI); + + for (WriteState &WS : IS.getDefs()) + PRF.addRegisterWrite(WriteRef(SourceIndex, &WS), UsedRegs); +} + +static void notifyInstructionExecute( + const InstRef &IR, + const SmallVectorImpl> &UsedRes, + const Stage &S) { + + S.notifyEvent( + HWInstructionEvent(HWInstructionEvent::Ready, IR)); + S.notifyEvent(HWInstructionIssuedEvent(IR, UsedRes)); + + LLVM_DEBUG(dbgs() << "[E] Issued #" << IR << "\n"); +} + +static void notifyInstructionDispatch(const InstRef &IR, unsigned Ops, + const SmallVectorImpl &UsedRegs, + const Stage &S) { + + S.notifyEvent( + HWInstructionDispatchedEvent(IR, UsedRegs, Ops)); + + LLVM_DEBUG(dbgs() << "[E] Dispatched #" << IR << "\n"); +} + +llvm::Error InOrderIssueStage::execute(InstRef &IR) { + Instruction &IS = *IR.getInstruction(); + const InstrDesc &Desc = IS.getDesc(); + + unsigned RCUTokenID = RetireControlUnit::UnhandledTokenID; + if (!Desc.RetireOOO) + RCUTokenID = RCU.dispatch(IR); + IS.dispatch(RCUTokenID); + + if (Desc.EndGroup) { + Bandwidth = 0; + } else { + unsigned NumMicroOps = IR.getInstruction()->getNumMicroOps(); + assert(Bandwidth >= NumMicroOps); + Bandwidth -= NumMicroOps; + } + + if (llvm::Error E = tryIssue(IR, &StallCyclesLeft)) + return E; + + if (StallCyclesLeft) { + StalledInst = IR; + Bandwidth = 0; + } + + return llvm::ErrorSuccess(); +} + +llvm::Error InOrderIssueStage::tryIssue(InstRef &IR, unsigned *StallCycles) { + Instruction &IS = *IR.getInstruction(); + unsigned SourceIndex = IR.getSourceIndex(); + + if (!canExecute(IR, StallCycles)) { + LLVM_DEBUG(dbgs() << "[E] Stalled #" << IR << " for " << *StallCycles + << " cycles\n"); + return llvm::ErrorSuccess(); + } + + SmallVector UsedRegs(PRF.getNumRegisterFiles()); + addRegisterReadWrite(PRF, IS, SourceIndex, STI, UsedRegs); + + notifyInstructionDispatch(IR, IS.getDesc().NumMicroOps, UsedRegs, *this); + + SmallVector, 4> UsedResources; + RM->issueInstruction(IS.getDesc(), UsedResources); + IS.execute(SourceIndex); + + // Replace resource masks with valid resource processor IDs. + for (std::pair &Use : UsedResources) { + uint64_t Mask = Use.first.first; + Use.first.first = RM->resolveResourceMask(Mask); + } + notifyInstructionExecute(IR, UsedResources, *this); + + IssuedInst.push_back(IR); + ++NumIssued; + + return llvm::ErrorSuccess(); +} + +llvm::Error InOrderIssueStage::updateIssuedInst() { + // Update other instructions. Executed instructions will be retired during the + // next cycle. + unsigned NumExecuted = 0; + for (auto I = IssuedInst.begin(), E = IssuedInst.end(); + I != (E - NumExecuted);) { + InstRef &IR = *I; + Instruction &IS = *IR.getInstruction(); + + IS.cycleEvent(); + if (!IS.isExecuted()) { + LLVM_DEBUG(dbgs() << "[E] Instruction #" << IR + << " is still executing\n"); + ++I; + continue; + } + notifyEvent( + HWInstructionEvent(HWInstructionEvent::Executed, IR)); + + LLVM_DEBUG(dbgs() << "[E] Instruction #" << IR << " is executed\n"); + ++NumExecuted; + std::iter_swap(I, E - NumExecuted); + } + + // Retire instructions in the next cycle + if (NumExecuted) { + for (auto I = IssuedInst.end() - NumExecuted, E = IssuedInst.end(); I != E; + ++I) { + if (llvm::Error E = moveToTheNextStage(*I)) + return E; + } + IssuedInst.resize(IssuedInst.size() - NumExecuted); + } + + return llvm::ErrorSuccess(); +} + +llvm::Error InOrderIssueStage::cycleStart() { + NumIssued = 0; + + // Release consumed resources. + SmallVector Freed; + RM->cycleEvent(Freed); + + if (llvm::Error E = updateIssuedInst()) + return E; + + // Issue instructions scheduled for this cycle + if (!StallCyclesLeft && StalledInst) { + if (llvm::Error E = tryIssue(StalledInst, &StallCyclesLeft)) + return E; + } + + if (!StallCyclesLeft) { + StalledInst.invalidate(); + assert(NumIssued <= SM.IssueWidth && "Overflow."); + Bandwidth = SM.IssueWidth - NumIssued; + } else { + // The instruction is still stalled, cannot issue any new instructions in + // this cycle. + Bandwidth = 0; + } + + return llvm::ErrorSuccess(); +} + +llvm::Error InOrderIssueStage::cycleEnd() { + if (StallCyclesLeft > 0) + --StallCyclesLeft; + return llvm::ErrorSuccess(); +} + +} // namespace mca +} // namespace llvm diff --git a/llvm/lib/MCA/Stages/RetireStage.cpp b/llvm/lib/MCA/Stages/RetireStage.cpp --- a/llvm/lib/MCA/Stages/RetireStage.cpp +++ b/llvm/lib/MCA/Stages/RetireStage.cpp @@ -23,9 +23,6 @@ namespace mca { llvm::Error RetireStage::cycleStart() { - if (RCU.isEmpty()) - return llvm::ErrorSuccess(); - const unsigned MaxRetirePerCycle = RCU.getMaxRetirePerCycle(); unsigned NumRetired = 0; while (!RCU.isEmpty()) { @@ -39,11 +36,26 @@ NumRetired++; } + // Retire instructions that are not controlled by the RCU + for (InstRef &IR : RetireInst) { + IR.getInstruction()->retire(); + notifyInstructionRetired(IR); + } + RetireInst.resize(0); + return llvm::ErrorSuccess(); } llvm::Error RetireStage::execute(InstRef &IR) { - RCU.onInstructionExecuted(IR.getInstruction()->getRCUTokenID()); + Instruction &IS = *IR.getInstruction(); + + unsigned TokenID = IS.getRCUTokenID(); + if (TokenID != RetireControlUnit::UnhandledTokenID) { + RCU.onInstructionExecuted(TokenID); + return llvm::ErrorSuccess(); + } + + RetireInst.push_back(IR); return llvm::ErrorSuccess(); } diff --git a/llvm/lib/Target/AArch64/AArch64SchedA55.td b/llvm/lib/Target/AArch64/AArch64SchedA55.td --- a/llvm/lib/Target/AArch64/AArch64SchedA55.td +++ b/llvm/lib/Target/AArch64/AArch64SchedA55.td @@ -151,6 +151,8 @@ // FP Mul, Div, Sqrt. Div/Sqrt are not pipelined def : WriteRes { let Latency = 4; } + +let RetireOOO = 1 in { def : WriteRes { let Latency = 22; let ResourceCycles = [29]; } def CortexA55WriteFMAC : SchedWriteRes<[CortexA55UnitFPMAC]> { let Latency = 4; } @@ -166,7 +168,7 @@ let ResourceCycles = [9]; } def CortexA55WriteFSqrtDP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 22; let ResourceCycles = [19]; } - +} //===----------------------------------------------------------------------===// // Subtarget-specific SchedRead types. @@ -336,4 +338,6 @@ def : InstRW<[CortexA55WriteFSqrtHP], (instregex "^.*SQRT.*16$")>; def : InstRW<[CortexA55WriteFSqrtSP], (instregex "^.*SQRT.*32$")>; def : InstRW<[CortexA55WriteFSqrtDP], (instregex "^.*SQRT.*64$")>; + +def A55RCU : RetireControlUnit<64, 0>; } diff --git a/llvm/test/TableGen/InvalidMCSchedClassDesc.td b/llvm/test/TableGen/InvalidMCSchedClassDesc.td --- a/llvm/test/TableGen/InvalidMCSchedClassDesc.td +++ b/llvm/test/TableGen/InvalidMCSchedClassDesc.td @@ -19,7 +19,7 @@ // Inst_B didn't have the resoures, and it is invalid. // CHECK: SchedModel_ASchedClasses[] = { // CHECK: {DBGFIELD("Inst_A") 1 -// CHECK-NEXT: {DBGFIELD("Inst_B") 16383 +// CHECK-NEXT: {DBGFIELD("Inst_B") 8191 let SchedModel = SchedModel_A in { def Write_A : SchedWriteRes<[]>; def : InstRW<[Write_A], (instrs Inst_A)>; @@ -27,7 +27,7 @@ // Inst_A didn't have the resoures, and it is invalid. // CHECK: SchedModel_BSchedClasses[] = { -// CHECK: {DBGFIELD("Inst_A") 16383 +// CHECK: {DBGFIELD("Inst_A") 8191 // CHECK-NEXT: {DBGFIELD("Inst_B") 1 let SchedModel = SchedModel_B in { def Write_B: SchedWriteRes<[]>; diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-add-sequence.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-add-sequence.s new file mode 100644 --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-add-sequence.s @@ -0,0 +1,81 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --timeline --iterations=2 < %s | FileCheck %s + +add w2, w3, #1 +add w4, w3, #2, lsl #12 +add w0, w4, #3 +add w1, w0, #4 + +# CHECK: Iterations: 2 +# CHECK-NEXT: Instructions: 8 +# CHECK-NEXT: Total Cycles: 10 +# CHECK-NEXT: Total uOps: 8 + +# CHECK: Dispatch Width: 2 +# CHECK-NEXT: uOps Per Cycle: 0.80 +# CHECK-NEXT: IPC: 0.80 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 3 0.50 add w2, w3, #1 +# CHECK-NEXT: 1 3 0.50 add w4, w3, #2, lsl #12 +# CHECK-NEXT: 1 3 0.50 add w0, w4, #3 +# CHECK-NEXT: 1 3 0.50 add w1, w0, #4 + +# CHECK: Resources: +# CHECK-NEXT: [0.0] - CortexA55UnitALU +# CHECK-NEXT: [0.1] - CortexA55UnitALU +# CHECK-NEXT: [1] - CortexA55UnitB +# CHECK-NEXT: [2] - CortexA55UnitDiv +# CHECK-NEXT: [3.0] - CortexA55UnitFPALU +# CHECK-NEXT: [3.1] - CortexA55UnitFPALU +# CHECK-NEXT: [4] - CortexA55UnitFPDIV +# CHECK-NEXT: [5.0] - CortexA55UnitFPMAC +# CHECK-NEXT: [5.1] - CortexA55UnitFPMAC +# CHECK-NEXT: [6] - CortexA55UnitLd +# CHECK-NEXT: [7] - CortexA55UnitMAC +# CHECK-NEXT: [8] - CortexA55UnitSt + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3.0] [3.1] [4] [5.0] [5.1] [6] [7] [8] +# CHECK-NEXT: 2.00 2.00 - - - - - - - - - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3.0] [3.1] [4] [5.0] [5.1] [6] [7] [8] Instructions: +# CHECK-NEXT: - 1.00 - - - - - - - - - - add w2, w3, #1 +# CHECK-NEXT: 1.00 - - - - - - - - - - - add w4, w3, #2, lsl #12 +# CHECK-NEXT: - 1.00 - - - - - - - - - - add w0, w4, #3 +# CHECK-NEXT: 1.00 - - - - - - - - - - - add w1, w0, #4 + +# CHECK: Timeline view: +# CHECK-NEXT: Index 0123456789 + +# CHECK: [0,0] DeeER. . add w2, w3, #1 +# CHECK-NEXT: [0,1] DeeER. . add w4, w3, #2, lsl #12 +# CHECK-NEXT: [0,2] .DeeER . add w0, w4, #3 +# CHECK-NEXT: [0,3] . DeeER . add w1, w0, #4 +# CHECK-NEXT: [1,0] . DeeER . add w2, w3, #1 +# CHECK-NEXT: [1,1] . DeeER . add w4, w3, #2, lsl #12 +# CHECK-NEXT: [1,2] . DeeER. add w0, w4, #3 +# CHECK-NEXT: [1,3] . DeeER add w1, w0, #4 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 0.0 0.0 0.0 add w2, w3, #1 +# CHECK-NEXT: 1. 2 0.0 0.0 0.0 add w4, w3, #2, lsl #12 +# CHECK-NEXT: 2. 2 0.0 0.0 0.0 add w0, w4, #3 +# CHECK-NEXT: 3. 2 0.0 0.0 0.0 add w1, w0, #4 +# CHECK-NEXT: 2 0.0 0.0 0.0 diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-stats.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-stats.s new file mode 100644 --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-stats.s @@ -0,0 +1,100 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-stats --iterations=2 < %s | FileCheck %s + +ldr w4, [x2], #4 +ldr w5, [x3] +madd w0, w5, w4, w0 +add x3, x3, x13 +subs x1, x1, #1 +str w0, [x21, x18, lsl #2] + +# CHECK: Iterations: 2 +# CHECK-NEXT: Instructions: 12 +# CHECK-NEXT: Total Cycles: 21 +# CHECK-NEXT: Total uOps: 14 + +# CHECK: Dispatch Width: 2 +# CHECK-NEXT: uOps Per Cycle: 0.67 +# CHECK-NEXT: IPC: 0.57 +# CHECK-NEXT: Block RThroughput: 3.5 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 2 3 1.00 * ldr w4, [x2], #4 +# CHECK-NEXT: 1 3 1.00 * ldr w5, [x3] +# CHECK-NEXT: 1 4 1.00 madd w0, w5, w4, w0 +# CHECK-NEXT: 1 3 0.50 add x3, x3, x13 +# CHECK-NEXT: 1 3 0.50 subs x1, x1, #1 +# CHECK-NEXT: 1 4 1.00 * str w0, [x21, x18, lsl #2] + +# CHECK: Dynamic Dispatch Stall Cycles: +# CHECK-NEXT: RAT - Register unavailable: 10 (47.6%) +# CHECK-NEXT: RCU - Retire tokens unavailable: 0 +# CHECK-NEXT: SCHEDQ - Scheduler full: 0 +# CHECK-NEXT: LQ - Load queue full: 0 +# CHECK-NEXT: SQ - Store queue full: 0 +# CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0 + +# CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: +# CHECK-NEXT: [# dispatched], [# cycles] +# CHECK-NEXT: 0, 11 (52.4%) +# CHECK-NEXT: 1, 6 (28.6%) +# CHECK-NEXT: 2, 4 (19.0%) + +# CHECK: Schedulers - number of cycles where we saw N micro opcodes issued: +# CHECK-NEXT: [# issued], [# cycles] +# CHECK-NEXT: 0, 11 (52.4%) +# CHECK-NEXT: 1, 6 (28.6%) +# CHECK-NEXT: 2, 4 (19.0%) + +# CHECK: Scheduler's queue usage: +# CHECK-NEXT: No scheduler resources used. + +# CHECK: Retire Control Unit - number of cycles where we saw N instructions retired: +# CHECK-NEXT: [# retired], [# cycles] +# CHECK-NEXT: 0, 14 (66.7%) +# CHECK-NEXT: 1, 4 (19.0%) +# CHECK-NEXT: 2, 1 (4.8%) +# CHECK-NEXT: 3, 2 (9.5%) + +# CHECK: Total ROB Entries: 64 +# CHECK-NEXT: Max Used ROB Entries: 6 ( 9.4% ) +# CHECK-NEXT: Average Used ROB Entries per cy: 2 ( 3.1% ) + +# CHECK: Register File statistics: +# CHECK-NEXT: Total number of mappings created: 14 +# CHECK-NEXT: Max number of mappings used: 6 + +# CHECK: Resources: +# CHECK-NEXT: [0.0] - CortexA55UnitALU +# CHECK-NEXT: [0.1] - CortexA55UnitALU +# CHECK-NEXT: [1] - CortexA55UnitB +# CHECK-NEXT: [2] - CortexA55UnitDiv +# CHECK-NEXT: [3.0] - CortexA55UnitFPALU +# CHECK-NEXT: [3.1] - CortexA55UnitFPALU +# CHECK-NEXT: [4] - CortexA55UnitFPDIV +# CHECK-NEXT: [5.0] - CortexA55UnitFPMAC +# CHECK-NEXT: [5.1] - CortexA55UnitFPMAC +# CHECK-NEXT: [6] - CortexA55UnitLd +# CHECK-NEXT: [7] - CortexA55UnitMAC +# CHECK-NEXT: [8] - CortexA55UnitSt + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3.0] [3.1] [4] [5.0] [5.1] [6] [7] [8] +# CHECK-NEXT: 1.00 1.00 - - - - - - - 2.00 1.00 1.00 + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3.0] [3.1] [4] [5.0] [5.1] [6] [7] [8] Instructions: +# CHECK-NEXT: - - - - - - - - - 1.00 - - ldr w4, [x2], #4 +# CHECK-NEXT: - - - - - - - - - 1.00 - - ldr w5, [x3] +# CHECK-NEXT: - - - - - - - - - - 1.00 - madd w0, w5, w4, w0 +# CHECK-NEXT: - 1.00 - - - - - - - - - - add x3, x3, x13 +# CHECK-NEXT: 1.00 - - - - - - - - - - - subs x1, x1, #1 +# CHECK-NEXT: - - - - - - - - - - - 1.00 str w0, [x21, x18, lsl #2] diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-views.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-views.s new file mode 100644 --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-views.s @@ -0,0 +1,139 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views --iterations=2 < %s | FileCheck %s + +ldr w4, [x2], #4 +ldr w5, [x3] +madd w0, w5, w4, w0 +add x3, x3, x13 +subs x1, x1, #1 +str w0, [x21, x18, lsl #2] + +# CHECK: Iterations: 2 +# CHECK-NEXT: Instructions: 12 +# CHECK-NEXT: Total Cycles: 21 +# CHECK-NEXT: Total uOps: 14 + +# CHECK: Dispatch Width: 2 +# CHECK-NEXT: uOps Per Cycle: 0.67 +# CHECK-NEXT: IPC: 0.57 +# CHECK-NEXT: Block RThroughput: 3.5 + +# CHECK: Cycles with backend pressure increase [ 19.05% ] +# CHECK-NEXT: Throughput Bottlenecks: +# CHECK-NEXT: Resource Pressure [ 0.00% ] +# CHECK-NEXT: Data Dependencies: [ 19.05% ] +# CHECK-NEXT: - Register Dependencies [ 19.05% ] +# CHECK-NEXT: - Memory Dependencies [ 0.00% ] + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 2 3 1.00 * ldr w4, [x2], #4 +# CHECK-NEXT: 1 3 1.00 * ldr w5, [x3] +# CHECK-NEXT: 1 4 1.00 madd w0, w5, w4, w0 +# CHECK-NEXT: 1 3 0.50 add x3, x3, x13 +# CHECK-NEXT: 1 3 0.50 subs x1, x1, #1 +# CHECK-NEXT: 1 4 1.00 * str w0, [x21, x18, lsl #2] + +# CHECK: Dynamic Dispatch Stall Cycles: +# CHECK-NEXT: RAT - Register unavailable: 10 (47.6%) +# CHECK-NEXT: RCU - Retire tokens unavailable: 0 +# CHECK-NEXT: SCHEDQ - Scheduler full: 0 +# CHECK-NEXT: LQ - Load queue full: 0 +# CHECK-NEXT: SQ - Store queue full: 0 +# CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0 + +# CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: +# CHECK-NEXT: [# dispatched], [# cycles] +# CHECK-NEXT: 0, 11 (52.4%) +# CHECK-NEXT: 1, 6 (28.6%) +# CHECK-NEXT: 2, 4 (19.0%) + +# CHECK: Schedulers - number of cycles where we saw N micro opcodes issued: +# CHECK-NEXT: [# issued], [# cycles] +# CHECK-NEXT: 0, 11 (52.4%) +# CHECK-NEXT: 1, 6 (28.6%) +# CHECK-NEXT: 2, 4 (19.0%) + +# CHECK: Scheduler's queue usage: +# CHECK-NEXT: No scheduler resources used. + +# CHECK: Retire Control Unit - number of cycles where we saw N instructions retired: +# CHECK-NEXT: [# retired], [# cycles] +# CHECK-NEXT: 0, 14 (66.7%) +# CHECK-NEXT: 1, 4 (19.0%) +# CHECK-NEXT: 2, 1 (4.8%) +# CHECK-NEXT: 3, 2 (9.5%) + +# CHECK: Total ROB Entries: 64 +# CHECK-NEXT: Max Used ROB Entries: 6 ( 9.4% ) +# CHECK-NEXT: Average Used ROB Entries per cy: 2 ( 3.1% ) + +# CHECK: Register File statistics: +# CHECK-NEXT: Total number of mappings created: 14 +# CHECK-NEXT: Max number of mappings used: 6 + +# CHECK: Resources: +# CHECK-NEXT: [0.0] - CortexA55UnitALU +# CHECK-NEXT: [0.1] - CortexA55UnitALU +# CHECK-NEXT: [1] - CortexA55UnitB +# CHECK-NEXT: [2] - CortexA55UnitDiv +# CHECK-NEXT: [3.0] - CortexA55UnitFPALU +# CHECK-NEXT: [3.1] - CortexA55UnitFPALU +# CHECK-NEXT: [4] - CortexA55UnitFPDIV +# CHECK-NEXT: [5.0] - CortexA55UnitFPMAC +# CHECK-NEXT: [5.1] - CortexA55UnitFPMAC +# CHECK-NEXT: [6] - CortexA55UnitLd +# CHECK-NEXT: [7] - CortexA55UnitMAC +# CHECK-NEXT: [8] - CortexA55UnitSt + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3.0] [3.1] [4] [5.0] [5.1] [6] [7] [8] +# CHECK-NEXT: 1.00 1.00 - - - - - - - 2.00 1.00 1.00 + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3.0] [3.1] [4] [5.0] [5.1] [6] [7] [8] Instructions: +# CHECK-NEXT: - - - - - - - - - 1.00 - - ldr w4, [x2], #4 +# CHECK-NEXT: - - - - - - - - - 1.00 - - ldr w5, [x3] +# CHECK-NEXT: - - - - - - - - - - 1.00 - madd w0, w5, w4, w0 +# CHECK-NEXT: - 1.00 - - - - - - - - - - add x3, x3, x13 +# CHECK-NEXT: 1.00 - - - - - - - - - - - subs x1, x1, #1 +# CHECK-NEXT: - - - - - - - - - - - 1.00 str w0, [x21, x18, lsl #2] + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 0 + +# CHECK: [0,0] DeeER. . . . ldr w4, [x2], #4 +# CHECK-NEXT: [0,1] .DeeER . . . ldr w5, [x3] +# CHECK-NEXT: [0,2] . DeeeER. . . madd w0, w5, w4, w0 +# CHECK-NEXT: [0,3] . DeeE-R. . . add x3, x3, x13 +# CHECK-NEXT: [0,4] . DeeER. . . subs x1, x1, #1 +# CHECK-NEXT: [0,5] . . DeeeER . . str w0, [x21, x18, lsl #2] +# CHECK-NEXT: [1,0] . . DeeER . . ldr w4, [x2], #4 +# CHECK-NEXT: [1,1] . . DeeER . . ldr w5, [x3] +# CHECK-NEXT: [1,2] . . . DeeeER . madd w0, w5, w4, w0 +# CHECK-NEXT: [1,3] . . . DeeE-R . add x3, x3, x13 +# CHECK-NEXT: [1,4] . . . DeeER . subs x1, x1, #1 +# CHECK-NEXT: [1,5] . . . DeeeER str w0, [x21, x18, lsl #2] + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 0.0 0.0 0.0 ldr w4, [x2], #4 +# CHECK-NEXT: 1. 2 0.0 0.0 0.0 ldr w5, [x3] +# CHECK-NEXT: 2. 2 0.0 0.0 0.0 madd w0, w5, w4, w0 +# CHECK-NEXT: 3. 2 0.0 0.0 1.0 add x3, x3, x13 +# CHECK-NEXT: 4. 2 0.0 0.0 0.0 subs x1, x1, #1 +# CHECK-NEXT: 5. 2 0.0 0.0 0.0 str w0, [x21, x18, lsl #2] +# CHECK-NEXT: 2 0.0 0.0 0.2 diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-in-order-retire.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-in-order-retire.s new file mode 100644 --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-in-order-retire.s @@ -0,0 +1,135 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-stats --all-views --iterations=2 < %s | FileCheck %s + +sdiv w12, w21, w0 +add w8, w8, #1 +add w1, w2, w0 +add w3, w4, #1 +add w5, w6, w0 +add w7, w9, w0 + +# CHECK: Iterations: 2 +# CHECK-NEXT: Instructions: 12 +# CHECK-NEXT: Total Cycles: 18 +# CHECK-NEXT: Total uOps: 12 + +# CHECK: Dispatch Width: 2 +# CHECK-NEXT: uOps Per Cycle: 0.67 +# CHECK-NEXT: IPC: 0.67 +# CHECK-NEXT: Block RThroughput: 8.0 + +# CHECK: Cycles with backend pressure increase [ 27.78% ] +# CHECK-NEXT: Throughput Bottlenecks: +# CHECK-NEXT: Resource Pressure [ 27.78% ] +# CHECK-NEXT: Data Dependencies: [ 0.00% ] +# CHECK-NEXT: - Register Dependencies [ 0.00% ] +# CHECK-NEXT: - Memory Dependencies [ 0.00% ] + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 8 8.00 sdiv w12, w21, w0 +# CHECK-NEXT: 1 3 0.50 add w8, w8, #1 +# CHECK-NEXT: 1 3 0.50 add w1, w2, w0 +# CHECK-NEXT: 1 3 0.50 add w3, w4, #1 +# CHECK-NEXT: 1 3 0.50 add w5, w6, w0 +# CHECK-NEXT: 1 3 0.50 add w7, w9, w0 + +# CHECK: Dynamic Dispatch Stall Cycles: +# CHECK-NEXT: RAT - Register unavailable: 0 +# CHECK-NEXT: RCU - Retire tokens unavailable: 0 +# CHECK-NEXT: SCHEDQ - Scheduler full: 0 +# CHECK-NEXT: LQ - Load queue full: 0 +# CHECK-NEXT: SQ - Store queue full: 0 +# CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 5 (27.8%) + +# CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: +# CHECK-NEXT: [# dispatched], [# cycles] +# CHECK-NEXT: 0, 12 (66.7%) +# CHECK-NEXT: 2, 6 (33.3%) + +# CHECK: Schedulers - number of cycles where we saw N micro opcodes issued: +# CHECK-NEXT: [# issued], [# cycles] +# CHECK-NEXT: 0, 12 (66.7%) +# CHECK-NEXT: 2, 6 (33.3%) + +# CHECK: Scheduler's queue usage: +# CHECK-NEXT: No scheduler resources used. + +# CHECK: Retire Control Unit - number of cycles where we saw N instructions retired: +# CHECK-NEXT: [# retired], [# cycles] +# CHECK-NEXT: 0, 16 (88.9%) +# CHECK-NEXT: 6, 2 (11.1%) + +# CHECK: Total ROB Entries: 64 +# CHECK-NEXT: Max Used ROB Entries: 8 ( 12.5% ) +# CHECK-NEXT: Average Used ROB Entries per cy: 5 ( 7.8% ) + +# CHECK: Register File statistics: +# CHECK-NEXT: Total number of mappings created: 12 +# CHECK-NEXT: Max number of mappings used: 8 + +# CHECK: Resources: +# CHECK-NEXT: [0.0] - CortexA55UnitALU +# CHECK-NEXT: [0.1] - CortexA55UnitALU +# CHECK-NEXT: [1] - CortexA55UnitB +# CHECK-NEXT: [2] - CortexA55UnitDiv +# CHECK-NEXT: [3.0] - CortexA55UnitFPALU +# CHECK-NEXT: [3.1] - CortexA55UnitFPALU +# CHECK-NEXT: [4] - CortexA55UnitFPDIV +# CHECK-NEXT: [5.0] - CortexA55UnitFPMAC +# CHECK-NEXT: [5.1] - CortexA55UnitFPMAC +# CHECK-NEXT: [6] - CortexA55UnitLd +# CHECK-NEXT: [7] - CortexA55UnitMAC +# CHECK-NEXT: [8] - CortexA55UnitSt + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3.0] [3.1] [4] [5.0] [5.1] [6] [7] [8] +# CHECK-NEXT: 2.50 2.50 - 8.00 - - - - - - - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3.0] [3.1] [4] [5.0] [5.1] [6] [7] [8] Instructions: +# CHECK-NEXT: - - - 8.00 - - - - - - - - sdiv w12, w21, w0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - add w8, w8, #1 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - add w1, w2, w0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - add w3, w4, #1 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - add w5, w6, w0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - add w7, w9, w0 + +# CHECK: Timeline view: +# CHECK-NEXT: 01234567 +# CHECK-NEXT: Index 0123456789 + +# CHECK: [0,0] DeeeeeeeER. . . sdiv w12, w21, w0 +# CHECK-NEXT: [0,1] DeeE-----R. . . add w8, w8, #1 +# CHECK-NEXT: [0,2] .DeeE----R. . . add w1, w2, w0 +# CHECK-NEXT: [0,3] .DeeE----R. . . add w3, w4, #1 +# CHECK-NEXT: [0,4] . DeeE---R. . . add w5, w6, w0 +# CHECK-NEXT: [0,5] . DeeE---R. . . add w7, w9, w0 +# CHECK-NEXT: [1,0] . . DeeeeeeeER sdiv w12, w21, w0 +# CHECK-NEXT: [1,1] . . DeeE-----R add w8, w8, #1 +# CHECK-NEXT: [1,2] . . DeeE----R add w1, w2, w0 +# CHECK-NEXT: [1,3] . . DeeE----R add w3, w4, #1 +# CHECK-NEXT: [1,4] . . DeeE---R add w5, w6, w0 +# CHECK-NEXT: [1,5] . . DeeE---R add w7, w9, w0 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 0.0 0.0 0.0 sdiv w12, w21, w0 +# CHECK-NEXT: 1. 2 0.0 0.0 5.0 add w8, w8, #1 +# CHECK-NEXT: 2. 2 0.0 0.0 4.0 add w1, w2, w0 +# CHECK-NEXT: 3. 2 0.0 0.0 4.0 add w3, w4, #1 +# CHECK-NEXT: 4. 2 0.0 0.0 3.0 add w5, w6, w0 +# CHECK-NEXT: 5. 2 0.0 0.0 3.0 add w7, w9, w0 +# CHECK-NEXT: 2 0.0 0.0 3.2 diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-out-of-order-retire.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-out-of-order-retire.s new file mode 100644 --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-out-of-order-retire.s @@ -0,0 +1,136 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-stats --all-views --iterations=2 < %s | FileCheck %s + +fdiv s1, s2, s3 +add w8, w8, #1 +add w1, w2, w0 +add w3, w4, #1 +add w5, w6, w0 +add w7, w9, w0 + +# CHECK: Iterations: 2 +# CHECK-NEXT: Instructions: 12 +# CHECK-NEXT: Total Cycles: 25 +# CHECK-NEXT: Total uOps: 12 + +# CHECK: Dispatch Width: 2 +# CHECK-NEXT: uOps Per Cycle: 0.48 +# CHECK-NEXT: IPC: 0.48 +# CHECK-NEXT: Block RThroughput: 10.0 + +# CHECK: Cycles with backend pressure increase [ 28.00% ] +# CHECK-NEXT: Throughput Bottlenecks: +# CHECK-NEXT: Resource Pressure [ 28.00% ] +# CHECK-NEXT: Data Dependencies: [ 0.00% ] +# CHECK-NEXT: - Register Dependencies [ 0.00% ] +# CHECK-NEXT: - Memory Dependencies [ 0.00% ] + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 13 10.00 fdiv s1, s2, s3 +# CHECK-NEXT: 1 3 0.50 add w8, w8, #1 +# CHECK-NEXT: 1 3 0.50 add w1, w2, w0 +# CHECK-NEXT: 1 3 0.50 add w3, w4, #1 +# CHECK-NEXT: 1 3 0.50 add w5, w6, w0 +# CHECK-NEXT: 1 3 0.50 add w7, w9, w0 + +# CHECK: Dynamic Dispatch Stall Cycles: +# CHECK-NEXT: RAT - Register unavailable: 0 +# CHECK-NEXT: RCU - Retire tokens unavailable: 0 +# CHECK-NEXT: SCHEDQ - Scheduler full: 0 +# CHECK-NEXT: LQ - Load queue full: 0 +# CHECK-NEXT: SQ - Store queue full: 0 +# CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 7 (28.0%) + +# CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: +# CHECK-NEXT: [# dispatched], [# cycles] +# CHECK-NEXT: 0, 19 (76.0%) +# CHECK-NEXT: 2, 6 (24.0%) + +# CHECK: Schedulers - number of cycles where we saw N micro opcodes issued: +# CHECK-NEXT: [# issued], [# cycles] +# CHECK-NEXT: 0, 19 (76.0%) +# CHECK-NEXT: 2, 6 (24.0%) + +# CHECK: Scheduler's queue usage: +# CHECK-NEXT: No scheduler resources used. + +# CHECK: Retire Control Unit - number of cycles where we saw N instructions retired: +# CHECK-NEXT: [# retired], [# cycles] +# CHECK-NEXT: 0, 18 (72.0%) +# CHECK-NEXT: 1, 2 (8.0%) +# CHECK-NEXT: 2, 5 (20.0%) + +# CHECK: Total ROB Entries: 64 +# CHECK-NEXT: Max Used ROB Entries: 7 ( 10.9% ) +# CHECK-NEXT: Average Used ROB Entries per cy: 2 ( 3.1% ) + +# CHECK: Register File statistics: +# CHECK-NEXT: Total number of mappings created: 12 +# CHECK-NEXT: Max number of mappings used: 7 + +# CHECK: Resources: +# CHECK-NEXT: [0.0] - CortexA55UnitALU +# CHECK-NEXT: [0.1] - CortexA55UnitALU +# CHECK-NEXT: [1] - CortexA55UnitB +# CHECK-NEXT: [2] - CortexA55UnitDiv +# CHECK-NEXT: [3.0] - CortexA55UnitFPALU +# CHECK-NEXT: [3.1] - CortexA55UnitFPALU +# CHECK-NEXT: [4] - CortexA55UnitFPDIV +# CHECK-NEXT: [5.0] - CortexA55UnitFPMAC +# CHECK-NEXT: [5.1] - CortexA55UnitFPMAC +# CHECK-NEXT: [6] - CortexA55UnitLd +# CHECK-NEXT: [7] - CortexA55UnitMAC +# CHECK-NEXT: [8] - CortexA55UnitSt + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3.0] [3.1] [4] [5.0] [5.1] [6] [7] [8] +# CHECK-NEXT: 2.50 2.50 - - - - 10.00 - - - - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3.0] [3.1] [4] [5.0] [5.1] [6] [7] [8] Instructions: +# CHECK-NEXT: - - - - - - 10.00 - - - - - fdiv s1, s2, s3 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - add w8, w8, #1 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - add w1, w2, w0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - add w3, w4, #1 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - add w5, w6, w0 +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - add w7, w9, w0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 01234 + +# CHECK: [0,0] DeeeeeeeeeeeeER. . . fdiv s1, s2, s3 +# CHECK-NEXT: [0,1] DeeER. . . . . add w8, w8, #1 +# CHECK-NEXT: [0,2] .DeeER . . . . add w1, w2, w0 +# CHECK-NEXT: [0,3] .DeeER . . . . add w3, w4, #1 +# CHECK-NEXT: [0,4] . DeeER . . . . add w5, w6, w0 +# CHECK-NEXT: [0,5] . DeeER . . . . add w7, w9, w0 +# CHECK-NEXT: [1,0] . . DeeeeeeeeeeeeER fdiv s1, s2, s3 +# CHECK-NEXT: [1,1] . . DeeER. . . add w8, w8, #1 +# CHECK-NEXT: [1,2] . . .DeeER . . add w1, w2, w0 +# CHECK-NEXT: [1,3] . . .DeeER . . add w3, w4, #1 +# CHECK-NEXT: [1,4] . . . DeeER . . add w5, w6, w0 +# CHECK-NEXT: [1,5] . . . DeeER . . add w7, w9, w0 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 0.0 0.0 0.0 fdiv s1, s2, s3 +# CHECK-NEXT: 1. 2 0.0 0.0 0.0 add w8, w8, #1 +# CHECK-NEXT: 2. 2 0.0 0.0 0.0 add w1, w2, w0 +# CHECK-NEXT: 3. 2 0.0 0.0 0.0 add w3, w4, #1 +# CHECK-NEXT: 4. 2 0.0 0.0 0.0 add w5, w6, w0 +# CHECK-NEXT: 5. 2 0.0 0.0 0.0 add w7, w9, w0 +# CHECK-NEXT: 2 0.0 0.0 0.0 diff --git a/llvm/test/tools/llvm-mca/X86/in-order-cpu.s b/llvm/test/tools/llvm-mca/X86/in-order-cpu.s --- a/llvm/test/tools/llvm-mca/X86/in-order-cpu.s +++ b/llvm/test/tools/llvm-mca/X86/in-order-cpu.s @@ -1,3 +1,3 @@ -# RUN: not llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=atom -o /dev/null 2>&1 | FileCheck %s - -# CHECK: error: please specify an out-of-order cpu. 'atom' is an in-order cpu. +# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=atom -o /dev/null 2>&1 | FileCheck %s +# CHECK: warning: support for in-order CPU 'atom' is experimental. +movsbw %al, %di diff --git a/llvm/tools/llvm-mca/llvm-mca.cpp b/llvm/tools/llvm-mca/llvm-mca.cpp --- a/llvm/tools/llvm-mca/llvm-mca.cpp +++ b/llvm/tools/llvm-mca/llvm-mca.cpp @@ -335,9 +335,8 @@ return 1; if (!PrintInstructionTables && !STI->getSchedModel().isOutOfOrder()) { - WithColor::error() << "please specify an out-of-order cpu. '" << MCPU - << "' is an in-order cpu.\n"; - return 1; + WithColor::warning() << "support for in-order CPU '" << MCPU + << "' is experimental.\n"; } if (!STI->getSchedModel().hasInstrSchedModel()) { diff --git a/llvm/utils/TableGen/SubtargetEmitter.cpp b/llvm/utils/TableGen/SubtargetEmitter.cpp --- a/llvm/utils/TableGen/SubtargetEmitter.cpp +++ b/llvm/utils/TableGen/SubtargetEmitter.cpp @@ -999,6 +999,7 @@ SCDesc.NumMicroOps = 0; SCDesc.BeginGroup = false; SCDesc.EndGroup = false; + SCDesc.RetireOOO = false; SCDesc.WriteProcResIdx = 0; SCDesc.WriteLatencyIdx = 0; SCDesc.ReadAdvanceIdx = 0; @@ -1101,6 +1102,7 @@ SCDesc.EndGroup |= WriteRes->getValueAsBit("EndGroup"); SCDesc.BeginGroup |= WriteRes->getValueAsBit("SingleIssue"); SCDesc.EndGroup |= WriteRes->getValueAsBit("SingleIssue"); + SCDesc.RetireOOO |= WriteRes->getValueAsBit("RetireOOO"); // Create an entry for each ProcResource listed in WriteRes. RecVec PRVec = WriteRes->getValueAsListOfDefs("ProcResources"); @@ -1299,7 +1301,7 @@ std::vector &SCTab = SchedTables.ProcSchedClasses[1 + (PI - SchedModels.procModelBegin())]; - OS << "\n// {Name, NumMicroOps, BeginGroup, EndGroup," + OS << "\n// {Name, NumMicroOps, BeginGroup, EndGroup, RetireOOO," << " WriteProcResIdx,#, WriteLatencyIdx,#, ReadAdvanceIdx,#}\n"; OS << "static const llvm::MCSchedClassDesc " << PI->ModelName << "SchedClasses[] = {\n"; @@ -1310,7 +1312,7 @@ && "invalid class not first"); OS << " {DBGFIELD(\"InvalidSchedClass\") " << MCSchedClassDesc::InvalidNumMicroOps - << ", false, false, 0, 0, 0, 0, 0, 0},\n"; + << ", false, false, false, 0, 0, 0, 0, 0, 0},\n"; for (unsigned SCIdx = 1, SCEnd = SCTab.size(); SCIdx != SCEnd; ++SCIdx) { MCSchedClassDesc &MCDesc = SCTab[SCIdx]; @@ -1321,6 +1323,7 @@ OS << MCDesc.NumMicroOps << ", " << ( MCDesc.BeginGroup ? "true" : "false" ) << ", " << ( MCDesc.EndGroup ? "true" : "false" ) + << ", " << ( MCDesc.RetireOOO ? "true" : "false" ) << ", " << format("%2d", MCDesc.WriteProcResIdx) << ", " << MCDesc.NumWriteProcResEntries << ", " << format("%2d", MCDesc.WriteLatencyIdx)