diff --git a/llvm/include/llvm/MCA/HardwareUnits/RegisterFile.h b/llvm/include/llvm/MCA/HardwareUnits/RegisterFile.h --- a/llvm/include/llvm/MCA/HardwareUnits/RegisterFile.h +++ b/llvm/include/llvm/MCA/HardwareUnits/RegisterFile.h @@ -172,11 +172,6 @@ void freePhysRegs(const RegisterRenamingInfo &Entry, MutableArrayRef FreedPhysRegs); - // Collects writes that are in a RAW dependency with RS. - // This method is called from `addRegisterRead()`. - void collectWrites(const ReadState &RS, - SmallVectorImpl &Writes) const; - // Create an instance of RegisterMappingTracker for every register file // specified by the processor model. // If no register file is specified, then this method creates a default @@ -187,6 +182,10 @@ RegisterFile(const MCSchedModel &SM, const MCRegisterInfo &mri, unsigned NumRegs = 0); + // Collects writes that are in a RAW dependency with RS. + void collectWrites(const ReadState &RS, + SmallVectorImpl &Writes) const; + // This method updates the register mappings inserting a new register // definition. This method is also responsible for updating the number of // allocated physical registers in each register file modified by the write. diff --git a/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h b/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h new file mode 100644 --- /dev/null +++ b/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h @@ -0,0 +1,83 @@ +//===---------------------- InOrderIssueStage.h -----------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// InOrderIssueStage implements an in-order execution pipeline. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_MCA_IN_ORDER_ISSUE_STAGE_H +#define LLVM_MCA_IN_ORDER_ISSUE_STAGE_H + +#include "llvm/ADT/SmallVector.h" +#include "llvm/MCA/SourceMgr.h" +#include "llvm/MCA/Stages/Stage.h" + +#include + +namespace llvm { +class MCSchedModel; +class MCSubtargetInfo; + +namespace mca { +class RegisterFile; +class ResourceManager; + +class InOrderIssueStage final : public Stage { + const MCSchedModel &SM; + const MCSubtargetInfo &STI; + RegisterFile &PRF; + std::unique_ptr RM; + + /// Instructions that were issued, but not retired yet. Executed instructions + /// (with CyclesLeft == 0) are moved to the end of IssuedInst. + SmallVector IssuedInst; + /// Number of executed instructions at the end of IssuedInst. + int NumExecuted; + + /// Instructions that must be issued during the next cycle. If the front + /// instruction cannot execute due to an unmet register or resource + /// dependency, the whole queue is stalled for StallCyclesLeft. + std::queue InstQueue; + int StallCyclesLeft; + + /// Number of instructions that can be added to InstQueue (dispatched) during + /// this cycle. + int Bandwidth; + + InOrderIssueStage(const InOrderIssueStage &Other) = delete; + InOrderIssueStage &operator=(const InOrderIssueStage &Other) = delete; + + /// If IR has an unmet register or resource dependency, canExecute returns + /// false. StallCycles is set to the number of cycles left before the + /// instruction can be issued. + bool canExecute(const InstRef &IR, int *StallCycles) const; + + /// Issue the instruction, or update StallCycles if IR is stalled. + Error tryIssue(InstRef &IR, int *StallCycles); + + /// Update status of instructions from IssuedInst. + Error updateIssuedInst(); + +public: + InOrderIssueStage(RegisterFile &PRF, const MCSchedModel &SM, + const MCSubtargetInfo &STI) + : SM(SM), STI(STI), PRF(PRF), RM(std::make_unique(SM)), + NumExecuted(0), StallCyclesLeft(0), Bandwidth(0) {} + + bool isAvailable(const InstRef &) const override; + bool hasWorkToComplete() const override; + Error execute(InstRef &IR) override; + Error cycleStart() override; + Error cycleEnd() override; +}; + +} // namespace mca +} // namespace llvm + +#endif // LLVM_MCA_IN_ORDER_ISSUE_STAGE_H diff --git a/llvm/include/llvm/MCA/Stages/RetireStage.h b/llvm/include/llvm/MCA/Stages/RetireStage.h --- a/llvm/include/llvm/MCA/Stages/RetireStage.h +++ b/llvm/include/llvm/MCA/Stages/RetireStage.h @@ -26,7 +26,7 @@ class RetireStage final : public Stage { // Owner will go away when we move listeners/eventing to the stages. - RetireControlUnit &RCU; + RetireControlUnit *RCU; RegisterFile &PRF; LSUnitBase &LSU; @@ -34,10 +34,10 @@ RetireStage &operator=(const RetireStage &Other) = delete; public: - RetireStage(RetireControlUnit &R, RegisterFile &F, LSUnitBase &LS) + RetireStage(RetireControlUnit *R, RegisterFile &F, LSUnitBase &LS) : Stage(), RCU(R), PRF(F), LSU(LS) {} - bool hasWorkToComplete() const override { return !RCU.isEmpty(); } + bool hasWorkToComplete() const override { return RCU && !RCU->isEmpty(); } Error cycleStart() override; Error execute(InstRef &IR) override; void notifyInstructionRetired(const InstRef &IR) const; diff --git a/llvm/lib/MCA/CMakeLists.txt b/llvm/lib/MCA/CMakeLists.txt --- a/llvm/lib/MCA/CMakeLists.txt +++ b/llvm/lib/MCA/CMakeLists.txt @@ -17,6 +17,7 @@ Stages/InstructionTables.cpp Stages/MicroOpQueueStage.cpp Stages/RetireStage.cpp + Stages/InOrderIssueStage.cpp Stages/Stage.cpp Support.cpp diff --git a/llvm/lib/MCA/Context.cpp b/llvm/lib/MCA/Context.cpp --- a/llvm/lib/MCA/Context.cpp +++ b/llvm/lib/MCA/Context.cpp @@ -21,6 +21,7 @@ #include "llvm/MCA/Stages/DispatchStage.h" #include "llvm/MCA/Stages/EntryStage.h" #include "llvm/MCA/Stages/ExecuteStage.h" +#include "llvm/MCA/Stages/InOrderIssueStage.h" #include "llvm/MCA/Stages/MicroOpQueueStage.h" #include "llvm/MCA/Stages/RetireStage.h" @@ -32,19 +33,35 @@ const MCSchedModel &SM = STI.getSchedModel(); // Create the hardware units defining the backend. - auto RCU = std::make_unique(SM); + std::unique_ptr RCU; + if (SM.isOutOfOrder()) + RCU = std::make_unique(SM); + auto PRF = std::make_unique(SM, MRI, Opts.RegisterFileSize); auto LSU = std::make_unique(SM, Opts.LoadQueueSize, Opts.StoreQueueSize, Opts.AssumeNoAlias); auto HWS = std::make_unique(SM, *LSU); - // Create the pipeline stages. - auto Fetch = std::make_unique(SrcMgr); - auto Dispatch = std::make_unique(STI, MRI, Opts.DispatchWidth, - *RCU, *PRF); - auto Execute = - std::make_unique(*HWS, Opts.EnableBottleneckAnalysis); - auto Retire = std::make_unique(*RCU, *PRF, *LSU); + // Build the pipeline. + auto StagePipeline = std::make_unique(); + StagePipeline->appendStage(std::make_unique(SrcMgr)); + if (SM.isOutOfOrder()) { + // Out-of-order pipeline + if (Opts.MicroOpQueueSize) { + StagePipeline->appendStage(std::make_unique( + Opts.MicroOpQueueSize, Opts.DecodersThroughput)); + } + StagePipeline->appendStage(std::make_unique( + STI, MRI, Opts.DispatchWidth, *RCU, *PRF)); + StagePipeline->appendStage( + std::make_unique(*HWS, Opts.EnableBottleneckAnalysis)); + } else { + // In-order-pipeline + StagePipeline->appendStage( + std::make_unique(*PRF, SM, STI)); + } + StagePipeline->appendStage( + std::make_unique(RCU.get(), *PRF, *LSU)); // Pass the ownership of all the hardware units to this Context. addHardwareUnit(std::move(RCU)); @@ -52,15 +69,6 @@ addHardwareUnit(std::move(LSU)); addHardwareUnit(std::move(HWS)); - // Build the pipeline. - auto StagePipeline = std::make_unique(); - StagePipeline->appendStage(std::move(Fetch)); - if (Opts.MicroOpQueueSize) - StagePipeline->appendStage(std::make_unique( - Opts.MicroOpQueueSize, Opts.DecodersThroughput)); - StagePipeline->appendStage(std::move(Dispatch)); - StagePipeline->appendStage(std::move(Execute)); - StagePipeline->appendStage(std::move(Retire)); return StagePipeline; } diff --git a/llvm/lib/MCA/Stages/InOrderIssueStage.cpp b/llvm/lib/MCA/Stages/InOrderIssueStage.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/MCA/Stages/InOrderIssueStage.cpp @@ -0,0 +1,269 @@ +//===---------------------- InOrderIssueStage.cpp ---------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// InOrderIssueStage implements an in-order execution pipeline. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/MCA/Stages/InOrderIssueStage.h" + +#include "llvm/MC/MCSchedule.h" +#include "llvm/MCA/HWEventListener.h" +#include "llvm/MCA/HardwareUnits/RegisterFile.h" +#include "llvm/MCA/HardwareUnits/ResourceManager.h" +#include "llvm/MCA/Instruction.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/Error.h" + +#include + +#define DEBUG_TYPE "llvm-mca" +namespace llvm { +namespace mca { + +bool InOrderIssueStage::hasWorkToComplete() const { + return !IssuedInst.empty() || !InstQueue.empty(); +} + +bool InOrderIssueStage::isAvailable(const InstRef &IR) const { + return Bandwidth > 0; +} + +static int hasResourceHazard(const ResourceManager &RM, const InstRef &IR) { + if (RM.checkAvailability(IR.getInstruction()->getDesc())) { + LLVM_DEBUG(dbgs() << "[E] Stall #" << IR << '\n'); + return true; + } + + return false; +} + +/// Return a number of cycles left until register requirements of the +/// instructions are met. +static int checkRegisterHazard(const RegisterFile &PRF, const MCSchedModel &SM, + const MCSubtargetInfo &STI, const InstRef &IR) { + int StallCycles = 0; + SmallVector Writes; + + for (const ReadState &RS : IR.getInstruction()->getUses()) { + const ReadDescriptor &RD = RS.getDescriptor(); + const MCSchedClassDesc *SC = SM.getSchedClassDesc(RD.SchedClassID); + + PRF.collectWrites(RS, Writes); + for (const WriteRef &WR : Writes) { + const WriteState *WS = WR.getWriteState(); + unsigned WriteResID = WS->getWriteResourceID(); + int ReadAdvance = STI.getReadAdvanceCycles(SC, RD.UseIndex, WriteResID); + LLVM_DEBUG(dbgs() << "[E] ReadAdvance for #" << IR << ": " << ReadAdvance + << '\n'); + + assert(WS->getCyclesLeft() != UNKNOWN_CYCLES); + int CyclesLeft = (int)WS->getCyclesLeft(); + if (CyclesLeft > ReadAdvance) { + LLVM_DEBUG(dbgs() << "[E] Register hazard: " << WS->getRegisterID() + << '\n'); + StallCycles = std::max(StallCycles, CyclesLeft - ReadAdvance); + } + } + Writes.clear(); + } + + return StallCycles; +} + +bool InOrderIssueStage::canExecute(const InstRef &IR, int *StallCycles) const { + *StallCycles = 0; + + if (int RegStall = checkRegisterHazard(PRF, SM, STI, IR)) { + *StallCycles = RegStall; + // FIXME: add a parameter to HWStallEvent to indicate a number of cycles. + for (int i = 0; i < RegStall; ++i) { + notifyEvent( + HWStallEvent(HWStallEvent::RegisterFileStall, IR)); + notifyEvent( + HWPressureEvent(HWPressureEvent::REGISTER_DEPS, IR)); + } + } else if (hasResourceHazard(*RM, IR)) { + *StallCycles = 1; + notifyEvent( + HWStallEvent(HWStallEvent::DispatchGroupStall, IR)); + notifyEvent( + HWPressureEvent(HWPressureEvent::RESOURCES, IR)); + } + + return *StallCycles == 0; +} + +static void addRegisterReadWrite(RegisterFile &PRF, Instruction &IS, + unsigned SourceIndex, + const MCSubtargetInfo &STI, + SmallVectorImpl &UsedRegs) { + if (IS.isEliminated()) + return; + + for (ReadState &RS : IS.getUses()) + PRF.addRegisterRead(RS, STI); + + for (WriteState &WS : IS.getDefs()) + PRF.addRegisterWrite(WriteRef(SourceIndex, &WS), UsedRegs); +} + +static void notifyInstructionExecute( + const InstRef &IR, + const SmallVectorImpl> &UsedRes, + const Stage &S) { + + S.notifyEvent( + HWInstructionEvent(HWInstructionEvent::Ready, IR)); + S.notifyEvent(HWInstructionIssuedEvent(IR, UsedRes)); + + LLVM_DEBUG(dbgs() << "[E] Issued #" << IR << "\n"); +} + +static void notifyInstructionDispatch(const InstRef &IR, unsigned Ops, + const SmallVectorImpl &UsedRegs, + const Stage &S) { + + S.notifyEvent( + HWInstructionDispatchedEvent(IR, UsedRegs, Ops)); + + LLVM_DEBUG(dbgs() << "[E] Dispatched #" << IR << "\n"); +} + +llvm::Error InOrderIssueStage::execute(InstRef &IR) { + Instruction &IS = *IR.getInstruction(); + + IS.dispatch(0); + + SmallVector UsedRegs(PRF.getNumRegisterFiles(), 0U); + // Register file information is not available yet. Send a dispatch + // event with "zero" uops now, and emit a proper event later. + notifyInstructionDispatch(IR, /*Ops=*/0, UsedRegs, *this); + + --Bandwidth; + InstQueue.push(IR); + + return llvm::ErrorSuccess(); +} + +llvm::Error InOrderIssueStage::tryIssue(InstRef &IR, int *StallCycles) { + Instruction &IS = *IR.getInstruction(); + unsigned SourceIndex = IR.getSourceIndex(); + + if (!canExecute(IR, StallCycles)) { + LLVM_DEBUG(dbgs() << "[E] Stalled #" << IR << " for " << *StallCycles + << " cycles\n"); + return llvm::ErrorSuccess(); + } + + SmallVector UsedRegs(PRF.getNumRegisterFiles()); + addRegisterReadWrite(PRF, IS, SourceIndex, STI, UsedRegs); + + // Notify dispatch the second time to update registers used by the + // instruction. + notifyInstructionDispatch(IR, IS.getDesc().NumMicroOps, UsedRegs, *this); + + SmallVector, 4> UsedResources; + RM->issueInstruction(IS.getDesc(), UsedResources); + IS.execute(SourceIndex); + + // Replace resource masks with valid resource processor IDs. + for (std::pair &Use : UsedResources) { + uint64_t Mask = Use.first.first; + Use.first.first = RM->resolveResourceMask(Mask); + } + notifyInstructionExecute(IR, UsedResources, *this); + + IssuedInst.push_back(IR); + if (NumExecuted) + std::iter_swap(IssuedInst.end() - 1, IssuedInst.end() - NumExecuted - 1); + + return llvm::ErrorSuccess(); +} + +llvm::Error InOrderIssueStage::updateIssuedInst() { + // Retire instructions executed on the previous cycle + if (NumExecuted) { + for (auto I = IssuedInst.end() - NumExecuted, E = IssuedInst.end(); I != E; + ++I) { + LLVM_DEBUG(dbgs() << "[E] Retiring #" << *I << " (out of " << NumExecuted + << ")\n"); + if (llvm::Error E = moveToTheNextStage(*I)) + return E; + } + IssuedInst.resize(IssuedInst.size() - NumExecuted); + NumExecuted = 0; + } + + // Update other instructions. Executed instructions will be retired during the + // next cycle. + for (auto I = IssuedInst.begin(), E = IssuedInst.end(); + I != (E - NumExecuted);) { + InstRef &IR = *I; + Instruction &IS = *IR.getInstruction(); + + IS.cycleEvent(); + if (!IS.isExecuted()) { + LLVM_DEBUG(dbgs() << "[E] Instruction #" << IR + << " is still executing\n"); + ++I; + continue; + } + notifyEvent( + HWInstructionEvent(HWInstructionEvent::Executed, IR)); + + LLVM_DEBUG(dbgs() << "[E] Instruction #" << IR << " is executed\n"); + ++NumExecuted; + std::iter_swap(I, E - NumExecuted); + } + + return llvm::ErrorSuccess(); +} + +llvm::Error InOrderIssueStage::cycleStart() { + // Release consumed resources. + SmallVector Freed; + RM->cycleEvent(Freed); + + if (llvm::Error E = updateIssuedInst()) + return E; + + // Issue instructions scheduled for this cycle + while (!StallCyclesLeft && !InstQueue.empty()) { + InstRef &IR = InstQueue.front(); + if (llvm::Error E = tryIssue(IR, &StallCyclesLeft)) + return E; + + // No stall + if (!StallCyclesLeft) + InstQueue.pop(); + } + + if (!StallCyclesLeft) { + Bandwidth = SM.IssueWidth; + } else if (StallCyclesLeft == 1) { + // The stalled instruction will be ready during the next cycle. Add more + // instructions if allowed. + Bandwidth = SM.IssueWidth - InstQueue.size(); + assert(Bandwidth >= 0 && "InstQueue overflow."); + } else { + Bandwidth = 0; + } + + return llvm::ErrorSuccess(); +} + +llvm::Error InOrderIssueStage::cycleEnd() { + if (StallCyclesLeft > 0) + --StallCyclesLeft; + return llvm::ErrorSuccess(); +} + +} // namespace mca +} // namespace llvm diff --git a/llvm/lib/MCA/Stages/RetireStage.cpp b/llvm/lib/MCA/Stages/RetireStage.cpp --- a/llvm/lib/MCA/Stages/RetireStage.cpp +++ b/llvm/lib/MCA/Stages/RetireStage.cpp @@ -23,19 +23,19 @@ namespace mca { llvm::Error RetireStage::cycleStart() { - if (RCU.isEmpty()) + if (!RCU || RCU->isEmpty()) return llvm::ErrorSuccess(); - const unsigned MaxRetirePerCycle = RCU.getMaxRetirePerCycle(); + const unsigned MaxRetirePerCycle = RCU->getMaxRetirePerCycle(); unsigned NumRetired = 0; - while (!RCU.isEmpty()) { + while (!RCU->isEmpty()) { if (MaxRetirePerCycle != 0 && NumRetired == MaxRetirePerCycle) break; - const RetireControlUnit::RUToken &Current = RCU.getCurrentToken(); + const RetireControlUnit::RUToken &Current = RCU->getCurrentToken(); if (!Current.Executed) break; notifyInstructionRetired(Current.IR); - RCU.consumeCurrentToken(); + RCU->consumeCurrentToken(); NumRetired++; } @@ -43,7 +43,15 @@ } llvm::Error RetireStage::execute(InstRef &IR) { - RCU.onInstructionExecuted(IR.getInstruction()->getRCUTokenID()); + Instruction &IS = *IR.getInstruction(); + if (RCU) { + RCU->onInstructionExecuted(IS.getRCUTokenID()); + return llvm::ErrorSuccess(); + } + + IS.retire(); + notifyInstructionRetired(IR); + return llvm::ErrorSuccess(); } diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-add-sequence.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-add-sequence.s new file mode 100644 --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-add-sequence.s @@ -0,0 +1,82 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --timeline --iterations=2 < %s | FileCheck %s + +add w2, w3, #1 +add w4, w3, #2, lsl #12 +add w0, w4, #3 +add w1, w0, #4 + +# CHECK: Iterations: 2 +# CHECK-NEXT: Instructions: 8 +# CHECK-NEXT: Total Cycles: 11 +# CHECK-NEXT: Total uOps: 8 + +# CHECK: Dispatch Width: 2 +# CHECK-NEXT: uOps Per Cycle: 0.73 +# CHECK-NEXT: IPC: 0.73 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 3 0.50 add w2, w3, #1 +# CHECK-NEXT: 1 3 0.50 add w4, w3, #2, lsl #12 +# CHECK-NEXT: 1 3 0.50 add w0, w4, #3 +# CHECK-NEXT: 1 3 0.50 add w1, w0, #4 + +# CHECK: Resources: +# CHECK-NEXT: [0.0] - CortexA55UnitALU +# CHECK-NEXT: [0.1] - CortexA55UnitALU +# CHECK-NEXT: [1] - CortexA55UnitB +# CHECK-NEXT: [2] - CortexA55UnitDiv +# CHECK-NEXT: [3.0] - CortexA55UnitFPALU +# CHECK-NEXT: [3.1] - CortexA55UnitFPALU +# CHECK-NEXT: [4] - CortexA55UnitFPDIV +# CHECK-NEXT: [5.0] - CortexA55UnitFPMAC +# CHECK-NEXT: [5.1] - CortexA55UnitFPMAC +# CHECK-NEXT: [6] - CortexA55UnitLd +# CHECK-NEXT: [7] - CortexA55UnitMAC +# CHECK-NEXT: [8] - CortexA55UnitSt + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3.0] [3.1] [4] [5.0] [5.1] [6] [7] [8] +# CHECK-NEXT: 2.00 2.00 - - - - - - - - - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3.0] [3.1] [4] [5.0] [5.1] [6] [7] [8] Instructions: +# CHECK-NEXT: - 1.00 - - - - - - - - - - add w2, w3, #1 +# CHECK-NEXT: 1.00 - - - - - - - - - - - add w4, w3, #2, lsl #12 +# CHECK-NEXT: - 1.00 - - - - - - - - - - add w0, w4, #3 +# CHECK-NEXT: 1.00 - - - - - - - - - - - add w1, w0, #4 + +# CHECK: Timeline view: +# CHECK-NEXT: 0 +# CHECK-NEXT: Index 0123456789 + +# CHECK: [0,0] DeeeER . add w2, w3, #1 +# CHECK-NEXT: [0,1] DeeeER . add w4, w3, #2, lsl #12 +# CHECK-NEXT: [0,2] .DeeeER . add w0, w4, #3 +# CHECK-NEXT: [0,3] .D=eeeER . add w1, w0, #4 +# CHECK-NEXT: [1,0] . DeeeER . add w2, w3, #1 +# CHECK-NEXT: [1,1] . DeeeER . add w4, w3, #2, lsl #12 +# CHECK-NEXT: [1,2] . D=eeeER. add w0, w4, #3 +# CHECK-NEXT: [1,3] . D=eeeER add w1, w0, #4 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 1.0 0.0 0.0 add w2, w3, #1 +# CHECK-NEXT: 1. 2 1.0 0.0 0.0 add w4, w3, #2, lsl #12 +# CHECK-NEXT: 2. 2 1.5 0.0 0.0 add w0, w4, #3 +# CHECK-NEXT: 3. 2 2.0 0.0 0.0 add w1, w0, #4 +# CHECK-NEXT: 2 1.4 0.0 0.0 diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-stats.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-stats.s new file mode 100644 --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-stats.s @@ -0,0 +1,101 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-stats --iterations=2 < %s | FileCheck %s + +ldr w4, [x2], #4 +ldr w5, [x3] +madd w0, w5, w4, w0 +add x3, x3, x13 +subs x1, x1, #1 +str w0, [x21, x18, lsl #2] + +# CHECK: Iterations: 2 +# CHECK-NEXT: Instructions: 12 +# CHECK-NEXT: Total Cycles: 21 +# CHECK-NEXT: Total uOps: 14 + +# CHECK: Dispatch Width: 2 +# CHECK-NEXT: uOps Per Cycle: 0.67 +# CHECK-NEXT: IPC: 0.57 +# CHECK-NEXT: Block RThroughput: 3.5 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 2 3 1.00 * ldr w4, [x2], #4 +# CHECK-NEXT: 1 3 1.00 * ldr w5, [x3] +# CHECK-NEXT: 1 4 1.00 madd w0, w5, w4, w0 +# CHECK-NEXT: 1 3 0.50 add x3, x3, x13 +# CHECK-NEXT: 1 3 0.50 subs x1, x1, #1 +# CHECK-NEXT: 1 4 1.00 * str w0, [x21, x18, lsl #2] + +# CHECK: Dynamic Dispatch Stall Cycles: +# CHECK-NEXT: RAT - Register unavailable: 10 (47.6%) +# CHECK-NEXT: RCU - Retire tokens unavailable: 0 +# CHECK-NEXT: SCHEDQ - Scheduler full: 0 +# CHECK-NEXT: LQ - Load queue full: 0 +# CHECK-NEXT: SQ - Store queue full: 0 +# CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 1 (4.8%) + +# CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: +# CHECK-NEXT: [# dispatched], [# cycles] +# CHECK-NEXT: 0, 12 (57.1%) +# CHECK-NEXT: 1, 5 (23.8%) +# CHECK-NEXT: 2, 3 (14.3%) +# CHECK-NEXT: 3, 1 (4.8%) + +# CHECK: Schedulers - number of cycles where we saw N micro opcodes issued: +# CHECK-NEXT: [# issued], [# cycles] +# CHECK-NEXT: 0, 12 (57.1%) +# CHECK-NEXT: 1, 5 (23.8%) +# CHECK-NEXT: 2, 3 (14.3%) +# CHECK-NEXT: 3, 1 (4.8%) + +# CHECK: Scheduler's queue usage: +# CHECK-NEXT: No scheduler resources used. + +# CHECK: Retire Control Unit - number of cycles where we saw N instructions retired: +# CHECK-NEXT: [# retired], [# cycles] +# CHECK-NEXT: 0, 12 (57.1%) +# CHECK-NEXT: 1, 6 (28.6%) +# CHECK-NEXT: 2, 3 (14.3%) + +# CHECK: Total ROB Entries: 0 +# CHECK-NEXT: Max Used ROB Entries: 6 ( inf% ) +# CHECK-NEXT: Average Used ROB Entries per cy: 2 ( inf% ) + +# CHECK: Register File statistics: +# CHECK-NEXT: Total number of mappings created: 14 +# CHECK-NEXT: Max number of mappings used: 6 + +# CHECK: Resources: +# CHECK-NEXT: [0.0] - CortexA55UnitALU +# CHECK-NEXT: [0.1] - CortexA55UnitALU +# CHECK-NEXT: [1] - CortexA55UnitB +# CHECK-NEXT: [2] - CortexA55UnitDiv +# CHECK-NEXT: [3.0] - CortexA55UnitFPALU +# CHECK-NEXT: [3.1] - CortexA55UnitFPALU +# CHECK-NEXT: [4] - CortexA55UnitFPDIV +# CHECK-NEXT: [5.0] - CortexA55UnitFPMAC +# CHECK-NEXT: [5.1] - CortexA55UnitFPMAC +# CHECK-NEXT: [6] - CortexA55UnitLd +# CHECK-NEXT: [7] - CortexA55UnitMAC +# CHECK-NEXT: [8] - CortexA55UnitSt + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3.0] [3.1] [4] [5.0] [5.1] [6] [7] [8] +# CHECK-NEXT: 1.00 1.00 - - - - - - - 2.00 1.00 1.00 + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3.0] [3.1] [4] [5.0] [5.1] [6] [7] [8] Instructions: +# CHECK-NEXT: - - - - - - - - - 1.00 - - ldr w4, [x2], #4 +# CHECK-NEXT: - - - - - - - - - 1.00 - - ldr w5, [x3] +# CHECK-NEXT: - - - - - - - - - - 1.00 - madd w0, w5, w4, w0 +# CHECK-NEXT: - 1.00 - - - - - - - - - - add x3, x3, x13 +# CHECK-NEXT: 1.00 - - - - - - - - - - - subs x1, x1, #1 +# CHECK-NEXT: - - - - - - - - - - - 1.00 str w0, [x21, x18, lsl #2] diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-views.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-views.s new file mode 100644 --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-views.s @@ -0,0 +1,140 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views --iterations=2 < %s | FileCheck %s + +ldr w4, [x2], #4 +ldr w5, [x3] +madd w0, w5, w4, w0 +add x3, x3, x13 +subs x1, x1, #1 +str w0, [x21, x18, lsl #2] + +# CHECK: Iterations: 2 +# CHECK-NEXT: Instructions: 12 +# CHECK-NEXT: Total Cycles: 21 +# CHECK-NEXT: Total uOps: 14 + +# CHECK: Dispatch Width: 2 +# CHECK-NEXT: uOps Per Cycle: 0.67 +# CHECK-NEXT: IPC: 0.57 +# CHECK-NEXT: Block RThroughput: 3.5 + +# CHECK: Cycles with backend pressure increase [ 23.81% ] +# CHECK-NEXT: Throughput Bottlenecks: +# CHECK-NEXT: Resource Pressure [ 4.76% ] +# CHECK-NEXT: Data Dependencies: [ 19.05% ] +# CHECK-NEXT: - Register Dependencies [ 19.05% ] +# CHECK-NEXT: - Memory Dependencies [ 0.00% ] + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 2 3 1.00 * ldr w4, [x2], #4 +# CHECK-NEXT: 1 3 1.00 * ldr w5, [x3] +# CHECK-NEXT: 1 4 1.00 madd w0, w5, w4, w0 +# CHECK-NEXT: 1 3 0.50 add x3, x3, x13 +# CHECK-NEXT: 1 3 0.50 subs x1, x1, #1 +# CHECK-NEXT: 1 4 1.00 * str w0, [x21, x18, lsl #2] + +# CHECK: Dynamic Dispatch Stall Cycles: +# CHECK-NEXT: RAT - Register unavailable: 10 (47.6%) +# CHECK-NEXT: RCU - Retire tokens unavailable: 0 +# CHECK-NEXT: SCHEDQ - Scheduler full: 0 +# CHECK-NEXT: LQ - Load queue full: 0 +# CHECK-NEXT: SQ - Store queue full: 0 +# CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 1 (4.8%) + +# CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: +# CHECK-NEXT: [# dispatched], [# cycles] +# CHECK-NEXT: 0, 12 (57.1%) +# CHECK-NEXT: 1, 5 (23.8%) +# CHECK-NEXT: 2, 3 (14.3%) +# CHECK-NEXT: 3, 1 (4.8%) + +# CHECK: Schedulers - number of cycles where we saw N micro opcodes issued: +# CHECK-NEXT: [# issued], [# cycles] +# CHECK-NEXT: 0, 12 (57.1%) +# CHECK-NEXT: 1, 5 (23.8%) +# CHECK-NEXT: 2, 3 (14.3%) +# CHECK-NEXT: 3, 1 (4.8%) + +# CHECK: Scheduler's queue usage: +# CHECK-NEXT: No scheduler resources used. + +# CHECK: Retire Control Unit - number of cycles where we saw N instructions retired: +# CHECK-NEXT: [# retired], [# cycles] +# CHECK-NEXT: 0, 12 (57.1%) +# CHECK-NEXT: 1, 6 (28.6%) +# CHECK-NEXT: 2, 3 (14.3%) + +# CHECK: Total ROB Entries: 0 +# CHECK-NEXT: Max Used ROB Entries: 6 ( inf% ) +# CHECK-NEXT: Average Used ROB Entries per cy: 2 ( inf% ) + +# CHECK: Register File statistics: +# CHECK-NEXT: Total number of mappings created: 14 +# CHECK-NEXT: Max number of mappings used: 6 + +# CHECK: Resources: +# CHECK-NEXT: [0.0] - CortexA55UnitALU +# CHECK-NEXT: [0.1] - CortexA55UnitALU +# CHECK-NEXT: [1] - CortexA55UnitB +# CHECK-NEXT: [2] - CortexA55UnitDiv +# CHECK-NEXT: [3.0] - CortexA55UnitFPALU +# CHECK-NEXT: [3.1] - CortexA55UnitFPALU +# CHECK-NEXT: [4] - CortexA55UnitFPDIV +# CHECK-NEXT: [5.0] - CortexA55UnitFPMAC +# CHECK-NEXT: [5.1] - CortexA55UnitFPMAC +# CHECK-NEXT: [6] - CortexA55UnitLd +# CHECK-NEXT: [7] - CortexA55UnitMAC +# CHECK-NEXT: [8] - CortexA55UnitSt + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3.0] [3.1] [4] [5.0] [5.1] [6] [7] [8] +# CHECK-NEXT: 1.00 1.00 - - - - - - - 2.00 1.00 1.00 + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3.0] [3.1] [4] [5.0] [5.1] [6] [7] [8] Instructions: +# CHECK-NEXT: - - - - - - - - - 1.00 - - ldr w4, [x2], #4 +# CHECK-NEXT: - - - - - - - - - 1.00 - - ldr w5, [x3] +# CHECK-NEXT: - - - - - - - - - - 1.00 - madd w0, w5, w4, w0 +# CHECK-NEXT: - 1.00 - - - - - - - - - - add x3, x3, x13 +# CHECK-NEXT: 1.00 - - - - - - - - - - - subs x1, x1, #1 +# CHECK-NEXT: - - - - - - - - - - - 1.00 str w0, [x21, x18, lsl #2] + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 0 + +# CHECK: [0,0] DeeeER . . . ldr w4, [x2], #4 +# CHECK-NEXT: [0,1] D=eeeER . . . ldr w5, [x3] +# CHECK-NEXT: [0,2] .D===eeeeER . . madd w0, w5, w4, w0 +# CHECK-NEXT: [0,3] . DeeeER. . . add x3, x3, x13 +# CHECK-NEXT: [0,4] . DeeeER . . subs x1, x1, #1 +# CHECK-NEXT: [0,5] . D==eeeeER . . str w0, [x21, x18, lsl #2] +# CHECK-NEXT: [1,0] . . DeeeER . . ldr w4, [x2], #4 +# CHECK-NEXT: [1,1] . . DeeeER . . ldr w5, [x3] +# CHECK-NEXT: [1,2] . . D===eeeeER . madd w0, w5, w4, w0 +# CHECK-NEXT: [1,3] . . .DeeeER . add x3, x3, x13 +# CHECK-NEXT: [1,4] . . . DeeeER . subs x1, x1, #1 +# CHECK-NEXT: [1,5] . . . D==eeeeER str w0, [x21, x18, lsl #2] + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 1.0 0.0 0.0 ldr w4, [x2], #4 +# CHECK-NEXT: 1. 2 1.5 0.0 0.0 ldr w5, [x3] +# CHECK-NEXT: 2. 2 4.0 0.0 0.0 madd w0, w5, w4, w0 +# CHECK-NEXT: 3. 2 1.0 0.0 0.0 add x3, x3, x13 +# CHECK-NEXT: 4. 2 1.0 0.0 0.0 subs x1, x1, #1 +# CHECK-NEXT: 5. 2 3.0 0.0 0.0 str w0, [x21, x18, lsl #2] +# CHECK-NEXT: 2 1.9 0.0 0.0 diff --git a/llvm/test/tools/llvm-mca/X86/in-order-cpu.s b/llvm/test/tools/llvm-mca/X86/in-order-cpu.s --- a/llvm/test/tools/llvm-mca/X86/in-order-cpu.s +++ b/llvm/test/tools/llvm-mca/X86/in-order-cpu.s @@ -1,3 +1,3 @@ -# RUN: not llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=atom -o /dev/null 2>&1 | FileCheck %s - -# CHECK: error: please specify an out-of-order cpu. 'atom' is an in-order cpu. +# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=atom -o /dev/null 2>&1 | FileCheck %s +# CHECK: warning: support for in-order CPU 'atom' is experimental. +movsbw %al, %di diff --git a/llvm/tools/llvm-mca/llvm-mca.cpp b/llvm/tools/llvm-mca/llvm-mca.cpp --- a/llvm/tools/llvm-mca/llvm-mca.cpp +++ b/llvm/tools/llvm-mca/llvm-mca.cpp @@ -335,9 +335,8 @@ return 1; if (!PrintInstructionTables && !STI->getSchedModel().isOutOfOrder()) { - WithColor::error() << "please specify an out-of-order cpu. '" << MCPU - << "' is an in-order cpu.\n"; - return 1; + WithColor::warning() << "support for in-order CPU '" << MCPU + << "' is experimental.\n"; } if (!STI->getSchedModel().hasInstrSchedModel()) {