Index: include/llvm/MCA/Context.h =================================================================== --- include/llvm/MCA/Context.h +++ include/llvm/MCA/Context.h @@ -31,11 +31,15 @@ /// This is a convenience struct to hold the parameters necessary for creating /// the pre-built "default" out-of-order pipeline. struct PipelineOptions { - PipelineOptions(unsigned DW, unsigned RFS, unsigned LQS, unsigned SQS, - bool NoAlias, bool ShouldEnableBottleneckAnalysis = false) - : DispatchWidth(DW), RegisterFileSize(RFS), LoadQueueSize(LQS), + PipelineOptions(unsigned UOPQSize, unsigned DecThr, unsigned DW, unsigned RFS, + unsigned LQS, unsigned SQS, bool NoAlias, + bool ShouldEnableBottleneckAnalysis = false) + : MicroOpQueueSize(UOPQSize), DecodersThroughput(DecThr), + DispatchWidth(DW), RegisterFileSize(RFS), LoadQueueSize(LQS), StoreQueueSize(SQS), AssumeNoAlias(NoAlias), EnableBottleneckAnalysis(ShouldEnableBottleneckAnalysis) {} + unsigned MicroOpQueueSize; + unsigned DecodersThroughput; // Instructions per cycle. unsigned DispatchWidth; unsigned RegisterFileSize; unsigned LoadQueueSize; Index: include/llvm/MCA/Stages/MicroOpQueueStage.h =================================================================== --- include/llvm/MCA/Stages/MicroOpQueueStage.h +++ include/llvm/MCA/Stages/MicroOpQueueStage.h @@ -0,0 +1,88 @@ +//===---------------------- MicroOpQueue.h ----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file defines a stage that implements a queue of micro opcodes. +/// It can be used to simulate a hardware micro-op queue that serves opcodes to +/// the out of order backend. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_MCA_BUFFER_STAGE_H +#define LLVM_MCA_BUFFER_STAGE_H + +#include "llvm/ADT/SmallVector.h" +#include "llvm/MCA/Stages/Stage.h" + +namespace llvm { +namespace mca { + +/// A stage that simulates a queue of instruction opcodes. +class MicroOpQueueStage : public Stage { + SmallVector Buffer; + unsigned NextAvailableSlotIdx; + unsigned CurrentInstructionSlotIdx; + + // Limits the number of instructions that can be written to this buffer every + // cycle. A value of zero means that there is no limit to the instruction + // throughput in input. + const unsigned MaxIPC; + unsigned CurrentIPC; + + // Number of entries that are available during this cycle. + unsigned AvailableEntries; + + // True if instructions dispatched to this stage don't need to wait for the + // next cycle before moving to the next stage. + // False if this buffer acts as a one cycle delay in the execution pipeline. + bool IsZeroLatencyStage; + + MicroOpQueueStage(const MicroOpQueueStage &Other) = delete; + MicroOpQueueStage &operator=(const MicroOpQueueStage &Other) = delete; + + // By default, an instruction consumes a number of buffer entries equal to its + // number of micro opcodes (see field `InstrDesc::NumMicroOpcodes`). The + // number of entries consumed by an instruction is normalized to the + // std::min(NumMicroOpcodes, Buffer.size()). This is to avoid problems with + // (microcoded) instructions that generate a number of micro opcodes than + // doesn't fit in the buffer. + unsigned getNormalizedOpcodes(const InstRef &IR) const { + unsigned NormalizedOpcodes = + std::min(static_cast(Buffer.size()), + IR.getInstruction()->getDesc().NumMicroOps); + return NormalizedOpcodes ? NormalizedOpcodes : 1U; + } + + Error moveInstructions(); + +public: + MicroOpQueueStage(unsigned Size, unsigned IPC = 0, + bool ZeroLatencyStage = true); + + bool isAvailable(const InstRef &IR) const override { + if (MaxIPC && CurrentIPC == MaxIPC) + return false; + unsigned NormalizedOpcodes = getNormalizedOpcodes(IR); + if (NormalizedOpcodes > AvailableEntries) + return false; + return true; + } + + bool hasWorkToComplete() const override { + return AvailableEntries != Buffer.size(); + } + + Error execute(InstRef &IR) override; + Error cycleStart() override; + Error cycleEnd() override; +}; + +} // namespace mca +} // namespace llvm + +#endif // LLVM_MCA_FETCH_STAGE_H Index: lib/MCA/CMakeLists.txt =================================================================== --- lib/MCA/CMakeLists.txt +++ lib/MCA/CMakeLists.txt @@ -14,6 +14,7 @@ Stages/EntryStage.cpp Stages/ExecuteStage.cpp Stages/InstructionTables.cpp + Stages/MicroOpQueueStage.cpp Stages/RetireStage.cpp Stages/Stage.cpp Support.cpp Index: lib/MCA/Context.cpp =================================================================== --- lib/MCA/Context.cpp +++ lib/MCA/Context.cpp @@ -21,6 +21,7 @@ #include "llvm/MCA/Stages/DispatchStage.h" #include "llvm/MCA/Stages/EntryStage.h" #include "llvm/MCA/Stages/ExecuteStage.h" +#include "llvm/MCA/Stages/MicroOpQueueStage.h" #include "llvm/MCA/Stages/RetireStage.h" namespace llvm { @@ -55,6 +56,9 @@ // Build the pipeline. auto StagePipeline = llvm::make_unique(); StagePipeline->appendStage(std::move(Fetch)); + if (Opts.MicroOpQueueSize) + StagePipeline->appendStage(std::move(llvm::make_unique( + Opts.MicroOpQueueSize, Opts.DecodersThroughput))); StagePipeline->appendStage(std::move(Dispatch)); StagePipeline->appendStage(std::move(Execute)); StagePipeline->appendStage(std::move(Retire)); Index: lib/MCA/Stages/MicroOpQueueStage.cpp =================================================================== --- lib/MCA/Stages/MicroOpQueueStage.cpp +++ lib/MCA/Stages/MicroOpQueueStage.cpp @@ -0,0 +1,70 @@ +//===---------------------- MicroOpQueueStage.cpp ---------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file defines the MicroOpQueueStage. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/MCA/Stages/MicroOpQueueStage.h" + +namespace llvm { +namespace mca { + +#define DEBUG_TYPE "llvm-mca" + +Error MicroOpQueueStage::moveInstructions() { + InstRef IR = Buffer[CurrentInstructionSlotIdx]; + while (IR && checkNextStage(IR)) { + if (llvm::Error Val = moveToTheNextStage(IR)) + return Val; + + Buffer[CurrentInstructionSlotIdx].invalidate(); + unsigned NormalizedOpcodes = getNormalizedOpcodes(IR); + CurrentInstructionSlotIdx += NormalizedOpcodes; + CurrentInstructionSlotIdx %= Buffer.size(); + AvailableEntries += NormalizedOpcodes; + IR = Buffer[CurrentInstructionSlotIdx]; + } + + return llvm::ErrorSuccess(); +} + +MicroOpQueueStage::MicroOpQueueStage(unsigned Size, unsigned IPC, + bool ZeroLatencyStage) + : NextAvailableSlotIdx(0), CurrentInstructionSlotIdx(0), MaxIPC(IPC), + CurrentIPC(0), IsZeroLatencyStage(ZeroLatencyStage) { + Buffer.resize(Size ? Size : 1); + AvailableEntries = Buffer.size(); +} + +Error MicroOpQueueStage::execute(InstRef &IR) { + Buffer[NextAvailableSlotIdx] = IR; + unsigned NormalizedOpcodes = getNormalizedOpcodes(IR); + NextAvailableSlotIdx += NormalizedOpcodes; + NextAvailableSlotIdx %= Buffer.size(); + AvailableEntries -= NormalizedOpcodes; + ++CurrentIPC; + return llvm::ErrorSuccess(); +} + +Error MicroOpQueueStage::cycleStart() { + CurrentIPC = 0; + if (!IsZeroLatencyStage) + return moveInstructions(); + return llvm::ErrorSuccess(); +} + +Error MicroOpQueueStage::cycleEnd() { + if (IsZeroLatencyStage) + return moveInstructions(); + return llvm::ErrorSuccess(); +} + +} // namespace mca +} // namespace llvm Index: test/tools/llvm-mca/X86/uop-queue.s =================================================================== --- test/tools/llvm-mca/X86/uop-queue.s +++ test/tools/llvm-mca/X86/uop-queue.s @@ -0,0 +1,105 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -iterations=1500 -micro-op-queue-size=1 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=HASWELL-UOPQ-1 +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -iterations=1500 -micro-op-queue-size=2 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=HASWELL-UOPQ-2 +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -iterations=1500 -micro-op-queue-size=3 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=HASWELL-UOPQ-3 +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -iterations=1500 -micro-op-queue-size=4 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=HASWELL-UOPQ-4 +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -iterations=1500 -micro-op-queue-size=4 -decoder-throughput=2 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=HASWELL-DEC-2 + +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1500 -micro-op-queue-size=1 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=BTVER2-UOPQ-1 +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1500 -micro-op-queue-size=2 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=BTVER2-UOPQ-2 +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1500 -micro-op-queue-size=4 -decoder-throughput=1 -all-views=false -summary-view < %s | FileCheck %s -check-prefix=BTVER2-DEC-1 + +add %eax, %eax +add %ebx, %ebx +add %ecx, %ecx +add %edx, %edx + +# BTVER2-DEC-2: Iterations: 1500 +# BTVER2-DEC-2-NEXT: Instructions: 6000 +# BTVER2-DEC-2-NEXT: Total Cycles: 3003 +# BTVER2-DEC-2-NEXT: Total uOps: 6000 + +# BTVER2-DEC-2: Dispatch Width: 2 +# BTVER2-DEC-2-NEXT: uOps Per Cycle: 2.00 +# BTVER2-DEC-2-NEXT: IPC: 2.00 +# BTVER2-DEC-2-NEXT: Block RThroughput: 2.0 + +# BTVER2-DEC-1: Iterations: 1500 +# BTVER2-DEC-1-NEXT: Instructions: 6000 +# BTVER2-DEC-1-NEXT: Total Cycles: 6003 +# BTVER2-DEC-1-NEXT: Total uOps: 6000 + +# BTVER2-UOPQ-1: Iterations: 1500 +# BTVER2-UOPQ-1-NEXT: Instructions: 6000 +# BTVER2-UOPQ-1-NEXT: Total Cycles: 6003 +# BTVER2-UOPQ-1-NEXT: Total uOps: 6000 + +# BTVER2-UOPQ-2: Iterations: 1500 +# BTVER2-UOPQ-2-NEXT: Instructions: 6000 +# BTVER2-UOPQ-2-NEXT: Total Cycles: 3003 +# BTVER2-UOPQ-2-NEXT: Total uOps: 6000 + +# HASWELL-DEC-2: Iterations: 1500 +# HASWELL-DEC-2-NEXT: Instructions: 6000 +# HASWELL-DEC-2-NEXT: Total Cycles: 3003 +# HASWELL-DEC-2-NEXT: Total uOps: 6000 + +# HASWELL-UOPQ-1: Iterations: 1500 +# HASWELL-UOPQ-1-NEXT: Instructions: 6000 +# HASWELL-UOPQ-1-NEXT: Total Cycles: 6003 +# HASWELL-UOPQ-1-NEXT: Total uOps: 6000 + +# HASWELL-UOPQ-2: Iterations: 1500 +# HASWELL-UOPQ-2-NEXT: Instructions: 6000 +# HASWELL-UOPQ-2-NEXT: Total Cycles: 3003 +# HASWELL-UOPQ-2-NEXT: Total uOps: 6000 + +# HASWELL-UOPQ-3: Iterations: 1500 +# HASWELL-UOPQ-3-NEXT: Instructions: 6000 +# HASWELL-UOPQ-3-NEXT: Total Cycles: 2003 +# HASWELL-UOPQ-3-NEXT: Total uOps: 6000 + +# HASWELL-UOPQ-4: Iterations: 1500 +# HASWELL-UOPQ-4-NEXT: Instructions: 6000 +# HASWELL-UOPQ-4-NEXT: Total Cycles: 1503 +# HASWELL-UOPQ-4-NEXT: Total uOps: 6000 + +# BTVER2-DEC-1: Dispatch Width: 2 +# BTVER2-DEC-1-NEXT: uOps Per Cycle: 1.00 +# BTVER2-DEC-1-NEXT: IPC: 1.00 +# BTVER2-DEC-1-NEXT: Block RThroughput: 2.0 + +# BTVER2-UOPQ-1: Dispatch Width: 2 +# BTVER2-UOPQ-1-NEXT: uOps Per Cycle: 1.00 +# BTVER2-UOPQ-1-NEXT: IPC: 1.00 +# BTVER2-UOPQ-1-NEXT: Block RThroughput: 2.0 + +# BTVER2-UOPQ-2: Dispatch Width: 2 +# BTVER2-UOPQ-2-NEXT: uOps Per Cycle: 2.00 +# BTVER2-UOPQ-2-NEXT: IPC: 2.00 +# BTVER2-UOPQ-2-NEXT: Block RThroughput: 2.0 + +# HASWELL-DEC-2: Dispatch Width: 4 +# HASWELL-DEC-2-NEXT: uOps Per Cycle: 2.00 +# HASWELL-DEC-2-NEXT: IPC: 2.00 +# HASWELL-DEC-2-NEXT: Block RThroughput: 1.0 + +# HASWELL-UOPQ-1: Dispatch Width: 4 +# HASWELL-UOPQ-1-NEXT: uOps Per Cycle: 1.00 +# HASWELL-UOPQ-1-NEXT: IPC: 1.00 +# HASWELL-UOPQ-1-NEXT: Block RThroughput: 1.0 + +# HASWELL-UOPQ-2: Dispatch Width: 4 +# HASWELL-UOPQ-2-NEXT: uOps Per Cycle: 2.00 +# HASWELL-UOPQ-2-NEXT: IPC: 2.00 +# HASWELL-UOPQ-2-NEXT: Block RThroughput: 1.0 + +# HASWELL-UOPQ-3: Dispatch Width: 4 +# HASWELL-UOPQ-3-NEXT: uOps Per Cycle: 3.00 +# HASWELL-UOPQ-3-NEXT: IPC: 3.00 +# HASWELL-UOPQ-3-NEXT: Block RThroughput: 1.0 + +# HASWELL-UOPQ-4: Dispatch Width: 4 +# HASWELL-UOPQ-4-NEXT: uOps Per Cycle: 3.99 +# HASWELL-UOPQ-4-NEXT: IPC: 3.99 +# HASWELL-UOPQ-4-NEXT: Block RThroughput: 1.0 Index: tools/llvm-mca/llvm-mca.cpp =================================================================== --- tools/llvm-mca/llvm-mca.cpp +++ tools/llvm-mca/llvm-mca.cpp @@ -100,6 +100,17 @@ "be used for register mappings"), cl::cat(ToolOptions), cl::init(0)); +static cl::opt + MicroOpQueue("micro-op-queue-size", cl::Hidden, + cl::desc("Number of entries in the micro-op queue"), + cl::cat(ToolOptions), cl::init(0)); + +static cl::opt + DecoderThroughput("decoder-throughput", cl::Hidden, + cl::desc("Maximum throughput from the decoders " + "(instructions per cycle)"), + cl::cat(ToolOptions), cl::init(0)); + static cl::opt PrintRegisterFileStats("register-file-stats", cl::desc("Print register file statistics"), @@ -387,9 +398,9 @@ // Create a context to control ownership of the pipeline hardware. mca::Context MCA(*MRI, *STI); - mca::PipelineOptions PO(DispatchWidth, RegisterFileSize, LoadQueueSize, - StoreQueueSize, AssumeNoAlias, - EnableBottleneckAnalysis); + mca::PipelineOptions PO(MicroOpQueue, DecoderThroughput, DispatchWidth, + RegisterFileSize, LoadQueueSize, StoreQueueSize, + AssumeNoAlias, EnableBottleneckAnalysis); // Number each region in the sequence. unsigned RegionIdx = 0;