diff --git a/llvm/include/llvm/MCA/Stages/DecodeStage.h b/llvm/include/llvm/MCA/Stages/DecodeStage.h new file mode 100644 --- /dev/null +++ b/llvm/include/llvm/MCA/Stages/DecodeStage.h @@ -0,0 +1,57 @@ +//===---------------------- DecodeStage.h -----------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file defines a stage that implements instruction decoding +/// into micro-ops. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_MCA_DECODE_STAGE_H +#define LLVM_MCA_DECODE_STAGE_H + +#include "llvm/ADT/SmallVector.h" +#include "llvm/MCA/Stages/Stage.h" + +namespace llvm { +namespace mca { + +/// A stage that simulates an instruction decoder. +class DecodeStage : public Stage { + struct MicroOpEngine { + InstRef IR; + int MicroOpsLeftToGenerate; + } MicroOpEngine; + SmallVector Decoders; + + DecodeStage(const DecodeStage &Other) = delete; + DecodeStage &operator=(const DecodeStage &Other) = delete; + + // Is this instruction microcoded? + static bool IsMicroCoded(const InstRef &IR); + +public: + DecodeStage(); + + // Would decoder be able to start decoding the provided instruction? + bool isAvailable(const InstRef &IR) const override; + + // Are there any instructions currently being decoded? + bool hasWorkToComplete() const override; + + // Start decoding the provided instruction. + Error execute(InstRef &IR) override; + + // Actually generate microcodes, if any. + Error cycleEnd() override; +}; + +} // namespace mca +} // namespace llvm + +#endif // LLVM_MCA_DECODE_STAGE_H diff --git a/llvm/lib/MCA/CMakeLists.txt b/llvm/lib/MCA/CMakeLists.txt --- a/llvm/lib/MCA/CMakeLists.txt +++ b/llvm/lib/MCA/CMakeLists.txt @@ -11,6 +11,7 @@ InstrBuilder.cpp Instruction.cpp Pipeline.cpp + Stages/DecodeStage.cpp Stages/DispatchStage.cpp Stages/EntryStage.cpp Stages/ExecuteStage.cpp diff --git a/llvm/lib/MCA/Context.cpp b/llvm/lib/MCA/Context.cpp --- a/llvm/lib/MCA/Context.cpp +++ b/llvm/lib/MCA/Context.cpp @@ -18,6 +18,7 @@ #include "llvm/MCA/HardwareUnits/RegisterFile.h" #include "llvm/MCA/HardwareUnits/RetireControlUnit.h" #include "llvm/MCA/HardwareUnits/Scheduler.h" +#include "llvm/MCA/Stages/DecodeStage.h" #include "llvm/MCA/Stages/DispatchStage.h" #include "llvm/MCA/Stages/EntryStage.h" #include "llvm/MCA/Stages/ExecuteStage.h" @@ -40,6 +41,7 @@ // Create the pipeline stages. auto Fetch = std::make_unique(SrcMgr); + auto Decode = std::make_unique(); auto Dispatch = std::make_unique(STI, MRI, Opts.DispatchWidth, *RCU, *PRF); auto Execute = @@ -55,6 +57,10 @@ // Build the pipeline. auto StagePipeline = std::make_unique(); StagePipeline->appendStage(std::move(Fetch)); + // FIXME: parametrize decoder stage and enable globally. + if (STI.getTargetTriple().getArch() == Triple::ArchType::x86_64 && + STI.getCPU() == "bdver2") + StagePipeline->appendStage(std::move(Decode)); if (Opts.MicroOpQueueSize) StagePipeline->appendStage(std::make_unique( Opts.MicroOpQueueSize, Opts.DecodersThroughput)); diff --git a/llvm/lib/MCA/Stages/DecodeStage.cpp b/llvm/lib/MCA/Stages/DecodeStage.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/MCA/Stages/DecodeStage.cpp @@ -0,0 +1,145 @@ +//===---------------------- DecodeStage.cpp ---------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file defines the DecodeStage. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/MCA/Stages/DecodeStage.h" +#include + +namespace llvm { +namespace mca { + +#define DEBUG_TYPE "llvm-mca" + +DecodeStage::DecodeStage() {} + +bool DecodeStage::IsMicroCoded(const InstRef &IR) { + // FIXME: parametrize. + return IR.getInstruction()->getDesc().NumMicroOps > 2; +} + +bool DecodeStage::isAvailable(const InstRef &IR) const { + // If we are currently decoding microcoded instruction, + // we can't start decoding *anything* else. + if (MicroOpEngine.IR) + return false; + + if (IsMicroCoded(IR)) { + // We can not start decoding microcoded instruction until + // we finish decoding *all* preceding instructions. + return !hasWorkToComplete(); + } + + // There are only 4 decoders. + // FIXME: parametrize. + if (std::count_if(Decoders.begin(), Decoders.end(), + [](const InstRef &IR) { return IR; }) >= 4) + return false; + + // FIXME: parametrize. It can be more complex than that. + unsigned WouldBeNumMicroOpsTotal = + std::accumulate(Decoders.begin(), Decoders.end(), + IR.getInstruction()->getDesc().NumMicroOps, + [](unsigned NumMicroOpsSoFar, const InstRef &IR) { + if (const Instruction *Instr = IR.getInstruction()) + NumMicroOpsSoFar += Instr->getDesc().NumMicroOps; + return NumMicroOpsSoFar; + }); + // We can at most generate 4 microops per cycle. + // That is, we can generate 2-2/2-1-1/1-1-1-1. + return WouldBeNumMicroOpsTotal <= 4; +}; + +bool DecodeStage::hasWorkToComplete() const { + return MicroOpEngine.IR || std::any_of(Decoders.begin(), Decoders.end(), + [](const InstRef &IR) { return IR; }); +} + +Error DecodeStage::execute(InstRef &IR) { + assert(isAvailable(IR) && + "Should not start decoding instruction unless we are ready to."); + + if (IsMicroCoded(IR)) { + assert(Decoders.empty() && !MicroOpEngine.IR && + "Must not start decoding microcoded instruction if the decoder is " + "already occupied."); + + MicroOpEngine.IR = IR; + MicroOpEngine.MicroOpsLeftToGenerate = + IR.getInstruction()->getDesc().NumMicroOps; + return llvm::ErrorSuccess(); + } + + assert(!MicroOpEngine.IR && + "Must not start decoding non-microcoded instruction if already " + "decoding microcoded instruction."); + Decoders.emplace_back(IR); + return llvm::ErrorSuccess(); +} + +Error DecodeStage::cycleEnd() { + if (!hasWorkToComplete()) + return llvm::ErrorSuccess(); // Decoder is empty. + + if (InstRef &IR = MicroOpEngine.IR) { + assert(Decoders.empty() && + "Microcoded instruction must be decoded standalone"); + + // Is next stage ready to recieve all the microcodes? + if (!checkNextStage(IR)) + return llvm::ErrorSuccess(); // Stall. + + // Okay, start/continue generating microops. + + // FIXME: is that so for BdVer2? + // FIXME: parametrize. + MicroOpEngine.MicroOpsLeftToGenerate -= 2; + // Did we just finish generating microops for this Microcoded instruction? + if (MicroOpEngine.MicroOpsLeftToGenerate > 0) + return llvm::ErrorSuccess(); // More microops left to generate... + + // Done decoding/generating. + if (llvm::Error Val = moveToTheNextStage(IR)) + return Val; + IR.invalidate(); + return llvm::ErrorSuccess(); + } + + // Okay, must be a normal instruction. + assert(!Decoders.empty() && "Should be decoding some plain instructions."); + for (InstRef &IR : Decoders) { + if (!IR) + continue; + + // Is next stage ready to recieve microops of this decoded instruction? + if (!checkNextStage(IR)) + break; // Stall. + // Done decoding/generating in a single cycle. + if (llvm::Error Val = moveToTheNextStage(IR)) + return Val; + IR.invalidate(); + } + + // Trim fully decoded instructions from the back. + while (!Decoders.empty() && !Decoders.back()) + Decoders.pop_back(); + // Find the first instruction which hasn't been fully decoded. + auto It = find_if(Decoders, [](const InstRef &IR) { return IR; }); + unsigned NumDecoded = std::distance(Decoders.begin(), It); + // Erase instructions up to the first that hasn't been decoded. + if ((NumDecoded * 2) >= Decoders.size()) + Decoders.erase(Decoders.begin(), It); + + return llvm::ErrorSuccess(); +} + +} // namespace mca +} // namespace llvm diff --git a/llvm/test/tools/llvm-mca/X86/BdVer2/clear-super-register-1.s b/llvm/test/tools/llvm-mca/X86/BdVer2/clear-super-register-1.s --- a/llvm/test/tools/llvm-mca/X86/BdVer2/clear-super-register-1.s +++ b/llvm/test/tools/llvm-mca/X86/BdVer2/clear-super-register-1.s @@ -44,11 +44,11 @@ # CHECK: [0,0] DeeeeeeER . . . imulq $5, %rcx, %rax # CHECK-NEXT: [0,1] DeeE----R . . . lzcntl %ecx, %eax # CHECK-NEXT: [0,2] D==eE---R . . . andq %rcx, %rax -# CHECK-NEXT: [0,3] .D==eeeER . . . bsfq %rax, %rcx -# CHECK-NEXT: [1,0] . D====eeeeeeER. . imulq $5, %rcx, %rax -# CHECK-NEXT: [1,1] . D======eeE-R. . lzcntl %ecx, %eax -# CHECK-NEXT: [1,2] . D========eER. . andq %rcx, %rax -# CHECK-NEXT: [1,3] . D========eeeER bsfq %rax, %rcx +# CHECK-NEXT: [0,3] . DeeeER . . . bsfq %rax, %rcx +# CHECK-NEXT: [1,0] . D==eeeeeeER. . imulq $5, %rcx, %rax +# CHECK-NEXT: [1,1] . D====eeE-R. . lzcntl %ecx, %eax +# CHECK-NEXT: [1,2] . D======eER. . andq %rcx, %rax +# CHECK-NEXT: [1,3] . . D====eeeER bsfq %rax, %rcx # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -57,8 +57,8 @@ # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 2 3.0 0.5 0.0 imulq $5, %rcx, %rax -# CHECK-NEXT: 1. 2 4.0 2.0 2.5 lzcntl %ecx, %eax -# CHECK-NEXT: 2. 2 6.0 0.0 1.5 andq %rcx, %rax -# CHECK-NEXT: 3. 2 6.0 0.0 0.0 bsfq %rax, %rcx -# CHECK-NEXT: 2 4.8 0.6 1.0 +# CHECK-NEXT: 0. 2 2.0 0.5 0.0 imulq $5, %rcx, %rax +# CHECK-NEXT: 1. 2 3.0 2.0 2.5 lzcntl %ecx, %eax +# CHECK-NEXT: 2. 2 5.0 0.0 1.5 andq %rcx, %rax +# CHECK-NEXT: 3. 2 3.0 0.0 0.0 bsfq %rax, %rcx +# CHECK-NEXT: 2 3.3 0.6 1.0 diff --git a/llvm/test/tools/llvm-mca/X86/BdVer2/dot-product.s b/llvm/test/tools/llvm-mca/X86/BdVer2/dot-product.s --- a/llvm/test/tools/llvm-mca/X86/BdVer2/dot-product.s +++ b/llvm/test/tools/llvm-mca/X86/BdVer2/dot-product.s @@ -7,12 +7,12 @@ # CHECK: Iterations: 300 # CHECK-NEXT: Instructions: 900 -# CHECK-NEXT: Total Cycles: 1709 +# CHECK-NEXT: Total Cycles: 1682 # CHECK-NEXT: Total uOps: 2100 # CHECK: Dispatch Width: 4 -# CHECK-NEXT: uOps Per Cycle: 1.23 -# CHECK-NEXT: IPC: 0.53 +# CHECK-NEXT: uOps Per Cycle: 1.25 +# CHECK-NEXT: IPC: 0.54 # CHECK-NEXT: Block RThroughput: 5.5 # CHECK: Instruction Info: @@ -55,27 +55,27 @@ # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0.0] [0.1] [1] [2] [3] [4] [5] [6] [7.0] [7.1] [8.0] [8.1] [9] [10] [11] [12] [13] [14] [15] [16.0] [16.1] [17] [18] -# CHECK-NEXT: - - - - - - - - 5.50 5.50 - - - - 2.00 1.00 - - - - - - - +# CHECK-NEXT: - - - - - - - - 5.52 5.48 - - - - 2.00 1.00 - - - - - - - # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0.0] [0.1] [1] [2] [3] [4] [5] [6] [7.0] [7.1] [8.0] [8.1] [9] [10] [11] [12] [13] [14] [15] [16.0] [16.1] [17] [18] Instructions: -# CHECK-NEXT: - - - - - - - - 0.49 0.51 - - - - - 1.00 - - - - - - - vmulps %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: - - - - - - - - 2.53 2.47 - - - - 1.00 - - - - - - - - vhaddps %xmm2, %xmm2, %xmm3 -# CHECK-NEXT: - - - - - - - - 2.48 2.52 - - - - 1.00 - - - - - - - - vhaddps %xmm3, %xmm3, %xmm4 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - - - 1.00 - - - - - - - vmulps %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: - - - - - - - - 2.13 2.87 - - - - 1.00 - - - - - - - - vhaddps %xmm2, %xmm2, %xmm3 +# CHECK-NEXT: - - - - - - - - 2.88 2.12 - - - - 1.00 - - - - - - - - vhaddps %xmm3, %xmm3, %xmm4 # CHECK: Timeline view: -# CHECK-NEXT: 0123456789 01234 +# CHECK-NEXT: 0123456789 0123456789 # CHECK-NEXT: Index 0123456789 0123456789 -# CHECK: [0,0] DeeeeeER . . . . . . vmulps %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: [0,1] D=====eeeeeeeeeeeER . . . . vhaddps %xmm2, %xmm2, %xmm3 -# CHECK-NEXT: [0,2] .D===============eeeeeeeeeeeER. . vhaddps %xmm3, %xmm3, %xmm4 -# CHECK-NEXT: [1,0] .DeeeeeE---------------------R. . vmulps %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: [1,1] . D====eeeeeeeeeeeE----------R. . vhaddps %xmm2, %xmm2, %xmm3 -# CHECK-NEXT: [1,2] . D==============eeeeeeeeeeeER . vhaddps %xmm3, %xmm3, %xmm4 -# CHECK-NEXT: [2,0] . DeeeeeE--------------------R . vmulps %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: [2,1] . D======eeeeeeeeeeeE-------R . vhaddps %xmm2, %xmm2, %xmm3 -# CHECK-NEXT: [2,2] . D================eeeeeeeeeeeER vhaddps %xmm3, %xmm3, %xmm4 +# CHECK: [0,0] DeeeeeER . . . . . . . vmulps %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: [0,1] . D===eeeeeeeeeeeER . . . . . vhaddps %xmm2, %xmm2, %xmm3 +# CHECK-NEXT: [0,2] . D============eeeeeeeeeeeER. . . vhaddps %xmm3, %xmm3, %xmm4 +# CHECK-NEXT: [1,0] . DeeeeeE-----------------R. . . vmulps %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: [1,1] . . D===eeeeeeeeeeeE------R. . . vhaddps %xmm2, %xmm2, %xmm3 +# CHECK-NEXT: [1,2] . . D============eeeeeeeeeeeER. . vhaddps %xmm3, %xmm3, %xmm4 +# CHECK-NEXT: [2,0] . . DeeeeeE-----------------R. . vmulps %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: [2,1] . . . D===eeeeeeeeeeeE------R. . vhaddps %xmm2, %xmm2, %xmm3 +# CHECK-NEXT: [2,2] . . . D============eeeeeeeeeeeER vhaddps %xmm3, %xmm3, %xmm4 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -84,7 +84,7 @@ # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 3 1.0 1.0 13.7 vmulps %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: 1. 3 6.0 0.7 5.7 vhaddps %xmm2, %xmm2, %xmm3 -# CHECK-NEXT: 2. 3 16.0 0.0 0.0 vhaddps %xmm3, %xmm3, %xmm4 -# CHECK-NEXT: 3 7.7 0.6 6.4 +# CHECK-NEXT: 0. 3 1.0 1.0 11.3 vmulps %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: 1. 3 4.0 0.0 4.0 vhaddps %xmm2, %xmm2, %xmm3 +# CHECK-NEXT: 2. 3 13.0 0.0 0.0 vhaddps %xmm3, %xmm3, %xmm4 +# CHECK-NEXT: 3 6.0 0.3 5.1 diff --git a/llvm/test/tools/llvm-mca/X86/BdVer2/hadd-read-after-ld-1.s b/llvm/test/tools/llvm-mca/X86/BdVer2/hadd-read-after-ld-1.s --- a/llvm/test/tools/llvm-mca/X86/BdVer2/hadd-read-after-ld-1.s +++ b/llvm/test/tools/llvm-mca/X86/BdVer2/hadd-read-after-ld-1.s @@ -6,11 +6,11 @@ # CHECK: Iterations: 1 # CHECK-NEXT: Instructions: 2 -# CHECK-NEXT: Total Cycles: 20 +# CHECK-NEXT: Total Cycles: 21 # CHECK-NEXT: Total uOps: 5 # CHECK: Dispatch Width: 4 -# CHECK-NEXT: uOps Per Cycle: 0.25 +# CHECK-NEXT: uOps Per Cycle: 0.24 # CHECK-NEXT: IPC: 0.10 # CHECK-NEXT: Block RThroughput: 3.5 @@ -28,10 +28,10 @@ # CHECK: Timeline view: # CHECK-NEXT: 0123456789 -# CHECK-NEXT: Index 0123456789 +# CHECK-NEXT: Index 0123456789 0 -# CHECK: [0,0] DeeER. . . . vshufps $0, %xmm0, %xmm1, %xmm1 -# CHECK-NEXT: [0,1] .DeeeeeeeeeeeeeeeeER vhaddps (%rdi), %xmm1, %xmm2 +# CHECK: [0,0] DeeER. . . . vshufps $0, %xmm0, %xmm1, %xmm1 +# CHECK-NEXT: [0,1] . DeeeeeeeeeeeeeeeeER vhaddps (%rdi), %xmm1, %xmm2 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions diff --git a/llvm/test/tools/llvm-mca/X86/BdVer2/hadd-read-after-ld-2.s b/llvm/test/tools/llvm-mca/X86/BdVer2/hadd-read-after-ld-2.s --- a/llvm/test/tools/llvm-mca/X86/BdVer2/hadd-read-after-ld-2.s +++ b/llvm/test/tools/llvm-mca/X86/BdVer2/hadd-read-after-ld-2.s @@ -6,12 +6,12 @@ # CHECK: Iterations: 1 # CHECK-NEXT: Instructions: 2 -# CHECK-NEXT: Total Cycles: 20 +# CHECK-NEXT: Total Cycles: 24 # CHECK-NEXT: Total uOps: 11 # CHECK: Dispatch Width: 4 -# CHECK-NEXT: uOps Per Cycle: 0.55 -# CHECK-NEXT: IPC: 0.10 +# CHECK-NEXT: uOps Per Cycle: 0.46 +# CHECK-NEXT: IPC: 0.08 # CHECK-NEXT: Block RThroughput: 5.0 # CHECK: Instruction Info: @@ -28,10 +28,10 @@ # CHECK: Timeline view: # CHECK-NEXT: 0123456789 -# CHECK-NEXT: Index 0123456789 +# CHECK-NEXT: Index 0123456789 0123 -# CHECK: [0,0] DeeER. . . . vshufps $0, %xmm0, %xmm1, %xmm1 -# CHECK-NEXT: [0,1] .DeeeeeeeeeeeeeeeeER vhaddps (%rdi), %ymm1, %ymm2 +# CHECK: [0,0] DeeER. . . . . vshufps $0, %xmm0, %xmm1, %xmm1 +# CHECK-NEXT: [0,1] . DeeeeeeeeeeeeeeeeER vhaddps (%rdi), %ymm1, %ymm2 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions diff --git a/llvm/test/tools/llvm-mca/X86/BdVer2/instruction-info-view.s b/llvm/test/tools/llvm-mca/X86/BdVer2/instruction-info-view.s --- a/llvm/test/tools/llvm-mca/X86/BdVer2/instruction-info-view.s +++ b/llvm/test/tools/llvm-mca/X86/BdVer2/instruction-info-view.s @@ -13,13 +13,13 @@ # ENABLED: Iterations: 100 # ENABLED-NEXT: Instructions: 300 -# ENABLED-NEXT: Total Cycles: 583 +# ENABLED-NEXT: Total Cycles: 582 # ENABLED-NEXT: Total uOps: 700 # ENABLED: Dispatch Width: 4 # ENABLED-NEXT: uOps Per Cycle: 1.20 -# ENABLED-NEXT: IPC: 0.51 +# ENABLED-NEXT: IPC: 0.52 # ENABLED-NEXT: Block RThroughput: 5.5 # ENABLED: Instruction Info: diff --git a/llvm/test/tools/llvm-mca/X86/BdVer2/pipes-fpu.s b/llvm/test/tools/llvm-mca/X86/BdVer2/pipes-fpu.s --- a/llvm/test/tools/llvm-mca/X86/BdVer2/pipes-fpu.s +++ b/llvm/test/tools/llvm-mca/X86/BdVer2/pipes-fpu.s @@ -19,7 +19,7 @@ # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 800 -# CHECK-NEXT: Total Cycles: 1503 +# CHECK-NEXT: Total Cycles: 1507 # CHECK-NEXT: Total uOps: 1500 # CHECK: Dispatch Width: 4 @@ -72,39 +72,39 @@ # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0.0] [0.1] [1] [2] [3] [4] [5] [6] [7.0] [7.1] [8.0] [8.1] [9] [10] [11] [12] [13] [14] [15] [16.0] [16.1] [17] [18] -# CHECK-NEXT: - - - - - - - 1.00 15.06 14.94 1.12 1.88 9.00 1.00 6.44 4.56 - - - - - - - +# CHECK-NEXT: - - - - - - - 1.00 15.09 14.91 1.10 1.90 9.00 1.00 6.00 5.00 - - - - - - - # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0.0] [0.1] [1] [2] [3] [4] [5] [6] [7.0] [7.1] [8.0] [8.1] [9] [10] [11] [12] [13] [14] [15] [16.0] [16.1] [17] [18] Instructions: -# CHECK-NEXT: - - - - - - - - - - 0.88 0.12 2.00 - 2.00 1.00 - - - - - - - vpmulld %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: - - - - - - - - - - 0.24 1.76 - - 0.44 0.56 - - - - - - - vpand %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: - - - - - - - - - - 0.90 0.10 2.00 - 2.00 1.00 - - - - - - - vpmulld %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: - - - - - - - - - - 0.20 1.80 - - - 1.00 - - - - - - - vpand %xmm0, %xmm1, %xmm2 # CHECK-NEXT: - - - - - - - 1.00 - - - - - 1.00 1.00 - - - - - - - - vcvttps2dq %xmm0, %xmm2 # CHECK-NEXT: - - - - - - - - - - - - 7.00 - 1.00 - - - - - - - - vpclmulqdq $0, %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: - - - - - - - - 0.52 0.48 - - - - 1.00 - - - - - - - - vaddps %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: - - - - - - - - 4.50 4.50 - - - - - 1.00 - - - - - - - vsqrtps %xmm0, %xmm2 -# CHECK-NEXT: - - - - - - - - 1.04 0.96 - - - - 1.00 - - - - - - - - vaddps %ymm0, %ymm1, %ymm2 +# CHECK-NEXT: - - - - - - - - 0.50 0.50 - - - - 1.00 - - - - - - - - vaddps %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: - - - - - - - - 4.59 4.41 - - - - - 1.00 - - - - - - - vsqrtps %xmm0, %xmm2 +# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - - 1.00 - - - - - - - - vaddps %ymm0, %ymm1, %ymm2 # CHECK-NEXT: - - - - - - - - 9.00 9.00 - - - - - 2.00 - - - - - - - vsqrtps %ymm0, %ymm2 # CHECK: Timeline view: -# CHECK-NEXT: 0123456789 012 +# CHECK-NEXT: 0123456789 0123456 # CHECK-NEXT: Index 0123456789 0123456789 -# CHECK: [0,0] DeeeeeER . . . . . . vpmulld %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: [0,1] D=eeE--R . . . . . . vpand %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: [0,2] D==eeeeER . . . . . . vcvttps2dq %xmm0, %xmm2 -# CHECK-NEXT: [0,3] .D==eeeeeeeeeeeeER . . . . vpclmulqdq $0, %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: [0,4] . D===================eeeeeER . . vaddps %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: [0,5] . DeeeeeeeeeE---------------R . . vsqrtps %xmm0, %xmm2 -# CHECK-NEXT: [0,6] . D===================eeeeeER. . vaddps %ymm0, %ymm1, %ymm2 -# CHECK-NEXT: [0,7] . DeeeeeeeeeE---------------R. . vsqrtps %ymm0, %ymm2 -# CHECK-NEXT: [1,0] . D======eeeeeE------------R. . vpmulld %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: [1,1] . DeeE---------------------R. . vpand %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: [1,2] . D=eeeeE-------------------R . vcvttps2dq %xmm0, %xmm2 -# CHECK-NEXT: [1,3] . D=======eeeeeeeeeeeeE----R . vpclmulqdq $0, %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: [1,4] . .D==================eeeeeER. vaddps %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: [1,5] . .D=====eeeeeeeeeE---------R. vsqrtps %xmm0, %xmm2 -# CHECK-NEXT: [1,6] . . D==================eeeeeER vaddps %ymm0, %ymm1, %ymm2 -# CHECK-NEXT: [1,7] . . D=============eeeeeeeeeE-R vsqrtps %ymm0, %ymm2 +# CHECK: [0,0] DeeeeeER . . . . . .. vpmulld %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: [0,1] D=eeE--R . . . . . .. vpand %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: [0,2] D==eeeeER . . . . . .. vcvttps2dq %xmm0, %xmm2 +# CHECK-NEXT: [0,3] . DeeeeeeeeeeeeER . . . .. vpclmulqdq $0, %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: [0,4] . DeeeeeE------R . . . .. vaddps %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: [0,5] . DeeeeeeeeeE--R . . . .. vsqrtps %xmm0, %xmm2 +# CHECK-NEXT: [0,6] . DeeeeeE-----R . . . .. vaddps %ymm0, %ymm1, %ymm2 +# CHECK-NEXT: [0,7] . D==eeeeeeeeeER . . . .. vsqrtps %ymm0, %ymm2 +# CHECK-NEXT: [1,0] . .D====eeeeeE-R . . . .. vpmulld %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: [1,1] . .DeeE--------R . . . .. vpand %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: [1,2] . .DeeeeE------R . . . .. vcvttps2dq %xmm0, %xmm2 +# CHECK-NEXT: [1,3] . . D===eeeeeeeeeeeeER . .. vpclmulqdq $0, %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: [1,4] . . D===eeeeeE------R . .. vaddps %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: [1,5] . . D====eeeeeeeeeE-R . .. vsqrtps %xmm0, %xmm2 +# CHECK-NEXT: [1,6] . . .D============eeeeeER .. vaddps %ymm0, %ymm1, %ymm2 +# CHECK-NEXT: [1,7] . . .D==============eeeeeeeeeER vsqrtps %ymm0, %ymm2 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -113,12 +113,12 @@ # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 2 4.0 4.0 6.0 vpmulld %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: 1. 2 1.5 1.5 11.5 vpand %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: 2. 2 2.5 2.5 9.5 vcvttps2dq %xmm0, %xmm2 -# CHECK-NEXT: 3. 2 5.5 5.5 2.0 vpclmulqdq $0, %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: 4. 2 19.5 19.5 0.0 vaddps %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: 5. 2 3.5 3.5 12.0 vsqrtps %xmm0, %xmm2 -# CHECK-NEXT: 6. 2 19.5 19.5 0.0 vaddps %ymm0, %ymm1, %ymm2 -# CHECK-NEXT: 7. 2 7.5 7.5 8.0 vsqrtps %ymm0, %ymm2 -# CHECK-NEXT: 2 7.9 7.9 6.1 +# CHECK-NEXT: 0. 2 3.0 3.0 0.5 vpmulld %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: 1. 2 1.5 1.5 5.0 vpand %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: 2. 2 2.0 2.0 3.0 vcvttps2dq %xmm0, %xmm2 +# CHECK-NEXT: 3. 2 2.5 2.5 0.0 vpclmulqdq $0, %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: 4. 2 2.5 2.5 6.0 vaddps %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: 5. 2 3.0 3.0 1.5 vsqrtps %xmm0, %xmm2 +# CHECK-NEXT: 6. 2 7.0 7.0 2.5 vaddps %ymm0, %ymm1, %ymm2 +# CHECK-NEXT: 7. 2 9.0 9.0 0.0 vsqrtps %ymm0, %ymm2 +# CHECK-NEXT: 2 3.8 3.8 2.3 diff --git a/llvm/test/tools/llvm-mca/X86/BdVer2/store-throughput.s b/llvm/test/tools/llvm-mca/X86/BdVer2/store-throughput.s --- a/llvm/test/tools/llvm-mca/X86/BdVer2/store-throughput.s +++ b/llvm/test/tools/llvm-mca/X86/BdVer2/store-throughput.s @@ -741,7 +741,7 @@ # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 400 -# CHECK-NEXT: Total Cycles: 7170 +# CHECK-NEXT: Total Cycles: 7171 # CHECK-NEXT: Total uOps: 1600 # CHECK: Dispatch Width: 4 @@ -766,19 +766,19 @@ # CHECK: Dynamic Dispatch Stall Cycles: # CHECK-NEXT: RAT - Register unavailable: 0 # CHECK-NEXT: RCU - Retire tokens unavailable: 0 -# CHECK-NEXT: SCHEDQ - Scheduler full: 5777 (80.6%) +# CHECK-NEXT: SCHEDQ - Scheduler full: 5568 (77.6%) # CHECK-NEXT: LQ - Load queue full: 0 -# CHECK-NEXT: SQ - Store queue full: 561 (7.8%) +# CHECK-NEXT: SQ - Store queue full: 372 (5.2%) # CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0 # CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: # CHECK-NEXT: [# dispatched], [# cycles] -# CHECK-NEXT: 0, 6770 (94.4%) +# CHECK-NEXT: 0, 6771 (94.4%) # CHECK-NEXT: 4, 400 (5.6%) # CHECK: Schedulers - number of cycles where we saw N micro opcodes issued: # CHECK-NEXT: [# issued], [# cycles] -# CHECK-NEXT: 0, 6770 (94.4%) +# CHECK-NEXT: 0, 6771 (94.4%) # CHECK-NEXT: 4, 400 (5.6%) # CHECK: Scheduler's queue usage: @@ -831,12 +831,12 @@ # CHECK: Timeline view: # CHECK-NEXT: 0123456789 0123456789 -# CHECK-NEXT: Index 0123456789 0123456789 01 +# CHECK-NEXT: Index 0123456789 0123456789 012 -# CHECK: [0,0] DeER . . . . . . . .. vmovaps %ymm0, (%rax) -# CHECK-NEXT: [0,1] .D=eER . . . . . . .. vmovaps %ymm1, (%rcx) -# CHECK-NEXT: [0,2] . D==================================eER.. vmovaps %ymm2, (%rdx) -# CHECK-NEXT: [0,3] . D===================================eER vmovaps %ymm3, (%rbx) +# CHECK: [0,0] .DeER. . . . . . . . . vmovaps %ymm0, (%rax) +# CHECK-NEXT: [0,1] . DeER . . . . . . . . vmovaps %ymm1, (%rcx) +# CHECK-NEXT: [0,2] . D================================eER . vmovaps %ymm2, (%rdx) +# CHECK-NEXT: [0,3] . . D================================eER vmovaps %ymm3, (%rbx) # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -846,7 +846,7 @@ # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 vmovaps %ymm0, (%rax) -# CHECK-NEXT: 1. 1 2.0 1.0 0.0 vmovaps %ymm1, (%rcx) -# CHECK-NEXT: 2. 1 35.0 33.0 0.0 vmovaps %ymm2, (%rdx) -# CHECK-NEXT: 3. 1 36.0 1.0 0.0 vmovaps %ymm3, (%rbx) -# CHECK-NEXT: 1 18.5 9.0 0.0 +# CHECK-NEXT: 1. 1 1.0 1.0 0.0 vmovaps %ymm1, (%rcx) +# CHECK-NEXT: 2. 1 33.0 33.0 0.0 vmovaps %ymm2, (%rdx) +# CHECK-NEXT: 3. 1 33.0 1.0 0.0 vmovaps %ymm3, (%rbx) +# CHECK-NEXT: 1 17.0 9.0 0.0 diff --git a/llvm/test/tools/llvm-mca/X86/BdVer2/zero-idioms-avx-256.s b/llvm/test/tools/llvm-mca/X86/BdVer2/zero-idioms-avx-256.s --- a/llvm/test/tools/llvm-mca/X86/BdVer2/zero-idioms-avx-256.s +++ b/llvm/test/tools/llvm-mca/X86/BdVer2/zero-idioms-avx-256.s @@ -372,11 +372,11 @@ # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 200 -# CHECK-NEXT: Total Cycles: 903 +# CHECK-NEXT: Total Cycles: 906 # CHECK-NEXT: Total uOps: 1000 # CHECK: Dispatch Width: 4 -# CHECK-NEXT: uOps Per Cycle: 1.11 +# CHECK-NEXT: uOps Per Cycle: 1.10 # CHECK-NEXT: IPC: 0.22 # CHECK-NEXT: Block RThroughput: 4.0 @@ -427,15 +427,15 @@ # CHECK-NEXT: - - - - - - - - 2.00 - - - - - 1.00 - - - - - - - - vaddps %ymm1, %ymm1, %ymm0 # CHECK: Timeline view: -# CHECK-NEXT: 0123456789 +# CHECK-NEXT: 0123456789 012 # CHECK-NEXT: Index 0123456789 0123456789 -# CHECK: [0,0] DeeeeER . . . . . vperm2f128 $136, %ymm0, %ymm0, %ymm1 -# CHECK-NEXT: [0,1] . D==eeeeeER . . . . vaddps %ymm1, %ymm1, %ymm0 -# CHECK-NEXT: [1,0] . D======eeeeER . . . vperm2f128 $136, %ymm0, %ymm0, %ymm1 -# CHECK-NEXT: [1,1] . D========eeeeeER . . vaddps %ymm1, %ymm1, %ymm0 -# CHECK-NEXT: [2,0] . .D============eeeeER. . vperm2f128 $136, %ymm0, %ymm0, %ymm1 -# CHECK-NEXT: [2,1] . . D==============eeeeeER vaddps %ymm1, %ymm1, %ymm0 +# CHECK: [0,0] . DeeeeER. . . . . . vperm2f128 $136, %ymm0, %ymm0, %ymm1 +# CHECK-NEXT: [0,1] . D==eeeeeER. . . . . vaddps %ymm1, %ymm1, %ymm0 +# CHECK-NEXT: [1,0] . . D===eeeeER . . . . vperm2f128 $136, %ymm0, %ymm0, %ymm1 +# CHECK-NEXT: [1,1] . . .D=====eeeeeER . . . vaddps %ymm1, %ymm1, %ymm0 +# CHECK-NEXT: [2,0] . . . D======eeeeER . . vperm2f128 $136, %ymm0, %ymm0, %ymm1 +# CHECK-NEXT: [2,1] . . . . D========eeeeeER vaddps %ymm1, %ymm1, %ymm0 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -444,6 +444,6 @@ # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 3 7.0 0.3 0.0 vperm2f128 $136, %ymm0, %ymm0, %ymm1 -# CHECK-NEXT: 1. 3 9.0 0.0 0.0 vaddps %ymm1, %ymm1, %ymm0 -# CHECK-NEXT: 3 8.0 0.2 0.0 +# CHECK-NEXT: 0. 3 4.0 0.3 0.0 vperm2f128 $136, %ymm0, %ymm0, %ymm1 +# CHECK-NEXT: 1. 3 6.0 0.0 0.0 vaddps %ymm1, %ymm1, %ymm0 +# CHECK-NEXT: 3 5.0 0.2 0.0