diff --git a/llvm/docs/CommandGuide/llvm-mca.rst b/llvm/docs/CommandGuide/llvm-mca.rst --- a/llvm/docs/CommandGuide/llvm-mca.rst +++ b/llvm/docs/CommandGuide/llvm-mca.rst @@ -182,6 +182,11 @@ Enable the printing of instruction encodings within the instruction info view. +.. option:: -show-barriers + + Enable the printing of LoadBarrier and StoreBarrier flags within the + instruction info view. + .. option:: -all-stats Print all hardware statistics. This enables extra statistics related to the @@ -949,15 +954,16 @@ loads, the scheduling model provides an "optimistic" load-to-use latency (which usually matches the load-to-use latency for when there is a hit in the L1D). -:program:`llvm-mca` does not know about serializing operations or memory-barrier -like instructions. The LSUnit conservatively assumes that an instruction which -has both "MayLoad" and unmodeled side effects behaves like a "soft" -load-barrier. That means, it serializes loads without forcing a flush of the -load queue. Similarly, instructions that "MayStore" and have unmodeled side -effects are treated like store barriers. A full memory barrier is a "MayLoad" -and "MayStore" instruction with unmodeled side effects. This is inaccurate, but -it is the best that we can do at the moment with the current information -available in LLVM. +:program:`llvm-mca` does not (on its own) know about serializing operations or +memory-barrier like instructions. The LSUnit used to conservatively use an +instruction's "MayLoad", "MayStore", and unmodeled side effects flags to +determine whether an instruction should be treated as a memory-barrier. This was +inaccurate in general and was changed so that now each instruction has an +IsAStoreBarrier and IsALoadBarrier flag. These flags are mca specific and +default to false for every instruction. If any instruction should have either of +these flags set, it should be done within the target's InstrPostProcess class. +For an example, look at the `X86InstrPostProcess::postProcessInstruction` method +within `llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp`. A load/store barrier consumes one entry of the load/store queue. A load/store barrier enforces ordering of loads/stores. A younger load cannot pass a load diff --git a/llvm/include/llvm/MCA/CustomBehaviour.h b/llvm/include/llvm/MCA/CustomBehaviour.h --- a/llvm/include/llvm/MCA/CustomBehaviour.h +++ b/llvm/include/llvm/MCA/CustomBehaviour.h @@ -43,6 +43,10 @@ virtual ~InstrPostProcess() {} + /// This method can be overriden by targets to modify the mca::Instruction + /// object after it has been lowered from the MCInst. + /// This is generally a less disruptive alternative to modifying the + /// scheduling model. virtual void postProcessInstruction(std::unique_ptr &Inst, const MCInst &MCI) {} }; diff --git a/llvm/include/llvm/MCA/Instruction.h b/llvm/include/llvm/MCA/Instruction.h --- a/llvm/include/llvm/MCA/Instruction.h +++ b/llvm/include/llvm/MCA/Instruction.h @@ -517,9 +517,14 @@ // Instruction opcode which can be used by mca::CustomBehaviour unsigned Opcode; + // Flags used by the LSUnit. + bool IsALoadBarrier; + bool IsAStoreBarrier; + public: InstructionBase(const InstrDesc &D, const unsigned Opcode) - : Desc(D), IsOptimizableMove(false), Operands(0), Opcode(Opcode) {} + : Desc(D), IsOptimizableMove(false), Operands(0), Opcode(Opcode), + IsALoadBarrier(false), IsAStoreBarrier(false) {} SmallVectorImpl &getDefs() { return Defs; } ArrayRef getDefs() const { return Defs; } @@ -530,6 +535,10 @@ unsigned getLatency() const { return Desc.MaxLatency; } unsigned getNumMicroOps() const { return Desc.NumMicroOps; } unsigned getOpcode() const { return Opcode; } + bool isALoadBarrier() const { return IsALoadBarrier; } + bool isAStoreBarrier() const { return IsAStoreBarrier; } + void setLoadBarrier(bool IsBarrier) { IsALoadBarrier = IsBarrier; } + void setStoreBarrier(bool IsBarrier) { IsAStoreBarrier = IsBarrier; } /// Return the MCAOperand which corresponds to index Idx within the original /// MCInst. diff --git a/llvm/lib/MCA/HardwareUnits/LSUnit.cpp b/llvm/lib/MCA/HardwareUnits/LSUnit.cpp --- a/llvm/lib/MCA/HardwareUnits/LSUnit.cpp +++ b/llvm/lib/MCA/HardwareUnits/LSUnit.cpp @@ -68,7 +68,8 @@ unsigned LSUnit::dispatch(const InstRef &IR) { const InstrDesc &Desc = IR.getInstruction()->getDesc(); - unsigned IsMemBarrier = Desc.HasSideEffects; + bool IsStoreBarrier = IR.getInstruction()->isAStoreBarrier(); + bool IsLoadBarrier = IR.getInstruction()->isALoadBarrier(); assert((Desc.MayLoad || Desc.MayStore) && "Not a memory operation!"); if (Desc.MayLoad) @@ -111,12 +112,12 @@ CurrentStoreGroupID = NewGID; - if (IsMemBarrier) + if (IsStoreBarrier) CurrentStoreBarrierGroupID = NewGID; if (Desc.MayLoad) { CurrentLoadGroupID = NewGID; - if (IsMemBarrier) + if (IsLoadBarrier) CurrentLoadBarrierGroupID = NewGID; } @@ -141,7 +142,7 @@ // However that group has already started execution, so we cannot add // this load to it. bool ShouldCreateANewGroup = - IsMemBarrier || !ImmediateLoadDominator || + IsLoadBarrier || !ImmediateLoadDominator || CurrentLoadBarrierGroupID == ImmediateLoadDominator || ImmediateLoadDominator <= CurrentStoreGroupID || getGroup(ImmediateLoadDominator).isExecuting(); @@ -161,7 +162,7 @@ } // A load barrier may not pass a previous load or load barrier. - if (IsMemBarrier) { + if (IsLoadBarrier) { if (ImmediateLoadDominator) { MemoryGroup &LoadGroup = getGroup(ImmediateLoadDominator); LLVM_DEBUG(dbgs() << "[LSUnit]: GROUP DEP: (" @@ -181,7 +182,7 @@ } CurrentLoadGroupID = NewGID; - if (IsMemBarrier) + if (IsLoadBarrier) CurrentLoadBarrierGroupID = NewGID; return NewGID; } diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt --- a/llvm/lib/Target/X86/CMakeLists.txt +++ b/llvm/lib/Target/X86/CMakeLists.txt @@ -109,5 +109,6 @@ add_subdirectory(AsmParser) add_subdirectory(Disassembler) +add_subdirectory(MCA) add_subdirectory(MCTargetDesc) add_subdirectory(TargetInfo) diff --git a/llvm/lib/Target/X86/MCA/CMakeLists.txt b/llvm/lib/Target/X86/MCA/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/X86/MCA/CMakeLists.txt @@ -0,0 +1,14 @@ +add_llvm_component_library(LLVMX86TargetMCA + X86CustomBehaviour.cpp + + LINK_COMPONENTS + MC + MCParser + X86Desc + X86Info + Support + MCA + + ADD_TO_COMPONENT + X86 + ) diff --git a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h @@ -0,0 +1,47 @@ +//===-------------------- X86CustomBehaviour.h ------------------*-C++ -* -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file defines the X86CustomBehaviour class which inherits from +/// CustomBehaviour. This class is used by the tool llvm-mca to enforce +/// target specific behaviour that is not expressed well enough in the +/// scheduling model for mca to enforce it automatically. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_MCA_X86CUSTOMBEHAVIOUR_H +#define LLVM_LIB_TARGET_X86_MCA_X86CUSTOMBEHAVIOUR_H + +#include "llvm/ADT/SmallVector.h" +#include "llvm/MCA/CustomBehaviour.h" +#include "llvm/Support/TargetParser.h" + +namespace llvm { +namespace mca { + +class X86InstrPostProcess : public InstrPostProcess { + void processWaitCnt(std::unique_ptr &Inst, const MCInst &MCI); + + /// Called within X86InstrPostProcess to specify certain instructions + /// as load and store barriers. + void setMemBarriers(std::unique_ptr &Inst, const MCInst &MCI); + +public: + X86InstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII) + : InstrPostProcess(STI, MCII) {} + + ~X86InstrPostProcess() {} + + void postProcessInstruction(std::unique_ptr &Inst, + const MCInst &MCI) override; +}; + +} // namespace mca +} // namespace llvm + +#endif diff --git a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp @@ -0,0 +1,64 @@ +//===------------------- X86CustomBehaviour.cpp -----------------*-C++ -* -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file implements methods from the X86CustomBehaviour class. +/// +//===----------------------------------------------------------------------===// + +#include "X86CustomBehaviour.h" +#include "TargetInfo/X86TargetInfo.h" +#include "X86InstrInfo.h" +#include "llvm/MC/TargetRegistry.h" +#include "llvm/Support/WithColor.h" + +namespace llvm { +namespace mca { + +void X86InstrPostProcess::setMemBarriers(std::unique_ptr &Inst, + const MCInst &MCI) { + switch (MCI.getOpcode()) { + case X86::MFENCE: + Inst->setLoadBarrier(true); + Inst->setStoreBarrier(true); + break; + case X86::LFENCE: + Inst->setLoadBarrier(true); + break; + case X86::SFENCE: + Inst->setStoreBarrier(true); + break; + } +} + +void X86InstrPostProcess::postProcessInstruction( + std::unique_ptr &Inst, const MCInst &MCI) { + // Currently, we only modify certain instructions' IsALoadBarrier and + // IsAStoreBarrier flags. + setMemBarriers(Inst, MCI); +} + +} // namespace mca +} // namespace llvm + +using namespace llvm; +using namespace mca; + +static InstrPostProcess *createX86InstrPostProcess(const MCSubtargetInfo &STI, + const MCInstrInfo &MCII) { + return new X86InstrPostProcess(STI, MCII); +} + +/// Extern function to initialize the targets for the X86 backend + +extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86TargetMCA() { + TargetRegistry::RegisterInstrPostProcess(getTheX86_32Target(), + createX86InstrPostProcess); + TargetRegistry::RegisterInstrPostProcess(getTheX86_64Target(), + createX86InstrPostProcess); +} diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-load-store-noalias.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-load-store-noalias.s --- a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-load-store-noalias.s +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-load-store-noalias.s @@ -10,12 +10,12 @@ # CHECK: Iterations: 3 # CHECK-NEXT: Instructions: 18 -# CHECK-NEXT: Total Cycles: 19 +# CHECK-NEXT: Total Cycles: 16 # CHECK-NEXT: Total uOps: 18 # CHECK: Dispatch Width: 2 -# CHECK-NEXT: uOps Per Cycle: 0.95 -# CHECK-NEXT: IPC: 0.95 +# CHECK-NEXT: uOps Per Cycle: 1.13 +# CHECK-NEXT: IPC: 1.13 # CHECK-NEXT: Block RThroughput: 3.0 # CHECK: Instruction Info: @@ -62,27 +62,27 @@ # CHECK-NEXT: - - - - - - - - - 1.00 - - ldr x3, [x10] # CHECK: Timeline view: -# CHECK-NEXT: 012345678 +# CHECK-NEXT: 012345 # CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DE . . . . str x1, [x10] -# CHECK-NEXT: [0,1] .DE . . . . str x1, [x10] -# CHECK-NEXT: [0,2] .DeeE. . . . ldr x2, [x10] -# CHECK-NEXT: [0,3] . DE. . . . nop -# CHECK-NEXT: [0,4] . DeeE . . . ldr x2, [x10] -# CHECK-NEXT: [0,5] . DeeE . . . ldr x3, [x10] -# CHECK-NEXT: [1,0] . DE . . . str x1, [x10] -# CHECK-NEXT: [1,1] . .DE . . . str x1, [x10] -# CHECK-NEXT: [1,2] . .DeeE. . . ldr x2, [x10] -# CHECK-NEXT: [1,3] . . DE. . . nop -# CHECK-NEXT: [1,4] . . DeeE . . ldr x2, [x10] -# CHECK-NEXT: [1,5] . . DeeE . . ldr x3, [x10] -# CHECK-NEXT: [2,0] . . DE . . str x1, [x10] -# CHECK-NEXT: [2,1] . . .DE . . str x1, [x10] -# CHECK-NEXT: [2,2] . . .DeeE. . ldr x2, [x10] -# CHECK-NEXT: [2,3] . . . DE. . nop -# CHECK-NEXT: [2,4] . . . DeeE. ldr x2, [x10] -# CHECK-NEXT: [2,5] . . . DeeE ldr x3, [x10] +# CHECK: [0,0] DE . . . str x1, [x10] +# CHECK-NEXT: [0,1] .DE . . . str x1, [x10] +# CHECK-NEXT: [0,2] .DeeE. . . ldr x2, [x10] +# CHECK-NEXT: [0,3] . DE. . . nop +# CHECK-NEXT: [0,4] . DeeE . . ldr x2, [x10] +# CHECK-NEXT: [0,5] . DeeE . . ldr x3, [x10] +# CHECK-NEXT: [1,0] . DE . . str x1, [x10] +# CHECK-NEXT: [1,1] . DE . . str x1, [x10] +# CHECK-NEXT: [1,2] . DeeE . . ldr x2, [x10] +# CHECK-NEXT: [1,3] . . DE . . nop +# CHECK-NEXT: [1,4] . . DeeE . ldr x2, [x10] +# CHECK-NEXT: [1,5] . . DeeE . ldr x3, [x10] +# CHECK-NEXT: [2,0] . . DE. . str x1, [x10] +# CHECK-NEXT: [2,1] . . DE . str x1, [x10] +# CHECK-NEXT: [2,2] . . DeeE . ldr x2, [x10] +# CHECK-NEXT: [2,3] . . .DE . nop +# CHECK-NEXT: [2,4] . . .DeeE. ldr x2, [x10] +# CHECK-NEXT: [2,5] . . . DeeE ldr x3, [x10] # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions diff --git a/llvm/test/tools/llvm-mca/AMDGPU/gfx9-retireooo.s b/llvm/test/tools/llvm-mca/AMDGPU/gfx9-retireooo.s --- a/llvm/test/tools/llvm-mca/AMDGPU/gfx9-retireooo.s +++ b/llvm/test/tools/llvm-mca/AMDGPU/gfx9-retireooo.s @@ -40,12 +40,12 @@ # CHECK: Iterations: 1 # CHECK-NEXT: Instructions: 36 -# CHECK-NEXT: Total Cycles: 331 +# CHECK-NEXT: Total Cycles: 94 # CHECK-NEXT: Total uOps: 36 # CHECK: Dispatch Width: 1 -# CHECK-NEXT: uOps Per Cycle: 0.11 -# CHECK-NEXT: IPC: 0.11 +# CHECK-NEXT: uOps Per Cycle: 0.38 +# CHECK-NEXT: IPC: 0.38 # CHECK-NEXT: Block RThroughput: 36.0 # CHECK: Instruction Info: @@ -147,45 +147,45 @@ # CHECK-NEXT: - - - 1.00 - - - s_waitcnt vmcnt(0) lgkmcnt(0) # CHECK: Timeline view: -# CHECK-NEXT: 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0 -# CHECK-NEXT: Index 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 +# CHECK-NEXT: 0123456789 0123456789 0123456789 0123456789 0123 +# CHECK-NEXT: Index 0123456789 0123456789 0123456789 0123456789 0123456789 -# CHECK: [0,0] DeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . s_load_dwordx2 s[2:3], s[0:1], 0x24 -# CHECK-NEXT: [0,1] .DeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . s_load_dwordx2 s[0:1], s[0:1], 0x2c -# CHECK-NEXT: [0,2] . .DE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . s_waitcnt lgkmcnt(0) -# CHECK-NEXT: [0,3] . . DE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_mov_b32_e32 v0, s2 -# CHECK-NEXT: [0,4] . . DE. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_mov_b32_e32 v1, s3 -# CHECK-NEXT: [0,5] . . DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeE. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . flat_load_dword v2, v[0:1] -# CHECK-NEXT: [0,6] . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeE. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . flat_load_dword v3, v[0:1] offset:8 -# CHECK-NEXT: [0,7] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeE. . . . . . . . . . . . . . . . . flat_load_dword v4, v[0:1] offset:16 -# CHECK-NEXT: [0,8] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeE. flat_load_dword v5, v[0:1] offset:24 -# CHECK-NEXT: [0,9] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DE . . . . . . . . . . . . . . . . v_mov_b32_e32 v0, s0 -# CHECK-NEXT: [0,10] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .DE . . . . . . . . . . . . . . . . v_mov_b32_e32 v1, s1 -# CHECK-NEXT: [0,11] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DE . . . . . . . . . . . . . . . . v_mov_b32_e32 v6, s6 -# CHECK-NEXT: [0,12] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DE. . . . . . . . . . . . . . . . v_mov_b32_e32 v7, s7 -# CHECK-NEXT: [0,13] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DE . . . . . . . . . . . . . . . v_mov_b32_e32 v8, s8 -# CHECK-NEXT: [0,14] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DE . . . . . . . . . . . . . . . v_mov_b32_e32 v9, s9 -# CHECK-NEXT: [0,15] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .DE . . . . . . . . . . . . . . . v_mov_b32_e32 v10, s10 -# CHECK-NEXT: [0,16] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DE . . . . . . . . . . . . . . . v_mov_b32_e32 v11, s11 -# CHECK-NEXT: [0,17] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DE. . . . . . . . . . . . . . . v_mov_b32_e32 v12, s12 -# CHECK-NEXT: [0,18] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DE . . . . . . . . . . . . . . v_mov_b32_e32 v13, s13 -# CHECK-NEXT: [0,19] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DE . . . . . . . . . . . . . . v_mov_b32_e32 v14, s14 -# CHECK-NEXT: [0,20] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .DE . . . . . . . . . . . . . . v_mov_b32_e32 v15, s15 -# CHECK-NEXT: [0,21] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DE . . . . . . . . . . . . . . v_mov_b32_e32 v16, s16 -# CHECK-NEXT: [0,22] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DE. . . . . . . . . . . . . . v_mov_b32_e32 v17, s17 -# CHECK-NEXT: [0,23] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DE . . . . . . . . . . . . . v_mov_b32_e32 v18, s18 -# CHECK-NEXT: [0,24] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DE . . . . . . . . . . . . . v_mov_b32_e32 v19, s19 -# CHECK-NEXT: [0,25] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .DE . . . . . . . . . . . . . v_mov_b32_e32 v20, s20 -# CHECK-NEXT: [0,26] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DE . . . . . . . . . . . . . v_mov_b32_e32 v21, s21 -# CHECK-NEXT: [0,27] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DE. . . . . . . . . . . . . v_mov_b32_e32 v22, s22 -# CHECK-NEXT: [0,28] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DE . . . . . . . . . . . . v_mov_b32_e32 v23, s23 -# CHECK-NEXT: [0,29] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DE . . . . . . . . . . . . v_mov_b32_e32 v24, s24 -# CHECK-NEXT: [0,30] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .DE . . . . . . . . . . . . v_mov_b32_e32 v25, s25 -# CHECK-NEXT: [0,31] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DE . . . . . . . . . . . . v_mov_b32_e32 v26, s26 -# CHECK-NEXT: [0,32] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DE. . . . . . . . . . . . v_mov_b32_e32 v27, s27 -# CHECK-NEXT: [0,33] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DE . . . . . . . . . . . v_mov_b32_e32 v28, s28 -# CHECK-NEXT: [0,34] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DE . . . . . . . . . . . v_mov_b32_e32 v29, s29 -# CHECK-NEXT: [0,35] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DE s_waitcnt vmcnt(0) lgkmcnt(0) +# CHECK: [0,0] DeeeeE . . . . . . . . . . . . . . . . . . s_load_dwordx2 s[2:3], s[0:1], 0x24 +# CHECK-NEXT: [0,1] .DeeeeE . . . . . . . . . . . . . . . . . . s_load_dwordx2 s[0:1], s[0:1], 0x2c +# CHECK-NEXT: [0,2] . .DE . . . . . . . . . . . . . . . . . . s_waitcnt lgkmcnt(0) +# CHECK-NEXT: [0,3] . . DE . . . . . . . . . . . . . . . . . . v_mov_b32_e32 v0, s2 +# CHECK-NEXT: [0,4] . . DE. . . . . . . . . . . . . . . . . . v_mov_b32_e32 v1, s3 +# CHECK-NEXT: [0,5] . . DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeE. . flat_load_dword v2, v[0:1] +# CHECK-NEXT: [0,6] . . DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeE . flat_load_dword v3, v[0:1] offset:8 +# CHECK-NEXT: [0,7] . . .DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeE . flat_load_dword v4, v[0:1] offset:16 +# CHECK-NEXT: [0,8] . . . DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeE. flat_load_dword v5, v[0:1] offset:24 +# CHECK-NEXT: [0,9] . . . DE. . . . . . . . . . . . . . . . . v_mov_b32_e32 v0, s0 +# CHECK-NEXT: [0,10] . . . DE . . . . . . . . . . . . . . . . v_mov_b32_e32 v1, s1 +# CHECK-NEXT: [0,11] . . . DE . . . . . . . . . . . . . . . . v_mov_b32_e32 v6, s6 +# CHECK-NEXT: [0,12] . . . .DE . . . . . . . . . . . . . . . . v_mov_b32_e32 v7, s7 +# CHECK-NEXT: [0,13] . . . . DE . . . . . . . . . . . . . . . . v_mov_b32_e32 v8, s8 +# CHECK-NEXT: [0,14] . . . . DE. . . . . . . . . . . . . . . . v_mov_b32_e32 v9, s9 +# CHECK-NEXT: [0,15] . . . . DE . . . . . . . . . . . . . . . v_mov_b32_e32 v10, s10 +# CHECK-NEXT: [0,16] . . . . DE . . . . . . . . . . . . . . . v_mov_b32_e32 v11, s11 +# CHECK-NEXT: [0,17] . . . . .DE . . . . . . . . . . . . . . . v_mov_b32_e32 v12, s12 +# CHECK-NEXT: [0,18] . . . . . DE . . . . . . . . . . . . . . . v_mov_b32_e32 v13, s13 +# CHECK-NEXT: [0,19] . . . . . DE. . . . . . . . . . . . . . . v_mov_b32_e32 v14, s14 +# CHECK-NEXT: [0,20] . . . . . DE . . . . . . . . . . . . . . v_mov_b32_e32 v15, s15 +# CHECK-NEXT: [0,21] . . . . . DE . . . . . . . . . . . . . . v_mov_b32_e32 v16, s16 +# CHECK-NEXT: [0,22] . . . . . .DE . . . . . . . . . . . . . . v_mov_b32_e32 v17, s17 +# CHECK-NEXT: [0,23] . . . . . . DE . . . . . . . . . . . . . . v_mov_b32_e32 v18, s18 +# CHECK-NEXT: [0,24] . . . . . . DE. . . . . . . . . . . . . . v_mov_b32_e32 v19, s19 +# CHECK-NEXT: [0,25] . . . . . . DE . . . . . . . . . . . . . v_mov_b32_e32 v20, s20 +# CHECK-NEXT: [0,26] . . . . . . DE . . . . . . . . . . . . . v_mov_b32_e32 v21, s21 +# CHECK-NEXT: [0,27] . . . . . . .DE . . . . . . . . . . . . . v_mov_b32_e32 v22, s22 +# CHECK-NEXT: [0,28] . . . . . . . DE . . . . . . . . . . . . . v_mov_b32_e32 v23, s23 +# CHECK-NEXT: [0,29] . . . . . . . DE. . . . . . . . . . . . . v_mov_b32_e32 v24, s24 +# CHECK-NEXT: [0,30] . . . . . . . DE . . . . . . . . . . . . v_mov_b32_e32 v25, s25 +# CHECK-NEXT: [0,31] . . . . . . . DE . . . . . . . . . . . . v_mov_b32_e32 v26, s26 +# CHECK-NEXT: [0,32] . . . . . . . .DE . . . . . . . . . . . . v_mov_b32_e32 v27, s27 +# CHECK-NEXT: [0,33] . . . . . . . . DE . . . . . . . . . . . . v_mov_b32_e32 v28, s28 +# CHECK-NEXT: [0,34] . . . . . . . . DE. . . . . . . . . . . . v_mov_b32_e32 v29, s29 +# CHECK-NEXT: [0,35] . . . . . . . . . . . . . . . . . . . DE s_waitcnt vmcnt(0) lgkmcnt(0) # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions diff --git a/llvm/test/tools/llvm-mca/X86/Barcelona/store-throughput.s b/llvm/test/tools/llvm-mca/X86/Barcelona/store-throughput.s --- a/llvm/test/tools/llvm-mca/X86/Barcelona/store-throughput.s +++ b/llvm/test/tools/llvm-mca/X86/Barcelona/store-throughput.s @@ -528,10 +528,10 @@ # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 movd %mm0, (%rax) -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 movd %mm1, (%rcx) -# CHECK-NEXT: 2. 1 3.0 0.0 0.0 movd %mm2, (%rdx) -# CHECK-NEXT: 3. 1 4.0 0.0 0.0 movd %mm3, (%rbx) -# CHECK-NEXT: 1 2.5 0.3 0.0 +# CHECK-NEXT: 1. 1 2.0 1.0 0.0 movd %mm1, (%rcx) +# CHECK-NEXT: 2. 1 3.0 1.0 0.0 movd %mm2, (%rdx) +# CHECK-NEXT: 3. 1 4.0 1.0 0.0 movd %mm3, (%rbx) +# CHECK-NEXT: 1 2.5 1.0 0.0 # CHECK: [5] Code Region diff --git a/llvm/test/tools/llvm-mca/X86/BdVer2/load-store-throughput.s b/llvm/test/tools/llvm-mca/X86/BdVer2/load-store-throughput.s --- a/llvm/test/tools/llvm-mca/X86/BdVer2/load-store-throughput.s +++ b/llvm/test/tools/llvm-mca/X86/BdVer2/load-store-throughput.s @@ -519,12 +519,12 @@ # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 400 -# CHECK-NEXT: Total Cycles: 553 +# CHECK-NEXT: Total Cycles: 405 # CHECK-NEXT: Total uOps: 400 # CHECK: Dispatch Width: 4 -# CHECK-NEXT: uOps Per Cycle: 0.72 -# CHECK-NEXT: IPC: 0.72 +# CHECK-NEXT: uOps Per Cycle: 0.99 +# CHECK-NEXT: IPC: 0.99 # CHECK-NEXT: Block RThroughput: 4.0 # CHECK: Instruction Info: @@ -544,25 +544,24 @@ # CHECK: Dynamic Dispatch Stall Cycles: # CHECK-NEXT: RAT - Register unavailable: 0 # CHECK-NEXT: RCU - Retire tokens unavailable: 0 -# CHECK-NEXT: SCHEDQ - Scheduler full: 57 (10.3%) +# CHECK-NEXT: SCHEDQ - Scheduler full: 347 (85.7%) # CHECK-NEXT: LQ - Load queue full: 0 -# CHECK-NEXT: SQ - Store queue full: 432 (78.1%) +# CHECK-NEXT: SQ - Store queue full: 0 # CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0 # CHECK-NEXT: USH - Uncategorised Structural Hazard: 0 # CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: # CHECK-NEXT: [# dispatched], [# cycles] -# CHECK-NEXT: 0, 364 (65.8%) -# CHECK-NEXT: 1, 88 (15.9%) -# CHECK-NEXT: 2, 4 (0.7%) -# CHECK-NEXT: 3, 84 (15.2%) -# CHECK-NEXT: 4, 13 (2.4%) +# CHECK-NEXT: 0, 131 (32.3%) +# CHECK-NEXT: 1, 174 (43.0%) +# CHECK-NEXT: 2, 87 (21.5%) +# CHECK-NEXT: 4, 13 (3.2%) # CHECK: Schedulers - number of cycles where we saw N micro opcodes issued: # CHECK-NEXT: [# issued], [# cycles] -# CHECK-NEXT: 0, 253 (45.8%) -# CHECK-NEXT: 1, 200 (36.2%) -# CHECK-NEXT: 2, 100 (18.1%) +# CHECK-NEXT: 0, 105 (25.9%) +# CHECK-NEXT: 1, 200 (49.4%) +# CHECK-NEXT: 2, 100 (24.7%) # CHECK: Scheduler's queue usage: # CHECK-NEXT: [1] Resource name. @@ -571,10 +570,10 @@ # CHECK-NEXT: [4] Total number of buffer entries. # CHECK: [1] [2] [3] [4] -# CHECK-NEXT: PdEX 23 40 40 -# CHECK-NEXT: PdFPU 23 40 64 -# CHECK-NEXT: PdLoad 3 22 40 -# CHECK-NEXT: PdStore 22 24 24 +# CHECK-NEXT: PdEX 36 40 40 +# CHECK-NEXT: PdFPU 36 40 64 +# CHECK-NEXT: PdLoad 20 23 40 +# CHECK-NEXT: PdStore 19 22 24 # CHECK: Resources: # CHECK-NEXT: [0.0] - PdAGLU01 @@ -608,8 +607,8 @@ # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0.0] [0.1] [1] [2] [3] [4] [5] [6] [7.0] [7.1] [8.0] [8.1] [9] [10] [11] [12] [13] [14] [15] [16.0] [16.1] [17] [18] Instructions: # CHECK-NEXT: - 1.00 - - - - - - - - - - - 1.00 - - - 3.00 - - - - 1.00 movd %mm0, (%rax) -# CHECK-NEXT: 1.50 1.50 - - - - - - - - - 3.00 - - - 1.00 - - - - 3.00 - - movd (%rcx), %mm1 -# CHECK-NEXT: 1.50 1.50 - - - - - - - - 3.00 - - - 1.00 - - - - 3.00 - - - movd (%rdx), %mm2 +# CHECK-NEXT: 3.00 - - - - - - - - - - 3.00 - - - 1.00 - - - - 3.00 - - movd (%rcx), %mm1 +# CHECK-NEXT: - 3.00 - - - - - - - - 3.00 - - - 1.00 - - - - 3.00 - - - movd (%rdx), %mm2 # CHECK-NEXT: 1.00 - - - - - - - - - - - - 1.00 - - 3.00 - - - - - 1.00 movd %mm3, (%rbx) # CHECK: Timeline view: @@ -630,8 +629,8 @@ # CHECK-NEXT: 0. 1 1.0 1.0 0.0 movd %mm0, (%rax) # CHECK-NEXT: 1. 1 1.0 1.0 0.0 movd (%rcx), %mm1 # CHECK-NEXT: 2. 1 2.0 2.0 0.0 movd (%rdx), %mm2 -# CHECK-NEXT: 3. 1 4.0 1.0 1.0 movd %mm3, (%rbx) -# CHECK-NEXT: 1 2.0 1.3 0.3 +# CHECK-NEXT: 3. 1 4.0 2.0 1.0 movd %mm3, (%rbx) +# CHECK-NEXT: 1 2.0 1.5 0.3 # CHECK: [5] Code Region diff --git a/llvm/test/tools/llvm-mca/X86/BdVer2/pr37790.s b/llvm/test/tools/llvm-mca/X86/BdVer2/pr37790.s --- a/llvm/test/tools/llvm-mca/X86/BdVer2/pr37790.s +++ b/llvm/test/tools/llvm-mca/X86/BdVer2/pr37790.s @@ -6,12 +6,12 @@ # CHECK: Iterations: 2 # CHECK-NEXT: Instructions: 4 -# CHECK-NEXT: Total Cycles: 205 +# CHECK-NEXT: Total Cycles: 103 # CHECK-NEXT: Total uOps: 6 # CHECK: Dispatch Width: 4 -# CHECK-NEXT: uOps Per Cycle: 0.03 -# CHECK-NEXT: IPC: 0.02 +# CHECK-NEXT: uOps Per Cycle: 0.06 +# CHECK-NEXT: IPC: 0.04 # CHECK-NEXT: Block RThroughput: 18.0 # CHECK: Instruction Info: @@ -28,10 +28,12 @@ # CHECK: Timeline view: # CHECK-NEXT: 0123456789 0123456789 0123456789 0123456789 0123456789 -# CHECK-NEXT: Index 0123456789 0123456789 0123456789 0123456789 0123456789 0123 +# CHECK-NEXT: Index 0123456789 0123456789 0123456789 0123456789 0123456789 012 -# CHECK: [0,0] DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeER. int3 -# CHECK-NEXT: [0,1] D====================================================================================================eER stmxcsr (%rsp) +# CHECK: [0,0] DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeER int3 +# CHECK-NEXT: [0,1] DeE---------------------------------------------------------------------------------------------------R stmxcsr (%rsp) +# CHECK-NEXT: [1,0] DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeER int3 +# CHECK-NEXT: [1,1] .D=================eE---------------------------------------------------------------------------------R stmxcsr (%rsp) # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -40,6 +42,6 @@ # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 2 51.5 0.5 0.0 int3 -# CHECK-NEXT: 1. 2 151.0 0.0 0.0 stmxcsr (%rsp) -# CHECK-NEXT: 2 101.3 0.3 0.0 +# CHECK-NEXT: 0. 2 1.0 0.5 0.0 int3 +# CHECK-NEXT: 1. 2 9.5 9.0 90.0 stmxcsr (%rsp) +# CHECK-NEXT: 2 5.3 4.8 45.0 diff --git a/llvm/test/tools/llvm-mca/X86/BdVer2/store-throughput.s b/llvm/test/tools/llvm-mca/X86/BdVer2/store-throughput.s --- a/llvm/test/tools/llvm-mca/X86/BdVer2/store-throughput.s +++ b/llvm/test/tools/llvm-mca/X86/BdVer2/store-throughput.s @@ -514,12 +514,12 @@ # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 400 -# CHECK-NEXT: Total Cycles: 803 +# CHECK-NEXT: Total Cycles: 603 # CHECK-NEXT: Total uOps: 400 # CHECK: Dispatch Width: 4 -# CHECK-NEXT: uOps Per Cycle: 0.50 -# CHECK-NEXT: IPC: 0.50 +# CHECK-NEXT: uOps Per Cycle: 0.66 +# CHECK-NEXT: IPC: 0.66 # CHECK-NEXT: Block RThroughput: 6.0 # CHECK: Instruction Info: @@ -541,21 +541,21 @@ # CHECK-NEXT: RCU - Retire tokens unavailable: 0 # CHECK-NEXT: SCHEDQ - Scheduler full: 0 # CHECK-NEXT: LQ - Load queue full: 0 -# CHECK-NEXT: SQ - Store queue full: 748 (93.2%) +# CHECK-NEXT: SQ - Store queue full: 560 (92.9%) # CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0 # CHECK-NEXT: USH - Uncategorised Structural Hazard: 0 # CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: # CHECK-NEXT: [# dispatched], [# cycles] -# CHECK-NEXT: 0, 422 (52.6%) -# CHECK-NEXT: 1, 374 (46.6%) -# CHECK-NEXT: 2, 1 (0.1%) -# CHECK-NEXT: 4, 6 (0.7%) +# CHECK-NEXT: 0, 222 (36.8%) +# CHECK-NEXT: 1, 374 (62.0%) +# CHECK-NEXT: 2, 1 (0.2%) +# CHECK-NEXT: 4, 6 (1.0%) # CHECK: Schedulers - number of cycles where we saw N micro opcodes issued: # CHECK-NEXT: [# issued], [# cycles] -# CHECK-NEXT: 0, 403 (50.2%) -# CHECK-NEXT: 1, 400 (49.8%) +# CHECK-NEXT: 0, 203 (33.7%) +# CHECK-NEXT: 1, 400 (66.3%) # CHECK: Scheduler's queue usage: # CHECK-NEXT: [1] Resource name. @@ -564,8 +564,8 @@ # CHECK-NEXT: [4] Total number of buffer entries. # CHECK: [1] [2] [3] [4] -# CHECK-NEXT: PdEX 21 23 40 -# CHECK-NEXT: PdFPU 21 23 64 +# CHECK-NEXT: PdEX 21 22 40 +# CHECK-NEXT: PdFPU 21 22 64 # CHECK-NEXT: PdLoad 0 0 40 # CHECK-NEXT: PdStore 22 24 24 @@ -606,13 +606,12 @@ # CHECK-NEXT: 1.00 - - - - - - - - - - - - 1.00 - - 3.00 - - - - - 1.00 movd %mm3, (%rbx) # CHECK: Timeline view: -# CHECK-NEXT: 0 -# CHECK-NEXT: Index 0123456789 +# CHECK-NEXT: Index 012345678 -# CHECK: [0,0] DeeER. . movd %mm0, (%rax) -# CHECK-NEXT: [0,1] D==eeER . movd %mm1, (%rcx) -# CHECK-NEXT: [0,2] D====eeER . movd %mm2, (%rdx) -# CHECK-NEXT: [0,3] D======eeER movd %mm3, (%rbx) +# CHECK: [0,0] DeeER. . movd %mm0, (%rax) +# CHECK-NEXT: [0,1] D=eeER . movd %mm1, (%rcx) +# CHECK-NEXT: [0,2] D===eeER. movd %mm2, (%rdx) +# CHECK-NEXT: [0,3] D====eeER movd %mm3, (%rbx) # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -622,10 +621,10 @@ # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 movd %mm0, (%rax) -# CHECK-NEXT: 1. 1 3.0 0.0 0.0 movd %mm1, (%rcx) -# CHECK-NEXT: 2. 1 5.0 0.0 0.0 movd %mm2, (%rdx) -# CHECK-NEXT: 3. 1 7.0 0.0 0.0 movd %mm3, (%rbx) -# CHECK-NEXT: 1 4.0 0.3 0.0 +# CHECK-NEXT: 1. 1 2.0 1.0 0.0 movd %mm1, (%rcx) +# CHECK-NEXT: 2. 1 4.0 2.0 0.0 movd %mm2, (%rdx) +# CHECK-NEXT: 3. 1 5.0 1.0 0.0 movd %mm3, (%rbx) +# CHECK-NEXT: 1 3.0 1.3 0.0 # CHECK: [5] Code Region diff --git a/llvm/test/tools/llvm-mca/X86/BtVer2/pr37790.s b/llvm/test/tools/llvm-mca/X86/BtVer2/pr37790.s --- a/llvm/test/tools/llvm-mca/X86/BtVer2/pr37790.s +++ b/llvm/test/tools/llvm-mca/X86/BtVer2/pr37790.s @@ -6,12 +6,12 @@ # CHECK: Iterations: 2 # CHECK-NEXT: Instructions: 4 -# CHECK-NEXT: Total Cycles: 205 +# CHECK-NEXT: Total Cycles: 104 # CHECK-NEXT: Total uOps: 4 # CHECK: Dispatch Width: 2 -# CHECK-NEXT: uOps Per Cycle: 0.02 -# CHECK-NEXT: IPC: 0.02 +# CHECK-NEXT: uOps Per Cycle: 0.04 +# CHECK-NEXT: IPC: 0.04 # CHECK-NEXT: Block RThroughput: 1.0 # CHECK: Instruction Info: @@ -31,7 +31,9 @@ # CHECK-NEXT: Index 0123456789 0123456789 0123456789 0123456789 0123456789 0123 # CHECK: [0,0] DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeER. int3 -# CHECK-NEXT: [0,1] D====================================================================================================eER stmxcsr (%rsp) +# CHECK-NEXT: [0,1] DeE---------------------------------------------------------------------------------------------------R. stmxcsr (%rsp) +# CHECK-NEXT: [1,0] .DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeER int3 +# CHECK-NEXT: [1,1] .DeE---------------------------------------------------------------------------------------------------R stmxcsr (%rsp) # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -40,6 +42,6 @@ # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 2 51.0 0.5 0.0 int3 -# CHECK-NEXT: 1. 2 151.0 0.0 0.0 stmxcsr (%rsp) -# CHECK-NEXT: 2 101.0 0.3 0.0 +# CHECK-NEXT: 0. 2 1.0 1.0 0.0 int3 +# CHECK-NEXT: 1. 2 1.0 0.0 99.0 stmxcsr (%rsp) +# CHECK-NEXT: 2 1.0 0.5 49.5 diff --git a/llvm/test/tools/llvm-mca/X86/BtVer2/stmxcsr-ldmxcsr.s b/llvm/test/tools/llvm-mca/X86/BtVer2/stmxcsr-ldmxcsr.s --- a/llvm/test/tools/llvm-mca/X86/BtVer2/stmxcsr-ldmxcsr.s +++ b/llvm/test/tools/llvm-mca/X86/BtVer2/stmxcsr-ldmxcsr.s @@ -12,12 +12,12 @@ # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 600 -# CHECK-NEXT: Total Cycles: 704 +# CHECK-NEXT: Total Cycles: 308 # CHECK-NEXT: Total uOps: 600 # CHECK: Dispatch Width: 2 -# CHECK-NEXT: uOps Per Cycle: 0.85 -# CHECK-NEXT: IPC: 0.85 +# CHECK-NEXT: uOps Per Cycle: 1.95 +# CHECK-NEXT: IPC: 1.95 # CHECK-NEXT: Block RThroughput: 3.0 # CHECK: Instruction Info: @@ -66,27 +66,27 @@ # CHECK-NEXT: 0.50 0.50 - - - - - 1.00 - - - - - - retq # CHECK: Timeline view: -# CHECK-NEXT: 0123456789 -# CHECK-NEXT: Index 0123456789 01234 +# CHECK-NEXT: 0123456 +# CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeER . . . . . stmxcsr -4(%rsp) -# CHECK-NEXT: [0,1] DeER . . . . . movl $-24577, %eax -# CHECK-NEXT: [0,2] .DeeeeER . . . . andl -4(%rsp), %eax -# CHECK-NEXT: [0,3] .D====eER . . . . movl %eax, -8(%rsp) -# CHECK-NEXT: [0,4] . D===eeeER . . . ldmxcsr -8(%rsp) -# CHECK-NEXT: [0,5] . DeeeeE--R . . . retq -# CHECK-NEXT: [1,0] . D=====eER . . . stmxcsr -4(%rsp) -# CHECK-NEXT: [1,1] . DeE-----R . . . movl $-24577, %eax -# CHECK-NEXT: [1,2] . D====eeeeER. . . andl -4(%rsp), %eax -# CHECK-NEXT: [1,3] . D========eER . . movl %eax, -8(%rsp) -# CHECK-NEXT: [1,4] . D=======eeeER . . ldmxcsr -8(%rsp) -# CHECK-NEXT: [1,5] . D=eeeeE-----R . . retq -# CHECK-NEXT: [2,0] . .D=========eER . . stmxcsr -4(%rsp) -# CHECK-NEXT: [2,1] . .DeE---------R . . movl $-24577, %eax -# CHECK-NEXT: [2,2] . . D========eeeeER . andl -4(%rsp), %eax -# CHECK-NEXT: [2,3] . . D============eER . movl %eax, -8(%rsp) -# CHECK-NEXT: [2,4] . . D===========eeeER ldmxcsr -8(%rsp) -# CHECK-NEXT: [2,5] . . D=eeeeE---------R retq +# CHECK: [0,0] DeER . . .. stmxcsr -4(%rsp) +# CHECK-NEXT: [0,1] DeER . . .. movl $-24577, %eax +# CHECK-NEXT: [0,2] .DeeeeER . .. andl -4(%rsp), %eax +# CHECK-NEXT: [0,3] .D====eER . .. movl %eax, -8(%rsp) +# CHECK-NEXT: [0,4] . D===eeeER .. ldmxcsr -8(%rsp) +# CHECK-NEXT: [0,5] . DeeeeE--R .. retq +# CHECK-NEXT: [1,0] . D===eE--R .. stmxcsr -4(%rsp) +# CHECK-NEXT: [1,1] . DeE-----R .. movl $-24577, %eax +# CHECK-NEXT: [1,2] . DeeeeE--R .. andl -4(%rsp), %eax +# CHECK-NEXT: [1,3] . D====eE-R .. movl %eax, -8(%rsp) +# CHECK-NEXT: [1,4] . D===eeeER .. ldmxcsr -8(%rsp) +# CHECK-NEXT: [1,5] . D=eeeeE-R .. retq +# CHECK-NEXT: [2,0] . .D===eE--R.. stmxcsr -4(%rsp) +# CHECK-NEXT: [2,1] . .DeE-----R.. movl $-24577, %eax +# CHECK-NEXT: [2,2] . . DeeeeE--R. andl -4(%rsp), %eax +# CHECK-NEXT: [2,3] . . D====eE-R. movl %eax, -8(%rsp) +# CHECK-NEXT: [2,4] . . D===eeeER ldmxcsr -8(%rsp) +# CHECK-NEXT: [2,5] . . D=eeeeE-R retq # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -95,10 +95,10 @@ # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 3 5.7 0.3 0.0 stmxcsr -4(%rsp) -# CHECK-NEXT: 1. 3 1.0 1.0 4.7 movl $-24577, %eax -# CHECK-NEXT: 2. 3 5.0 0.3 0.0 andl -4(%rsp), %eax -# CHECK-NEXT: 3. 3 9.0 0.0 0.0 movl %eax, -8(%rsp) -# CHECK-NEXT: 4. 3 8.0 0.0 0.0 ldmxcsr -8(%rsp) -# CHECK-NEXT: 5. 3 1.7 1.7 5.3 retq -# CHECK-NEXT: 3 5.1 0.6 1.7 +# CHECK-NEXT: 0. 3 3.0 1.0 1.3 stmxcsr -4(%rsp) +# CHECK-NEXT: 1. 3 1.0 1.0 3.3 movl $-24577, %eax +# CHECK-NEXT: 2. 3 1.0 1.0 1.3 andl -4(%rsp), %eax +# CHECK-NEXT: 3. 3 5.0 0.0 0.7 movl %eax, -8(%rsp) +# CHECK-NEXT: 4. 3 4.0 0.0 0.0 ldmxcsr -8(%rsp) +# CHECK-NEXT: 5. 3 1.7 1.7 1.3 retq +# CHECK-NEXT: 3 2.6 0.8 1.3 diff --git a/llvm/test/tools/llvm-mca/X86/Haswell/reserved-resources.s b/llvm/test/tools/llvm-mca/X86/Haswell/reserved-resources.s --- a/llvm/test/tools/llvm-mca/X86/Haswell/reserved-resources.s +++ b/llvm/test/tools/llvm-mca/X86/Haswell/reserved-resources.s @@ -5,11 +5,11 @@ # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 100 -# CHECK-NEXT: Total Cycles: 6403 +# CHECK-NEXT: Total Cycles: 4720 # CHECK-NEXT: Total uOps: 9000 # CHECK: Dispatch Width: 4 -# CHECK-NEXT: uOps Per Cycle: 1.41 +# CHECK-NEXT: uOps Per Cycle: 1.91 # CHECK-NEXT: IPC: 0.02 # CHECK-NEXT: Block RThroughput: 22.5 diff --git a/llvm/test/tools/llvm-mca/X86/Haswell/stmxcsr-ldmxcsr.s b/llvm/test/tools/llvm-mca/X86/Haswell/stmxcsr-ldmxcsr.s --- a/llvm/test/tools/llvm-mca/X86/Haswell/stmxcsr-ldmxcsr.s +++ b/llvm/test/tools/llvm-mca/X86/Haswell/stmxcsr-ldmxcsr.s @@ -12,12 +12,12 @@ # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 600 -# CHECK-NEXT: Total Cycles: 1304 +# CHECK-NEXT: Total Cycles: 413 # CHECK-NEXT: Total uOps: 1300 # CHECK: Dispatch Width: 4 -# CHECK-NEXT: uOps Per Cycle: 1.00 -# CHECK-NEXT: IPC: 0.46 +# CHECK-NEXT: uOps Per Cycle: 3.15 +# CHECK-NEXT: IPC: 1.45 # CHECK-NEXT: Block RThroughput: 3.3 # CHECK: Instruction Info: @@ -50,39 +50,39 @@ # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] -# CHECK-NEXT: - - 1.74 1.74 1.67 1.68 2.00 1.74 1.78 1.65 +# CHECK-NEXT: - - 1.99 1.50 1.66 1.67 2.00 1.52 1.99 1.67 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions: -# CHECK-NEXT: - - - - 0.30 - 1.00 1.00 - 0.70 stmxcsr -4(%rsp) -# CHECK-NEXT: - - 0.03 0.53 - - - 0.23 0.21 - movl $-24577, %eax -# CHECK-NEXT: - - 0.22 0.58 0.35 0.65 - - 0.20 - andl -4(%rsp), %eax -# CHECK-NEXT: - - - - 0.05 - 1.00 - - 0.95 movl %eax, -8(%rsp) -# CHECK-NEXT: - - 1.00 0.21 0.34 0.66 - 0.42 0.37 - ldmxcsr -8(%rsp) -# CHECK-NEXT: - - 0.49 0.42 0.63 0.37 - 0.09 1.00 - retq +# CHECK-NEXT: - - - - 0.16 - 1.00 1.00 - 0.84 stmxcsr -4(%rsp) +# CHECK-NEXT: - - 0.49 0.49 - - - 0.01 0.01 - movl $-24577, %eax +# CHECK-NEXT: - - 0.49 0.02 0.49 0.51 - 0.01 0.48 - andl -4(%rsp), %eax +# CHECK-NEXT: - - - - 0.17 - 1.00 - - 0.83 movl %eax, -8(%rsp) +# CHECK-NEXT: - - 1.00 0.01 0.33 0.67 - 0.49 0.50 - ldmxcsr -8(%rsp) +# CHECK-NEXT: - - 0.01 0.98 0.51 0.49 - 0.01 1.00 - retq # CHECK: Timeline view: -# CHECK-NEXT: 0123456789 0123456789 -# CHECK-NEXT: Index 0123456789 0123456789 012 +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 01234 -# CHECK: [0,0] DeeER. . . . . . . . . stmxcsr -4(%rsp) -# CHECK-NEXT: [0,1] DeE-R. . . . . . . . . movl $-24577, %eax -# CHECK-NEXT: [0,2] .DeeeeeeER. . . . . . . . andl -4(%rsp), %eax -# CHECK-NEXT: [0,3] .D======eER . . . . . . . movl %eax, -8(%rsp) -# CHECK-NEXT: [0,4] . D=====eeeeeeeER . . . . . . ldmxcsr -8(%rsp) -# CHECK-NEXT: [0,5] . DeeeeeeeE----R . . . . . . retq -# CHECK-NEXT: [1,0] . D==========eeER . . . . . . stmxcsr -4(%rsp) -# CHECK-NEXT: [1,1] . DeE-----------R . . . . . . movl $-24577, %eax -# CHECK-NEXT: [1,2] . D=========eeeeeeER . . . . . andl -4(%rsp), %eax -# CHECK-NEXT: [1,3] . D===============eER . . . . . movl %eax, -8(%rsp) -# CHECK-NEXT: [1,4] . .D==============eeeeeeeER. . . . ldmxcsr -8(%rsp) -# CHECK-NEXT: [1,5] . . DeeeeeeeE-------------R. . . . retq -# CHECK-NEXT: [2,0] . . D===================eeER . . . stmxcsr -4(%rsp) -# CHECK-NEXT: [2,1] . . DeE--------------------R . . . movl $-24577, %eax -# CHECK-NEXT: [2,2] . . D==================eeeeeeER . . andl -4(%rsp), %eax -# CHECK-NEXT: [2,3] . . D========================eER . . movl %eax, -8(%rsp) -# CHECK-NEXT: [2,4] . . D=======================eeeeeeeER ldmxcsr -8(%rsp) -# CHECK-NEXT: [2,5] . . .DeeeeeeeE----------------------R retq +# CHECK: [0,0] DeeER. . . . . stmxcsr -4(%rsp) +# CHECK-NEXT: [0,1] DeE-R. . . . . movl $-24577, %eax +# CHECK-NEXT: [0,2] .DeeeeeeER. . . . andl -4(%rsp), %eax +# CHECK-NEXT: [0,3] .D======eER . . . movl %eax, -8(%rsp) +# CHECK-NEXT: [0,4] . D=====eeeeeeeER . . ldmxcsr -8(%rsp) +# CHECK-NEXT: [0,5] . DeeeeeeeE----R . . retq +# CHECK-NEXT: [1,0] . D====eeE----R . . stmxcsr -4(%rsp) +# CHECK-NEXT: [1,1] . DeE---------R . . movl $-24577, %eax +# CHECK-NEXT: [1,2] . DeeeeeeE---R . . andl -4(%rsp), %eax +# CHECK-NEXT: [1,3] . D======eE--R . . movl %eax, -8(%rsp) +# CHECK-NEXT: [1,4] . .D=====eeeeeeeER . ldmxcsr -8(%rsp) +# CHECK-NEXT: [1,5] . . D=eeeeeeeE---R . retq +# CHECK-NEXT: [2,0] . . D====eeE----R . stmxcsr -4(%rsp) +# CHECK-NEXT: [2,1] . . DeE---------R . movl $-24577, %eax +# CHECK-NEXT: [2,2] . . DeeeeeeE---R . andl -4(%rsp), %eax +# CHECK-NEXT: [2,3] . . D======eE--R . movl %eax, -8(%rsp) +# CHECK-NEXT: [2,4] . . D=====eeeeeeeER ldmxcsr -8(%rsp) +# CHECK-NEXT: [2,5] . . .DeeeeeeeE----R retq # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -91,10 +91,10 @@ # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 3 10.7 0.3 0.0 stmxcsr -4(%rsp) -# CHECK-NEXT: 1. 3 1.0 1.0 10.7 movl $-24577, %eax -# CHECK-NEXT: 2. 3 10.0 0.3 0.0 andl -4(%rsp), %eax -# CHECK-NEXT: 3. 3 16.0 0.0 0.0 movl %eax, -8(%rsp) -# CHECK-NEXT: 4. 3 15.0 0.0 0.0 ldmxcsr -8(%rsp) -# CHECK-NEXT: 5. 3 1.0 1.0 13.0 retq -# CHECK-NEXT: 3 8.9 0.4 3.9 +# CHECK-NEXT: 0. 3 3.7 1.0 2.7 stmxcsr -4(%rsp) +# CHECK-NEXT: 1. 3 1.0 1.0 6.3 movl $-24577, %eax +# CHECK-NEXT: 2. 3 1.0 1.0 2.0 andl -4(%rsp), %eax +# CHECK-NEXT: 3. 3 7.0 0.0 1.3 movl %eax, -8(%rsp) +# CHECK-NEXT: 4. 3 6.0 0.0 0.0 ldmxcsr -8(%rsp) +# CHECK-NEXT: 5. 3 1.3 1.3 3.7 retq +# CHECK-NEXT: 3 3.3 0.7 2.7 diff --git a/llvm/test/tools/llvm-mca/X86/barrier_output.s b/llvm/test/tools/llvm-mca/X86/barrier_output.s new file mode 100644 --- /dev/null +++ b/llvm/test/tools/llvm-mca/X86/barrier_output.s @@ -0,0 +1,25 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1 -resource-pressure=false -summary-view=false -show-barriers < %s | FileCheck %s + +clflush (%rax) +lfence +mfence +sfence +maskmovdqu %xmm0, %xmm1 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) +# CHECK-NEXT: [7]: LoadBarrier +# CHECK-NEXT: [8]: StoreBarrier + +# CHECK: [1] [2] [3] [4] [5] [6] [7] [8] Instructions: +# CHECK-NEXT: 4 5 1.00 * * U clflush (%rax) +# CHECK-NEXT: 1 1 1.00 * * U * lfence +# CHECK-NEXT: 1 1 1.00 * * U * * mfence +# CHECK-NEXT: 1 1 1.00 * * U * sfence +# CHECK-NEXT: 1 1 1.00 * * U maskmovdqu %xmm0, %xmm1 diff --git a/llvm/tools/llvm-mca/Views/InstructionInfoView.h b/llvm/tools/llvm-mca/Views/InstructionInfoView.h --- a/llvm/tools/llvm-mca/Views/InstructionInfoView.h +++ b/llvm/tools/llvm-mca/Views/InstructionInfoView.h @@ -54,6 +54,9 @@ const llvm::MCInstrInfo &MCII; CodeEmitter &CE; bool PrintEncodings; + bool PrintBarriers; + using UniqueInst = std::unique_ptr; + ArrayRef LoweredInsts; struct InstructionInfoViewData { unsigned NumMicroOpcodes = 0; @@ -72,9 +75,12 @@ InstructionInfoView(const llvm::MCSubtargetInfo &ST, const llvm::MCInstrInfo &II, CodeEmitter &C, bool ShouldPrintEncodings, llvm::ArrayRef S, - llvm::MCInstPrinter &IP) + llvm::MCInstPrinter &IP, + ArrayRef LoweredInsts, + bool ShouldPrintBarriers) : InstructionView(ST, IP, S), MCII(II), CE(C), - PrintEncodings(ShouldPrintEncodings) {} + PrintEncodings(ShouldPrintEncodings), + PrintBarriers(ShouldPrintBarriers), LoweredInsts(LoweredInsts) {} void printView(llvm::raw_ostream &OS) const override; StringRef getNameAsString() const override { return "InstructionInfoView"; } diff --git a/llvm/tools/llvm-mca/Views/InstructionInfoView.cpp b/llvm/tools/llvm-mca/Views/InstructionInfoView.cpp --- a/llvm/tools/llvm-mca/Views/InstructionInfoView.cpp +++ b/llvm/tools/llvm-mca/Views/InstructionInfoView.cpp @@ -32,14 +32,30 @@ TempStream << "\n\nInstruction Info:\n"; TempStream << "[1]: #uOps\n[2]: Latency\n[3]: RThroughput\n" << "[4]: MayLoad\n[5]: MayStore\n[6]: HasSideEffects (U)\n"; + if (PrintBarriers) { + TempStream << "[7]: LoadBarrier\n[8]: StoreBarrier\n"; + } if (PrintEncodings) { - TempStream << "[7]: Encoding Size\n"; - TempStream << "\n[1] [2] [3] [4] [5] [6] [7] " - << "Encodings: Instructions:\n"; + if (PrintBarriers) { + TempStream << "[9]: Encoding Size\n"; + TempStream << "\n[1] [2] [3] [4] [5] [6] [7] [8] " + << "[9] Encodings: Instructions:\n"; + } else { + TempStream << "[7]: Encoding Size\n"; + TempStream << "\n[1] [2] [3] [4] [5] [6] [7] " + << "Encodings: Instructions:\n"; + } } else { - TempStream << "\n[1] [2] [3] [4] [5] [6] Instructions:\n"; + if (PrintBarriers) { + TempStream << "\n[1] [2] [3] [4] [5] [6] [7] [8] " + << "Instructions:\n"; + } else { + TempStream << "\n[1] [2] [3] [4] [5] [6] " + << "Instructions:\n"; + } } + int Index = 0; for (const auto &I : enumerate(zip(IIVD, Source))) { const InstructionInfoViewData &IIVDEntry = std::get<0>(I.value()); @@ -68,6 +84,13 @@ TempStream << (IIVDEntry.mayStore ? " * " : " "); TempStream << (IIVDEntry.hasUnmodeledSideEffects ? " U " : " "); + if (PrintBarriers) { + TempStream << (LoweredInsts[Index]->isALoadBarrier() ? " * " + : " "); + TempStream << (LoweredInsts[Index]->isAStoreBarrier() ? " * " + : " "); + } + if (PrintEncodings) { StringRef Encoding(CE.getEncoding(I.index())); unsigned EncodingSize = Encoding.size(); @@ -83,6 +106,7 @@ const MCInst &Inst = std::get<1>(I.value()); TempStream << printInstructionString(Inst) << '\n'; + ++Index; } TempStream.flush(); diff --git a/llvm/tools/llvm-mca/llvm-mca.cpp b/llvm/tools/llvm-mca/llvm-mca.cpp --- a/llvm/tools/llvm-mca/llvm-mca.cpp +++ b/llvm/tools/llvm-mca/llvm-mca.cpp @@ -219,6 +219,11 @@ cl::desc("Print encoding information in the instruction info view"), cl::cat(ViewOptions), cl::init(false)); +static cl::opt ShowBarriers( + "show-barriers", + cl::desc("Print memory barrier information in the instruction info view"), + cl::cat(ViewOptions), cl::init(false)); + static cl::opt DisableCustomBehaviour( "disable-cb", cl::desc( @@ -504,7 +509,7 @@ // (which does nothing). IPP = std::make_unique(*STI, *MCII); - std::vector> LoweredSequence; + SmallVector> LoweredSequence; for (const MCInst &MCI : Insts) { Expected> Inst = IB.createInstruction(MCI); @@ -548,7 +553,8 @@ // Create the views for this pipeline, execute, and emit a report. if (PrintInstructionInfoView) { Printer.addView(std::make_unique( - *STI, *MCII, CE, ShowEncoding, Insts, *IP)); + *STI, *MCII, CE, ShowEncoding, Insts, *IP, LoweredSequence, + ShowBarriers)); } Printer.addView( std::make_unique(*STI, *IP, Insts)); @@ -624,7 +630,8 @@ if (PrintInstructionInfoView) Printer.addView(std::make_unique( - *STI, *MCII, CE, ShowEncoding, Insts, *IP)); + *STI, *MCII, CE, ShowEncoding, Insts, *IP, LoweredSequence, + ShowBarriers)); // Fetch custom Views that are to be placed after the InstructionInfoView. // Refer to the comment paired with the CB->getStartViews(*IP, Insts); line