Index: docs/CommandGuide/llvm-mca.rst =================================================================== --- docs/CommandGuide/llvm-mca.rst +++ docs/CommandGuide/llvm-mca.rst @@ -169,6 +169,12 @@ the theoretical uniform distribution of resource pressure for every instruction in sequence. +.. option:: -bottleneck-analysis + + Print information about bottlenecks that affect the throughput. This analysis + can be expensive, and it is disabled by default. Bottlenecks are highlighted + in the summary view. + EXIT STATUS ----------- Index: include/llvm/MCA/Context.h =================================================================== --- include/llvm/MCA/Context.h +++ include/llvm/MCA/Context.h @@ -32,14 +32,16 @@ /// the pre-built "default" out-of-order pipeline. struct PipelineOptions { PipelineOptions(unsigned DW, unsigned RFS, unsigned LQS, unsigned SQS, - bool NoAlias) + bool NoAlias, bool ShouldEnableBottleneckAnalysis = false) : DispatchWidth(DW), RegisterFileSize(RFS), LoadQueueSize(LQS), - StoreQueueSize(SQS), AssumeNoAlias(NoAlias) {} + StoreQueueSize(SQS), AssumeNoAlias(NoAlias), + EnableBottleneckAnalysis(ShouldEnableBottleneckAnalysis) {} unsigned DispatchWidth; unsigned RegisterFileSize; unsigned LoadQueueSize; unsigned StoreQueueSize; bool AssumeNoAlias; + bool EnableBottleneckAnalysis; }; class Context { Index: include/llvm/MCA/HWEventListener.h =================================================================== --- include/llvm/MCA/HWEventListener.h +++ include/llvm/MCA/HWEventListener.h @@ -125,6 +125,34 @@ const InstRef &IR; }; +// A HWPressureEvent describes an increase in backend pressure caused by +// the presence of data dependencies or unavailability of pipeline resources. +class HWPressureEvent { +public: + enum GenericReason { + INVALID = 0, + // Scheduler was unable to issue all the ready instructions because some + // pipeline resources were unavailable. + RESOURCES, + // Instructions could not be issued because of register data dependencies. + REGISTER_DEPS, + // Instructions could not be issued because of memory dependencies. + MEMORY_DEPS + }; + + HWPressureEvent(unsigned reason, ArrayRef Insts, uint64_t Mask = 0) + : Reason(reason), AffectedInstructions(Insts), ResourceMask(Mask) { } + + // Reason for this increase in backend pressure. + unsigned Reason; + + // Instructions affected (i.e. delayed) by this increase in backend pressure. + ArrayRef AffectedInstructions; + + // A mask of unavailable processor resources. + const uint64_t ResourceMask; +}; + class HWEventListener { public: // Generic events generated by the pipeline. @@ -133,6 +161,7 @@ virtual void onEvent(const HWInstructionEvent &Event) {} virtual void onEvent(const HWStallEvent &Event) {} + virtual void onEvent(const HWPressureEvent &Event) {} using ResourceRef = std::pair; virtual void onResourceAvailable(const ResourceRef &RRef) {} Index: include/llvm/MCA/HardwareUnits/Scheduler.h =================================================================== --- include/llvm/MCA/HardwareUnits/Scheduler.h +++ include/llvm/MCA/HardwareUnits/Scheduler.h @@ -180,8 +180,8 @@ /// Check if the instruction in 'IR' can be dispatched during this cycle. /// Return SC_AVAILABLE if both scheduler and LS resources are available. /// - /// This method internally sets field HadTokenStall based on the Scheduler - /// Status value. + /// This method is also responsible for setting field HadTokenStall if + /// IR cannot be dispatched to the Scheduler due to unavailable resources. Status isAvailable(const InstRef &IR); /// Reserves buffer and LSUnit queue resources that are necessary to issue @@ -225,16 +225,25 @@ /// resources are not available. InstRef select(); - /// Returns a mask of busy resources. Each bit of the mask identifies a unique - /// processor resource unit. In the absence of bottlenecks caused by resource - /// pressure, the mask value returned by this method is always zero. - uint64_t getBusyResourceUnits() const { return BusyResourceUnits; } bool arePipelinesFullyUsed() const { return !Resources->getAvailableProcResUnits(); } bool isReadySetEmpty() const { return ReadySet.empty(); } bool isWaitSetEmpty() const { return WaitSet.empty(); } + /// This method is called by the ExecuteStage at the end of each cycle to + /// identify bottlenecks caused by data dependencies. Vector RegDeps is + /// populated by instructions that were not issued because of unsolved + /// register dependencies. Vector MemDeps is populated by instructions that + /// were not issued because of unsolved memory dependencies. + void analyzeDataDependencies(SmallVectorImpl &RegDeps, + SmallVectorImpl &MemDeps); + + /// Returns a mask of busy resources, and populates vector Insts with + /// instructions that could not be issued to the underlying pipelines because + /// not all pipeline resources were available. + uint64_t analyzeResourcePressure(SmallVectorImpl &Insts); + // Returns true if the dispatch logic couldn't dispatch a full group due to // unavailable scheduler and/or LS resources. bool hadTokenStall() const { return HadTokenStall; } Index: include/llvm/MCA/Instruction.h =================================================================== --- include/llvm/MCA/Instruction.h +++ include/llvm/MCA/Instruction.h @@ -448,7 +448,13 @@ // Retire Unit token ID for this instruction. unsigned RCUTokenID; + // A bitmask of busy processor resource units. + // This field is set to zero only if execution is not delayed during this + // cycle because of unavailable pipeline resources. uint64_t CriticalResourceMask; + + // An instruction identifier. This field is only set if execution is delayed + // by a memory dependency. unsigned CriticalMemDep; public: @@ -499,12 +505,12 @@ Stage = IS_RETIRED; } - void updateCriticalResourceMask(uint64_t BusyResourceUnits) { - CriticalResourceMask |= BusyResourceUnits; - } uint64_t getCriticalResourceMask() const { return CriticalResourceMask; } - void setCriticalMemDep(unsigned IID) { CriticalMemDep = IID; } unsigned getCriticalMemDep() const { return CriticalMemDep; } + void setCriticalResourceMask(uint64_t ResourceMask) { + CriticalResourceMask = ResourceMask; + } + void setCriticalMemDep(unsigned IID) { CriticalMemDep = IID; } void cycleEvent(); }; Index: include/llvm/MCA/Stages/ExecuteStage.h =================================================================== --- include/llvm/MCA/Stages/ExecuteStage.h +++ include/llvm/MCA/Stages/ExecuteStage.h @@ -28,6 +28,13 @@ class ExecuteStage final : public Stage { Scheduler &HWS; + unsigned NumDispatchedOpcodes; + unsigned NumIssuedOpcodes; + bool DispatchStallCycle; + + // True if this stage should notify pressure events to listeners. + bool EnablePressureEvents; + Error issueInstruction(InstRef &IR); // Called at the beginning of each cycle to issue already dispatched @@ -41,7 +48,11 @@ ExecuteStage &operator=(const ExecuteStage &Other) = delete; public: - ExecuteStage(Scheduler &S) : Stage(), HWS(S) {} + ExecuteStage(Scheduler &S) : ExecuteStage(S, false) {} + ExecuteStage(Scheduler &S, bool ShouldPerformBottleneckAnalysis) + : Stage(), HWS(S), NumDispatchedOpcodes(0), NumIssuedOpcodes(0), + DispatchStallCycle(false), + EnablePressureEvents(ShouldPerformBottleneckAnalysis) {} // This stage works under the assumption that the Pipeline will eventually // execute a retire stage. We don't need to check if pipelines and/or @@ -60,6 +71,7 @@ // Instructions that transitioned to the 'Executed' state are automatically // moved to the next stage (i.e. RetireStage). Error cycleStart() override; + Error cycleEnd() override; Error execute(InstRef &IR) override; void notifyInstructionIssued( Index: lib/MCA/Context.cpp =================================================================== --- lib/MCA/Context.cpp +++ lib/MCA/Context.cpp @@ -42,7 +42,8 @@ auto Fetch = llvm::make_unique(SrcMgr); auto Dispatch = llvm::make_unique(STI, MRI, Opts.DispatchWidth, *RCU, *PRF); - auto Execute = llvm::make_unique(*HWS); + auto Execute = + llvm::make_unique(*HWS, Opts.EnableBottleneckAnalysis); auto Retire = llvm::make_unique(*RCU, *PRF); // Pass the ownership of all the hardware units to this Context. Index: lib/MCA/HardwareUnits/Scheduler.cpp =================================================================== --- lib/MCA/HardwareUnits/Scheduler.cpp +++ lib/MCA/HardwareUnits/Scheduler.cpp @@ -183,9 +183,10 @@ InstRef &IR = ReadySet[I]; if (QueueIndex == ReadySet.size() || Strategy->compare(IR, ReadySet[QueueIndex])) { - const InstrDesc &D = IR.getInstruction()->getDesc(); + Instruction &IS = *IR.getInstruction(); + const InstrDesc &D = IS.getDesc(); uint64_t BusyResourceMask = Resources->checkAvailability(D); - IR.getInstruction()->updateCriticalResourceMask(BusyResourceMask); + IS.setCriticalResourceMask(BusyResourceMask); BusyResourceUnits |= BusyResourceMask; if (!BusyResourceMask) QueueIndex = I; @@ -227,6 +228,30 @@ IssuedSet.resize(IssuedSet.size() - RemovedElements); } +uint64_t Scheduler::analyzeResourcePressure(SmallVectorImpl &Insts) { + for (const InstRef &IR : ReadySet) + Insts.emplace_back(IR); + return BusyResourceUnits; +} + +void Scheduler::analyzeDataDependencies(SmallVectorImpl &RegDeps, + SmallVectorImpl &MemDeps) { + const auto EndIt = PendingSet.end() - NumDispatchedToThePendingSet; + for (InstRef &IR : make_range(PendingSet.begin(), EndIt)) { + Instruction &IS = *IR.getInstruction(); + if (Resources->checkAvailability(IS.getDesc())) + continue; + + if (IS.isReady() || + (IS.isMemOp() && LSU.isReady(IR) != IR.getSourceIndex())) { + MemDeps.emplace_back(IR); + continue; + } + + RegDeps.emplace_back(IR); + } +} + void Scheduler::cycleEvent(SmallVectorImpl &Freed, SmallVectorImpl &Executed, SmallVectorImpl &Ready) { Index: lib/MCA/Stages/ExecuteStage.cpp =================================================================== --- lib/MCA/Stages/ExecuteStage.cpp +++ lib/MCA/Stages/ExecuteStage.cpp @@ -54,6 +54,7 @@ SmallVector, 4> Used; SmallVector Ready; HWS.issueInstruction(IR, Used, Ready); + NumIssuedOpcodes += IR.getInstruction()->getDesc().NumMicroOps; notifyReservedOrReleasedBuffers(IR, /* Reserved */ false); @@ -89,6 +90,8 @@ SmallVector Ready; HWS.cycleEvent(Freed, Executed, Ready); + NumDispatchedOpcodes = 0; + NumIssuedOpcodes = 0; for (const ResourceRef &RR : Freed) notifyResourceAvailable(RR); @@ -106,6 +109,45 @@ return issueReadyInstructions(); } +Error ExecuteStage::cycleEnd() { + if (!EnablePressureEvents) + return ErrorSuccess(); + + // Always conservatively report any backpressure events if the dispatch logic + // was stalled due to unavailable scheduler resources. + if (!HWS.hadTokenStall() && NumDispatchedOpcodes <= NumIssuedOpcodes) + return ErrorSuccess(); + + SmallVector Insts; + uint64_t Mask = HWS.analyzeResourcePressure(Insts); + if (Mask) { + LLVM_DEBUG(dbgs() << "[E] Backpressure increased because of unavailable " + "pipeline resources: " + << format_hex(Mask, 16) << '\n'); + HWPressureEvent Ev(HWPressureEvent::RESOURCES, Insts, Mask); + notifyEvent(Ev); + return ErrorSuccess(); + } + + SmallVector RegDeps; + SmallVector MemDeps; + HWS.analyzeDataDependencies(RegDeps, MemDeps); + if (RegDeps.size()) { + LLVM_DEBUG( + dbgs() << "[E] Backpressure increased by register dependencies\n"); + HWPressureEvent Ev(HWPressureEvent::REGISTER_DEPS, RegDeps); + notifyEvent(Ev); + } + + if (MemDeps.size()) { + LLVM_DEBUG(dbgs() << "[E] Backpressure increased by memory dependencies\n"); + HWPressureEvent Ev(HWPressureEvent::MEMORY_DEPS, MemDeps); + notifyEvent(Ev); + } + + return ErrorSuccess(); +} + #ifndef NDEBUG static void verifyInstructionEliminated(const InstRef &IR) { const Instruction &Inst = *IR.getInstruction(); @@ -147,6 +189,7 @@ // be released after MCIS is issued, and all the ResourceCycles for those // units have been consumed. bool IsReadyInstruction = HWS.dispatch(IR); + NumDispatchedOpcodes += IR.getInstruction()->getDesc().NumMicroOps; notifyReservedOrReleasedBuffers(IR, /* Reserved */ true); if (!IsReadyInstruction) return ErrorSuccess(); Index: test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-1.s =================================================================== --- test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-1.s +++ test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-1.s @@ -0,0 +1,85 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -timeline -timeline-max-iterations=1 -bottleneck-analysis < %s | FileCheck %s + +add %eax, %ebx +add %ebx, %ecx +add %ecx, %edx +add %edx, %eax + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 403 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 2 +# CHECK-NEXT: uOps Per Cycle: 0.99 +# CHECK-NEXT: IPC: 0.99 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Cycles with backend pressure increase [ 94.04% ] +# CHECK-NEXT: Throughput Bottlenecks: +# CHECK-NEXT: Resource Pressure [ 0.00% ] +# CHECK-NEXT: Data Dependencies: [ 94.04% ] +# CHECK-NEXT: - Register Dependencies [ 94.04% ] +# CHECK-NEXT: - Memory Dependencies [ 0.00% ] + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 1 0.50 addl %eax, %ebx +# CHECK-NEXT: 1 1 0.50 addl %ebx, %ecx +# CHECK-NEXT: 1 1 0.50 addl %ecx, %edx +# CHECK-NEXT: 1 1 0.50 addl %edx, %eax + +# CHECK: Resources: +# CHECK-NEXT: [0] - JALU0 +# CHECK-NEXT: [1] - JALU1 +# CHECK-NEXT: [2] - JDiv +# CHECK-NEXT: [3] - JFPA +# CHECK-NEXT: [4] - JFPM +# CHECK-NEXT: [5] - JFPU0 +# CHECK-NEXT: [6] - JFPU1 +# CHECK-NEXT: [7] - JLAGU +# CHECK-NEXT: [8] - JMul +# CHECK-NEXT: [9] - JSAGU +# CHECK-NEXT: [10] - JSTC +# CHECK-NEXT: [11] - JVALU0 +# CHECK-NEXT: [12] - JVALU1 +# CHECK-NEXT: [13] - JVIMUL + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] +# CHECK-NEXT: 2.00 2.00 - - - - - - - - - - - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions: +# CHECK-NEXT: - 1.00 - - - - - - - - - - - - addl %eax, %ebx +# CHECK-NEXT: 1.00 - - - - - - - - - - - - - addl %ebx, %ecx +# CHECK-NEXT: - 1.00 - - - - - - - - - - - - addl %ecx, %edx +# CHECK-NEXT: 1.00 - - - - - - - - - - - - - addl %edx, %eax + +# CHECK: Timeline view: +# CHECK-NEXT: Index 0123456 + +# CHECK: [0,0] DeER .. addl %eax, %ebx +# CHECK-NEXT: [0,1] D=eER.. addl %ebx, %ecx +# CHECK-NEXT: [0,2] .D=eER. addl %ecx, %edx +# CHECK-NEXT: [0,3] .D==eER addl %edx, %eax + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 1 1.0 1.0 0.0 addl %eax, %ebx +# CHECK-NEXT: 1. 1 2.0 0.0 0.0 addl %ebx, %ecx +# CHECK-NEXT: 2. 1 2.0 0.0 0.0 addl %ecx, %edx +# CHECK-NEXT: 3. 1 3.0 0.0 0.0 addl %edx, %eax Index: test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-2.s =================================================================== --- test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-2.s +++ test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-2.s @@ -0,0 +1,72 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=100 -timeline -timeline-max-iterations=1 -bottleneck-analysis < %s | FileCheck %s + +vhaddps %xmm0, %xmm0, %xmm1 + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 100 +# CHECK-NEXT: Total Cycles: 106 +# CHECK-NEXT: Total uOps: 100 + +# CHECK: Dispatch Width: 2 +# CHECK-NEXT: uOps Per Cycle: 0.94 +# CHECK-NEXT: IPC: 0.94 +# CHECK-NEXT: Block RThroughput: 1.0 + +# CHECK: Cycles with backend pressure increase [ 76.42% ] +# CHECK-NEXT: Throughput Bottlenecks: +# CHECK-NEXT: Resource Pressure [ 76.42% ] +# CHECK-NEXT: - JFPA [ 76.42% ] +# CHECK-NEXT: - JFPU0 [ 76.42% ] +# CHECK-NEXT: Data Dependencies: [ 0.00% ] +# CHECK-NEXT: - Register Dependencies [ 0.00% ] +# CHECK-NEXT: - Memory Dependencies [ 0.00% ] + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 4 1.00 vhaddps %xmm0, %xmm0, %xmm1 + +# CHECK: Resources: +# CHECK-NEXT: [0] - JALU0 +# CHECK-NEXT: [1] - JALU1 +# CHECK-NEXT: [2] - JDiv +# CHECK-NEXT: [3] - JFPA +# CHECK-NEXT: [4] - JFPM +# CHECK-NEXT: [5] - JFPU0 +# CHECK-NEXT: [6] - JFPU1 +# CHECK-NEXT: [7] - JLAGU +# CHECK-NEXT: [8] - JMul +# CHECK-NEXT: [9] - JSAGU +# CHECK-NEXT: [10] - JSTC +# CHECK-NEXT: [11] - JVALU0 +# CHECK-NEXT: [12] - JVALU1 +# CHECK-NEXT: [13] - JVIMUL + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] +# CHECK-NEXT: - - - 1.00 - 1.00 - - - - - - - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions: +# CHECK-NEXT: - - - 1.00 - 1.00 - - - - - - - - vhaddps %xmm0, %xmm0, %xmm1 + +# CHECK: Timeline view: +# CHECK-NEXT: Index 0123456 + +# CHECK: [0,0] DeeeeER vhaddps %xmm0, %xmm0, %xmm1 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 1 1.0 1.0 0.0 vhaddps %xmm0, %xmm0, %xmm1 Index: test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-3.s =================================================================== --- test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-3.s +++ test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-3.s @@ -0,0 +1,106 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1500 -noalias=false -timeline -timeline-max-iterations=1 -bottleneck-analysis < %s | FileCheck %s + +vmovaps (%rsi), %xmm0 +vmovaps %xmm0, (%rdi) +vmovaps 16(%rsi), %xmm0 +vmovaps %xmm0, 16(%rdi) +vmovaps 32(%rsi), %xmm0 +vmovaps %xmm0, 32(%rdi) +vmovaps 48(%rsi), %xmm0 +vmovaps %xmm0, 48(%rdi) + +# CHECK: Iterations: 1500 +# CHECK-NEXT: Instructions: 12000 +# CHECK-NEXT: Total Cycles: 36003 +# CHECK-NEXT: Total uOps: 12000 + +# CHECK: Dispatch Width: 2 +# CHECK-NEXT: uOps Per Cycle: 0.33 +# CHECK-NEXT: IPC: 0.33 +# CHECK-NEXT: Block RThroughput: 4.0 + +# CHECK: Cycles with backend pressure increase [ 99.89% ] +# CHECK-NEXT: Throughput Bottlenecks: +# CHECK-NEXT: Resource Pressure [ 0.00% ] +# CHECK-NEXT: Data Dependencies: [ 99.89% ] +# CHECK-NEXT: - Register Dependencies [ 0.00% ] +# CHECK-NEXT: - Memory Dependencies [ 99.89% ] + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 5 1.00 * vmovaps (%rsi), %xmm0 +# CHECK-NEXT: 1 1 1.00 * vmovaps %xmm0, (%rdi) +# CHECK-NEXT: 1 5 1.00 * vmovaps 16(%rsi), %xmm0 +# CHECK-NEXT: 1 1 1.00 * vmovaps %xmm0, 16(%rdi) +# CHECK-NEXT: 1 5 1.00 * vmovaps 32(%rsi), %xmm0 +# CHECK-NEXT: 1 1 1.00 * vmovaps %xmm0, 32(%rdi) +# CHECK-NEXT: 1 5 1.00 * vmovaps 48(%rsi), %xmm0 +# CHECK-NEXT: 1 1 1.00 * vmovaps %xmm0, 48(%rdi) + +# CHECK: Resources: +# CHECK-NEXT: [0] - JALU0 +# CHECK-NEXT: [1] - JALU1 +# CHECK-NEXT: [2] - JDiv +# CHECK-NEXT: [3] - JFPA +# CHECK-NEXT: [4] - JFPM +# CHECK-NEXT: [5] - JFPU0 +# CHECK-NEXT: [6] - JFPU1 +# CHECK-NEXT: [7] - JLAGU +# CHECK-NEXT: [8] - JMul +# CHECK-NEXT: [9] - JSAGU +# CHECK-NEXT: [10] - JSTC +# CHECK-NEXT: [11] - JVALU0 +# CHECK-NEXT: [12] - JVALU1 +# CHECK-NEXT: [13] - JVIMUL + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] +# CHECK-NEXT: - - - 2.00 2.00 4.00 4.00 4.00 - 4.00 4.00 - - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions: +# CHECK-NEXT: - - - - 1.00 1.00 - 1.00 - - - - - - vmovaps (%rsi), %xmm0 +# CHECK-NEXT: - - - - - - 1.00 - - 1.00 1.00 - - - vmovaps %xmm0, (%rdi) +# CHECK-NEXT: - - - 1.00 - 1.00 - 1.00 - - - - - - vmovaps 16(%rsi), %xmm0 +# CHECK-NEXT: - - - - - - 1.00 - - 1.00 1.00 - - - vmovaps %xmm0, 16(%rdi) +# CHECK-NEXT: - - - - 1.00 1.00 - 1.00 - - - - - - vmovaps 32(%rsi), %xmm0 +# CHECK-NEXT: - - - - - - 1.00 - - 1.00 1.00 - - - vmovaps %xmm0, 32(%rdi) +# CHECK-NEXT: - - - 1.00 - 1.00 - 1.00 - - - - - - vmovaps 48(%rsi), %xmm0 +# CHECK-NEXT: - - - - - - 1.00 - - 1.00 1.00 - - - vmovaps %xmm0, 48(%rdi) + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 0123456 + +# CHECK: [0,0] DeeeeeER . . . .. vmovaps (%rsi), %xmm0 +# CHECK-NEXT: [0,1] D=====eER . . . .. vmovaps %xmm0, (%rdi) +# CHECK-NEXT: [0,2] .D=====eeeeeER . . .. vmovaps 16(%rsi), %xmm0 +# CHECK-NEXT: [0,3] .D==========eER. . .. vmovaps %xmm0, 16(%rdi) +# CHECK-NEXT: [0,4] . D==========eeeeeER. .. vmovaps 32(%rsi), %xmm0 +# CHECK-NEXT: [0,5] . D===============eER .. vmovaps %xmm0, 32(%rdi) +# CHECK-NEXT: [0,6] . D===============eeeeeER. vmovaps 48(%rsi), %xmm0 +# CHECK-NEXT: [0,7] . D====================eER vmovaps %xmm0, 48(%rdi) + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 1 1.0 1.0 0.0 vmovaps (%rsi), %xmm0 +# CHECK-NEXT: 1. 1 6.0 0.0 0.0 vmovaps %xmm0, (%rdi) +# CHECK-NEXT: 2. 1 6.0 0.0 0.0 vmovaps 16(%rsi), %xmm0 +# CHECK-NEXT: 3. 1 11.0 0.0 0.0 vmovaps %xmm0, 16(%rdi) +# CHECK-NEXT: 4. 1 11.0 0.0 0.0 vmovaps 32(%rsi), %xmm0 +# CHECK-NEXT: 5. 1 16.0 0.0 0.0 vmovaps %xmm0, 32(%rdi) +# CHECK-NEXT: 6. 1 16.0 0.0 0.0 vmovaps 48(%rsi), %xmm0 +# CHECK-NEXT: 7. 1 21.0 0.0 0.0 vmovaps %xmm0, 48(%rdi) Index: tools/llvm-mca/Views/SummaryView.h =================================================================== --- tools/llvm-mca/Views/SummaryView.h +++ tools/llvm-mca/Views/SummaryView.h @@ -45,6 +45,25 @@ unsigned TotalCycles; // The total number of micro opcodes contributed by a block of instructions. unsigned NumMicroOps; + + struct BackPressureInfo { + // Cycles where backpressure increased. + unsigned PressureIncreaseCycles; + // Cycles where backpressure increased because of pipeline pressure. + unsigned ResourcePressureCycles; + // Cycles where backpressure increased because of data dependencies. + unsigned DataDependencyCycles; + // Cycles where backpressure increased because of register dependencies. + unsigned RegisterDependencyCycles; + // Cycles where backpressure increased because of memory dependencies. + unsigned MemoryDependencyCycles; + }; + BackPressureInfo BPI; + + // Resource pressure distribution. There is an element for every processor + // resource declared by the scheduling model. Quantities are number of cycles. + llvm::SmallVector ResourcePressureDistribution; + // For each processor resource, this vector stores the cumulative number of // resource cycles consumed by the analyzed code block. llvm::SmallVector ProcResourceUsage; @@ -58,18 +77,43 @@ // Used to map resource indices to actual processor resource IDs. llvm::SmallVector ResIdx2ProcResID; + // True if resource pressure events were notified during this cycle. + bool PressureIncreasedBecauseOfResources; + bool PressureIncreasedBecauseOfDataDependencies; + + // True if throughput was affected by dispatch stalls. + bool SeenStallCycles; + // Compute the reciprocal throughput for the analyzed code block. // The reciprocal block throughput is computed as the MAX between: // - NumMicroOps / DispatchWidth // - Total Resource Cycles / #Units (for every resource consumed). double getBlockRThroughput() const; + // Prints a bottleneck message to OS. + void printBottleneckHints(llvm::raw_ostream &OS) const; + public: SummaryView(const llvm::MCSchedModel &Model, llvm::ArrayRef S, unsigned Width); - void onCycleEnd() override { ++TotalCycles; } + void onCycleEnd() override { + ++TotalCycles; + if (PressureIncreasedBecauseOfResources || + PressureIncreasedBecauseOfDataDependencies) { + ++BPI.PressureIncreaseCycles; + if (PressureIncreasedBecauseOfDataDependencies) + ++BPI.DataDependencyCycles; + PressureIncreasedBecauseOfResources = false; + PressureIncreasedBecauseOfDataDependencies = false; + } + } void onEvent(const HWInstructionEvent &Event) override; + void onEvent(const HWStallEvent &Event) override { + SeenStallCycles = true; + } + + void onEvent(const HWPressureEvent &Event) override; void printView(llvm::raw_ostream &OS) const override; }; Index: tools/llvm-mca/Views/SummaryView.cpp =================================================================== --- tools/llvm-mca/Views/SummaryView.cpp +++ tools/llvm-mca/Views/SummaryView.cpp @@ -25,10 +25,14 @@ SummaryView::SummaryView(const MCSchedModel &Model, ArrayRef S, unsigned Width) : SM(Model), Source(S), DispatchWidth(Width), LastInstructionIdx(0), - TotalCycles(0), NumMicroOps(0), + TotalCycles(0), NumMicroOps(0), BPI({0, 0, 0, 0}), + ResourcePressureDistribution(Model.getNumProcResourceKinds(), 0), ProcResourceUsage(Model.getNumProcResourceKinds(), 0), ProcResourceMasks(Model.getNumProcResourceKinds()), - ResIdx2ProcResID(Model.getNumProcResourceKinds(), 0) { + ResIdx2ProcResID(Model.getNumProcResourceKinds(), 0), + PressureIncreasedBecauseOfResources(false), + PressureIncreasedBecauseOfDataDependencies(false), + SeenStallCycles(false) { computeProcResourceMasks(SM, ProcResourceMasks); for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) { unsigned Index = getResourceStateIndex(ProcResourceMasks[I]); @@ -61,6 +65,96 @@ } } +void SummaryView::onEvent(const HWPressureEvent &Event) { + switch (Event.Reason) { + case HWPressureEvent::INVALID: + default: + break; + + case HWPressureEvent::RESOURCES: { + PressureIncreasedBecauseOfResources = true; + ++BPI.ResourcePressureCycles; + uint64_t ResourceMask = Event.ResourceMask; + while (ResourceMask) { + uint64_t Current = ResourceMask & (-ResourceMask); + unsigned Index = getResourceStateIndex(Current); + unsigned ProcResID = ResIdx2ProcResID[Index]; + const MCProcResourceDesc &PRDesc = *SM.getProcResource(ProcResID); + if (!PRDesc.SubUnitsIdxBegin) { + ResourcePressureDistribution[Index]++; + ResourceMask ^= Current; + continue; + } + + for (unsigned I = 0, E = PRDesc.NumUnits; I < E; ++I) { + unsigned OtherProcResID = PRDesc.SubUnitsIdxBegin[I]; + unsigned OtherMask = ProcResourceMasks[OtherProcResID]; + ResourcePressureDistribution[getResourceStateIndex(OtherMask)]++; + } + + ResourceMask ^= Current; + } + } + + break; + case HWPressureEvent::REGISTER_DEPS: + PressureIncreasedBecauseOfDataDependencies = true; + ++BPI.RegisterDependencyCycles; + break; + case HWPressureEvent::MEMORY_DEPS : + PressureIncreasedBecauseOfDataDependencies = true; + ++BPI.MemoryDependencyCycles; + break; + } +} + +void SummaryView::printBottleneckHints(raw_ostream &OS) const { + if (!SeenStallCycles || !BPI.PressureIncreaseCycles) + return; + + double PressurePerCycle = + (double)BPI.PressureIncreaseCycles * 100 / TotalCycles; + double ResourcePressurePerCycle = + (double)BPI.ResourcePressureCycles * 100 / TotalCycles; + double DDPerCycle = (double)BPI.DataDependencyCycles * 100 / TotalCycles; + double RegDepPressurePerCycle = + (double)BPI.RegisterDependencyCycles * 100 / TotalCycles; + double MemDepPressurePerCycle = + (double)BPI.MemoryDependencyCycles * 100 / TotalCycles; + + OS << "\nCycles with backend pressure increase [ " + << format("%.2f", floor((PressurePerCycle * 100) + 0.5) / 100) << "% ]"; + + OS << "\nThroughput Bottlenecks: " + << "\n Resource Pressure [ " + << format("%.2f", floor((ResourcePressurePerCycle * 100) + 0.5) / 100) + << "% ]"; + + if (BPI.PressureIncreaseCycles) { + for (unsigned I = 0, E = ResourcePressureDistribution.size(); I < E; ++I) { + if (ResourcePressureDistribution[I]) { + double Frequency = + (double)ResourcePressureDistribution[I] * 100 / TotalCycles; + unsigned Index = ResIdx2ProcResID[getResourceStateIndex(1ULL << I)]; + const MCProcResourceDesc &PRDesc = *SM.getProcResource(Index); + OS << "\n - " << PRDesc.Name << " [ " + << format("%.2f", floor((Frequency * 100) + 0.5) / 100) << "% ]"; + } + } + } + + OS << "\n Data Dependencies: [ " + << format("%.2f", floor((DDPerCycle * 100) + 0.5) / 100) << "% ]"; + + OS << "\n - Register Dependencies [ " + << format("%.2f", floor((RegDepPressurePerCycle * 100) + 0.5) / 100) + << "% ]"; + + OS << "\n - Memory Dependencies [ " + << format("%.2f", floor((MemDepPressurePerCycle * 100) + 0.5) / 100) + << "% ]\n\n"; +} + void SummaryView::printView(raw_ostream &OS) const { unsigned Instructions = Source.size(); unsigned Iterations = (LastInstructionIdx / Instructions) + 1; @@ -85,6 +179,8 @@ TempStream << "\nBlock RThroughput: " << format("%.1f", floor((BlockRThroughput * 10) + 0.5) / 10) << '\n'; + + printBottleneckHints(TempStream); TempStream.flush(); OS << Buffer; } Index: tools/llvm-mca/llvm-mca.cpp =================================================================== --- tools/llvm-mca/llvm-mca.cpp +++ tools/llvm-mca/llvm-mca.cpp @@ -175,6 +175,11 @@ cl::desc("Print all views including hardware statistics"), cl::cat(ViewOptions), cl::init(false)); +static cl::opt EnableBottleneckAnalysis( + "bottleneck-analysis", + cl::desc("Enable bottleneck analysis (disabled by the fault)"), + cl::cat(ViewOptions), cl::init(false)); + namespace { const Target *getTarget(const char *ProgName) { @@ -387,7 +392,8 @@ mca::Context MCA(*MRI, *STI); mca::PipelineOptions PO(Width, RegisterFileSize, LoadQueueSize, - StoreQueueSize, AssumeNoAlias); + StoreQueueSize, AssumeNoAlias, + EnableBottleneckAnalysis); // Number each region in the sequence. unsigned RegionIdx = 0;