diff --git a/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h b/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h --- a/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h +++ b/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h @@ -27,11 +27,13 @@ namespace mca { class RegisterFile; class ResourceManager; +class LSUnit; class InOrderIssueStage final : public Stage { const MCSchedModel &SM; const MCSubtargetInfo &STI; RegisterFile &PRF; + LSUnit &LSU; std::unique_ptr RM; /// Instructions that were issued, but not executed yet. @@ -79,11 +81,11 @@ void retireInstruction(InstRef &IR); public: - InOrderIssueStage(RegisterFile &PRF, const MCSchedModel &SM, + InOrderIssueStage(RegisterFile &PRF, LSUnit &LSU, const MCSchedModel &SM, const MCSubtargetInfo &STI) - : SM(SM), STI(STI), PRF(PRF), RM(std::make_unique(SM)), - NumIssued(0), StallCyclesLeft(0), CarryOver(0), Bandwidth(0), - LastWriteBackCycle(0) {} + : SM(SM), STI(STI), PRF(PRF), LSU(LSU), + RM(std::make_unique(SM)), NumIssued(0), + StallCyclesLeft(0), CarryOver(0), Bandwidth(0), LastWriteBackCycle(0) {} bool isAvailable(const InstRef &) const override; bool hasWorkToComplete() const override; diff --git a/llvm/lib/MCA/Context.cpp b/llvm/lib/MCA/Context.cpp --- a/llvm/lib/MCA/Context.cpp +++ b/llvm/lib/MCA/Context.cpp @@ -72,15 +72,18 @@ Context::createInOrderPipeline(const PipelineOptions &Opts, SourceMgr &SrcMgr) { const MCSchedModel &SM = STI.getSchedModel(); auto PRF = std::make_unique(SM, MRI, Opts.RegisterFileSize); + auto LSU = std::make_unique(SM, Opts.LoadQueueSize, + Opts.StoreQueueSize, Opts.AssumeNoAlias); auto Entry = std::make_unique(SrcMgr); - auto InOrderIssue = std::make_unique(*PRF, SM, STI); + auto InOrderIssue = std::make_unique(*PRF, *LSU, SM, STI); auto StagePipeline = std::make_unique(); StagePipeline->appendStage(std::move(Entry)); StagePipeline->appendStage(std::move(InOrderIssue)); addHardwareUnit(std::move(PRF)); + addHardwareUnit(std::move(LSU)); return StagePipeline; } diff --git a/llvm/lib/MCA/Stages/InOrderIssueStage.cpp b/llvm/lib/MCA/Stages/InOrderIssueStage.cpp --- a/llvm/lib/MCA/Stages/InOrderIssueStage.cpp +++ b/llvm/lib/MCA/Stages/InOrderIssueStage.cpp @@ -15,6 +15,7 @@ #include "llvm/MC/MCSchedule.h" #include "llvm/MCA/HWEventListener.h" +#include "llvm/MCA/HardwareUnits/LSUnit.h" #include "llvm/MCA/HardwareUnits/RegisterFile.h" #include "llvm/MCA/HardwareUnits/ResourceManager.h" #include "llvm/MCA/HardwareUnits/RetireControlUnit.h" @@ -49,6 +50,9 @@ if (Desc.BeginGroup && NumIssued != 0) return false; + if (LSU.isAvailable(IR) != LSUnit::LSU_AVAILABLE) + return false; + return true; } @@ -160,6 +164,10 @@ HWStallEvent(HWStallEvent::DispatchGroupStall, IR)); notifyEvent( HWPressureEvent(HWPressureEvent::RESOURCES, IR)); + } else if (IR.getInstruction()->isMemOp() && !LSU.isReady(IR)) { + // This load (store) aliases with a preceding store (load). Delay + // it until the depenency is cleared. + *StallCycles = 1; } else if (LastWriteBackCycle) { if (!IR.getInstruction()->getDesc().RetireOOO) { unsigned NextWriteBackCycle = findFirstWriteBackCycle(IR); @@ -209,6 +217,10 @@ } llvm::Error InOrderIssueStage::execute(InstRef &IR) { + Instruction &IS = *IR.getInstruction(); + if (IS.isMemOp()) + IS.setLSUTokenID(LSU.dispatch(IR)); + if (llvm::Error E = tryIssue(IR, &StallCyclesLeft)) return E; @@ -244,6 +256,9 @@ RM->issueInstruction(Desc, UsedResources); IS.execute(SourceIndex); + if (IS.isMemOp()) + LSU.onInstructionIssued(IR); + // Replace resource masks with valid resource processor IDs. for (std::pair &Use : UsedResources) { uint64_t Mask = Use.first.first; @@ -289,6 +304,7 @@ } PRF.onInstructionExecuted(&IS); + LSU.onInstructionExecuted(IR); notifyEvent( HWInstructionEvent(HWInstructionEvent::Executed, IR)); LLVM_DEBUG(dbgs() << "[E] Instruction #" << IR << " is executed\n"); @@ -337,6 +353,9 @@ for (const WriteState &WS : IS.getDefs()) PRF.removeRegisterWrite(WS, FreedRegs); + if (IS.isMemOp()) + LSU.onInstructionRetired(IR); + notifyEvent(HWInstructionRetiredEvent(IR, FreedRegs)); LLVM_DEBUG(dbgs() << "[E] Retired #" << IR << " \n"); } @@ -346,6 +365,7 @@ Bandwidth = SM.IssueWidth; PRF.cycleStart(); + LSU.cycleEvent(); // Release consumed resources. SmallVector Freed; diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-load-store-alias.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-load-store-alias.s new file mode 100644 --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-load-store-alias.s @@ -0,0 +1,119 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-stats --all-views --iterations=2 --noalias=false < %s | FileCheck %s + +str x1, [x10] +str x1, [x10] +ldr x2, [x10] +nop +ldr x2, [x10] +ldr x3, [x10] + +# CHECK: Iterations: 2 +# CHECK-NEXT: Instructions: 12 +# CHECK-NEXT: Total Cycles: 33 +# CHECK-NEXT: Total uOps: 12 + +# CHECK: Dispatch Width: 2 +# CHECK-NEXT: uOps Per Cycle: 0.36 +# CHECK-NEXT: IPC: 0.36 +# CHECK-NEXT: Block RThroughput: 3.0 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 4 1.00 * str x1, [x10] +# CHECK-NEXT: 1 4 1.00 * str x1, [x10] +# CHECK-NEXT: 1 3 1.00 * ldr x2, [x10] +# CHECK-NEXT: 1 1 1.00 * * U nop +# CHECK-NEXT: 1 3 1.00 * ldr x2, [x10] +# CHECK-NEXT: 1 3 1.00 * ldr x3, [x10] + +# CHECK: Dynamic Dispatch Stall Cycles: +# CHECK-NEXT: RAT - Register unavailable: 0 +# CHECK-NEXT: RCU - Retire tokens unavailable: 0 +# CHECK-NEXT: SCHEDQ - Scheduler full: 0 +# CHECK-NEXT: LQ - Load queue full: 0 +# CHECK-NEXT: SQ - Store queue full: 0 +# CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 4 (12.1%) + +# CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: +# CHECK-NEXT: [# dispatched], [# cycles] +# CHECK-NEXT: 0, 21 (63.6%) +# CHECK-NEXT: 1, 12 (36.4%) + +# CHECK: Schedulers - number of cycles where we saw N micro opcodes issued: +# CHECK-NEXT: [# issued], [# cycles] +# CHECK-NEXT: 0, 21 (63.6%) +# CHECK-NEXT: 1, 12 (36.4%) + +# CHECK: Scheduler's queue usage: +# CHECK-NEXT: No scheduler resources used. + +# CHECK: Register File statistics: +# CHECK-NEXT: Total number of mappings created: 6 +# CHECK-NEXT: Max number of mappings used: 2 + +# CHECK: Resources: +# CHECK-NEXT: [0.0] - CortexA55UnitALU +# CHECK-NEXT: [0.1] - CortexA55UnitALU +# CHECK-NEXT: [1] - CortexA55UnitB +# CHECK-NEXT: [2] - CortexA55UnitDiv +# CHECK-NEXT: [3.0] - CortexA55UnitFPALU +# CHECK-NEXT: [3.1] - CortexA55UnitFPALU +# CHECK-NEXT: [4] - CortexA55UnitFPDIV +# CHECK-NEXT: [5.0] - CortexA55UnitFPMAC +# CHECK-NEXT: [5.1] - CortexA55UnitFPMAC +# CHECK-NEXT: [6] - CortexA55UnitLd +# CHECK-NEXT: [7] - CortexA55UnitMAC +# CHECK-NEXT: [8] - CortexA55UnitSt + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3.0] [3.1] [4] [5.0] [5.1] [6] [7] [8] +# CHECK-NEXT: - - 1.00 - - - - - - 3.00 - 2.00 + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3.0] [3.1] [4] [5.0] [5.1] [6] [7] [8] Instructions: +# CHECK-NEXT: - - - - - - - - - - - 1.00 str x1, [x10] +# CHECK-NEXT: - - - - - - - - - - - 1.00 str x1, [x10] +# CHECK-NEXT: - - - - - - - - - 1.00 - - ldr x2, [x10] +# CHECK-NEXT: - - 1.00 - - - - - - - - - nop +# CHECK-NEXT: - - - - - - - - - 1.00 - - ldr x2, [x10] +# CHECK-NEXT: - - - - - - - - - 1.00 - - ldr x3, [x10] + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 012 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeeE. . . . . . . str x1, [x10] +# CHECK-NEXT: [0,1] . DeeeE . . . . . . str x1, [x10] +# CHECK-NEXT: [0,2] . . DeeE . . . . . ldr x2, [x10] +# CHECK-NEXT: [0,3] . . .DE . . . . . nop +# CHECK-NEXT: [0,4] . . . DeeE . . . . ldr x2, [x10] +# CHECK-NEXT: [0,5] . . . DeeE . . . . ldr x3, [x10] +# CHECK-NEXT: [1,0] . . . .DeeeE . . . str x1, [x10] +# CHECK-NEXT: [1,1] . . . . DeeeE. . . str x1, [x10] +# CHECK-NEXT: [1,2] . . . . . DeeE . . ldr x2, [x10] +# CHECK-NEXT: [1,3] . . . . . . DE . . nop +# CHECK-NEXT: [1,4] . . . . . . DeeE. ldr x2, [x10] +# CHECK-NEXT: [1,5] . . . . . . DeeE ldr x3, [x10] + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 0.0 0.0 0.0 str x1, [x10] +# CHECK-NEXT: 1. 2 0.0 0.0 0.0 str x1, [x10] +# CHECK-NEXT: 2. 2 0.0 0.0 0.0 ldr x2, [x10] +# CHECK-NEXT: 3. 2 0.0 0.0 0.0 nop +# CHECK-NEXT: 4. 2 0.0 0.0 0.0 ldr x2, [x10] +# CHECK-NEXT: 5. 2 0.0 0.0 0.0 ldr x3, [x10] +# CHECK-NEXT: 2 0.0 0.0 0.0 diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-load-store-noalias.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-load-store-noalias.s new file mode 100644 --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-load-store-noalias.s @@ -0,0 +1,121 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-stats --all-views --iterations=2 --noalias=true < %s | FileCheck %s + +str x1, [x10] +str x1, [x10] +ldr x2, [x10] +nop +ldr x2, [x10] +ldr x3, [x10] + +# CHECK: Iterations: 2 +# CHECK-NEXT: Instructions: 12 +# CHECK-NEXT: Total Cycles: 12 +# CHECK-NEXT: Total uOps: 12 + +# CHECK: Dispatch Width: 2 +# CHECK-NEXT: uOps Per Cycle: 1.00 +# CHECK-NEXT: IPC: 1.00 +# CHECK-NEXT: Block RThroughput: 3.0 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 4 1.00 * str x1, [x10] +# CHECK-NEXT: 1 4 1.00 * str x1, [x10] +# CHECK-NEXT: 1 3 1.00 * ldr x2, [x10] +# CHECK-NEXT: 1 1 1.00 * * U nop +# CHECK-NEXT: 1 3 1.00 * ldr x2, [x10] +# CHECK-NEXT: 1 3 1.00 * ldr x3, [x10] + +# CHECK: Dynamic Dispatch Stall Cycles: +# CHECK-NEXT: RAT - Register unavailable: 0 +# CHECK-NEXT: RCU - Retire tokens unavailable: 0 +# CHECK-NEXT: SCHEDQ - Scheduler full: 0 +# CHECK-NEXT: LQ - Load queue full: 0 +# CHECK-NEXT: SQ - Store queue full: 0 +# CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 3 (25.0%) + +# CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: +# CHECK-NEXT: [# dispatched], [# cycles] +# CHECK-NEXT: 0, 3 (25.0%) +# CHECK-NEXT: 1, 6 (50.0%) +# CHECK-NEXT: 2, 3 (25.0%) + +# CHECK: Schedulers - number of cycles where we saw N micro opcodes issued: +# CHECK-NEXT: [# issued], [# cycles] +# CHECK-NEXT: 0, 3 (25.0%) +# CHECK-NEXT: 1, 6 (50.0%) +# CHECK-NEXT: 2, 3 (25.0%) + +# CHECK: Scheduler's queue usage: +# CHECK-NEXT: No scheduler resources used. + +# CHECK: Register File statistics: +# CHECK-NEXT: Total number of mappings created: 6 +# CHECK-NEXT: Max number of mappings used: 3 + +# CHECK: Resources: +# CHECK-NEXT: [0.0] - CortexA55UnitALU +# CHECK-NEXT: [0.1] - CortexA55UnitALU +# CHECK-NEXT: [1] - CortexA55UnitB +# CHECK-NEXT: [2] - CortexA55UnitDiv +# CHECK-NEXT: [3.0] - CortexA55UnitFPALU +# CHECK-NEXT: [3.1] - CortexA55UnitFPALU +# CHECK-NEXT: [4] - CortexA55UnitFPDIV +# CHECK-NEXT: [5.0] - CortexA55UnitFPMAC +# CHECK-NEXT: [5.1] - CortexA55UnitFPMAC +# CHECK-NEXT: [6] - CortexA55UnitLd +# CHECK-NEXT: [7] - CortexA55UnitMAC +# CHECK-NEXT: [8] - CortexA55UnitSt + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3.0] [3.1] [4] [5.0] [5.1] [6] [7] [8] +# CHECK-NEXT: - - 1.00 - - - - - - 3.00 - 2.00 + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3.0] [3.1] [4] [5.0] [5.1] [6] [7] [8] Instructions: +# CHECK-NEXT: - - - - - - - - - - - 1.00 str x1, [x10] +# CHECK-NEXT: - - - - - - - - - - - 1.00 str x1, [x10] +# CHECK-NEXT: - - - - - - - - - 1.00 - - ldr x2, [x10] +# CHECK-NEXT: - - 1.00 - - - - - - - - - nop +# CHECK-NEXT: - - - - - - - - - 1.00 - - ldr x2, [x10] +# CHECK-NEXT: - - - - - - - - - 1.00 - - ldr x3, [x10] + +# CHECK: Timeline view: +# CHECK-NEXT: 01 +# CHECK-NEXT: Index 0123456789 + +# CHECK: [0,0] DeeeE. .. str x1, [x10] +# CHECK-NEXT: [0,1] .DeeeE .. str x1, [x10] +# CHECK-NEXT: [0,2] .DeeE. .. ldr x2, [x10] +# CHECK-NEXT: [0,3] . DE . .. nop +# CHECK-NEXT: [0,4] . DeeE .. ldr x2, [x10] +# CHECK-NEXT: [0,5] . DeeE .. ldr x3, [x10] +# CHECK-NEXT: [1,0] . DeeeE .. str x1, [x10] +# CHECK-NEXT: [1,1] . DeeeE.. str x1, [x10] +# CHECK-NEXT: [1,2] . DeeE .. ldr x2, [x10] +# CHECK-NEXT: [1,3] . .DE .. nop +# CHECK-NEXT: [1,4] . . DeeE. ldr x2, [x10] +# CHECK-NEXT: [1,5] . . DeeE ldr x3, [x10] + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 0.0 0.0 0.0 str x1, [x10] +# CHECK-NEXT: 1. 2 0.0 0.0 0.0 str x1, [x10] +# CHECK-NEXT: 2. 2 0.0 0.0 0.0 ldr x2, [x10] +# CHECK-NEXT: 3. 2 0.0 0.0 0.0 nop +# CHECK-NEXT: 4. 2 0.0 0.0 0.0 ldr x2, [x10] +# CHECK-NEXT: 5. 2 0.0 0.0 0.0 ldr x3, [x10] +# CHECK-NEXT: 2 0.0 0.0 0.0