diff --git a/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h b/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h --- a/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h +++ b/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h @@ -21,6 +21,7 @@ namespace llvm { namespace mca { +class LSUnit; class RegisterFile; struct StallInfo { @@ -29,6 +30,7 @@ REGISTER_DEPS, DISPATCH, DELAY, + LOAD_STORE, CUSTOM_STALL }; @@ -54,6 +56,7 @@ RegisterFile &PRF; ResourceManager RM; CustomBehaviour &CB; + LSUnit &LSU; /// Instructions that were issued, but not executed yet. SmallVector IssuedInst; @@ -110,7 +113,7 @@ public: InOrderIssueStage(const MCSubtargetInfo &STI, RegisterFile &PRF, - CustomBehaviour &CB); + CustomBehaviour &CB, LSUnit &LSU); unsigned getIssueWidth() const; bool isAvailable(const InstRef &) const override; diff --git a/llvm/lib/MCA/Context.cpp b/llvm/lib/MCA/Context.cpp --- a/llvm/lib/MCA/Context.cpp +++ b/llvm/lib/MCA/Context.cpp @@ -74,14 +74,17 @@ CustomBehaviour &CB) { const MCSchedModel &SM = STI.getSchedModel(); auto PRF = std::make_unique(SM, MRI, Opts.RegisterFileSize); + auto LSU = std::make_unique(SM, Opts.LoadQueueSize, + Opts.StoreQueueSize, Opts.AssumeNoAlias); // Create the pipeline stages. auto Entry = std::make_unique(SrcMgr); - auto InOrderIssue = std::make_unique(STI, *PRF, CB); + auto InOrderIssue = std::make_unique(STI, *PRF, CB, *LSU); auto StagePipeline = std::make_unique(); // Pass the ownership of all the hardware units to this Context. addHardwareUnit(std::move(PRF)); + addHardwareUnit(std::move(LSU)); // Build the pipeline. StagePipeline->appendStage(std::move(Entry)); diff --git a/llvm/lib/MCA/Stages/InOrderIssueStage.cpp b/llvm/lib/MCA/Stages/InOrderIssueStage.cpp --- a/llvm/lib/MCA/Stages/InOrderIssueStage.cpp +++ b/llvm/lib/MCA/Stages/InOrderIssueStage.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "llvm/MCA/Stages/InOrderIssueStage.h" +#include "llvm/MCA/HardwareUnits/LSUnit.h" #include "llvm/MCA/HardwareUnits/RegisterFile.h" #include "llvm/MCA/HardwareUnits/RetireControlUnit.h" #include "llvm/MCA/Instruction.h" @@ -43,9 +44,10 @@ } InOrderIssueStage::InOrderIssueStage(const MCSubtargetInfo &STI, - RegisterFile &PRF, CustomBehaviour &CB) - : STI(STI), PRF(PRF), RM(STI.getSchedModel()), CB(CB), NumIssued(), SI(), - CarryOver(), Bandwidth(), LastWriteBackCycle() {} + RegisterFile &PRF, CustomBehaviour &CB, + LSUnit &LSU) + : STI(STI), PRF(PRF), RM(STI.getSchedModel()), CB(CB), LSU(LSU), + NumIssued(), SI(), CarryOver(), Bandwidth(), LastWriteBackCycle() {} unsigned InOrderIssueStage::getIssueWidth() const { return STI.getSchedModel().IssueWidth; @@ -125,6 +127,13 @@ return false; } + if (IR.getInstruction()->isMemOp() && !LSU.isReady(IR)) { + // This load (store) aliases with a preceding store (load). Delay + // it until the depenency is cleared. + SI.update(IR, /* delay */ 1, StallInfo::StallKind::LOAD_STORE); + return false; + } + if (unsigned CustomStallCycles = CB.checkCustomHazard(IssuedInst, IR)) { SI.update(IR, CustomStallCycles, StallInfo::StallKind::CUSTOM_STALL); return false; @@ -188,6 +197,10 @@ } llvm::Error InOrderIssueStage::execute(InstRef &IR) { + Instruction &IS = *IR.getInstruction(); + if (IS.isMemOp()) + IS.setLSUTokenID(LSU.dispatch(IR)); + if (llvm::Error E = tryIssue(IR)) return E; @@ -222,6 +235,9 @@ RM.issueInstruction(Desc, UsedResources); IS.execute(SourceIndex); + if (IS.isMemOp()) + LSU.onInstructionIssued(IR); + // Replace resource masks with valid resource processor IDs. for (ResourceUse &Use : UsedResources) { uint64_t Mask = Use.first.first; @@ -279,6 +295,7 @@ } PRF.onInstructionExecuted(&IS); + LSU.onInstructionExecuted(IR); notifyInstructionExecuted(IR); ++NumExecuted; @@ -324,6 +341,9 @@ for (const WriteState &WS : IS.getDefs()) PRF.removeRegisterWrite(WS, FreedRegs); + if (IS.isMemOp()) + LSU.onInstructionRetired(IR); + notifyInstructionRetired(IR, FreedRegs); } @@ -363,6 +383,7 @@ Bandwidth = getIssueWidth(); PRF.cycleStart(); + LSU.cycleEvent(); // Release consumed resources. SmallVector Freed; diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-load-store-alias.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-load-store-alias.s --- a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-load-store-alias.s +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-load-store-alias.s @@ -1,20 +1,24 @@ # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py -# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 -timeline --iterations=5 -noalias=false < %s | FileCheck %s +# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 -timeline --iterations=3 -noalias=false < %s | FileCheck %s # PR50483: Execution of loads and stores should not overlap if flag -noalias is set to false. -str x1, [x4] -ldr x2, [x4] +str x1, [x10] +str x1, [x10] +ldr x2, [x10] +nop +ldr x2, [x10] +ldr x3, [x10] -# CHECK: Iterations: 5 -# CHECK-NEXT: Instructions: 10 -# CHECK-NEXT: Total Cycles: 8 -# CHECK-NEXT: Total uOps: 10 +# CHECK: Iterations: 3 +# CHECK-NEXT: Instructions: 18 +# CHECK-NEXT: Total Cycles: 31 +# CHECK-NEXT: Total uOps: 18 # CHECK: Dispatch Width: 2 -# CHECK-NEXT: uOps Per Cycle: 1.25 -# CHECK-NEXT: IPC: 1.25 -# CHECK-NEXT: Block RThroughput: 1.0 +# CHECK-NEXT: uOps Per Cycle: 0.58 +# CHECK-NEXT: IPC: 0.58 +# CHECK-NEXT: Block RThroughput: 3.0 # CHECK: Instruction Info: # CHECK-NEXT: [1]: #uOps @@ -25,8 +29,12 @@ # CHECK-NEXT: [6]: HasSideEffects (U) # CHECK: [1] [2] [3] [4] [5] [6] Instructions: -# CHECK-NEXT: 1 1 1.00 * str x1, [x4] -# CHECK-NEXT: 1 3 1.00 * ldr x2, [x4] +# CHECK-NEXT: 1 1 1.00 * str x1, [x10] +# CHECK-NEXT: 1 1 1.00 * str x1, [x10] +# CHECK-NEXT: 1 3 1.00 * ldr x2, [x10] +# CHECK-NEXT: 1 1 1.00 * * U nop +# CHECK-NEXT: 1 3 1.00 * ldr x2, [x10] +# CHECK-NEXT: 1 3 1.00 * ldr x3, [x10] # CHECK: Resources: # CHECK-NEXT: [0.0] - CortexA55UnitALU @@ -44,26 +52,39 @@ # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0.0] [0.1] [1] [2] [3.0] [3.1] [4] [5.0] [5.1] [6] [7] [8] -# CHECK-NEXT: - - - - - - - - - 1.00 - 1.00 +# CHECK-NEXT: - - 1.00 - - - - - - 3.00 - 2.00 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0.0] [0.1] [1] [2] [3.0] [3.1] [4] [5.0] [5.1] [6] [7] [8] Instructions: -# CHECK-NEXT: - - - - - - - - - - - 1.00 str x1, [x4] -# CHECK-NEXT: - - - - - - - - - 1.00 - - ldr x2, [x4] +# CHECK-NEXT: - - - - - - - - - - - 1.00 str x1, [x10] +# CHECK-NEXT: - - - - - - - - - - - 1.00 str x1, [x10] +# CHECK-NEXT: - - - - - - - - - 1.00 - - ldr x2, [x10] +# CHECK-NEXT: - - 1.00 - - - - - - - - - nop +# CHECK-NEXT: - - - - - - - - - 1.00 - - ldr x2, [x10] +# CHECK-NEXT: - - - - - - - - - 1.00 - - ldr x3, [x10] # CHECK: Timeline view: -# CHECK-NEXT: Index 01234567 +# CHECK-NEXT: 0123456789 0 +# CHECK-NEXT: Index 0123456789 0123456789 -# CHECK: [0,0] DE . . str x1, [x4] -# CHECK-NEXT: [0,1] DeeE . . ldr x2, [x4] -# CHECK-NEXT: [1,0] .DE . . str x1, [x4] -# CHECK-NEXT: [1,1] .DeeE. . ldr x2, [x4] -# CHECK-NEXT: [2,0] . DE . . str x1, [x4] -# CHECK-NEXT: [2,1] . DeeE . ldr x2, [x4] -# CHECK-NEXT: [3,0] . DE. . str x1, [x4] -# CHECK-NEXT: [3,1] . DeeE. ldr x2, [x4] -# CHECK-NEXT: [4,0] . DE . str x1, [x4] -# CHECK-NEXT: [4,1] . DeeE ldr x2, [x4] +# CHECK: [0,0] DE . . . . . . str x1, [x10] +# CHECK-NEXT: [0,1] .DE . . . . . . str x1, [x10] +# CHECK-NEXT: [0,2] . DeeE . . . . . ldr x2, [x10] +# CHECK-NEXT: [0,3] . DE . . . . . nop +# CHECK-NEXT: [0,4] . .DeeE. . . . . ldr x2, [x10] +# CHECK-NEXT: [0,5] . . DeeE . . . . ldr x3, [x10] +# CHECK-NEXT: [1,0] . . DE . . . . str x1, [x10] +# CHECK-NEXT: [1,1] . . .DE . . . . str x1, [x10] +# CHECK-NEXT: [1,2] . . . DeeE . . . ldr x2, [x10] +# CHECK-NEXT: [1,3] . . . DE . . . nop +# CHECK-NEXT: [1,4] . . . .DeeE. . . ldr x2, [x10] +# CHECK-NEXT: [1,5] . . . . DeeE . . ldr x3, [x10] +# CHECK-NEXT: [2,0] . . . . DE . . str x1, [x10] +# CHECK-NEXT: [2,1] . . . . .DE . . str x1, [x10] +# CHECK-NEXT: [2,2] . . . . . DeeE . ldr x2, [x10] +# CHECK-NEXT: [2,3] . . . . . DE . nop +# CHECK-NEXT: [2,4] . . . . . .DeeE. ldr x2, [x10] +# CHECK-NEXT: [2,5] . . . . . . DeeE ldr x3, [x10] # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -72,6 +93,10 @@ # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 5 0.0 0.0 0.0 str x1, [x4] -# CHECK-NEXT: 1. 5 0.0 0.0 0.0 ldr x2, [x4] -# CHECK-NEXT: 5 0.0 0.0 0.0 +# CHECK-NEXT: 0. 3 0.0 0.0 0.0 str x1, [x10] +# CHECK-NEXT: 1. 3 0.0 0.0 0.0 str x1, [x10] +# CHECK-NEXT: 2. 3 0.0 0.0 0.0 ldr x2, [x10] +# CHECK-NEXT: 3. 3 0.0 0.0 0.0 nop +# CHECK-NEXT: 4. 3 0.0 0.0 0.0 ldr x2, [x10] +# CHECK-NEXT: 5. 3 0.0 0.0 0.0 ldr x3, [x10] +# CHECK-NEXT: 3 0.0 0.0 0.0 diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-load-store-noalias.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-load-store-noalias.s new file mode 100644 --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-load-store-noalias.s @@ -0,0 +1,100 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 -timeline --iterations=3 --noalias=true < %s | FileCheck %s + +str x1, [x10] +str x1, [x10] +ldr x2, [x10] +nop +ldr x2, [x10] +ldr x3, [x10] + +# CHECK: Iterations: 3 +# CHECK-NEXT: Instructions: 18 +# CHECK-NEXT: Total Cycles: 19 +# CHECK-NEXT: Total uOps: 18 + +# CHECK: Dispatch Width: 2 +# CHECK-NEXT: uOps Per Cycle: 0.95 +# CHECK-NEXT: IPC: 0.95 +# CHECK-NEXT: Block RThroughput: 3.0 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 1 1.00 * str x1, [x10] +# CHECK-NEXT: 1 1 1.00 * str x1, [x10] +# CHECK-NEXT: 1 3 1.00 * ldr x2, [x10] +# CHECK-NEXT: 1 1 1.00 * * U nop +# CHECK-NEXT: 1 3 1.00 * ldr x2, [x10] +# CHECK-NEXT: 1 3 1.00 * ldr x3, [x10] + +# CHECK: Resources: +# CHECK-NEXT: [0.0] - CortexA55UnitALU +# CHECK-NEXT: [0.1] - CortexA55UnitALU +# CHECK-NEXT: [1] - CortexA55UnitB +# CHECK-NEXT: [2] - CortexA55UnitDiv +# CHECK-NEXT: [3.0] - CortexA55UnitFPALU +# CHECK-NEXT: [3.1] - CortexA55UnitFPALU +# CHECK-NEXT: [4] - CortexA55UnitFPDIV +# CHECK-NEXT: [5.0] - CortexA55UnitFPMAC +# CHECK-NEXT: [5.1] - CortexA55UnitFPMAC +# CHECK-NEXT: [6] - CortexA55UnitLd +# CHECK-NEXT: [7] - CortexA55UnitMAC +# CHECK-NEXT: [8] - CortexA55UnitSt + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3.0] [3.1] [4] [5.0] [5.1] [6] [7] [8] +# CHECK-NEXT: - - 1.00 - - - - - - 3.00 - 2.00 + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3.0] [3.1] [4] [5.0] [5.1] [6] [7] [8] Instructions: +# CHECK-NEXT: - - - - - - - - - - - 1.00 str x1, [x10] +# CHECK-NEXT: - - - - - - - - - - - 1.00 str x1, [x10] +# CHECK-NEXT: - - - - - - - - - 1.00 - - ldr x2, [x10] +# CHECK-NEXT: - - 1.00 - - - - - - - - - nop +# CHECK-NEXT: - - - - - - - - - 1.00 - - ldr x2, [x10] +# CHECK-NEXT: - - - - - - - - - 1.00 - - ldr x3, [x10] + +# CHECK: Timeline view: +# CHECK-NEXT: 012345678 +# CHECK-NEXT: Index 0123456789 + +# CHECK: [0,0] DE . . . . str x1, [x10] +# CHECK-NEXT: [0,1] .DE . . . . str x1, [x10] +# CHECK-NEXT: [0,2] .DeeE. . . . ldr x2, [x10] +# CHECK-NEXT: [0,3] . DE. . . . nop +# CHECK-NEXT: [0,4] . DeeE . . . ldr x2, [x10] +# CHECK-NEXT: [0,5] . DeeE . . . ldr x3, [x10] +# CHECK-NEXT: [1,0] . DE . . . str x1, [x10] +# CHECK-NEXT: [1,1] . .DE . . . str x1, [x10] +# CHECK-NEXT: [1,2] . .DeeE. . . ldr x2, [x10] +# CHECK-NEXT: [1,3] . . DE. . . nop +# CHECK-NEXT: [1,4] . . DeeE . . ldr x2, [x10] +# CHECK-NEXT: [1,5] . . DeeE . . ldr x3, [x10] +# CHECK-NEXT: [2,0] . . DE . . str x1, [x10] +# CHECK-NEXT: [2,1] . . .DE . . str x1, [x10] +# CHECK-NEXT: [2,2] . . .DeeE. . ldr x2, [x10] +# CHECK-NEXT: [2,3] . . . DE. . nop +# CHECK-NEXT: [2,4] . . . DeeE. ldr x2, [x10] +# CHECK-NEXT: [2,5] . . . DeeE ldr x3, [x10] + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 3 0.0 0.0 0.0 str x1, [x10] +# CHECK-NEXT: 1. 3 0.0 0.0 0.0 str x1, [x10] +# CHECK-NEXT: 2. 3 0.0 0.0 0.0 ldr x2, [x10] +# CHECK-NEXT: 3. 3 0.0 0.0 0.0 nop +# CHECK-NEXT: 4. 3 0.0 0.0 0.0 ldr x2, [x10] +# CHECK-NEXT: 5. 3 0.0 0.0 0.0 ldr x3, [x10] +# CHECK-NEXT: 3 0.0 0.0 0.0