diff --git a/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h b/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h --- a/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h +++ b/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h @@ -45,6 +45,11 @@ InstRef StalledInst; unsigned StallCyclesLeft; + /// Instruction that is issued in more than 1 cycle. + InstRef CarriedOver; + /// Number of CarriedOver uops left to issue. + unsigned CarryOver; + /// Number of instructions that can be issued in the current cycle. unsigned Bandwidth; @@ -67,6 +72,9 @@ /// Update status of instructions from IssuedInst. void updateIssuedInst(); + /// Continue to issue the CarriedOver instruction. + void updateCarriedOver(); + /// Retire instruction once it is executed. void retireInstruction(InstRef &IR); @@ -74,7 +82,8 @@ InOrderIssueStage(RegisterFile &PRF, const MCSchedModel &SM, const MCSubtargetInfo &STI) : SM(SM), STI(STI), PRF(PRF), RM(std::make_unique(SM)), - NumIssued(0), StallCyclesLeft(0), Bandwidth(0), LastWriteBackCycle(0) {} + NumIssued(0), StallCyclesLeft(0), CarryOver(0), Bandwidth(0), + LastWriteBackCycle(0) {} bool isAvailable(const InstRef &) const override; bool hasWorkToComplete() const override; diff --git a/llvm/lib/MCA/Stages/InOrderIssueStage.cpp b/llvm/lib/MCA/Stages/InOrderIssueStage.cpp --- a/llvm/lib/MCA/Stages/InOrderIssueStage.cpp +++ b/llvm/lib/MCA/Stages/InOrderIssueStage.cpp @@ -29,15 +29,19 @@ namespace mca { bool InOrderIssueStage::hasWorkToComplete() const { - return !IssuedInst.empty() || StalledInst; + return !IssuedInst.empty() || StalledInst || CarriedOver; } bool InOrderIssueStage::isAvailable(const InstRef &IR) const { + if (StalledInst || CarriedOver) + return false; + const Instruction &Inst = *IR.getInstruction(); unsigned NumMicroOps = Inst.getNumMicroOps(); const InstrDesc &Desc = Inst.getDesc(); - if (Bandwidth < NumMicroOps) + bool ShouldCarryOver = NumMicroOps > SM.IssueWidth; + if (Bandwidth < NumMicroOps && !ShouldCarryOver) return false; // Instruction with BeginGroup must be the first instruction to be issued in a @@ -247,15 +251,19 @@ } notifyInstructionIssue(IR, UsedResources, *this); - if (Desc.EndGroup) { + bool ShouldCarryOver = NumMicroOps > Bandwidth; + if (ShouldCarryOver) { + CarryOver = NumMicroOps - Bandwidth; + CarriedOver = IR; Bandwidth = 0; + NumIssued += Bandwidth; + LLVM_DEBUG(dbgs() << "[N] Carry over #" << IR << " \n"); } else { - assert(Bandwidth >= NumMicroOps); - Bandwidth -= NumMicroOps; + NumIssued += NumMicroOps; + Bandwidth = Desc.EndGroup ? 0 : Bandwidth - NumMicroOps; } IssuedInst.push_back(IR); - NumIssued += NumMicroOps; if (!IR.getInstruction()->getDesc().RetireOOO) LastWriteBackCycle = findLastWriteBackCycle(IR); @@ -295,6 +303,32 @@ IssuedInst.resize(IssuedInst.size() - NumExecuted); } +void InOrderIssueStage::updateCarriedOver() { + if (!CarriedOver) + return; + + assert(!StalledInst && "A stalled instruction cannot be carried over."); + + if (CarryOver > Bandwidth) { + CarryOver -= Bandwidth; + Bandwidth = 0; + LLVM_DEBUG(dbgs() << "[N] Carry over (" << CarryOver << "uops left) #" + << CarriedOver << " \n"); + return; + } + + LLVM_DEBUG(dbgs() << "[N] Carry over (complete) #" << CarriedOver + << " \n"); + + if (CarriedOver.getInstruction()->getDesc().EndGroup) + Bandwidth = 0; + else + Bandwidth -= CarryOver; + + CarriedOver = InstRef(); + CarryOver = 0; +} + void InOrderIssueStage::retireInstruction(InstRef &IR) { Instruction &IS = *IR.getInstruction(); IS.retire(); @@ -319,6 +353,9 @@ updateIssuedInst(); + // Continue to issue the instruction carried over from the previous cycle + updateCarriedOver(); + // Issue instructions scheduled for this cycle if (!StallCyclesLeft && StalledInst) { if (llvm::Error E = tryIssue(StalledInst, &StallCyclesLeft)) diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A53-carry-over.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A53-carry-over.s new file mode 100644 --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A53-carry-over.s @@ -0,0 +1,83 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a53 --timeline --iterations=1 < %s | FileCheck %s + +ldp w3, w5, [x10], #4 // 2uop + 1uop carry over +add w10, w11, w12 +add w13, w14, w15 +ldp w7, w8, [x11] // 2uop, no carry over +add w16, w17, w18 +add w19, w20, w21 + +# CHECK: Iterations: 1 +# CHECK-NEXT: Instructions: 6 +# CHECK-NEXT: Total Cycles: 8 +# CHECK-NEXT: Total uOps: 9 + +# CHECK: Dispatch Width: 2 +# CHECK-NEXT: uOps Per Cycle: 1.13 +# CHECK-NEXT: IPC: 0.75 +# CHECK-NEXT: Block RThroughput: 4.5 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 3 4 2.00 * ldp w3, w5, [x10], #4 +# CHECK-NEXT: 1 3 0.50 add w10, w11, w12 +# CHECK-NEXT: 1 3 0.50 add w13, w14, w15 +# CHECK-NEXT: 2 4 2.00 * ldp w7, w8, [x11] +# CHECK-NEXT: 1 3 0.50 add w16, w17, w18 +# CHECK-NEXT: 1 3 0.50 add w19, w20, w21 + +# CHECK: Resources: +# CHECK-NEXT: [0.0] - A53UnitALU +# CHECK-NEXT: [0.1] - A53UnitALU +# CHECK-NEXT: [1] - A53UnitB +# CHECK-NEXT: [2] - A53UnitDiv +# CHECK-NEXT: [3] - A53UnitFPALU +# CHECK-NEXT: [4] - A53UnitFPMDS +# CHECK-NEXT: [5] - A53UnitLdSt +# CHECK-NEXT: [6] - A53UnitMAC + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3] [4] [5] [6] +# CHECK-NEXT: 2.00 2.00 - - - - 4.00 - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0.0] [0.1] [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: - - - - - - 2.00 - ldp w3, w5, [x10], #4 +# CHECK-NEXT: - 1.00 - - - - - - add w10, w11, w12 +# CHECK-NEXT: 1.00 - - - - - - - add w13, w14, w15 +# CHECK-NEXT: - - - - - - 2.00 - ldp w7, w8, [x11] +# CHECK-NEXT: - 1.00 - - - - - - add w16, w17, w18 +# CHECK-NEXT: 1.00 - - - - - - - add w19, w20, w21 + +# CHECK: Timeline view: +# CHECK-NEXT: Index 01234567 + +# CHECK: [0,0] DeeeE. . ldp w3, w5, [x10], #4 +# CHECK-NEXT: [0,1] .DeeE. . add w10, w11, w12 +# CHECK-NEXT: [0,2] . DeeE . add w13, w14, w15 +# CHECK-NEXT: [0,3] . DeeeE ldp w7, w8, [x11] +# CHECK-NEXT: [0,4] . DeeE add w16, w17, w18 +# CHECK-NEXT: [0,5] . DeeE add w19, w20, w21 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 1 0.0 0.0 0.0 ldp w3, w5, [x10], #4 +# CHECK-NEXT: 1. 1 0.0 0.0 0.0 add w10, w11, w12 +# CHECK-NEXT: 2. 1 0.0 0.0 0.0 add w13, w14, w15 +# CHECK-NEXT: 3. 1 0.0 0.0 0.0 ldp w7, w8, [x11] +# CHECK-NEXT: 4. 1 0.0 0.0 0.0 add w16, w17, w18 +# CHECK-NEXT: 5. 1 0.0 0.0 0.0 add w19, w20, w21 +# CHECK-NEXT: 1 0.0 0.0 0.0 diff --git a/llvm/test/tools/llvm-mca/AMDGPU/gfx10-double.s b/llvm/test/tools/llvm-mca/AMDGPU/gfx10-double.s --- a/llvm/test/tools/llvm-mca/AMDGPU/gfx10-double.s +++ b/llvm/test/tools/llvm-mca/AMDGPU/gfx10-double.s @@ -28,8 +28,7 @@ v_div_fixup_f64 v[0:1], v[0:1], v[0:1], v[0:1] v_ldexp_f64 v[2:3], v[2:3], v0 -; FIXME: This instructions sends llvm-mca into an infinite loop -;v_div_scale_f64 v[0:1], vcc_lo, v[0:1], v[0:1], v[0:1] +v_div_scale_f64 v[0:1], vcc_lo, v[0:1], v[0:1], v[0:1] v_trig_preop_f64 v[2:3], v[2:3], v0 @@ -41,14 +40,14 @@ v_sqrt_f64 v[4:5], v[4:5] # CHECK: Iterations: 1 -# CHECK-NEXT: Instructions: 27 -# CHECK-NEXT: Total Cycles: 204 -# CHECK-NEXT: Total uOps: 27 +# CHECK-NEXT: Instructions: 28 +# CHECK-NEXT: Total Cycles: 224 +# CHECK-NEXT: Total uOps: 29 # CHECK: Dispatch Width: 1 # CHECK-NEXT: uOps Per Cycle: 0.13 # CHECK-NEXT: IPC: 0.13 -# CHECK-NEXT: Block RThroughput: 27.0 +# CHECK-NEXT: Block RThroughput: 29.0 # CHECK: Instruction Info: # CHECK-NEXT: [1]: #uOps @@ -80,6 +79,7 @@ # CHECK-NEXT: 1 22 1.00 U v_div_fmas_f64 v[0:1], v[0:1], v[0:1], v[0:1] # CHECK-NEXT: 1 22 1.00 U v_div_fixup_f64 v[0:1], v[0:1], v[0:1], v[0:1] # CHECK-NEXT: 1 22 1.00 U v_ldexp_f64 v[2:3], v[2:3], v0 +# CHECK-NEXT: 2 22 2.00 U v_div_scale_f64 v[0:1], vcc_lo, v[0:1], v[0:1], v[0:1] # CHECK-NEXT: 1 22 1.00 U v_trig_preop_f64 v[2:3], v[2:3], v0 # CHECK-NEXT: 1 22 1.00 U v_cmp_eq_f64_e32 vcc_lo, v[0:1], v[0:1] # CHECK-NEXT: 1 22 1.00 U v_cmp_class_f64_e64 vcc_lo, v[2:3], s0 @@ -98,7 +98,7 @@ # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] -# CHECK-NEXT: - - - 27.00 - 27.00 - +# CHECK-NEXT: - - - 29.00 1.00 28.00 - # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] Instructions: @@ -123,6 +123,7 @@ # CHECK-NEXT: - - - 1.00 - 1.00 - v_div_fmas_f64 v[0:1], v[0:1], v[0:1], v[0:1] # CHECK-NEXT: - - - 1.00 - 1.00 - v_div_fixup_f64 v[0:1], v[0:1], v[0:1], v[0:1] # CHECK-NEXT: - - - 1.00 - 1.00 - v_ldexp_f64 v[2:3], v[2:3], v0 +# CHECK-NEXT: - - - 2.00 1.00 1.00 - v_div_scale_f64 v[0:1], vcc_lo, v[0:1], v[0:1], v[0:1] # CHECK-NEXT: - - - 1.00 - 1.00 - v_trig_preop_f64 v[2:3], v[2:3], v0 # CHECK-NEXT: - - - 1.00 - 1.00 - v_cmp_eq_f64_e32 vcc_lo, v[0:1], v[0:1] # CHECK-NEXT: - - - 1.00 - 1.00 - v_cmp_class_f64_e64 vcc_lo, v[2:3], s0 @@ -176,10 +177,11 @@ # CHECK-NEXT: 18. 1 0.0 0.0 0.0 v_div_fmas_f64 v[0:1], v[0:1], v[0:1], v[0:1] # CHECK-NEXT: 19. 1 0.0 0.0 0.0 v_div_fixup_f64 v[0:1], v[0:1], v[0:1], v[0:1] # CHECK-NEXT: 20. 1 0.0 0.0 0.0 v_ldexp_f64 v[2:3], v[2:3], v0 -# CHECK-NEXT: 21. 1 0.0 0.0 0.0 v_trig_preop_f64 v[2:3], v[2:3], v0 -# CHECK-NEXT: 22. 1 0.0 0.0 0.0 v_cmp_eq_f64_e32 vcc_lo, v[0:1], v[0:1] -# CHECK-NEXT: 23. 1 0.0 0.0 0.0 v_cmp_class_f64_e64 vcc_lo, v[2:3], s0 -# CHECK-NEXT: 24. 1 0.0 0.0 0.0 v_rcp_f64_e32 v[0:1], v[0:1] -# CHECK-NEXT: 25. 1 0.0 0.0 0.0 v_rsq_f64_e32 v[2:3], v[2:3] -# CHECK-NEXT: 26. 1 0.0 0.0 0.0 v_sqrt_f64_e32 v[4:5], v[4:5] +# CHECK-NEXT: 21. 1 0.0 0.0 0.0 v_div_scale_f64 v[0:1], vcc_lo, v[0:1], v[0:1], v[0:1] +# CHECK-NEXT: 22. 1 0.0 0.0 0.0 v_trig_preop_f64 v[2:3], v[2:3], v0 +# CHECK-NEXT: 23. 1 0.0 0.0 0.0 v_cmp_eq_f64_e32 vcc_lo, v[0:1], v[0:1] +# CHECK-NEXT: 24. 1 0.0 0.0 0.0 v_cmp_class_f64_e64 vcc_lo, v[2:3], s0 +# CHECK-NEXT: 25. 1 0.0 0.0 0.0 v_rcp_f64_e32 v[0:1], v[0:1] +# CHECK-NEXT: 26. 1 0.0 0.0 0.0 v_rsq_f64_e32 v[2:3], v[2:3] +# CHECK-NEXT: 27. 1 0.0 0.0 0.0 v_sqrt_f64_e32 v[4:5], v[4:5] # CHECK-NEXT: 1 0.0 0.0 0.0