Index: include/llvm/CodeGen/TargetSchedule.h =================================================================== --- include/llvm/CodeGen/TargetSchedule.h +++ include/llvm/CodeGen/TargetSchedule.h @@ -91,6 +91,13 @@ /// \brief Maximum number of micro-ops that may be scheduled per cycle. unsigned getIssueWidth() const { return SchedModel.IssueWidth; } + /// \brief Return true if new group must begin. + bool mustBeginGroup(const MachineInstr *MI, + const MCSchedClassDesc *SC = nullptr) const; + /// \brief Return true if current group must end. + bool mustEndGroup(const MachineInstr *MI, + const MCSchedClassDesc *SC = nullptr) const; + /// \brief Return the number of issue slots required for this MI. unsigned getNumMicroOps(const MachineInstr *MI, const MCSchedClassDesc *SC = nullptr) const; @@ -176,6 +183,7 @@ bool UseDefaultDefLatency = true) const; unsigned computeInstrLatency(unsigned Opcode) const; + /// \brief Output dependency latency of a pair of defs of the same register. /// /// This is typically one cycle. Index: include/llvm/Target/TargetSchedule.td =================================================================== --- include/llvm/Target/TargetSchedule.td +++ include/llvm/Target/TargetSchedule.td @@ -255,6 +255,8 @@ // Allow a processor to mark some scheduling classes as unsupported // for stronger verification. bit Unsupported = 0; + // Allow a processor to mark some scheduling classes as single-issue + bit SingleIssue = 0; SchedMachineModel SchedModel = ?; } Index: lib/CodeGen/MachineScheduler.cpp =================================================================== --- lib/CodeGen/MachineScheduler.cpp +++ lib/CodeGen/MachineScheduler.cpp @@ -1172,6 +1172,12 @@ dbgs() << " Pressure Diff : "; getPressureDiff(&SU).dump(*TRI); } + dbgs() << " Single Issue : "; + if (SchedModel.mustBeginGroup(SU.getInstr()) && + SchedModel.mustEndGroup(SU.getInstr())) + dbgs() << "true;"; + else + dbgs() << "false;"; dbgs() << '\n'; } if (ExitSU.getInstr() != nullptr) @@ -1909,12 +1915,22 @@ && HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard) { return true; } + unsigned uops = SchedModel->getNumMicroOps(SU->getInstr()); if ((CurrMOps > 0) && (CurrMOps + uops > SchedModel->getIssueWidth())) { DEBUG(dbgs() << " SU(" << SU->NodeNum << ") uops=" << SchedModel->getNumMicroOps(SU->getInstr()) << '\n'); return true; } + + if (CurrMOps > 0 && + ((isTop() && SchedModel->mustBeginGroup(SU->getInstr())) || + (!isTop() && SchedModel->mustEndGroup(SU->getInstr())))) { + DEBUG(dbgs() << " hazard: SU(" << SU->NodeNum << ") must " + << (isTop()? "begin" : "end") << " group\n"); + return true; + } + if (SchedModel->hasInstrSchedModel() && SU->hasReservedResource) { const MCSchedClassDesc *SC = DAG->getSchedClass(SU); for (TargetSchedModel::ProcResIter @@ -2210,6 +2226,18 @@ // one cycle. Since we commonly reach the max MOps here, opportunistically // bump the cycle to avoid uselessly checking everything in the readyQ. CurrMOps += IncMOps; + + // Bump the cycle count for issue group constraints. + // This must be done after NextCycle has been adjust for all other stalls. + // Calling bumpCycle(X) will reduce CurrMOps by one issue group and set + // currCycle to X. + if ((isTop() && SchedModel->mustEndGroup(SU->getInstr())) || + (!isTop() && SchedModel->mustBeginGroup(SU->getInstr()))) { + DEBUG(dbgs() << " Bump cycle to " + << (isTop() ? "end" : "begin") << " group\n"); + bumpCycle(++NextCycle); + } + while (CurrMOps >= SchedModel->getIssueWidth()) { DEBUG(dbgs() << " *** Max MOps " << CurrMOps << " at cycle " << CurrCycle << '\n'); Index: lib/CodeGen/TargetSchedule.cpp =================================================================== --- lib/CodeGen/TargetSchedule.cpp +++ lib/CodeGen/TargetSchedule.cpp @@ -84,6 +84,29 @@ } } +/// Returns true only if instruction is specified as single issue. +bool TargetSchedModel::mustBeginGroup(const MachineInstr *MI, + const MCSchedClassDesc *SC) const { + if (hasInstrSchedModel()) { + if (!SC) + SC = resolveSchedClass(MI); + if (SC->isValid()) + return SC->BeginGroup; + } + return false; +} + +bool TargetSchedModel::mustEndGroup(const MachineInstr *MI, + const MCSchedClassDesc *SC) const { + if (hasInstrSchedModel()) { + if (!SC) + SC = resolveSchedClass(MI); + if (SC->isValid()) + return SC->EndGroup; + } + return false; +} + unsigned TargetSchedModel::getNumMicroOps(const MachineInstr *MI, const MCSchedClassDesc *SC) const { if (hasInstrItineraries()) { Index: lib/Target/ARM/ARMScheduleR52.td =================================================================== --- lib/Target/ARM/ARMScheduleR52.td +++ lib/Target/ARM/ARMScheduleR52.td @@ -74,7 +74,7 @@ // Div - may stall 0-9 cycles depending on input (i.e. WRI+(0-9)/2) def : WriteRes { - let Latency = 8; let ResourceCycles = [8]; // not pipelined + let Latency = 8; let ResourceCycles = [8]; // non-pipelined } // Branches - LR written in Late EX2 @@ -141,7 +141,7 @@ let Latency = 4; let NumMicroOps = 0; } def R52WriteDIV : SchedWriteRes<[R52UnitDiv]> { - let Latency = 8; let ResourceCycles = [8]; // not pipelined + let Latency = 8; let ResourceCycles = [8]; // not pipelined } def R52WriteLd : SchedWriteRes<[R52UnitLd]> { let Latency = 4; } def R52WriteST : SchedWriteRes<[R52UnitLd]> { let Latency = 4; } @@ -717,16 +717,19 @@ let Latency = 6; let NumMicroOps = 3; let ResourceCycles = [2]; + let SingleIssue = 1; } def R52WriteVLD3Mem : SchedWriteRes<[R52UnitLd]> { let Latency = 7; let NumMicroOps = 5; let ResourceCycles = [3]; + let SingleIssue = 1; } def R52WriteVLD4Mem : SchedWriteRes<[R52UnitLd]> { let Latency = 8; let NumMicroOps = 7; let ResourceCycles = [4]; + let SingleIssue = 1; } def R52WriteVST1Mem : SchedWriteRes<[R52UnitLd]> { let Latency = 5; Index: test/CodeGen/ARM/single-issue-r52.mir =================================================================== --- /dev/null +++ test/CodeGen/ARM/single-issue-r52.mir @@ -0,0 +1,86 @@ +# RUN: llc -o /dev/null %s -mtriple=arm-eabi -mcpu=cortex-r52 -run-pass machine-scheduler -enable-misched -debug-only=misched -misched-topdown 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=TOPDOWN +# RUN: llc -o /dev/null %s -mtriple=arm-eabi -mcpu=cortex-r52 -run-pass machine-scheduler -enable-misched -debug-only=misched -misched-bottomup 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=BOTTOMUP +# REQUIRES: asserts +--- | + ; ModuleID = 'foo.ll' + source_filename = "foo.ll" + target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" + target triple = "arm---eabi" + + %struct.__neon_int8x8x4_t = type { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } + ; Function Attrs: nounwind + define <8 x i8> @foo(i8* %A) { + %tmp1 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4.v8i8.p0i8(i8* %A, i32 8) + %tmp2 = extractvalue %struct.__neon_int8x8x4_t %tmp1, 0 + %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp1, 1 + %tmp4 = add <8 x i8> %tmp2, %tmp3 + ret <8 x i8> %tmp4 + } + declare %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4.v8i8.p0i8(i8*, i32) + +# CHECK: ********** MI Scheduling ********** +# CHECK: ScheduleDAGMILive::schedule starting +# CHECK: SU(1): %vreg1 = VLD4d8Pseudo %vreg0, 8, pred:14, pred:%noreg; mem:LD32[%A](align=8) QQPR:%vreg1 GPR:%vreg0 +# CHECK: Latency : 8 +# CHECK: Single Issue : true; +# CHECK: SU(2): %vreg4 = VADDv8i8 %vreg1:dsub_0, %vreg1:dsub_1, pred:14, pred:%noreg; DPR:%vreg4 QQPR:%vreg1 +# CHECK: Latency : 5 +# CHECK: Single Issue : false; +# CHECK: SU(3): %vreg5, %vreg6 = VMOVRRD %vreg4, pred:14, pred:%noreg; GPR:%vreg5,%vreg6 DPR:%vreg4 +# CHECK: Latency : 4 +# CHECK: Single Issue : false; + +# TOPDOWN: Scheduling SU(1) %vreg1 = VLD4d8Pseudo +# TOPDOWN: Bump cycle to end group +# TOPDOWN: Scheduling SU(2) %vreg4 = VADDv8i8 + +# BOTTOMUP: Scheduling SU(2) %vreg4 = VADDv8i8 +# BOTTOMUP: Scheduling SU(1) %vreg1 = VLD4d8Pseudo +# BOTTOMUP: Bump cycle to begin group + +... +--- +name: foo +alignment: 2 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: gpr } + - { id: 1, class: qqpr } + - { id: 2, class: dpr } + - { id: 3, class: dpr } + - { id: 4, class: dpr } + - { id: 5, class: gpr } + - { id: 6, class: gpr } +liveins: + - { reg: '%r0', virtual-reg: '%0' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %r0 + + %0 = COPY %r0 + %1 = VLD4d8Pseudo %0, 8, 14, _ :: (load 32 from %ir.A, align 8) + %4 = VADDv8i8 %1.dsub_0, %1.dsub_1, 14, _ + %5, %6 = VMOVRRD %4, 14, _ + %r0 = COPY %5 + %r1 = COPY %6 + BX_RET 14, _, implicit %r0, implicit killed %r1 + +... Index: utils/TableGen/SubtargetEmitter.cpp =================================================================== --- utils/TableGen/SubtargetEmitter.cpp +++ utils/TableGen/SubtargetEmitter.cpp @@ -917,6 +917,8 @@ SCDesc.NumMicroOps += WriteRes->getValueAsInt("NumMicroOps"); SCDesc.BeginGroup |= WriteRes->getValueAsBit("BeginGroup"); SCDesc.EndGroup |= WriteRes->getValueAsBit("EndGroup"); + SCDesc.BeginGroup |= WriteRes->getValueAsBit("SingleIssue"); + SCDesc.EndGroup |= WriteRes->getValueAsBit("SingleIssue"); // Create an entry for each ProcResource listed in WriteRes. RecVec PRVec = WriteRes->getValueAsListOfDefs("ProcResources");