diff --git a/llvm/lib/Target/ARM/ARMScheduleA57.td b/llvm/lib/Target/ARM/ARMScheduleA57.td --- a/llvm/lib/Target/ARM/ARMScheduleA57.td +++ b/llvm/lib/Target/ARM/ARMScheduleA57.td @@ -183,11 +183,6 @@ // TODO: according to the doc, conditional uses I0/I1, unconditional uses M // Why more complex instruction uses more simple pipeline? // May be an error in doc. -def A57WriteALUsi : SchedWriteVariant<[ - // lsl #2, lsl #1, or lsr #1. - SchedVar>]>, - SchedVar>]> -]>; def A57WriteALUsr : SchedWriteVariant<[ SchedVar>]>, SchedVar>]> @@ -200,7 +195,7 @@ SchedVar, SchedVar ]>; -def : SchedAlias; +def : SchedAlias>>; def : SchedAlias; def : SchedAlias; def : SchedAlias; diff --git a/llvm/test/CodeGen/ARM/cortex-a57-misched-mla.mir b/llvm/test/CodeGen/ARM/cortex-a57-misched-mla.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/ARM/cortex-a57-misched-mla.mir @@ -0,0 +1,34 @@ +# RUN: llc -mcpu=cortex-a57 -mtriple=thumb -enable-misched -run-pass=machine-scheduler -debug-only=machine-scheduler %s 2>&1 | FileCheck %s + +# CHECK-LABEL: ********** MI Scheduling ********** +# CHECK: %[[RES:[0-9]+]]:rgpr = t2MLA +# CHECK-NEXT: # preds left +# CHECK-NEXT: # succs left +# CHECK-NEXT: # rdefs left +# CHECK-NEXT: Latency : 3 +# CHECK-NEXT: Depth +# CHECK-NEXT: Height +# CHECK-NEXT: Predecessors: +# CHECK-NEXT: SU({{.*}}): Data Latency=1 Reg= +# CHECK-NEXT: SU({{.*}}): Out Latency= +# CHECK-NEXT: SU({{.*}}): Data Latency=1 Reg= +# CHECK-NEXT: Successors: +# CHECK-NEXT: SU([[SMLA_SU:[0-9]+]]): Data Latency=1 Reg=%[[RES]] +# CHECK-NEXT: Pressure Diff +# CHECK-NEXT: Single Issue : false; +# CHECK-NEXT: SU([[SMLA_SU]]): {{.*}} = t2SMLAL %{{[0-9]+}}:rgpr, %{{[0-9]+}}:rgpr, %{{[0-9]+}}:rgpr(tied-def 0), %[[RES]]:rgpr(tied-def 1), 14, $noreg + +name: test_smlal_forwarding +tracksRegLiveness: true +body: | + bb.0: + liveins: $r1, $r3, $r4, $r5, $r6 + %1:rgpr = COPY $r1 + %3:rgpr = COPY $r3 + %4:rgpr = COPY $r4 + %5:rgpr = COPY $r5 + %6:rgpr = COPY $r6 + %3:rgpr = t2MLA %4:rgpr, %1:rgpr, %4:rgpr, 14, $noreg + %6:rgpr, %5:rgpr = t2SMLAL %5:rgpr, %6:rgpr, %4:rgpr, %3:rgpr, 14, $noreg + $r0 = COPY %6:rgpr + BX_RET 14, $noreg, implicit $r0 diff --git a/llvm/utils/TableGen/CodeGenSchedule.h b/llvm/utils/TableGen/CodeGenSchedule.h --- a/llvm/utils/TableGen/CodeGenSchedule.h +++ b/llvm/utils/TableGen/CodeGenSchedule.h @@ -443,6 +443,7 @@ InstClassMapTy InstrClassMap; std::vector STIPredicates; + std::vector getAllProcIndices() const; public: CodeGenSchedModels(RecordKeeper& RK, const CodeGenTarget &TGT); diff --git a/llvm/utils/TableGen/CodeGenSchedule.cpp b/llvm/utils/TableGen/CodeGenSchedule.cpp --- a/llvm/utils/TableGen/CodeGenSchedule.cpp +++ b/llvm/utils/TableGen/CodeGenSchedule.cpp @@ -1338,8 +1338,7 @@ PredTransitions(CodeGenSchedModels &sm): SchedModels(sm) {} bool substituteVariantOperand(const SmallVectorImpl &RWSeq, - bool IsRead, bool IsForAnyCPU, - unsigned StartIdx); + bool IsRead, unsigned StartIdx); bool substituteVariants(const PredTransition &Trans); @@ -1413,29 +1412,6 @@ return false; } -static bool hasAliasedVariants(const CodeGenSchedRW &RW, - CodeGenSchedModels &SchedModels) { - if (RW.HasVariants) - return true; - - for (Record *Alias : RW.Aliases) { - const CodeGenSchedRW &AliasRW = - SchedModels.getSchedRW(Alias->getValueAsDef("AliasRW")); - if (AliasRW.HasVariants) - return true; - if (AliasRW.IsSequence) { - IdxVec ExpandedRWs; - SchedModels.expandRWSequence(AliasRW.Index, ExpandedRWs, AliasRW.IsRead); - for (unsigned SI : ExpandedRWs) { - if (hasAliasedVariants(SchedModels.getSchedRW(SI, AliasRW.IsRead), - SchedModels)) - return true; - } - } - } - return false; -} - static std::vector getAllPredicates(ArrayRef Variants, ArrayRef ProcIndices) { std::vector Preds; @@ -1613,21 +1589,7 @@ // starts. RWSeq must be applied to all transitions between StartIdx and the end // of TransVec. bool PredTransitions::substituteVariantOperand( - const SmallVectorImpl &RWSeq, bool IsRead, bool IsForAnyCPU, - unsigned StartIdx) { - - auto CollectAndAddVariants = [&](unsigned TransIdx, - const CodeGenSchedRW &SchedRW) { - // Distribute this partial PredTransition across intersecting variants. - // This will push a copies of TransVec[TransIdx] on the back of TransVec. - std::vector IntersectingVariants; - getIntersectingVariants(SchedRW, TransIdx, IntersectingVariants); - // Now expand each variant on top of its copy of the transition. - for (const TransVariant &IV : IntersectingVariants) - pushVariant(IV, IsRead); - return !IntersectingVariants.empty(); - }; - + const SmallVectorImpl &RWSeq, bool IsRead, unsigned StartIdx) { bool Subst = false; // Visit each original RW within the current sequence. for (SmallVectorImpl::const_iterator @@ -1636,35 +1598,24 @@ // Push this RW on all partial PredTransitions or distribute variants. // New PredTransitions may be pushed within this loop which should not be // revisited (TransEnd must be loop invariant). - bool HasAliases = false, WasPushed = false; for (unsigned TransIdx = StartIdx, TransEnd = TransVec.size(); TransIdx != TransEnd; ++TransIdx) { - // In the common case, push RW onto the current operand's sequence. - if (!hasAliasedVariants(SchedRW, SchedModels)) { + // Distribute this partial PredTransition across intersecting variants. + // This will push a copies of TransVec[TransIdx] on the back of TransVec. + std::vector IntersectingVariants; + getIntersectingVariants(SchedRW, TransIdx, IntersectingVariants); + // Now expand each variant on top of its copy of the transition. + for (const TransVariant &IV : IntersectingVariants) + pushVariant(IV, IsRead); + if (IntersectingVariants.empty()) { if (IsRead) TransVec[TransIdx].ReadSequences.back().push_back(*RWI); else TransVec[TransIdx].WriteSequences.back().push_back(*RWI); continue; + } else { + Subst = true; } - HasAliases = true; - WasPushed |= CollectAndAddVariants(TransIdx, SchedRW); - Subst |= WasPushed; - } - if (IsRead && IsForAnyCPU && HasAliases && !WasPushed) { - // If we're here this means that in some sched class: - // a) We have read variant for CPU A - // b) We have write variant for CPU B - // b) We don't have write variant for CPU A - // d) We must expand all read/write variants (IsForAnyCPU is true) - // e) We couldn't expand SchedRW because TransVec doesn't have - // any transition with compatible CPU ID. - // In such case we create new empty transition with zero (AnyCPU) - // index. - TransVec.reserve(TransVec.size() + 1); - TransVec.emplace_back(TransVec[StartIdx].PredTerm); - TransVec.back().ReadSequences.emplace_back(); - Subst |= CollectAndAddVariants(TransVec.size() - 1, SchedRW); } } return Subst; @@ -1683,7 +1634,7 @@ bool Subst = false; TransVec.emplace_back(Trans.PredTerm, Trans.ProcIndices); - bool IsForAnyCPU = llvm::count(Trans.ProcIndices, 0); + assert(!llvm::count(Trans.ProcIndices, 0)); // Visit each original write sequence. for (SmallVectorImpl>::const_iterator WSI = Trans.WriteSequences.begin(), WSE = Trans.WriteSequences.end(); @@ -1693,8 +1644,7 @@ TransVec.begin() + StartIdx, E = TransVec.end(); I != E; ++I) { I->WriteSequences.emplace_back(); } - Subst |= - substituteVariantOperand(*WSI, /*IsRead=*/false, IsForAnyCPU, StartIdx); + Subst |= substituteVariantOperand(*WSI, /*IsRead=*/false, StartIdx); } // Visit each original read sequence. for (SmallVectorImpl>::const_iterator @@ -1705,8 +1655,7 @@ TransVec.begin() + StartIdx, E = TransVec.end(); I != E; ++I) { I->ReadSequences.emplace_back(); } - Subst |= - substituteVariantOperand(*RSI, /*IsRead=*/true, IsForAnyCPU, StartIdx); + Subst |= substituteVariantOperand(*RSI, /*IsRead=*/true, StartIdx); } return Subst; } @@ -1745,6 +1694,10 @@ // requires creating a new SchedClass. for (ArrayRef::iterator I = LastTransitions.begin(), E = LastTransitions.end(); I != E; ++I) { + // Variant expansion (substituteVariants) may create unconditional + // transitions. We don't need to build sched classes for them. + if (I->PredTerm.empty()) + continue; IdxVec OperWritesVariant, OperReadsVariant; addSequences(SchedModels, I->WriteSequences, OperWritesVariant, false); addSequences(SchedModels, I->ReadSequences, OperReadsVariant, true); @@ -1777,6 +1730,26 @@ } } +std::vector CodeGenSchedModels::getAllProcIndices() const { + std::vector ProcIdVec; + for (const auto &PM : ProcModelMap) + if (PM.second != 0) + ProcIdVec.push_back(PM.second); + return ProcIdVec; +} + +static std::vector +makePerProcessorTransitions(const PredTransition &Trans, + ArrayRef ProcIndices) { + std::vector PerCpuTransVec; + for (unsigned ProcId : ProcIndices) { + assert(ProcId != 0); + PerCpuTransVec.push_back(Trans); + PerCpuTransVec.back().ProcIndices.assign(1, ProcId); + } + return PerCpuTransVec; +} + // Create new SchedClasses for the given ReadWrite list. If any of the // ReadWrites refers to a SchedVariant, create a new SchedClass for each variant // of the ReadWrite list, following Aliases if necessary. @@ -1812,6 +1785,10 @@ } LLVM_DEBUG(dbgs() << '\n'); + LastTransitions = makePerProcessorTransitions( + LastTransitions[0], llvm::count(ProcIndices, 0) + ? ArrayRef(getAllProcIndices()) + : ProcIndices); // Collect all PredTransitions for individual operands. // Iterate until no variant writes remain. bool SubstitutedAny; @@ -1823,9 +1800,6 @@ LLVM_DEBUG(Transitions.dump()); LastTransitions.swap(Transitions.TransVec); } while (SubstitutedAny); - // If the first transition has no variants, nothing to do. - if (LastTransitions[0].PredTerm.empty()) - return; // WARNING: We are about to mutate the SchedClasses vector. Do not refer to // OperWrites, OperReads, or ProcIndices after calling inferFromTransitions.