diff --git a/llvm/lib/Target/ARM/ARMScheduleA57.td b/llvm/lib/Target/ARM/ARMScheduleA57.td --- a/llvm/lib/Target/ARM/ARMScheduleA57.td +++ b/llvm/lib/Target/ARM/ARMScheduleA57.td @@ -270,7 +270,11 @@ // from similar μops, allowing a typical sequence of multiply-accumulate μops // to issue one every 1 cycle (sched advance = 2). def A57WriteMLA : SchedWriteRes<[A57UnitM]> { let Latency = 3; } -def A57WriteMLAL : SchedWriteRes<[A57UnitM]> { let Latency = 4; } +def A57WriteMLAL : SchedWriteVariant<[ + SchedVar, + SchedVar +]>; + def A57ReadMLA : SchedReadAdvance<2, [A57WriteMLA, A57WriteMLAL]>; def : InstRW<[A57WriteMLA], diff --git a/llvm/test/tools/llvm-mca/ARM/cortex-a57-basic-instructions.s b/llvm/test/tools/llvm-mca/ARM/cortex-a57-basic-instructions.s --- a/llvm/test/tools/llvm-mca/ARM/cortex-a57-basic-instructions.s +++ b/llvm/test/tools/llvm-mca/ARM/cortex-a57-basic-instructions.s @@ -1421,9 +1421,9 @@ # CHECK-NEXT: 1 3 1.00 smladeq r2, r3, r5, r8 # CHECK-NEXT: 1 3 1.00 smladxhi r2, r3, r5, r8 # CHECK-NEXT: 2 4 2.00 smlal r2, r3, r5, r8 -# CHECK-NEXT: 2 4 2.00 smlals r2, r3, r5, r8 +# CHECK-NEXT: 4 5 2.00 smlals r2, r3, r5, r8 # CHECK-NEXT: 2 4 2.00 smlaleq r2, r3, r5, r8 -# CHECK-NEXT: 2 4 2.00 smlalshi r2, r3, r5, r8 +# CHECK-NEXT: 4 5 2.00 smlalshi r2, r3, r5, r8 # CHECK-NEXT: 2 4 2.00 smlalbb r3, r1, r9, r0 # CHECK-NEXT: 2 4 2.00 smlalbt r5, r6, r4, r1 # CHECK-NEXT: 2 4 2.00 smlaltb r4, r2, r3, r2 @@ -1634,12 +1634,12 @@ # CHECK-NEXT: 2 4 2.00 umaallt r3, r4, r5, r6 # CHECK-NEXT: 2 4 2.00 umlal r2, r4, r6, r8 # CHECK-NEXT: 2 4 2.00 umlalgt r6, r1, r2, r6 -# CHECK-NEXT: 2 4 2.00 umlals r2, r9, r2, r3 -# CHECK-NEXT: 2 4 2.00 umlalseq r3, r5, r1, r2 +# CHECK-NEXT: 4 5 2.00 umlals r2, r9, r2, r3 +# CHECK-NEXT: 4 5 2.00 umlalseq r3, r5, r1, r2 # CHECK-NEXT: 2 4 2.00 umull r2, r4, r6, r8 # CHECK-NEXT: 2 4 2.00 umullgt r6, r1, r2, r6 -# CHECK-NEXT: 2 4 2.00 umulls r2, r9, r2, r3 -# CHECK-NEXT: 2 4 2.00 umullseq r3, r5, r1, r2 +# CHECK-NEXT: 4 5 2.00 umulls r2, r9, r2, r3 +# CHECK-NEXT: 4 5 2.00 umullseq r3, r5, r1, r2 # CHECK-NEXT: 1 2 1.00 uqadd16 r1, r2, r3 # CHECK-NEXT: 1 2 1.00 uqadd16gt r4, r7, r9 # CHECK-NEXT: 1 2 1.00 uqadd8 r3, r4, r8 @@ -1719,7 +1719,7 @@ # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1.0] [1.1] [2] [3] [4] [5] [6] -# CHECK-NEXT: 8.00 133.00 133.00 53.00 522.00 12.00 - - +# CHECK-NEXT: 8.00 139.00 139.00 53.00 522.00 12.00 - - # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1.0] [1.1] [2] [3] [4] [5] [6] Instructions: @@ -2285,9 +2285,9 @@ # CHECK-NEXT: - - - - 1.00 - - - smladeq r2, r3, r5, r8 # CHECK-NEXT: - - - - 1.00 - - - smladxhi r2, r3, r5, r8 # CHECK-NEXT: - - - - 2.00 - - - smlal r2, r3, r5, r8 -# CHECK-NEXT: - - - - 2.00 - - - smlals r2, r3, r5, r8 +# CHECK-NEXT: - 1.00 1.00 - 2.00 - - - smlals r2, r3, r5, r8 # CHECK-NEXT: - - - - 2.00 - - - smlaleq r2, r3, r5, r8 -# CHECK-NEXT: - - - - 2.00 - - - smlalshi r2, r3, r5, r8 +# CHECK-NEXT: - 1.00 1.00 - 2.00 - - - smlalshi r2, r3, r5, r8 # CHECK-NEXT: - - - - 2.00 - - - smlalbb r3, r1, r9, r0 # CHECK-NEXT: - - - - 2.00 - - - smlalbt r5, r6, r4, r1 # CHECK-NEXT: - - - - 2.00 - - - smlaltb r4, r2, r3, r2 @@ -2498,12 +2498,12 @@ # CHECK-NEXT: - - - - 2.00 - - - umaallt r3, r4, r5, r6 # CHECK-NEXT: - - - - 2.00 - - - umlal r2, r4, r6, r8 # CHECK-NEXT: - - - - 2.00 - - - umlalgt r6, r1, r2, r6 -# CHECK-NEXT: - - - - 2.00 - - - umlals r2, r9, r2, r3 -# CHECK-NEXT: - - - - 2.00 - - - umlalseq r3, r5, r1, r2 +# CHECK-NEXT: - 1.00 1.00 - 2.00 - - - umlals r2, r9, r2, r3 +# CHECK-NEXT: - 1.00 1.00 - 2.00 - - - umlalseq r3, r5, r1, r2 # CHECK-NEXT: - - - - 2.00 - - - umull r2, r4, r6, r8 # CHECK-NEXT: - - - - 2.00 - - - umullgt r6, r1, r2, r6 -# CHECK-NEXT: - - - - 2.00 - - - umulls r2, r9, r2, r3 -# CHECK-NEXT: - - - - 2.00 - - - umullseq r3, r5, r1, r2 +# CHECK-NEXT: - 1.00 1.00 - 2.00 - - - umulls r2, r9, r2, r3 +# CHECK-NEXT: - 1.00 1.00 - 2.00 - - - umullseq r3, r5, r1, r2 # CHECK-NEXT: - - - - 1.00 - - - uqadd16 r1, r2, r3 # CHECK-NEXT: - - - - 1.00 - - - uqadd16gt r4, r7, r9 # CHECK-NEXT: - - - - 1.00 - - - uqadd8 r3, r4, r8 diff --git a/llvm/utils/TableGen/CodeGenSchedule.cpp b/llvm/utils/TableGen/CodeGenSchedule.cpp --- a/llvm/utils/TableGen/CodeGenSchedule.cpp +++ b/llvm/utils/TableGen/CodeGenSchedule.cpp @@ -1315,6 +1315,16 @@ SmallVector, 16> WriteSequences; SmallVector, 16> ReadSequences; SmallVector ProcIndices; + + PredTransition() = default; + PredTransition(ArrayRef PT) { + PredTerm.assign(PT.begin(), PT.end()); + ProcIndices.assign(1, 0); + } + PredTransition(ArrayRef PT, ArrayRef PIds) { + PredTerm.assign(PT.begin(), PT.end()); + ProcIndices.assign(PIds.begin(), PIds.end()); + } }; // Encapsulate a set of partially constructed transitions. @@ -1328,7 +1338,8 @@ PredTransitions(CodeGenSchedModels &sm): SchedModels(sm) {} void substituteVariantOperand(const SmallVectorImpl &RWSeq, - bool IsRead, unsigned StartIdx); + bool IsRead, bool IsForAnyCPU, + unsigned StartIdx); void substituteVariants(const PredTransition &Trans); @@ -1568,7 +1579,20 @@ // starts. RWSeq must be applied to all transitions between StartIdx and the end // of TransVec. void PredTransitions::substituteVariantOperand( - const SmallVectorImpl &RWSeq, bool IsRead, unsigned StartIdx) { + const SmallVectorImpl &RWSeq, bool IsRead, bool IsForAnyCPU, + unsigned StartIdx) { + + auto CollectAndAddVariants = [&](unsigned TransIdx, + const CodeGenSchedRW &SchedRW) { + // Distribute this partial PredTransition across intersecting variants. + // This will push a copies of TransVec[TransIdx] on the back of TransVec. + std::vector IntersectingVariants; + getIntersectingVariants(SchedRW, TransIdx, IntersectingVariants); + // Now expand each variant on top of its copy of the transition. + for (const TransVariant &IV : IntersectingVariants) + pushVariant(IV, IsRead); + return !IntersectingVariants.empty(); + }; // Visit each original RW within the current sequence. for (SmallVectorImpl::const_iterator @@ -1577,6 +1601,7 @@ // Push this RW on all partial PredTransitions or distribute variants. // New PredTransitions may be pushed within this loop which should not be // revisited (TransEnd must be loop invariant). + bool HasAliases = false, WasPushed = false; for (unsigned TransIdx = StartIdx, TransEnd = TransVec.size(); TransIdx != TransEnd; ++TransIdx) { // In the common case, push RW onto the current operand's sequence. @@ -1587,17 +1612,22 @@ TransVec[TransIdx].WriteSequences.back().push_back(*RWI); continue; } - // Distribute this partial PredTransition across intersecting variants. - // This will push a copies of TransVec[TransIdx] on the back of TransVec. - std::vector IntersectingVariants; - getIntersectingVariants(SchedRW, TransIdx, IntersectingVariants); - // Now expand each variant on top of its copy of the transition. - for (std::vector::const_iterator - IVI = IntersectingVariants.begin(), - IVE = IntersectingVariants.end(); - IVI != IVE; ++IVI) { - pushVariant(*IVI, IsRead); - } + HasAliases = true; + WasPushed |= CollectAndAddVariants(TransIdx, SchedRW); + } + if (IsRead && IsForAnyCPU && HasAliases && !WasPushed) { + // If we're here this means that in some sched class: + // a) We have read variant for CPU A + // b) We have write variant for CPU B + // b) We don't have write variant for CPU A + // d) We must expand all read/write variants (IsForAnyCPU is true) + // e) We couldn't expand SchedRW because TransVec doesn't have + // any transition with compatible CPU ID. + // In such case we create new empty transition with zero (AnyCPU) + // index. + TransVec.emplace_back(TransVec[StartIdx].PredTerm); + TransVec.back().ReadSequences.emplace_back(); + CollectAndAddVariants(TransVec.size() - 1, SchedRW); } } } @@ -1612,10 +1642,9 @@ // Build up a set of partial results starting at the back of // PredTransitions. Remember the first new transition. unsigned StartIdx = TransVec.size(); - TransVec.emplace_back(); - TransVec.back().PredTerm = Trans.PredTerm; - TransVec.back().ProcIndices = Trans.ProcIndices; + TransVec.emplace_back(Trans.PredTerm, Trans.ProcIndices); + bool IsForAnyCPU = llvm::count(Trans.ProcIndices, 0); // Visit each original write sequence. for (SmallVectorImpl>::const_iterator WSI = Trans.WriteSequences.begin(), WSE = Trans.WriteSequences.end(); @@ -1625,7 +1654,7 @@ TransVec.begin() + StartIdx, E = TransVec.end(); I != E; ++I) { I->WriteSequences.emplace_back(); } - substituteVariantOperand(*WSI, /*IsRead=*/false, StartIdx); + substituteVariantOperand(*WSI, /*IsRead=*/false, IsForAnyCPU, StartIdx); } // Visit each original read sequence. for (SmallVectorImpl>::const_iterator @@ -1636,7 +1665,7 @@ TransVec.begin() + StartIdx, E = TransVec.end(); I != E; ++I) { I->ReadSequences.emplace_back(); } - substituteVariantOperand(*RSI, /*IsRead=*/true, StartIdx); + substituteVariantOperand(*RSI, /*IsRead=*/true, IsForAnyCPU, StartIdx); } }