diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -54,6 +54,15 @@ cl::desc("Enable the use of AA during codegen."), cl::init(true)); +static cl::opt EnableMFMACluster("amdgpu-mfma-cluster", + cl::desc("Enable MFMA clustering"), + cl::init(false)); + +static cl::opt + MFMAClusterSize("amdgpu-mfma-cluster-size", cl::init(5), cl::Hidden, + cl::desc("The maximum number of MFMA insts to " + "attempt to cluster together.")); + GCNSubtarget::~GCNSubtarget() = default; GCNSubtarget & @@ -833,6 +842,124 @@ } } +namespace { +struct MFMAClusterDAGMutation : ScheduleDAGMutation { + const SIInstrInfo *TII; + ScheduleDAGMI *DAG; + + MFMAClusterDAGMutation(const SIInstrInfo *tii) : TII(tii) {} + + void collectMFMASUnits(SmallVectorImpl &MFMASUnits) { + for (SUnit &SU : DAG->SUnits) { + MachineInstr &MAI = *SU.getInstr(); + if (!TII->isMAI(MAI) || + MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || + MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64) + continue; + + MFMASUnits.push_back(&SU); + + LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);); + } + } + + void clusterNeighboringMFMAs(llvm::ArrayRef MFMASUnits) { + + DenseMap SUnit2ClusterInfo; + + for (unsigned Idx = 0, End = MFMASUnits.size(); Idx < (End - 1); ++Idx) { + if (SUnit2ClusterInfo.count(MFMASUnits[Idx]->NodeNum)) + continue; // we don't want to cluster against a different cluster + + auto MFMAOpa = MFMASUnits[Idx]; + SmallVector ClusterSuccs(MFMAOpa->Succs); + unsigned NextIdx = Idx + 1; + unsigned ClusterSize = 1; + + // Attempt to cluster all the remaining MFMASunits with MFMAOpa + // Clustering in this manner allows for nicely handling the preds and + // succs s.t. they dont get interspersed in the cluster + while (NextIdx < End) { + if (ClusterSize >= MFMAClusterSize) + break; + + for (; NextIdx < End; ++NextIdx) { + // Only add independent MFMAs that have not been previously clustered + if (!SUnit2ClusterInfo.count(MFMASUnits[NextIdx]->NodeNum) && + !DAG->IsReachable(MFMASUnits[NextIdx], MFMAOpa) && + !DAG->IsReachable(MFMAOpa, MFMASUnits[NextIdx])) + break; + } + if (NextIdx == End) + break; + + auto MFMAOpb = MFMASUnits[NextIdx]; + if (MFMAOpa->NodeNum > MFMAOpb->NodeNum) + std::swap(MFMAOpa, MFMAOpb); + + DAG->addEdge(MFMAOpb, SDep(MFMAOpa, SDep::Cluster)); + + LLVM_DEBUG(dbgs() << "Cluster MFMA SU(" << MFMAOpa->NodeNum << ") - SU(" + << MFMAOpb->NodeNum << ")\n"); + + LLVM_DEBUG(dbgs() << "Copying Preds from "; DAG->dumpNode(*MFMAOpb); + dbgs() << "To "; DAG->dumpNode(*MFMAOpa);); + + SmallVector OpaPreds(MFMAOpa->Preds); + for (const SDep &Pred : MFMAOpb->Preds) { + if (Pred.getSUnit() == MFMAOpa) + continue; + LLVM_DEBUG(dbgs() + << "Copy Pred SU(" << Pred.getSUnit()->NodeNum << ")\n"); + DAG->addEdge(MFMAOpa, SDep(Pred.getSUnit(), SDep::Artificial)); + } + + SUnit2ClusterInfo[MFMAOpb->NodeNum] = MFMAOpa->NodeNum; + SUnit2ClusterInfo[MFMAOpa->NodeNum] = MFMAOpa->NodeNum; + ++ClusterSize; + // Aggregate the succs over each inst in the cluster + ClusterSuccs.append(MFMAOpb->Succs); + } + + for (auto Node : SUnit2ClusterInfo) { + if (Node.second != MFMAOpa->NodeNum) + continue; // only add the combined succs to the current cluster + + for (const SDep &Succ : ClusterSuccs) { + if (Succ.getSUnit() == &DAG->SUnits[Node.first]) + continue; + + DAG->addEdge(Succ.getSUnit(), + SDep(&DAG->SUnits[Node.first], SDep::Artificial)); + } + } + } + } + + void apply(ScheduleDAGInstrs *DAGInstrs) override { + const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget(); + const SIMachineFunctionInfo *MFI = + DAGInstrs->MF.getInfo(); + // The purpose of clustering is to aid with multive wave scheduling + // If our occupancy doesn't support multi waves, bypass clustering + if (!ST.hasMAIInsts() || MFI->getOccupancy() < 2) + return; + DAG = static_cast(DAGInstrs); + const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); + if (!TSchedModel || DAG->SUnits.empty()) + return; + + SmallVector MFMASUnits; + collectMFMASUnits(MFMASUnits); + + if (MFMASUnits.size() < 2) + return; + + clusterNeighboringMFMAs(MFMASUnits); + } +}; +} // namespace + namespace { struct FillMFMAShadowMutation : ScheduleDAGMutation { const SIInstrInfo *TII; @@ -961,6 +1088,14 @@ void GCNSubtarget::getPostRAMutations( std::vector> &Mutations) const { Mutations.push_back(std::make_unique(&InstrInfo)); + Mutations.push_back(std::make_unique(&InstrInfo)); +} + +std::unique_ptr +GCNSubtarget::createMFMAClusterDAGMutation(const TargetInstrInfo *TII) const { + return EnableMFMACluster + ? std::make_unique(&InstrInfo) + : nullptr; } std::unique_ptr diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -393,7 +393,9 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { ScheduleDAGMILive *DAG = new GCNScheduleDAGMILive(C, std::make_unique(C)); + const GCNSubtarget &ST = C->MF->getSubtarget(); DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); + DAG->addMutation(ST.createMFMAClusterDAGMutation(DAG->TII)); DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); return DAG; @@ -875,6 +877,7 @@ const GCNSubtarget &ST = C->MF->getSubtarget(); DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII)); + DAG->addMutation(ST.createMFMAClusterDAGMutation(DAG->TII)); return DAG; } diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1159,6 +1159,9 @@ std::vector> &Mutations) const override; + std::unique_ptr + createMFMAClusterDAGMutation(const TargetInstrInfo *TII) const; + std::unique_ptr createFillMFMAShadowMutation(const TargetInstrInfo *TII) const; diff --git a/llvm/test/CodeGen/AMDGPU/mfma-cluster.mir b/llvm/test/CodeGen/AMDGPU/mfma-cluster.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/mfma-cluster.mir @@ -0,0 +1,54 @@ +# RUN: llc -march=amdgcn -mcpu=gfx90a -start-before=machine-scheduler %s -o - -amdgpu-mfma-cluster=1 --debug-only=amdgpu-subtarget,machine-scheduler 2>&1 | FileCheck -check-prefix=DEFAULT %s +# RUN: llc -march=amdgcn -mcpu=gfx90a -start-before=machine-scheduler %s -o - -amdgpu-mfma-cluster=1 -amdgpu-mfma-cluster-size=2 --misched-bottomup --debug-only=amdgpu-subtarget,machine-scheduler 2>&1 | FileCheck -check-prefix=TWOLIMIT %s + + +# DEFAULT: Cluster MFMA SU(2) - SU(6) +# DEFAULT-NEXT: Copying Preds from SU(6): $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec +# DEFAULT-NEXT: To SU(2): $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec +# DEFAULT-NEXT: Copy Pred SU(5) +# DEFAULT-NEXT: Copy Pred SU(4) +# DEFAULT-NEXT: Cluster MFMA SU(2) - SU(10) +# DEFAULT-NEXT: Copying Preds from SU(10): $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec +# DEFAULT-NEXT: To SU(2): $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec +# DEFAULT-NEXT: Copy Pred SU(5) +# DEFAULT-NEXT: Copy Pred SU(4) +# DEFAULT-NEXT: Cluster MFMA SU(2) - SU(12) +# DEFAULT-NEXT: Copying Preds from SU(12): $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec +# DEFAULT-NEXT: To SU(2): $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec +# DEFAULT-NEXT: Copy Pred SU(1) +# DEFAULT-NEXT: Copy Pred SU(0) + + +# TWOLIMIT: Cluster MFMA SU(2) - SU(6) +# TWOLIMIT-NEXT:Copying Preds from SU(6): $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec +# TWOLIMIT-NEXT: To SU(2): $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec +# TWOLIMIT-NEXT: Copy Pred SU(5) +# TWOLIMIT-NEXT: Copy Pred SU(4) +# TWOLIMIT-NEXT: Cluster MFMA SU(10) - SU(11) +# TWOLIMIT-NEXT: Copying Preds from SU(11): $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec +# TWOLIMIT-NEXT: To SU(10): $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec +# TWOLIMIT-NEXT: Copy Pred SU(9) +# TWOLIMIT-NEXT: Copy Pred SU(8) +# TWOLIMIT-NEXT: Copy Pred SU(6) + +--- +name: sched_test +tracksRegLiveness: true +body: | + bb.0: + liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15 + $vgpr1 = V_MOV_B32_e32 1, implicit $exec + $vgpr0 = V_MOV_B32_e32 1, implicit $exec + $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + $vgpr2 = V_MOV_B32_e32 1, implicit $exec + $vgpr3 = V_MOV_B32_e32 1, implicit $exec + $vgpr4 = V_MOV_B32_e32 1, implicit $exec + $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec + $vgpr5 = V_MOV_B32_e32 1, implicit $exec + $vgpr6 = V_MOV_B32_e32 1, implicit $exec + $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec + $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + +...