diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMFMAClustering.h b/llvm/lib/Target/AMDGPU/AMDGPUMFMAClustering.h new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUMFMAClustering.h @@ -0,0 +1,21 @@ +//===- AMDGPUMFMAClustering.h - AMDGPU MFMA Clustering ------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMFMACLUSTERING_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMFMACLUSTERING_H + +#include "llvm/CodeGen/ScheduleDAGMutation.h" +#include + +namespace llvm { + +std::unique_ptr createMFMAClusterDAGMutation(); + +} // namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUMFMACLUSTERING_H diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMFMAClustering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMFMAClustering.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUMFMAClustering.cpp @@ -0,0 +1,175 @@ +//===--- AMDGPUMFMAClusting.cpp - AMDGPU MFMA Clustering -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file This file contains a DAG scheduling mutation to cluster MFMA +/// instructions. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUMFMAClustering.h" +#include "AMDGPUTargetMachine.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineScheduler.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-mfma-clustering" + +namespace { + +static cl::opt EnableMFMACluster("amdgpu-mfma-cluster", + cl::desc("Enable MFMA clustering"), + cl::init(false)); + +static cl::opt + MaxMFMAClusterSize("amdgpu-mfma-cluster-size", cl::init(5), cl::Hidden, + cl::desc("The maximum number of MFMA instructions to " + "attempt to cluster together.")); + +class MFMAClusterDAGMutation : public ScheduleDAGMutation { + const SIInstrInfo *TII; + ScheduleDAGMI *DAG; + +public: + MFMAClusterDAGMutation() = default; + void apply(ScheduleDAGInstrs *DAGInstrs) override; +}; + +static void collectMFMASUnits(SmallVectorImpl &MFMASUnits, + const SIInstrInfo *TII, ScheduleDAGInstrs *DAG) { + for (SUnit &SU : DAG->SUnits) { + MachineInstr &MAI = *SU.getInstr(); + if (!TII->isMAI(MAI) || + MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || + MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64) + continue; + + MFMASUnits.push_back(&SU); + + LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);); + } + + // Sorting the MFMAs in NodeNum order results in a good clustering order + std::sort(MFMASUnits.begin(), MFMASUnits.end(), + [](SUnit *a, SUnit *b) { return a->NodeNum < b->NodeNum; }); +} + +static void propagateDeps(DenseMap &SUnit2ClusterInfo, + llvm::ArrayRef ClusterPreds, + llvm::ArrayRef ClusterSuccs, + unsigned ClusterNum, ScheduleDAGInstrs *DAG) { + + for (auto Node : SUnit2ClusterInfo) { + if (Node.second != ClusterNum) + continue; // Only add the combined succs to the current cluster + + LLVM_DEBUG(dbgs() << "Copying Deps To SU(" << Node.first << ")\n"); + + for (const SDep &Succ : ClusterSuccs) { + LLVM_DEBUG(dbgs() << "Copying Succ SU(" << Succ.getSUnit()->NodeNum + << ")\n"); + DAG->addEdge(Succ.getSUnit(), + SDep(&DAG->SUnits[Node.first], SDep::Artificial)); + } + + for (const SDep &Pred : ClusterPreds) { + LLVM_DEBUG(dbgs() << "Copying Pred SU(" << Pred.getSUnit()->NodeNum + << ")\n"); + if (Pred.getSUnit()->NodeNum == ClusterNum) + continue; + DAG->addEdge(&DAG->SUnits[Node.first], + SDep(Pred.getSUnit(), SDep::Artificial)); + } + } +} + +static void clusterNeighboringMFMAs(llvm::ArrayRef MFMASUnits, + ScheduleDAGInstrs *DAG) { + + DenseMap SUnit2ClusterInfo; + + for (unsigned Idx = 0, End = MFMASUnits.size(); Idx < (End - 1); ++Idx) { + if (SUnit2ClusterInfo.count(MFMASUnits[Idx]->NodeNum)) + continue; // We don't want to cluster against a different cluster + + auto MFMAOpa = MFMASUnits[Idx]; + auto ClusterBase = MFMAOpa; + unsigned ClusterNum = ClusterBase->NodeNum; + SmallVector ClusterSuccs(MFMAOpa->Succs); + SmallVector ClusterPreds(MFMAOpa->Preds); + unsigned NextIdx = Idx + 1; + unsigned ClusterSize = 1; + + // Attempt to cluster all the remaining MFMASunits in a chain + // starting at ClusterBase/MFMAOpa. + for (; NextIdx < End; ++NextIdx) { + if (ClusterSize >= MaxMFMAClusterSize || NextIdx >= End) + break; + // Only add independent MFMAs that have not been previously clustered + if (SUnit2ClusterInfo.count(MFMASUnits[NextIdx]->NodeNum) || + DAG->IsReachable(MFMASUnits[NextIdx], ClusterBase) || + DAG->IsReachable(ClusterBase, MFMASUnits[NextIdx])) + continue; + + auto MFMAOpb = MFMASUnits[NextIdx]; + // Aggregate the cluster inst dependencies for dep propogation + ClusterPreds.append(MFMAOpb->Preds); + ClusterSuccs.append(MFMAOpb->Succs); + if (!DAG->addEdge(MFMAOpb, SDep(MFMAOpa, SDep::Cluster))) + continue; + + // Enforce ordering to ensure root/leaf of cluster chain gets + // scheduled first/last + DAG->addEdge(MFMAOpb, SDep(MFMAOpa, SDep::Artificial)); + + LLVM_DEBUG(dbgs() << "Cluster MFMA SU(" << MFMAOpa->NodeNum << ") - SU(" + << MFMAOpb->NodeNum << ")\n"); + + SUnit2ClusterInfo[MFMAOpb->NodeNum] = ClusterNum; + SUnit2ClusterInfo[MFMAOpa->NodeNum] = ClusterNum; + ++ClusterSize; + MFMAOpa = MFMAOpb; + } + propagateDeps(SUnit2ClusterInfo, ClusterPreds, ClusterSuccs, ClusterNum, + DAG); + } +} + +void MFMAClusterDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) { + const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget(); + TII = ST.getInstrInfo(); + const SIMachineFunctionInfo *MFI = + DAGInstrs->MF.getInfo(); + if (!ST.hasMAIInsts()) + return; + DAG = static_cast(DAGInstrs); + const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); + if (!TSchedModel || DAG->SUnits.empty()) + return; + + SmallVector MFMASUnits; + collectMFMASUnits(MFMASUnits, TII, DAG); + + if (MFMASUnits.size() < 2) + return; + + clusterNeighboringMFMAs(MFMASUnits, DAG); +} + +} // namespace + +namespace llvm { + +std::unique_ptr createMFMAClusterDAGMutation() { + return EnableMFMACluster ? std::make_unique() + : nullptr; +} + +} // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -16,6 +16,7 @@ #include "AMDGPU.h" #include "AMDGPUAliasAnalysis.h" #include "AMDGPUExportClustering.h" +#include "AMDGPUMFMAClustering.h" #include "AMDGPUMacroFusion.h" #include "AMDGPUTargetObjectFile.h" #include "AMDGPUTargetTransformInfo.h" @@ -394,6 +395,7 @@ ScheduleDAGMILive *DAG = new GCNScheduleDAGMILive(C, std::make_unique(C)); DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); + DAG->addMutation(createMFMAClusterDAGMutation()); DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); return DAG; @@ -875,6 +877,7 @@ const GCNSubtarget &ST = C->MF->getSubtarget(); DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII)); + DAG->addMutation(createMFMAClusterDAGMutation()); return DAG; } diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -75,6 +75,7 @@ AMDGPUMachineModuleInfo.cpp AMDGPUMacroFusion.cpp AMDGPUMCInstLower.cpp + AMDGPUMFMAClustering.cpp AMDGPUMIRFormatter.cpp AMDGPUOpenCLEnqueuedBlockLowering.cpp AMDGPUPerfHintAnalysis.cpp diff --git a/llvm/test/CodeGen/AMDGPU/mfma-cluster-edges.mir b/llvm/test/CodeGen/AMDGPU/mfma-cluster-edges.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/mfma-cluster-edges.mir @@ -0,0 +1,71 @@ +# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=machine-scheduler %s -o - -amdgpu-mfma-cluster=1 --debug-only=amdgpu-mfma-clustering 2>&1 | FileCheck -check-prefix=PRERA %s +# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=machine-scheduler %s -o - -amdgpu-mfma-cluster=1 -amdgpu-mfma-cluster-size=2 --debug-only=amdgpu-mfma-clustering 2>&1 | FileCheck -check-prefix=TWOLIMIT %s +# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=postmisched %s -o - -amdgpu-mfma-cluster=1 --debug-only=amdgpu-mfma-clustering 2>&1| FileCheck -check-prefix=POSTRA %s +# REQUIRES: asserts + +# PRERA: Cluster MFMA SU(2) - SU(6) +# PRERA-NEXT: Cluster MFMA SU(6) - SU(10) +# PRERA-NEXT: Cluster MFMA SU(10) - SU(12) + +# TWOLIMIT: Cluster MFMA SU(2) - SU(6) +# TWOLIMIT: Cluster MFMA SU(10) - SU(11) + +# POSTRA: Cluster MFMA SU(2) - SU(6) +# POSTRA-NEXT: Cluster MFMA SU(6) - SU(10) +# POSTRA-NEXT: Cluster MFMA SU(10) - SU(12) + +--- +name: basic_cluster +tracksRegLiveness: true +body: | + bb.0: + liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15 + $vgpr1 = V_MOV_B32_e32 1, implicit $exec + $vgpr0 = V_MOV_B32_e32 1, implicit $exec + $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + $vgpr2 = V_MOV_B32_e32 1, implicit $exec + $vgpr3 = V_MOV_B32_e32 1, implicit $exec + $vgpr4 = V_MOV_B32_e32 1, implicit $exec + $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec + $vgpr5 = V_MOV_B32_e32 1, implicit $exec + $vgpr6 = V_MOV_B32_e32 1, implicit $exec + $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec + $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec +... + +# PRERA: Cluster MFMA SU(12) - SU(16) +# PRERA-NEXT: Cluster MFMA SU(16) - SU(20) + +# POSTRA: Cluster MFMA SU(12) - SU(16) +# POSTRA-NEXT: Cluster MFMA SU(16) - SU(20) + +--- +name: complex_cluster +tracksRegLiveness: true +body: | + bb.0: + liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $sgpr0, $vgpr10_vgpr11 + $vgpr1 = V_MOV_B32_e32 1, implicit $exec + $vgpr0 = V_MOV_B32_e32 1, implicit $exec + $vgpr8 = V_MOV_B32_e32 0, implicit $exec + $vgpr9 = V_MOV_B32_e32 9, implicit $exec + $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + $vgpr2 = V_MOV_B32_e32 1, implicit $exec + $vgpr3 = V_MOV_B32_e32 1, implicit $exec + $vgpr4 = V_MOV_B32_e32 1, implicit $exec + $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec + GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec + $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec + $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec + $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec + $vgpr5 = V_MOV_B32_e32 1, implicit $exec + $vgpr6 = V_MOV_B32_e32 1, implicit $exec + $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec + $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec + $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec + $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec +... diff --git a/llvm/test/CodeGen/AMDGPU/mfma-cluster.mir b/llvm/test/CodeGen/AMDGPU/mfma-cluster.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/mfma-cluster.mir @@ -0,0 +1,354 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=machine-scheduler %s -o - -amdgpu-mfma-cluster=1 2>&1 | FileCheck -check-prefix=PRERA %s +# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=machine-scheduler %s -o - 2>&1 | FileCheck -check-prefix=DEFAULT %s +# RUN: llc -march=amdgcn -mcpu=gfx90a -start-before=machine-scheduler -stop-after=postmisched %s -o - -amdgpu-mfma-cluster=1 2>&1 | FileCheck -check-prefix=BOTHSCHEDPASS %s +# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=machine-scheduler %s -o - -amdgpu-mfma-cluster=1 -amdgpu-mfma-cluster-size=2 2>&1 | FileCheck -check-prefix=TWOLIMIT %s +# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=postmisched %s -o - -amdgpu-mfma-cluster=1 2>&1| FileCheck -check-prefix=POSTRA %s + + +--- +name: no_cluster +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0, $vgpr10_vgpr11 + ; PRERA-LABEL: name: no_cluster + ; PRERA: liveins: $sgpr0, $vgpr10_vgpr11 + ; PRERA-NEXT: {{ $}} + ; PRERA-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec + ; PRERA-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec + ; PRERA-NEXT: $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec + ; PRERA-NEXT: GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec + ; PRERA-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; PRERA-NEXT: $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec + ; PRERA-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec + ; PRERA-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec + ; PRERA-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec + ; PRERA-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT $vgpr8_vgpr9, 0, 0, implicit $exec + ; PRERA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec + ; PRERA-NEXT: $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec + ; DEFAULT-LABEL: name: no_cluster + ; DEFAULT: liveins: $sgpr0, $vgpr10_vgpr11 + ; DEFAULT-NEXT: {{ $}} + ; DEFAULT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec + ; DEFAULT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec + ; DEFAULT-NEXT: $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec + ; DEFAULT-NEXT: GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec + ; DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; DEFAULT-NEXT: $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec + ; DEFAULT-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec + ; DEFAULT-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec + ; DEFAULT-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec + ; DEFAULT-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT $vgpr8_vgpr9, 0, 0, implicit $exec + ; DEFAULT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec + ; DEFAULT-NEXT: $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec + ; BOTHSCHEDPASS-LABEL: name: no_cluster + ; BOTHSCHEDPASS: liveins: $sgpr0, $vgpr10_vgpr11 + ; BOTHSCHEDPASS-NEXT: {{ $}} + ; BOTHSCHEDPASS-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec + ; BOTHSCHEDPASS-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec + ; BOTHSCHEDPASS-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec + ; BOTHSCHEDPASS-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec + ; BOTHSCHEDPASS-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; BOTHSCHEDPASS-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec + ; BOTHSCHEDPASS-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec + ; BOTHSCHEDPASS-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec + ; BOTHSCHEDPASS-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec + ; BOTHSCHEDPASS-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT killed $vgpr8_vgpr9, 0, 0, implicit $exec + ; BOTHSCHEDPASS-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec + ; BOTHSCHEDPASS-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec + ; TWOLIMIT-LABEL: name: no_cluster + ; TWOLIMIT: liveins: $sgpr0, $vgpr10_vgpr11 + ; TWOLIMIT-NEXT: {{ $}} + ; TWOLIMIT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec + ; TWOLIMIT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec + ; TWOLIMIT-NEXT: $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec + ; TWOLIMIT-NEXT: GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec + ; TWOLIMIT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; TWOLIMIT-NEXT: $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec + ; TWOLIMIT-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec + ; TWOLIMIT-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec + ; TWOLIMIT-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec + ; TWOLIMIT-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT $vgpr8_vgpr9, 0, 0, implicit $exec + ; TWOLIMIT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec + ; TWOLIMIT-NEXT: $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec + ; POSTRA-LABEL: name: no_cluster + ; POSTRA: liveins: $sgpr0, $vgpr10_vgpr11 + ; POSTRA-NEXT: {{ $}} + ; POSTRA-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec + ; POSTRA-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec + ; POSTRA-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec + ; POSTRA-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec + ; POSTRA-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec + ; POSTRA-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec + ; POSTRA-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; POSTRA-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec + ; POSTRA-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec + ; POSTRA-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT killed $vgpr8_vgpr9, 0, 0, implicit $exec + ; POSTRA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec + ; POSTRA-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec + $vgpr1 = V_MOV_B32_e32 1, implicit $exec + $vgpr0 = V_MOV_B32_e32 1, implicit $exec + $vgpr8 = V_MOV_B32_e32 0, implicit $exec + $vgpr9 = V_MOV_B32_e32 9, implicit $exec + $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec + GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec + $vgpr2 = V_MOV_B32_e32 1, implicit $exec + $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec + $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec + $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec + $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec + $vgpr7 = GLOBAL_LOAD_USHORT $vgpr8_vgpr9, 0, 0, implicit $exec +... + + +--- +name: basic_cluster +tracksRegLiveness: true +body: | + bb.0: + liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15 + ; PRERA-LABEL: name: basic_cluster + ; PRERA: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15 + ; PRERA-NEXT: {{ $}} + ; PRERA-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec + ; PRERA-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; PRERA-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec + ; PRERA-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec + ; PRERA-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; PRERA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + ; PRERA-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec + ; PRERA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec + ; PRERA-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; PRERA-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec + ; PRERA-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec + ; PRERA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + ; PRERA-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; DEFAULT-LABEL: name: basic_cluster + ; DEFAULT: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15 + ; DEFAULT-NEXT: {{ $}} + ; DEFAULT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec + ; DEFAULT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec + ; DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; DEFAULT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; DEFAULT-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec + ; DEFAULT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + ; DEFAULT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec + ; DEFAULT-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec + ; DEFAULT-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec + ; DEFAULT-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec + ; DEFAULT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + ; DEFAULT-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; BOTHSCHEDPASS-LABEL: name: basic_cluster + ; BOTHSCHEDPASS: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15 + ; BOTHSCHEDPASS-NEXT: {{ $}} + ; BOTHSCHEDPASS-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec + ; BOTHSCHEDPASS-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; BOTHSCHEDPASS-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec + ; BOTHSCHEDPASS-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec + ; BOTHSCHEDPASS-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec + ; BOTHSCHEDPASS-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec + ; BOTHSCHEDPASS-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; BOTHSCHEDPASS-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec + ; BOTHSCHEDPASS-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + ; BOTHSCHEDPASS-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; BOTHSCHEDPASS-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec + ; BOTHSCHEDPASS-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, killed $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; BOTHSCHEDPASS-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + ; TWOLIMIT-LABEL: name: basic_cluster + ; TWOLIMIT: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15 + ; TWOLIMIT-NEXT: {{ $}} + ; TWOLIMIT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec + ; TWOLIMIT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; TWOLIMIT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec + ; TWOLIMIT-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec + ; TWOLIMIT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; TWOLIMIT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + ; TWOLIMIT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec + ; TWOLIMIT-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec + ; TWOLIMIT-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec + ; TWOLIMIT-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec + ; TWOLIMIT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + ; TWOLIMIT-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; TWOLIMIT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; POSTRA-LABEL: name: basic_cluster + ; POSTRA: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15 + ; POSTRA-NEXT: {{ $}} + ; POSTRA-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec + ; POSTRA-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec + ; POSTRA-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; POSTRA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec + ; POSTRA-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec + ; POSTRA-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec + ; POSTRA-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; POSTRA-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec + ; POSTRA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + ; POSTRA-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; POSTRA-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec + ; POSTRA-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, killed $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; POSTRA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + $vgpr1 = V_MOV_B32_e32 1, implicit $exec + $vgpr0 = V_MOV_B32_e32 1, implicit $exec + $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + $vgpr2 = V_MOV_B32_e32 1, implicit $exec + $vgpr3 = V_MOV_B32_e32 1, implicit $exec + $vgpr4 = V_MOV_B32_e32 1, implicit $exec + $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec + $vgpr5 = V_MOV_B32_e32 1, implicit $exec + $vgpr6 = V_MOV_B32_e32 1, implicit $exec + $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec + $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec +... + + +--- +name: complex_cluster +tracksRegLiveness: true +body: | + bb.0: + liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $sgpr0, $vgpr10_vgpr11 + ; PRERA-LABEL: name: complex_cluster + ; PRERA: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $sgpr0, $vgpr10_vgpr11 + ; PRERA-NEXT: {{ $}} + ; PRERA-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec + ; PRERA-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec + ; PRERA-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; PRERA-NEXT: $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec + ; PRERA-NEXT: GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec + ; PRERA-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; PRERA-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; PRERA-NEXT: $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec + ; PRERA-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec + ; PRERA-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec + ; PRERA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + ; PRERA-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec + ; PRERA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec + ; PRERA-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; PRERA-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec + ; PRERA-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec + ; PRERA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + ; PRERA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec + ; PRERA-NEXT: $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec + ; PRERA-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec + ; PRERA-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec + ; DEFAULT-LABEL: name: complex_cluster + ; DEFAULT: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $sgpr0, $vgpr10_vgpr11 + ; DEFAULT-NEXT: {{ $}} + ; DEFAULT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec + ; DEFAULT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec + ; DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; DEFAULT-NEXT: $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec + ; DEFAULT-NEXT: GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec + ; DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; DEFAULT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; DEFAULT-NEXT: $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec + ; DEFAULT-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; DEFAULT-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec + ; DEFAULT-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec + ; DEFAULT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + ; DEFAULT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec + ; DEFAULT-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec + ; DEFAULT-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec + ; DEFAULT-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec + ; DEFAULT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + ; DEFAULT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec + ; DEFAULT-NEXT: $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec + ; DEFAULT-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec + ; DEFAULT-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec + ; BOTHSCHEDPASS-LABEL: name: complex_cluster + ; BOTHSCHEDPASS: liveins: $sgpr0, $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $vgpr10_vgpr11 + ; BOTHSCHEDPASS-NEXT: {{ $}} + ; BOTHSCHEDPASS-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec + ; BOTHSCHEDPASS-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec + ; BOTHSCHEDPASS-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; BOTHSCHEDPASS-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; BOTHSCHEDPASS-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec + ; BOTHSCHEDPASS-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; BOTHSCHEDPASS-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec + ; BOTHSCHEDPASS-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec + ; BOTHSCHEDPASS-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec + ; BOTHSCHEDPASS-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec + ; BOTHSCHEDPASS-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec + ; BOTHSCHEDPASS-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec + ; BOTHSCHEDPASS-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec + ; BOTHSCHEDPASS-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec + ; BOTHSCHEDPASS-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec + ; BOTHSCHEDPASS-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + ; BOTHSCHEDPASS-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec + ; BOTHSCHEDPASS-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; BOTHSCHEDPASS-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + ; BOTHSCHEDPASS-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec + ; BOTHSCHEDPASS-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec + ; TWOLIMIT-LABEL: name: complex_cluster + ; TWOLIMIT: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $sgpr0, $vgpr10_vgpr11 + ; TWOLIMIT-NEXT: {{ $}} + ; TWOLIMIT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec + ; TWOLIMIT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec + ; TWOLIMIT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; TWOLIMIT-NEXT: $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec + ; TWOLIMIT-NEXT: GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec + ; TWOLIMIT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; TWOLIMIT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; TWOLIMIT-NEXT: $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec + ; TWOLIMIT-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec + ; TWOLIMIT-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec + ; TWOLIMIT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + ; TWOLIMIT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec + ; TWOLIMIT-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec + ; TWOLIMIT-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec + ; TWOLIMIT-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec + ; TWOLIMIT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + ; TWOLIMIT-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; TWOLIMIT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec + ; TWOLIMIT-NEXT: $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec + ; TWOLIMIT-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec + ; TWOLIMIT-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec + ; POSTRA-LABEL: name: complex_cluster + ; POSTRA: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $sgpr0, $vgpr10_vgpr11 + ; POSTRA-NEXT: {{ $}} + ; POSTRA-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec + ; POSTRA-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec + ; POSTRA-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; POSTRA-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; POSTRA-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec + ; POSTRA-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; POSTRA-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec + ; POSTRA-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec + ; POSTRA-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec + ; POSTRA-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec + ; POSTRA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec + ; POSTRA-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec + ; POSTRA-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec + ; POSTRA-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec + ; POSTRA-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec + ; POSTRA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + ; POSTRA-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec + ; POSTRA-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; POSTRA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + ; POSTRA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec + ; POSTRA-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec + $vgpr1 = V_MOV_B32_e32 1, implicit $exec + $vgpr0 = V_MOV_B32_e32 1, implicit $exec + $vgpr8 = V_MOV_B32_e32 0, implicit $exec + $vgpr9 = V_MOV_B32_e32 9, implicit $exec + $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + $vgpr2 = V_MOV_B32_e32 1, implicit $exec + $vgpr3 = V_MOV_B32_e32 1, implicit $exec + $vgpr4 = V_MOV_B32_e32 1, implicit $exec + $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec + GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec + $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec + $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec + $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec + $vgpr5 = V_MOV_B32_e32 1, implicit $exec + $vgpr6 = V_MOV_B32_e32 1, implicit $exec + $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec + $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec + $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec + $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec +...