diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMFMAClustering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMFMAClustering.cpp deleted file mode 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMFMAClustering.cpp +++ /dev/null @@ -1,173 +0,0 @@ -//===--- AMDGPUMFMAClusting.cpp - AMDGPU MFMA Clustering -------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -/// \file This file contains a DAG scheduling mutation to cluster MFMA -/// instructions. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPUMFMAClustering.h" -#include "AMDGPUTargetMachine.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "SIInstrInfo.h" -#include "SIMachineFunctionInfo.h" -#include "llvm/CodeGen/MachineScheduler.h" - -using namespace llvm; - -#define DEBUG_TYPE "amdgpu-mfma-clustering" - -namespace { - -static cl::opt EnableMFMACluster("amdgpu-mfma-cluster", - cl::desc("Enable MFMA clustering"), - cl::init(false)); - -static cl::opt - MaxMFMAClusterSize("amdgpu-mfma-cluster-size", cl::init(5), cl::Hidden, - cl::desc("The maximum number of MFMA instructions to " - "attempt to cluster together.")); - -class MFMAClusterDAGMutation : public ScheduleDAGMutation { - const SIInstrInfo *TII; - ScheduleDAGMI *DAG; - -public: - MFMAClusterDAGMutation() = default; - void apply(ScheduleDAGInstrs *DAGInstrs) override; -}; - -static void collectMFMASUnits(SmallVectorImpl &MFMASUnits, - const SIInstrInfo *TII, ScheduleDAGInstrs *DAG) { - for (SUnit &SU : DAG->SUnits) { - MachineInstr &MAI = *SU.getInstr(); - if (!TII->isMAI(MAI) || - MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || - MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64) - continue; - - MFMASUnits.push_back(&SU); - - LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);); - } - - // Sorting the MFMAs in NodeNum order results in a good clustering order - std::sort(MFMASUnits.begin(), MFMASUnits.end(), - [](SUnit *a, SUnit *b) { return a->NodeNum < b->NodeNum; }); -} - -static void propagateDeps(DenseMap &SUnit2ClusterInfo, - llvm::ArrayRef ClusterPreds, - llvm::ArrayRef ClusterSuccs, - unsigned ClusterNum, ScheduleDAGInstrs *DAG) { - - for (auto Node : SUnit2ClusterInfo) { - if (Node.second != ClusterNum) - continue; // Only add the combined succs to the current cluster - - LLVM_DEBUG(dbgs() << "Copying Deps To SU(" << Node.first << ")\n"); - - for (const SDep &Succ : ClusterSuccs) { - LLVM_DEBUG(dbgs() << "Copying Succ SU(" << Succ.getSUnit()->NodeNum - << ")\n"); - DAG->addEdge(Succ.getSUnit(), - SDep(&DAG->SUnits[Node.first], SDep::Artificial)); - } - - for (const SDep &Pred : ClusterPreds) { - LLVM_DEBUG(dbgs() << "Copying Pred SU(" << Pred.getSUnit()->NodeNum - << ")\n"); - if (Pred.getSUnit()->NodeNum == ClusterNum) - continue; - DAG->addEdge(&DAG->SUnits[Node.first], - SDep(Pred.getSUnit(), SDep::Artificial)); - } - } -} - -static void clusterNeighboringMFMAs(llvm::ArrayRef MFMASUnits, - ScheduleDAGInstrs *DAG) { - - DenseMap SUnit2ClusterInfo; - - for (unsigned Idx = 0, End = MFMASUnits.size(); Idx < (End - 1); ++Idx) { - if (SUnit2ClusterInfo.count(MFMASUnits[Idx]->NodeNum)) - continue; // We don't want to cluster against a different cluster - - auto MFMAOpa = MFMASUnits[Idx]; - auto ClusterBase = MFMAOpa; - unsigned ClusterNum = ClusterBase->NodeNum; - SmallVector ClusterSuccs(MFMAOpa->Succs); - SmallVector ClusterPreds(MFMAOpa->Preds); - unsigned NextIdx = Idx + 1; - unsigned ClusterSize = 1; - - // Attempt to cluster all the remaining MFMASunits in a chain - // starting at ClusterBase/MFMAOpa. - for (; NextIdx < End; ++NextIdx) { - if (ClusterSize >= MaxMFMAClusterSize || NextIdx >= End) - break; - // Only add independent MFMAs that have not been previously clustered - if (SUnit2ClusterInfo.count(MFMASUnits[NextIdx]->NodeNum) || - DAG->IsReachable(MFMASUnits[NextIdx], ClusterBase) || - DAG->IsReachable(ClusterBase, MFMASUnits[NextIdx])) - continue; - - auto MFMAOpb = MFMASUnits[NextIdx]; - // Aggregate the cluster inst dependencies for dep propogation - ClusterPreds.append(MFMAOpb->Preds); - ClusterSuccs.append(MFMAOpb->Succs); - if (!DAG->addEdge(MFMAOpb, SDep(MFMAOpa, SDep::Cluster))) - continue; - - // Enforce ordering to ensure root/leaf of cluster chain gets - // scheduled first/last - DAG->addEdge(MFMAOpb, SDep(MFMAOpa, SDep::Artificial)); - - LLVM_DEBUG(dbgs() << "Cluster MFMA SU(" << MFMAOpa->NodeNum << ") - SU(" - << MFMAOpb->NodeNum << ")\n"); - - SUnit2ClusterInfo[MFMAOpb->NodeNum] = ClusterNum; - SUnit2ClusterInfo[MFMAOpa->NodeNum] = ClusterNum; - ++ClusterSize; - MFMAOpa = MFMAOpb; - } - propagateDeps(SUnit2ClusterInfo, ClusterPreds, ClusterSuccs, ClusterNum, - DAG); - } -} - -void MFMAClusterDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) { - const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget(); - TII = ST.getInstrInfo(); - if (!ST.hasMAIInsts()) - return; - DAG = static_cast(DAGInstrs); - const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); - if (!TSchedModel || DAG->SUnits.empty()) - return; - - SmallVector MFMASUnits; - collectMFMASUnits(MFMASUnits, TII, DAG); - - if (MFMASUnits.size() < 2) - return; - - clusterNeighboringMFMAs(MFMASUnits, DAG); -} - -} // namespace - -namespace llvm { - -std::unique_ptr createMFMAClusterDAGMutation() { - return EnableMFMACluster ? std::make_unique() - : nullptr; -} - -} // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMFMAClustering.h b/llvm/lib/Target/AMDGPU/AMDGPUMFMAIGroupLP.h rename from llvm/lib/Target/AMDGPU/AMDGPUMFMAClustering.h rename to llvm/lib/Target/AMDGPU/AMDGPUMFMAIGroupLP.h --- a/llvm/lib/Target/AMDGPU/AMDGPUMFMAClustering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMFMAIGroupLP.h @@ -1,4 +1,4 @@ -//===- AMDGPUMFMAClustering.h - AMDGPU MFMA Clustering ------*- C++ -*-===// +//===- AMDGPUMFMAIGroupLP.h - AMDGPU MFMA IGroupLP --------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,16 +6,16 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMFMACLUSTERING_H -#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMFMACLUSTERING_H +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMFMAIGROUPLP_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMFMAIGROUPLP_H #include "llvm/CodeGen/ScheduleDAGMutation.h" #include namespace llvm { -std::unique_ptr createMFMAClusterDAGMutation(); +std::unique_ptr createMFMAIGroupLPDAGMutation(); } // namespace llvm -#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUMFMACLUSTERING_H +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUMFMAIGROUPLP_H diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMFMAIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMFMAIGroupLP.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUMFMAIGroupLP.cpp @@ -0,0 +1,207 @@ +//===--- AMDGPUMFMAIGroupLP.cpp - AMDGPU MFMA IGroupLP ------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// \file This file contains a DAG scheduling mutation which tries to coerce +// the scheduler into generating an ordering based on ordering of groups +// of instructions. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUMFMAIGroupLP.h" +#include "AMDGPUTargetMachine.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/CodeGen/TargetOpcodes.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-MFMA-IGroupLP" + +namespace { + +static cl::opt + EnableMFMAIGroupLP("amdgpu-mfma-igrouplp", + cl::desc("Enable construction of Instruction Groups and " + "their ordering for scheduling"), + cl::init(false)); + +static cl::opt + VMEMGroupMaxSize("amdgpu-vmem-group-size", cl::init(-1), cl::Hidden, + cl::desc("The maximum number of instructions to include " + "in VMEM group.")); + +static cl::opt + MFMAGroupMaxSize("amdgpu-mfma-group-size", cl::init(-1), cl::Hidden, + cl::desc("The maximum number of instructions to include " + "in MFMA group.")); + +static cl::opt + LDRGroupMaxSize("amdgpu-ldr-group-size", cl::init(-1), cl::Hidden, + cl::desc("The maximum number of instructions to include " + "in lds/gds read group.")); + +static cl::opt + LDWGroupMaxSize("amdgpu-ldw-group-size", cl::init(-1), cl::Hidden, + cl::desc("The maximum number of instructions to include " + "in lds/gds write group.")); + +typedef function_ref IsInstructionType; + +struct InstructionClass { + SmallVector Collection; + const IsInstructionType isInstructionClass; + const int MaxSize; + + InstructionClass(IsInstructionType IsInstructionClass, int maxSize) + : isInstructionClass(IsInstructionClass), MaxSize(maxSize){}; + + bool IsFull() { return !(MaxSize <= 0) && (int)Collection.size() >= MaxSize; } +}; + +class MFMAIGroupLPDAGMutation : public ScheduleDAGMutation { +public: + const SIInstrInfo *TII; + ScheduleDAGMI *DAG; + + MFMAIGroupLPDAGMutation() = default; + void apply(ScheduleDAGInstrs *DAGInstrs) override; +}; + +static void collectSUnits(SmallVectorImpl &PipelineOrder, + const SIInstrInfo *TII, ScheduleDAGInstrs *DAG) { + for (SUnit &SU : DAG->SUnits) { + LLVM_DEBUG(dbgs() << "Checking Node"; DAG->dumpNode(SU)); + + if (SU.getInstr()->getOpcode() == TargetOpcode::BUNDLE) { + MachineInstr *MI = SU.getInstr(); + MachineBasicBlock::instr_iterator BundledMI = MI->getIterator(); + ++BundledMI; + + LLVM_DEBUG(dbgs() << "Checking bundled insts\n";); + + InstructionClass *MatchingStage = nullptr; + for (auto Stage : PipelineOrder) { + if (Stage->isInstructionClass(*BundledMI) && !Stage->IsFull()) { + MatchingStage = Stage; + break; + } + } + + if (MatchingStage != nullptr) { + while (MatchingStage->isInstructionClass(*BundledMI)) { + if (!BundledMI->isBundledWithSucc()) + break; + ++BundledMI; + } + + if (!BundledMI->isBundledWithSucc()) { + LLVM_DEBUG(dbgs() << "Bundle is all of same type\n";); + MatchingStage->Collection.push_back(&SU); + } + } + } + + for (InstructionClass *Stage : PipelineOrder) { + if (Stage->isInstructionClass(*SU.getInstr()) && !Stage->IsFull()) { + Stage->Collection.push_back(&SU); + } + } + } +} + +static void +addPipelineEdges(const llvm::ArrayRef PipelineOrder, + ScheduleDAGInstrs *DAG) { + for (int i = 0; i < (int)PipelineOrder.size() - 1; i++) { + auto StageA = PipelineOrder[i]; + for (int j = i + 1; j < (int)PipelineOrder.size(); j++) { + auto StageB = PipelineOrder[j]; + for (auto SUnitA : StageA->Collection) { + LLVM_DEBUG(dbgs() << "Adding edges for: "; DAG->dumpNode(*SUnitA);); + for (auto SUnitB : StageB->Collection) { + if (DAG->canAddEdge(SUnitB, SUnitA)) { + DAG->addEdge(SUnitB, SDep(SUnitA, SDep::Artificial)); + LLVM_DEBUG(dbgs() << "Added edge to: "; DAG->dumpNode(*SUnitB);); + } else { + LLVM_DEBUG(dbgs() << "Can't add edge to: "; + DAG->dumpNode(*SUnitB);); + } + } + } + } + } +} + +void MFMAIGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) { + const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget(); + TII = ST.getInstrInfo(); + if (!ST.hasMAIInsts()) + return; + DAG = static_cast(DAGInstrs); + const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); + if (!TSchedModel || DAG->SUnits.empty()) + return; + + const IsInstructionType isMFMAFn = [this](const MachineInstr &MI) { + if (TII->isMAI(MI) && MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 && + MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64) { + LLVM_DEBUG(dbgs() << "Found MFMA\n";); + return true; + } + return false; + }; + InstructionClass MFMASUnits(isMFMAFn, MFMAGroupMaxSize); + + const IsInstructionType isVMEMReadFn = [this](const MachineInstr &MI) { + if (((TII->isFLAT(MI) && !TII->isDS(MI)) || TII->isVMEM(MI)) && + MI.mayLoad()) { + LLVM_DEBUG(dbgs() << "Found VMEM read\n";); + return true; + } + return false; + }; + InstructionClass VMEMReadSUnits(isVMEMReadFn, VMEMGroupMaxSize); + + const IsInstructionType isDSWriteFn = [this](const MachineInstr &MI) { + if (TII->isDS(MI) && MI.mayStore()) { + LLVM_DEBUG(dbgs() << "Found DS Write\n";); + return true; + } + return false; + }; + InstructionClass DSWriteSUnits(isDSWriteFn, LDWGroupMaxSize); + + const IsInstructionType isDSReadFn = [this](const MachineInstr &MI) { + if (TII->isDS(MI) && MI.mayLoad()) { + LLVM_DEBUG(dbgs() << "Found DS Read\n";); + return true; + } + return false; + }; + InstructionClass DSReadSUnits(isDSReadFn, LDRGroupMaxSize); + + SmallVector PipelineOrder = { + &VMEMReadSUnits, &DSReadSUnits, &MFMASUnits, &DSWriteSUnits}; + + collectSUnits(PipelineOrder, TII, DAG); + + addPipelineEdges(PipelineOrder, DAG); +} + +} // namespace + +namespace llvm { + +std::unique_ptr createMFMAIGroupLPDAGMutation() { + return EnableMFMAIGroupLP ? std::make_unique() + : nullptr; +} + +} // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -16,7 +16,7 @@ #include "AMDGPU.h" #include "AMDGPUAliasAnalysis.h" #include "AMDGPUExportClustering.h" -#include "AMDGPUMFMAClustering.h" +#include "AMDGPUMFMAIGroupLP.h" #include "AMDGPUMacroFusion.h" #include "AMDGPUTargetObjectFile.h" #include "AMDGPUTargetTransformInfo.h" @@ -399,7 +399,7 @@ ScheduleDAGMILive *DAG = new GCNScheduleDAGMILive(C, std::make_unique(C)); DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); - DAG->addMutation(createMFMAClusterDAGMutation()); + DAG->addMutation(createMFMAIGroupLPDAGMutation()); DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); return DAG; @@ -881,7 +881,7 @@ const GCNSubtarget &ST = C->MF->getSubtarget(); DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII)); - DAG->addMutation(createMFMAClusterDAGMutation()); + DAG->addMutation(createMFMAIGroupLPDAGMutation()); return DAG; } diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -75,7 +75,7 @@ AMDGPUMachineModuleInfo.cpp AMDGPUMacroFusion.cpp AMDGPUMCInstLower.cpp - AMDGPUMFMAClustering.cpp + AMDGPUMFMAIGroupLP.cpp AMDGPUMIRFormatter.cpp AMDGPUOpenCLEnqueuedBlockLowering.cpp AMDGPUPerfHintAnalysis.cpp diff --git a/llvm/test/CodeGen/AMDGPU/mfma-cluster-edges.mir b/llvm/test/CodeGen/AMDGPU/mfma-cluster-edges.mir deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-cluster-edges.mir +++ /dev/null @@ -1,71 +0,0 @@ -# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=machine-scheduler %s -o - -amdgpu-mfma-cluster=1 --debug-only=amdgpu-mfma-clustering 2>&1 | FileCheck -check-prefix=PRERA %s -# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=machine-scheduler %s -o - -amdgpu-mfma-cluster=1 -amdgpu-mfma-cluster-size=2 --debug-only=amdgpu-mfma-clustering 2>&1 | FileCheck -check-prefix=TWOLIMIT %s -# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=postmisched %s -o - -amdgpu-mfma-cluster=1 --debug-only=amdgpu-mfma-clustering 2>&1| FileCheck -check-prefix=POSTRA %s -# REQUIRES: asserts - -# PRERA: Cluster MFMA SU(2) - SU(6) -# PRERA-NEXT: Cluster MFMA SU(6) - SU(10) -# PRERA-NEXT: Cluster MFMA SU(10) - SU(12) - -# TWOLIMIT: Cluster MFMA SU(2) - SU(6) -# TWOLIMIT: Cluster MFMA SU(10) - SU(11) - -# POSTRA: Cluster MFMA SU(2) - SU(6) -# POSTRA-NEXT: Cluster MFMA SU(6) - SU(10) -# POSTRA-NEXT: Cluster MFMA SU(10) - SU(12) - ---- -name: basic_cluster -tracksRegLiveness: true -body: | - bb.0: - liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15 - $vgpr1 = V_MOV_B32_e32 1, implicit $exec - $vgpr0 = V_MOV_B32_e32 1, implicit $exec - $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec - $vgpr2 = V_MOV_B32_e32 1, implicit $exec - $vgpr3 = V_MOV_B32_e32 1, implicit $exec - $vgpr4 = V_MOV_B32_e32 1, implicit $exec - $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec - $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec - $vgpr5 = V_MOV_B32_e32 1, implicit $exec - $vgpr6 = V_MOV_B32_e32 1, implicit $exec - $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec - $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec - $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec -... - -# PRERA: Cluster MFMA SU(12) - SU(16) -# PRERA-NEXT: Cluster MFMA SU(16) - SU(20) - -# POSTRA: Cluster MFMA SU(12) - SU(16) -# POSTRA-NEXT: Cluster MFMA SU(16) - SU(20) - ---- -name: complex_cluster -tracksRegLiveness: true -body: | - bb.0: - liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $sgpr0, $vgpr10_vgpr11 - $vgpr1 = V_MOV_B32_e32 1, implicit $exec - $vgpr0 = V_MOV_B32_e32 1, implicit $exec - $vgpr8 = V_MOV_B32_e32 0, implicit $exec - $vgpr9 = V_MOV_B32_e32 9, implicit $exec - $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec - $vgpr2 = V_MOV_B32_e32 1, implicit $exec - $vgpr3 = V_MOV_B32_e32 1, implicit $exec - $vgpr4 = V_MOV_B32_e32 1, implicit $exec - $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec - GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec - $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec - $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec - $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec - $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec - $vgpr5 = V_MOV_B32_e32 1, implicit $exec - $vgpr6 = V_MOV_B32_e32 1, implicit $exec - $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec - $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec - $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec - $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec - $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec -... diff --git a/llvm/test/CodeGen/AMDGPU/mfma-cluster.mir b/llvm/test/CodeGen/AMDGPU/mfma-cluster.mir deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-cluster.mir +++ /dev/null @@ -1,354 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=machine-scheduler %s -o - -amdgpu-mfma-cluster=1 2>&1 | FileCheck -check-prefix=PRERA %s -# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=machine-scheduler %s -o - 2>&1 | FileCheck -check-prefix=DEFAULT %s -# RUN: llc -march=amdgcn -mcpu=gfx90a -start-before=machine-scheduler -stop-after=postmisched %s -o - -amdgpu-mfma-cluster=1 2>&1 | FileCheck -check-prefix=BOTHSCHEDPASS %s -# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=machine-scheduler %s -o - -amdgpu-mfma-cluster=1 -amdgpu-mfma-cluster-size=2 2>&1 | FileCheck -check-prefix=TWOLIMIT %s -# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=postmisched %s -o - -amdgpu-mfma-cluster=1 2>&1| FileCheck -check-prefix=POSTRA %s - - ---- -name: no_cluster -tracksRegLiveness: true -body: | - bb.0: - liveins: $sgpr0, $vgpr10_vgpr11 - ; PRERA-LABEL: name: no_cluster - ; PRERA: liveins: $sgpr0, $vgpr10_vgpr11 - ; PRERA-NEXT: {{ $}} - ; PRERA-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec - ; PRERA-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec - ; PRERA-NEXT: $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec - ; PRERA-NEXT: GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec - ; PRERA-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec - ; PRERA-NEXT: $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec - ; PRERA-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec - ; PRERA-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec - ; PRERA-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec - ; PRERA-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT $vgpr8_vgpr9, 0, 0, implicit $exec - ; PRERA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec - ; PRERA-NEXT: $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec - ; DEFAULT-LABEL: name: no_cluster - ; DEFAULT: liveins: $sgpr0, $vgpr10_vgpr11 - ; DEFAULT-NEXT: {{ $}} - ; DEFAULT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec - ; DEFAULT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec - ; DEFAULT-NEXT: $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec - ; DEFAULT-NEXT: GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec - ; DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec - ; DEFAULT-NEXT: $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec - ; DEFAULT-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec - ; DEFAULT-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec - ; DEFAULT-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec - ; DEFAULT-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT $vgpr8_vgpr9, 0, 0, implicit $exec - ; DEFAULT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec - ; DEFAULT-NEXT: $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec - ; BOTHSCHEDPASS-LABEL: name: no_cluster - ; BOTHSCHEDPASS: liveins: $sgpr0, $vgpr10_vgpr11 - ; BOTHSCHEDPASS-NEXT: {{ $}} - ; BOTHSCHEDPASS-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec - ; BOTHSCHEDPASS-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec - ; BOTHSCHEDPASS-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec - ; BOTHSCHEDPASS-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec - ; BOTHSCHEDPASS-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec - ; BOTHSCHEDPASS-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec - ; BOTHSCHEDPASS-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec - ; BOTHSCHEDPASS-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec - ; BOTHSCHEDPASS-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec - ; BOTHSCHEDPASS-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT killed $vgpr8_vgpr9, 0, 0, implicit $exec - ; BOTHSCHEDPASS-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec - ; BOTHSCHEDPASS-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec - ; TWOLIMIT-LABEL: name: no_cluster - ; TWOLIMIT: liveins: $sgpr0, $vgpr10_vgpr11 - ; TWOLIMIT-NEXT: {{ $}} - ; TWOLIMIT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec - ; TWOLIMIT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec - ; TWOLIMIT-NEXT: $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec - ; TWOLIMIT-NEXT: GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec - ; TWOLIMIT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec - ; TWOLIMIT-NEXT: $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec - ; TWOLIMIT-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec - ; TWOLIMIT-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec - ; TWOLIMIT-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec - ; TWOLIMIT-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT $vgpr8_vgpr9, 0, 0, implicit $exec - ; TWOLIMIT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec - ; TWOLIMIT-NEXT: $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec - ; POSTRA-LABEL: name: no_cluster - ; POSTRA: liveins: $sgpr0, $vgpr10_vgpr11 - ; POSTRA-NEXT: {{ $}} - ; POSTRA-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec - ; POSTRA-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec - ; POSTRA-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec - ; POSTRA-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec - ; POSTRA-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec - ; POSTRA-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec - ; POSTRA-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec - ; POSTRA-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec - ; POSTRA-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec - ; POSTRA-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT killed $vgpr8_vgpr9, 0, 0, implicit $exec - ; POSTRA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec - ; POSTRA-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec - $vgpr1 = V_MOV_B32_e32 1, implicit $exec - $vgpr0 = V_MOV_B32_e32 1, implicit $exec - $vgpr8 = V_MOV_B32_e32 0, implicit $exec - $vgpr9 = V_MOV_B32_e32 9, implicit $exec - $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec - GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec - $vgpr2 = V_MOV_B32_e32 1, implicit $exec - $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec - $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec - $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec - $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec - $vgpr7 = GLOBAL_LOAD_USHORT $vgpr8_vgpr9, 0, 0, implicit $exec -... - - ---- -name: basic_cluster -tracksRegLiveness: true -body: | - bb.0: - liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15 - ; PRERA-LABEL: name: basic_cluster - ; PRERA: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15 - ; PRERA-NEXT: {{ $}} - ; PRERA-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec - ; PRERA-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec - ; PRERA-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec - ; PRERA-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec - ; PRERA-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec - ; PRERA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec - ; PRERA-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec - ; PRERA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec - ; PRERA-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec - ; PRERA-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec - ; PRERA-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec - ; PRERA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec - ; PRERA-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec - ; DEFAULT-LABEL: name: basic_cluster - ; DEFAULT: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15 - ; DEFAULT-NEXT: {{ $}} - ; DEFAULT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec - ; DEFAULT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec - ; DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec - ; DEFAULT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec - ; DEFAULT-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec - ; DEFAULT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec - ; DEFAULT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec - ; DEFAULT-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec - ; DEFAULT-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec - ; DEFAULT-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec - ; DEFAULT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec - ; DEFAULT-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec - ; DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec - ; BOTHSCHEDPASS-LABEL: name: basic_cluster - ; BOTHSCHEDPASS: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15 - ; BOTHSCHEDPASS-NEXT: {{ $}} - ; BOTHSCHEDPASS-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec - ; BOTHSCHEDPASS-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec - ; BOTHSCHEDPASS-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec - ; BOTHSCHEDPASS-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec - ; BOTHSCHEDPASS-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec - ; BOTHSCHEDPASS-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec - ; BOTHSCHEDPASS-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec - ; BOTHSCHEDPASS-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec - ; BOTHSCHEDPASS-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec - ; BOTHSCHEDPASS-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec - ; BOTHSCHEDPASS-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec - ; BOTHSCHEDPASS-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, killed $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec - ; BOTHSCHEDPASS-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec - ; TWOLIMIT-LABEL: name: basic_cluster - ; TWOLIMIT: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15 - ; TWOLIMIT-NEXT: {{ $}} - ; TWOLIMIT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec - ; TWOLIMIT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec - ; TWOLIMIT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec - ; TWOLIMIT-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec - ; TWOLIMIT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec - ; TWOLIMIT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec - ; TWOLIMIT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec - ; TWOLIMIT-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec - ; TWOLIMIT-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec - ; TWOLIMIT-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec - ; TWOLIMIT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec - ; TWOLIMIT-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec - ; TWOLIMIT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec - ; POSTRA-LABEL: name: basic_cluster - ; POSTRA: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15 - ; POSTRA-NEXT: {{ $}} - ; POSTRA-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec - ; POSTRA-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec - ; POSTRA-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec - ; POSTRA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec - ; POSTRA-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec - ; POSTRA-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec - ; POSTRA-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec - ; POSTRA-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec - ; POSTRA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec - ; POSTRA-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec - ; POSTRA-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec - ; POSTRA-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, killed $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec - ; POSTRA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec - $vgpr1 = V_MOV_B32_e32 1, implicit $exec - $vgpr0 = V_MOV_B32_e32 1, implicit $exec - $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec - $vgpr2 = V_MOV_B32_e32 1, implicit $exec - $vgpr3 = V_MOV_B32_e32 1, implicit $exec - $vgpr4 = V_MOV_B32_e32 1, implicit $exec - $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec - $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec - $vgpr5 = V_MOV_B32_e32 1, implicit $exec - $vgpr6 = V_MOV_B32_e32 1, implicit $exec - $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec - $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec - $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec -... - - ---- -name: complex_cluster -tracksRegLiveness: true -body: | - bb.0: - liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $sgpr0, $vgpr10_vgpr11 - ; PRERA-LABEL: name: complex_cluster - ; PRERA: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $sgpr0, $vgpr10_vgpr11 - ; PRERA-NEXT: {{ $}} - ; PRERA-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec - ; PRERA-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec - ; PRERA-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec - ; PRERA-NEXT: $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec - ; PRERA-NEXT: GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec - ; PRERA-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec - ; PRERA-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec - ; PRERA-NEXT: $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec - ; PRERA-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec - ; PRERA-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec - ; PRERA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec - ; PRERA-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec - ; PRERA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec - ; PRERA-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec - ; PRERA-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec - ; PRERA-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec - ; PRERA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec - ; PRERA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec - ; PRERA-NEXT: $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec - ; PRERA-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec - ; PRERA-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec - ; DEFAULT-LABEL: name: complex_cluster - ; DEFAULT: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $sgpr0, $vgpr10_vgpr11 - ; DEFAULT-NEXT: {{ $}} - ; DEFAULT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec - ; DEFAULT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec - ; DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec - ; DEFAULT-NEXT: $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec - ; DEFAULT-NEXT: GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec - ; DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec - ; DEFAULT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec - ; DEFAULT-NEXT: $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec - ; DEFAULT-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec - ; DEFAULT-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec - ; DEFAULT-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec - ; DEFAULT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec - ; DEFAULT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec - ; DEFAULT-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec - ; DEFAULT-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec - ; DEFAULT-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec - ; DEFAULT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec - ; DEFAULT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec - ; DEFAULT-NEXT: $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec - ; DEFAULT-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec - ; DEFAULT-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec - ; BOTHSCHEDPASS-LABEL: name: complex_cluster - ; BOTHSCHEDPASS: liveins: $sgpr0, $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $vgpr10_vgpr11 - ; BOTHSCHEDPASS-NEXT: {{ $}} - ; BOTHSCHEDPASS-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec - ; BOTHSCHEDPASS-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec - ; BOTHSCHEDPASS-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec - ; BOTHSCHEDPASS-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec - ; BOTHSCHEDPASS-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec - ; BOTHSCHEDPASS-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec - ; BOTHSCHEDPASS-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec - ; BOTHSCHEDPASS-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec - ; BOTHSCHEDPASS-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec - ; BOTHSCHEDPASS-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec - ; BOTHSCHEDPASS-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec - ; BOTHSCHEDPASS-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec - ; BOTHSCHEDPASS-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec - ; BOTHSCHEDPASS-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec - ; BOTHSCHEDPASS-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec - ; BOTHSCHEDPASS-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec - ; BOTHSCHEDPASS-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec - ; BOTHSCHEDPASS-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec - ; BOTHSCHEDPASS-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec - ; BOTHSCHEDPASS-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec - ; BOTHSCHEDPASS-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec - ; TWOLIMIT-LABEL: name: complex_cluster - ; TWOLIMIT: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $sgpr0, $vgpr10_vgpr11 - ; TWOLIMIT-NEXT: {{ $}} - ; TWOLIMIT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec - ; TWOLIMIT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec - ; TWOLIMIT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec - ; TWOLIMIT-NEXT: $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec - ; TWOLIMIT-NEXT: GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec - ; TWOLIMIT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec - ; TWOLIMIT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec - ; TWOLIMIT-NEXT: $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec - ; TWOLIMIT-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec - ; TWOLIMIT-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec - ; TWOLIMIT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec - ; TWOLIMIT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec - ; TWOLIMIT-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec - ; TWOLIMIT-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec - ; TWOLIMIT-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec - ; TWOLIMIT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec - ; TWOLIMIT-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec - ; TWOLIMIT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec - ; TWOLIMIT-NEXT: $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec - ; TWOLIMIT-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec - ; TWOLIMIT-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec - ; POSTRA-LABEL: name: complex_cluster - ; POSTRA: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $sgpr0, $vgpr10_vgpr11 - ; POSTRA-NEXT: {{ $}} - ; POSTRA-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec - ; POSTRA-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec - ; POSTRA-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec - ; POSTRA-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec - ; POSTRA-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec - ; POSTRA-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec - ; POSTRA-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec - ; POSTRA-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec - ; POSTRA-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec - ; POSTRA-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec - ; POSTRA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec - ; POSTRA-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec - ; POSTRA-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec - ; POSTRA-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec - ; POSTRA-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec - ; POSTRA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec - ; POSTRA-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec - ; POSTRA-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec - ; POSTRA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec - ; POSTRA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec - ; POSTRA-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec - $vgpr1 = V_MOV_B32_e32 1, implicit $exec - $vgpr0 = V_MOV_B32_e32 1, implicit $exec - $vgpr8 = V_MOV_B32_e32 0, implicit $exec - $vgpr9 = V_MOV_B32_e32 9, implicit $exec - $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec - $vgpr2 = V_MOV_B32_e32 1, implicit $exec - $vgpr3 = V_MOV_B32_e32 1, implicit $exec - $vgpr4 = V_MOV_B32_e32 1, implicit $exec - $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec - GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec - $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec - $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec - $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec - $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec - $vgpr5 = V_MOV_B32_e32 1, implicit $exec - $vgpr6 = V_MOV_B32_e32 1, implicit $exec - $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec - $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec - $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec - $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec - $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec -... diff --git a/llvm/test/CodeGen/AMDGPU/mfma-igrouplp-dag-mutation.mir b/llvm/test/CodeGen/AMDGPU/mfma-igrouplp-dag-mutation.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/mfma-igrouplp-dag-mutation.mir @@ -0,0 +1,183 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx90a -start-before=machine-scheduler -stop-after=postmisched %s -o - 2>&1 | FileCheck -check-prefix=DEFAULT %s +# RUN: llc -march=amdgcn -mcpu=gfx90a -start-before=machine-scheduler -stop-after=postmisched %s -o - -amdgpu-mfma-igrouplp=1 2>&1 | FileCheck -check-prefix=PIPELINE %s + +--- +name: no_pipeline +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0, $vgpr10_vgpr11 + ; DEFAULT-LABEL: name: no_pipeline + ; DEFAULT: liveins: $sgpr0, $vgpr10_vgpr11 + ; DEFAULT-NEXT: {{ $}} + ; DEFAULT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec + ; DEFAULT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec + ; DEFAULT-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec + ; DEFAULT-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec + ; DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; DEFAULT-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec + ; DEFAULT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec + ; DEFAULT-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec + ; DEFAULT-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec + ; DEFAULT-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec + ; PIPELINE-LABEL: name: no_pipeline + ; PIPELINE: liveins: $sgpr0, $vgpr10_vgpr11 + ; PIPELINE-NEXT: {{ $}} + ; PIPELINE-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec + ; PIPELINE-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec + ; PIPELINE-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec + ; PIPELINE-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec + ; PIPELINE-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; PIPELINE-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec + ; PIPELINE-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec + ; PIPELINE-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec + ; PIPELINE-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec + ; PIPELINE-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec + $vgpr1 = V_MOV_B32_e32 1, implicit $exec + $vgpr0 = V_MOV_B32_e32 1, implicit $exec + $vgpr8 = V_MOV_B32_e32 0, implicit $exec + $vgpr9 = V_MOV_B32_e32 9, implicit $exec + $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec + GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec + $vgpr2 = V_MOV_B32_e32 1, implicit $exec + $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec + $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec + $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec +... + + +--- +name: full_pipe +tracksRegLiveness: true +body: | + bb.0: + liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $agpr16_agpr17_agpr18_agpr19, $sgpr0, $vgpr10_vgpr11 + ; DEFAULT-LABEL: name: full_pipe + ; DEFAULT: liveins: $sgpr0, $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $agpr16_agpr17_agpr18_agpr19, $vgpr10_vgpr11 + ; DEFAULT-NEXT: {{ $}} + ; DEFAULT-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; DEFAULT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec + ; DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 2, implicit $exec + ; DEFAULT-NEXT: $vgpr3 = V_MOV_B32_e32 3, implicit $exec + ; DEFAULT-NEXT: $vgpr6 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec + ; DEFAULT-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT $vgpr2_vgpr3, 0, 0, implicit $exec + ; DEFAULT-NEXT: $vgpr4 = V_MOV_B32_e32 4, implicit $exec + ; DEFAULT-NEXT: $vgpr5 = V_MOV_B32_e32 5, implicit $exec + ; DEFAULT-NEXT: $vgpr8 = GLOBAL_LOAD_USHORT $vgpr4_vgpr5, 0, 0, implicit $exec + ; DEFAULT-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec + ; DEFAULT-NEXT: $vgpr26 = V_MOV_B32_e32 1, implicit $exec + ; DEFAULT-NEXT: $vgpr27 = V_MOV_B32_e32 1, implicit $exec + ; DEFAULT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + ; DEFAULT-NEXT: $vgpr23 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec + ; DEFAULT-NEXT: $vgpr9 = V_MOV_B32_e32 1, implicit $exec + ; DEFAULT-NEXT: $vgpr24 = V_MOV_B32_e32 1, implicit $exec + ; DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; DEFAULT-NEXT: $vgpr22 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec + ; DEFAULT-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec + ; DEFAULT-NEXT: $vgpr21 = V_MUL_LO_U32_e64 $vgpr1, killed $sgpr0, implicit $exec + ; DEFAULT-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; DEFAULT-NEXT: $vgpr30 = V_MOV_B32_e32 30, implicit $exec + ; DEFAULT-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec + ; DEFAULT-NEXT: $vgpr18 = V_MOV_B32_e32 1, implicit $exec + ; DEFAULT-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $vgpr7, implicit $exec { + ; DEFAULT-NEXT: $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec + ; DEFAULT-NEXT: $vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec + ; DEFAULT-NEXT: $vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec + ; DEFAULT-NEXT: $vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec + ; DEFAULT-NEXT: $vgpr16 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec + ; DEFAULT-NEXT: } + ; DEFAULT-NEXT: DS_WRITE_B32 $vgpr3, killed $vgpr1, 0, 16, implicit $m0, implicit $exec + ; DEFAULT-NEXT: BUNDLE implicit-def $vgpr19, implicit-def $vgpr19_lo16, implicit-def $vgpr19_hi16, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit killed $vgpr26_vgpr27, implicit $exec { + ; DEFAULT-NEXT: $vgpr19 = GLOBAL_LOAD_USHORT $vgpr26_vgpr27, 0, 0, implicit $exec + ; DEFAULT-NEXT: $vgpr20 = GLOBAL_LOAD_USHORT killed $vgpr26_vgpr27, 0, 0, implicit $exec + ; DEFAULT-NEXT: } + ; DEFAULT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + ; DEFAULT-NEXT: DS_WRITE_B32 killed $vgpr0, killed $vgpr7, 0, 16, implicit $m0, implicit $exec + ; DEFAULT-NEXT: $agpr16_agpr17_agpr18_agpr19 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr10, killed $vgpr11, killed $agpr16_agpr17_agpr18_agpr19, 0, 0, 0, implicit $mode, implicit $exec + ; DEFAULT-NEXT: DS_WRITE_B32 killed $vgpr23, killed $vgpr3, 0, 16, implicit $m0, implicit $exec + ; DEFAULT-NEXT: DS_WRITE_B32 killed $vgpr9, killed $vgpr24, 0, 16, implicit $m0, implicit $exec + ; PIPELINE-LABEL: name: full_pipe + ; PIPELINE: liveins: $sgpr0, $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $agpr16_agpr17_agpr18_agpr19, $vgpr10_vgpr11 + ; PIPELINE-NEXT: {{ $}} + ; PIPELINE-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; PIPELINE-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec + ; PIPELINE-NEXT: $vgpr2 = V_MOV_B32_e32 2, implicit $exec + ; PIPELINE-NEXT: $vgpr3 = V_MOV_B32_e32 3, implicit $exec + ; PIPELINE-NEXT: $vgpr6 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec + ; PIPELINE-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT $vgpr2_vgpr3, 0, 0, implicit $exec + ; PIPELINE-NEXT: $vgpr4 = V_MOV_B32_e32 4, implicit $exec + ; PIPELINE-NEXT: $vgpr5 = V_MOV_B32_e32 5, implicit $exec + ; PIPELINE-NEXT: $vgpr8 = GLOBAL_LOAD_USHORT $vgpr4_vgpr5, 0, 0, implicit $exec + ; PIPELINE-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec + ; PIPELINE-NEXT: $vgpr26 = V_MOV_B32_e32 1, implicit $exec + ; PIPELINE-NEXT: $vgpr27 = V_MOV_B32_e32 1, implicit $exec + ; PIPELINE-NEXT: $vgpr9 = V_MOV_B32_e32 1, implicit $exec + ; PIPELINE-NEXT: $vgpr24 = V_MOV_B32_e32 1, implicit $exec + ; PIPELINE-NEXT: $vgpr23 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec + ; PIPELINE-NEXT: $vgpr22 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec + ; PIPELINE-NEXT: $vgpr21 = V_MUL_LO_U32_e64 $vgpr1, killed $sgpr0, implicit $exec + ; PIPELINE-NEXT: $vgpr30 = V_MOV_B32_e32 30, implicit $exec + ; PIPELINE-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec + ; PIPELINE-NEXT: $vgpr18 = V_MOV_B32_e32 1, implicit $exec + ; PIPELINE-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $vgpr7, implicit $exec { + ; PIPELINE-NEXT: $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec + ; PIPELINE-NEXT: $vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec + ; PIPELINE-NEXT: $vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec + ; PIPELINE-NEXT: $vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec + ; PIPELINE-NEXT: $vgpr16 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec + ; PIPELINE-NEXT: } + ; PIPELINE-NEXT: DS_WRITE_B32 $vgpr3, $vgpr1, 0, 16, implicit $m0, implicit $exec + ; PIPELINE-NEXT: BUNDLE implicit-def $vgpr19, implicit-def $vgpr19_lo16, implicit-def $vgpr19_hi16, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit killed $vgpr26_vgpr27, implicit $exec { + ; PIPELINE-NEXT: $vgpr19 = GLOBAL_LOAD_USHORT $vgpr26_vgpr27, 0, 0, implicit $exec + ; PIPELINE-NEXT: $vgpr20 = GLOBAL_LOAD_USHORT killed $vgpr26_vgpr27, 0, 0, implicit $exec + ; PIPELINE-NEXT: } + ; PIPELINE-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + ; PIPELINE-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; PIPELINE-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec + ; PIPELINE-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + ; PIPELINE-NEXT: $agpr16_agpr17_agpr18_agpr19 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr10, killed $vgpr11, killed $agpr16_agpr17_agpr18_agpr19, 0, 0, 0, implicit $mode, implicit $exec + ; PIPELINE-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; PIPELINE-NEXT: BUNDLE implicit killed $vgpr0, implicit killed $vgpr7, implicit $m0, implicit $exec, implicit killed $vgpr23, implicit killed $vgpr3 { + ; PIPELINE-NEXT: DS_WRITE_B32 killed $vgpr0, killed $vgpr7, 0, 16, implicit $m0, implicit $exec + ; PIPELINE-NEXT: DS_WRITE_B32 killed $vgpr23, killed $vgpr3, 0, 16, implicit $m0, implicit $exec + ; PIPELINE-NEXT: } + ; PIPELINE-NEXT: DS_WRITE_B32 killed $vgpr9, killed $vgpr24, 0, 16, implicit $m0, implicit $exec + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + $vgpr1 = V_MOV_B32_e32 1, implicit $exec + $vgpr2 = V_MOV_B32_e32 2, implicit $exec + $vgpr3 = V_MOV_B32_e32 3, implicit $exec + $vgpr4 = V_MOV_B32_e32 4, implicit $exec + $vgpr5 = V_MOV_B32_e32 5, implicit $exec + $vgpr30 = V_MOV_B32_e32 30, implicit $exec + $vgpr6 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec + $vgpr7 = GLOBAL_LOAD_USHORT $vgpr2_vgpr3, 0, 0, implicit $exec + $vgpr8 = GLOBAL_LOAD_USHORT $vgpr4_vgpr5, 0, 0, implicit $exec + $vgpr9 = V_MOV_B32_e32 1, implicit $exec + $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec + $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + $vgpr23 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec + $vgpr24 = V_MOV_B32_e32 1, implicit $exec + $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec + $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec + $vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec + $vgpr26 = V_MOV_B32_e32 1, implicit $exec + $vgpr27 = V_MOV_B32_e32 1, implicit $exec + $vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec + $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + $vgpr22 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec + $vgpr21 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec + $vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec + $vgpr16 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec + DS_WRITE_B32 $vgpr3, $vgpr1, 0, 16, implicit $m0, implicit $exec + $vgpr19 = GLOBAL_LOAD_USHORT $vgpr26_vgpr27, 0, 0, implicit $exec + $vgpr17 = V_MOV_B32_e32 1, implicit $exec + $vgpr18 = V_MOV_B32_e32 1, implicit $exec + $vgpr20 = GLOBAL_LOAD_USHORT $vgpr26_vgpr27, 0, 0, implicit $exec + DS_WRITE_B32 $vgpr0, $vgpr7, 0, 16, implicit $m0, implicit $exec + $agpr16_agpr17_agpr18_agpr19 = V_MFMA_F32_4X4X1F32_e64 $vgpr10, $vgpr11, $agpr16_agpr17_agpr18_agpr19, 0, 0, 0, implicit $mode, implicit $exec + DS_WRITE_B32 $vgpr23, $vgpr3, 0, 16, implicit $m0, implicit $exec + $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + DS_WRITE_B32 $vgpr9, $vgpr24, 0, 16, implicit $m0, implicit $exec +...