diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -82,12 +82,36 @@ class SchedGroup; -typedef DenseMap> SUnitsToCandidateSGsMap; +// InstructionRule class is used to enact a filter which determines whether or +// not an SU maps to a given SchedGroup. It contains complementary data +// structures (e.g Cache) to help those filters. +class InstructionRule { +protected: + const SIInstrInfo *TII; + unsigned SGID; + // A cache made available to the Filter to store SUnits for subsequent + // invocations of the Filter + std::optional> Cache; + +public: + virtual bool + apply(const SUnit *, const ArrayRef, + SmallVectorImpl &) { /*errs() << "in base apply";*/ + return true; + }; + + InstructionRule(const SIInstrInfo *TII, unsigned SGID, + bool NeedsCache = false) + : TII(TII), SGID(SGID) { + if (NeedsCache) { + Cache = SmallVector(); + } + } + + virtual ~InstructionRule() = default; +}; -typedef function_ref, - const SIInstrInfo *, SmallVectorImpl &, - unsigned)> - InstructionRuleType; +typedef DenseMap> SUnitsToCandidateSGsMap; // Classify instructions into groups to enable fine tuned control over the // scheduler. These groups may be more specific than current SchedModel @@ -110,7 +134,7 @@ unsigned SGID; // The different rules each instruction in this SchedGroup must conform to - SmallVector Rules; + SmallVector, 4> Rules; // Count of the number of created SchedGroups, used to initialize SGID. static unsigned NumSchedGroups; @@ -159,15 +183,18 @@ // SchedGroup. Since many rules involve the relationship between a SchedGroup // and the SUnits in other SchedGroups, rules are checked at Pipeline Solve // time (rather than SchedGroup init time.) - void addRule(const InstructionRuleType &NewRule) { Rules.push_back(NewRule); } + void addRule(std::shared_ptr NewRule) { + Rules.push_back(NewRule); + } // Returns true if the SU matches all rules bool allowedByRules(const SUnit *SU, SmallVectorImpl &SyncPipe) const { if (Rules.empty()) return true; - for (auto &Rule : Rules) { - if (!Rule(SU, Collection, TII, SyncPipe, SGID)) { + for (size_t I = 0; I < Rules.size(); I++) { + auto TheRule = Rules[I].get(); + if (!TheRule->apply(SU, Collection, SyncPipe)) { return false; } } @@ -424,9 +451,10 @@ void PipelineSolver::makePipeline() { // Preserve the order of barrier for subsequent SchedGroupBarrier mutations for (auto &SyncPipeline : BestPipeline) { + LLVM_DEBUG(dbgs() << "Printing SchedGroups\n"); for (auto &SG : SyncPipeline) { - LLVM_DEBUG(dbgs() << "Printing SchedGroups\nSchedGroup with SGID " - << SG.getSGID() << " has: \n"); + LLVM_DEBUG(dbgs() << "SchedGroup with SGID " << SG.getSGID() + << " has: \n"); SUnit *SGBarr = nullptr; for (auto &SU : SG.Collection) { if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER) @@ -808,7 +836,10 @@ LLVM_DEBUG(DAG->dump()); } -enum IGLPStrategyID : int { MFMASmallGemmOptID = 0, DemoOptID = 1 }; +enum IGLPStrategyID : int { + MFMASmallGemmOptID = 0, + MFMASmallGemmSingleWaveOptID = 1, +}; // Implement a IGLP scheduling strategy. class IGLPStrategy { @@ -871,8 +902,199 @@ } } -class DemoOpt final : public IGLPStrategy { +class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy { private: + // Whether the DS_READ is a predecessor of first four MFMA in region + class EnablesInitialMFMA final : public InstructionRule { + public: + bool apply(const SUnit *SU, const ArrayRef Collection, + SmallVectorImpl &SyncPipe) override { + errs() << "ENABLES INITIAL\n"; + if (!SyncPipe.size()) + return false; + int MFMAsFound = 0; + if (!Cache->size()) { + for (auto &Elt : SyncPipe[0].DAG->SUnits) { + if (TII->isMFMAorWMMA(*Elt.getInstr())) { + ++MFMAsFound; + if (MFMAsFound > 4) + break; + Cache->push_back(&Elt); + } + } + } + + assert(Cache->size()); + auto DAG = SyncPipe[0].DAG; + for (auto &Elt : *Cache) { + if (DAG->IsReachable(Elt, const_cast(SU))) + return true; + } + return false; + } + + EnablesInitialMFMA(const SIInstrInfo *TII, unsigned SGID, + bool NeedsCache = false) + : InstructionRule(TII, SGID, NeedsCache) {} + }; + + // Whether the MI is a V_PERM and is a predecessor of a common DS_WRITE + class IsPermForDSW final : public InstructionRule { + public: + bool apply(const SUnit *SU, const ArrayRef Collection, + SmallVectorImpl &SyncPipe) override { + auto MI = SU->getInstr(); + if (MI->getOpcode() != AMDGPU::V_PERM_B32_e64) + return false; + + bool FitsInGroup = false; + // Does the VALU have a DS_WRITE successor + if (!Collection.size()) { + for (auto &Succ : SU->Succs) { + SUnit *SuccUnit = Succ.getSUnit(); + if (TII->isDS(*SuccUnit->getInstr()) && + SuccUnit->getInstr()->mayStore()) { + Cache->push_back(SuccUnit); + FitsInGroup = true; + } + } + return FitsInGroup; + } + + assert(Cache->size()); + + // Does the VALU have a DS_WRITE successor that is the same as other + // VALU already in the group. The V_PERMs will all share 1 DS_W succ + return std::any_of(Cache->begin(), Cache->end(), [&SU](SUnit *Elt) { + return std::any_of(SU->Succs.begin(), SU->Succs.end(), + [&Elt](const SDep &ThisSucc) { + return ThisSucc.getSUnit() == Elt; + }); + }); + } + + IsPermForDSW(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false) + : InstructionRule(TII, SGID, NeedsCache) {} + }; + + // Whether the SU is a successor of any element in previous SchedGroup + class IsSuccOfPrevGroup final : public InstructionRule { + public: + bool apply(const SUnit *SU, const ArrayRef Collection, + SmallVectorImpl &SyncPipe) override { + SchedGroup *OtherGroup = nullptr; + for (auto &PipeSG : SyncPipe) { + if ((unsigned)PipeSG.getSGID() == SGID - 1) { + OtherGroup = &PipeSG; + } + } + + if (!OtherGroup) + return false; + if (!OtherGroup->Collection.size()) + return true; + + // Does the previous VALU have this DS_Write as a successor + return (std::any_of(OtherGroup->Collection.begin(), + OtherGroup->Collection.end(), [&SU](SUnit *Elt) { + return std::any_of(Elt->Succs.begin(), + Elt->Succs.end(), + [&SU](SDep &Succ) { + return Succ.getSUnit() == SU; + }); + })); + } + IsSuccOfPrevGroup(const SIInstrInfo *TII, unsigned SGID, + bool NeedsCache = false) + : InstructionRule(TII, SGID, NeedsCache) {} + }; + + // Whether the combined load width of group is 128 bits + class VMEMSize final : public InstructionRule { + public: + bool apply(const SUnit *SU, const ArrayRef Collection, + SmallVectorImpl &SyncPipe) override { + auto MI = SU->getInstr(); + if (MI->getOpcode() == TargetOpcode::BUNDLE) + return false; + if (!Collection.size()) + return true; + + int NumBits = 0; + + auto TRI = TII->getRegisterInfo(); + auto &MRI = MI->getParent()->getParent()->getRegInfo(); + for (auto &Elt : Collection) { + auto Op = Elt->getInstr()->getOperand(0); + auto Size = + TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg(MRI, Op)); + NumBits += Size; + } + + if (NumBits < 128) { + assert(TII->isVMEM(*MI) && MI->mayLoad()); + if (NumBits + TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg( + MRI, MI->getOperand(0))) <= + 128) + return true; + } + + return false; + } + + VMEMSize(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false) + : InstructionRule(TII, SGID, NeedsCache) {} + }; + + // Whether the SU shares a V_PERM predecessor with any SU in the SchedGroup + // that is /p Distance steps away + class SharesPredWithPrevNthGroup final : public InstructionRule { + private: + unsigned Distance = 1; + + public: + bool apply(const SUnit *SU, const ArrayRef Collection, + SmallVectorImpl &SyncPipe) override { + SchedGroup *OtherGroup = nullptr; + if (!SyncPipe.size()) + return false; + + if (!Cache->size()) { + + for (auto &PipeSG : SyncPipe) { + if ((unsigned)PipeSG.getSGID() == SGID - Distance) { + OtherGroup = &PipeSG; + } + } + + if (!OtherGroup) + return false; + if (!OtherGroup->Collection.size()) + return true; + + for (auto &OtherEle : OtherGroup->Collection) { + for (auto &Pred : OtherEle->Preds) { + if (Pred.getSUnit()->getInstr()->getOpcode() == + AMDGPU::V_PERM_B32_e64) + Cache->push_back(Pred.getSUnit()); + } + } + } + + assert(Cache->size()); + auto DAG = SyncPipe[0].DAG; + // Does the previous DS_WRITE share a V_PERM predecessor with this + // VMEM_READ + return ( + std::any_of(Cache->begin(), Cache->end(), [&SU, &DAG](SUnit *Elt) { + return DAG->IsReachable(const_cast(SU), Elt); + })); + } + SharesPredWithPrevNthGroup(unsigned Distance, const SIInstrInfo *TII, + unsigned SGID, bool NeedsCache = false) + : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {} + }; + public: void applyIGLPStrategy( DenseMap &SyncedInstrs, @@ -880,64 +1102,245 @@ bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; } - DemoOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) + MFMASmallGemmSingleWaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) : IGLPStrategy(DAG, TII) { IsBottomUp = 0; } }; -void DemoOpt::applyIGLPStrategy( +void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( DenseMap &SyncedInstrs, DenseMap> &SyncedSchedGroups) { - // Count the number of MFMA instructions. unsigned MFMACount = 0; - for (const MachineInstr &I : *DAG) - if (TII->isMFMAorWMMA(I)) + unsigned DSWCount = 0; + unsigned DSWWithPermCount = 0; + unsigned DSWWithSharedVMEMCount = 0; + unsigned DSRCount = 0; + SmallVector DSWithPerms; + for (auto &SU : DAG->SUnits) { + auto I = SU.getInstr(); + if (TII->isMFMAorWMMA(*I)) ++MFMACount; + else if (TII->isDS(*I)) { + if (I->mayLoad()) + ++DSRCount; + else if (I->mayStore()) { + ++DSWCount; + for (auto Pred : SU.Preds) { + if (Pred.getSUnit()->getInstr()->getOpcode() == + AMDGPU::V_PERM_B32_e64) { + DSWithPerms.push_back(&SU); + break; + } + } + } + } + } + DSWWithPermCount = DSWithPerms.size(); + auto I = DSWithPerms.begin(); + auto E = DSWithPerms.end(); + + // Get the count of DS_WRITES with V_PERM predecessors which + // have loop carried dependencies (WAR) on the same VMEM_READs. + // We consider partial overlap as a miss -- in other words, + // for a given DS_W, we only consider another DS_W as matching + // if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred + // for every V_PERM pred of this DS_W. + DenseMap VMEMLookup; + SmallVector Counted; + for (; I != E; I++) { + SUnit *Cand = nullptr; + bool MissedAny = false; + for (auto &Pred : (*I)->Preds) { + if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64) + continue; - const unsigned PipelineSyncID = 0; - SchedGroup *SG = nullptr; + if (Cand && + std::find(Counted.begin(), Counted.end(), Cand) != Counted.end()) + break; - // The SU is a successor of SU in prev SchedGroup - InstructionRuleType Rule1 = - [](const SUnit *SU, ArrayRef Collection, const SIInstrInfo *TII, - SmallVectorImpl &SyncPipe, unsigned SGID) { - auto MI = SU->getInstr(); - if (MI->getOpcode() == TargetOpcode::BUNDLE) - return false; + for (auto &Succ : Pred.getSUnit()->Succs) { + auto MI = Succ.getSUnit()->getInstr(); + if (!TII->isVMEM(*MI) || !MI->mayLoad()) + continue; - SchedGroup *OtherGroup = nullptr; - for (auto &PipeSG : SyncPipe) { - if (PipeSG.getSGID() == (int)SGID - 1) { - OtherGroup = &PipeSG; - } + if (MissedAny || !VMEMLookup.size()) { + MissedAny = true; + VMEMLookup[MI] = *I; + continue; } - if (!OtherGroup) - return false; + if (!VMEMLookup.contains(MI)) { + MissedAny = true; + VMEMLookup[MI] = *I; + continue; + } + + Cand = VMEMLookup[MI]; + if (std::find(Counted.begin(), Counted.end(), Cand) != Counted.end()) { + MissedAny = true; + break; + } + } + } + if (!MissedAny && Cand) { + DSWWithSharedVMEMCount += 2; + Counted.push_back(Cand); + Counted.push_back(*I); + } + } + + assert(DSWWithSharedVMEMCount <= DSWWithPermCount); + SchedGroup *SG; + unsigned PipelineSyncID = 0; + // For kernels with V_PERM, there are enough VALU to mix in between MFMAs + if (DSWWithPermCount) { + for (unsigned I = 0; I < MFMACount; I++) { + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::VALU, 2, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + } + } + + PipelineSyncID = 1; + // Phase 1: Break up DS_READ and MFMA clusters. + // First DS_READ to make ready initial MFMA, then interleave MFMA with DS_READ + // prefetch + + // Make ready initial MFMA + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::DS_READ, 4, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared(TII, SG->getSGID(), true)); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + // Interleave MFMA with DS_READ prefetch + for (unsigned I = 0; I < DSRCount - 4; ++I) { + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + } + + // Phase 2a: Loop carried dependency with V_PERM + // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they + // depend on. Interleave MFMA to keep XDL unit busy throughout. + for (unsigned I = 0; I < DSWWithPermCount - DSWWithSharedVMEMCount; ++I) { + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared(TII, SG->getSGID(), true)); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared(TII, SG->getSGID(), false)); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared( + 1, TII, SG->getSGID(), true)); + SG->addRule(std::make_shared(TII, SG->getSGID(), false)); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); - return (std::any_of(OtherGroup->Collection.begin(), - OtherGroup->Collection.end(), [&SU](SUnit *Elt) { - return std::any_of(Elt->Succs.begin(), - Elt->Succs.end(), - [&SU](SDep &Succ) { - return Succ.getSUnit() == SU; - }); - })); - }; - - SmallVector DemoRules; - - // Each iteration of pipeline has 1 MFMA and 1 DS_W, where the DS_W is a - // successor of the MFMA - for (unsigned I = 0; I < MFMACount; ++I) { SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared( + 3, TII, SG->getSGID(), true)); + SG->addRule(std::make_shared(TII, SG->getSGID(), false)); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + } + + // Phase 2b: Loop carried dependency without V_PERM + // Schedule DS_WRITE as closely as possible to the VMEM_READ they depend on. + // Interleave MFMA to keep XDL unit busy throughout. + for (unsigned I = 0; I < DSWCount - DSWWithPermCount; I++) { SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); - SG->addRule(Rule1); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared(TII, SG->getSGID(), false)); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + } + + // Phase 2c: Loop carried dependency with V_PERM, VMEM_READs are + // ultimately used by two DS_WRITE + // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they + // depend on. Interleave MFMA to keep XDL unit busy throughout. + + for (unsigned I = 0; I < DSWWithSharedVMEMCount; ++I) { + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared(TII, SG->getSGID(), true)); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared(TII, SG->getSGID(), false)); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared(TII, SG->getSGID(), true)); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared(TII, SG->getSGID(), false)); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared( + 2, TII, SG->getSGID(), true)); + SG->addRule(std::make_shared(TII, SG->getSGID(), false)); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared( + 4, TII, SG->getSGID(), true)); + SG->addRule(std::make_shared(TII, SG->getSGID(), false)); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); } } @@ -948,8 +1351,8 @@ switch (ID) { case MFMASmallGemmOptID: return std::make_unique(DAG, TII); - case DemoOptID: - return std::make_unique(DAG, TII); + case MFMASmallGemmSingleWaveOptID: + return std::make_unique(DAG, TII); } llvm_unreachable("Unknown IGLPStrategyID"); diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll @@ -168,27 +168,49 @@ ; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:48 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:8304 -; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:8288 -; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:8272 -; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:8256 -; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:8240 -; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:8224 -; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:8208 -; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:8192 +; GCN-NEXT: ds_read_b128 a[156:159], v1 offset:8304 +; GCN-NEXT: ds_read_b128 a[152:155], v1 offset:8288 +; GCN-NEXT: ds_read_b128 a[148:151], v1 offset:8272 +; GCN-NEXT: ds_read_b128 a[144:147], v1 offset:8256 +; GCN-NEXT: ds_read_b128 a[140:143], v1 offset:8240 +; GCN-NEXT: ds_read_b128 a[136:139], v1 offset:8224 +; GCN-NEXT: ds_read_b128 a[132:135], v1 offset:8208 +; GCN-NEXT: ds_read_b128 a[128:131], v1 offset:8192 ; GCN-NEXT: v_add_u32_e32 v0, s1, v0 -; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:24688 -; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:24672 -; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:24656 -; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:24640 -; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:24624 -; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:24608 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112 -; GCN-NEXT: s_waitcnt lgkmcnt(7) +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v3, a[128:159] +; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:24688 +; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:24672 +; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:24656 +; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:24640 +; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:24624 +; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:24608 +; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:24592 +; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:24576 +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v3, a[96:127] -; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:24592 -; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:24576 +; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:49264 +; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:49248 +; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:49232 +; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:49216 +; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:49200 +; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:49184 +; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:49168 +; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:49152 +; GCN-NEXT: v_add_u32_e32 v1, 0x6000, v1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v3, a[64:95] +; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:57456 +; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:57440 +; GCN-NEXT: ds_read_b128 a[52:55], v1 offset:57424 +; GCN-NEXT: ds_read_b128 a[48:51], v1 offset:57408 +; GCN-NEXT: ds_read_b128 a[32:35], v1 offset:57344 +; GCN-NEXT: ds_read_b128 a[36:39], v1 offset:57360 +; GCN-NEXT: ds_read_b128 a[40:43], v1 offset:57376 +; GCN-NEXT: ds_read_b128 a[44:47], v1 offset:57392 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63] +; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112 ; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:96 ; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:80 ; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:64 @@ -197,58 +219,30 @@ ; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16 ; GCN-NEXT: ds_write_b128 v0, a[0:3] ; GCN-NEXT: v_mov_b32_e32 v0, s1 -; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:49264 -; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:49248 -; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:49232 -; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:49216 -; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:49200 -; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:49184 -; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:49168 -; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:49152 -; GCN-NEXT: v_add_u32_e32 v4, 0x6000, v1 -; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:8288 -; GCN-NEXT: s_waitcnt lgkmcnt(14) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v3, a[64:95] -; GCN-NEXT: ds_read_b128 a[60:63], v4 offset:57456 -; GCN-NEXT: ds_read_b128 a[56:59], v4 offset:57440 -; GCN-NEXT: ds_read_b128 a[52:55], v4 offset:57424 -; GCN-NEXT: ds_read_b128 a[48:51], v4 offset:57408 -; GCN-NEXT: ds_read_b128 a[32:35], v4 offset:57344 -; GCN-NEXT: ds_read_b128 a[36:39], v4 offset:57360 -; GCN-NEXT: ds_read_b128 a[40:43], v4 offset:57376 -; GCN-NEXT: ds_read_b128 a[44:47], v4 offset:57392 -; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:8304 -; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:8256 -; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:8272 -; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:8224 -; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:8240 -; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:8192 -; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:8208 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:16480 -; GCN-NEXT: s_waitcnt lgkmcnt(14) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:16496 -; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:16448 -; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:16464 -; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:16416 -; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:16432 -; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:16384 -; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:16400 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:24672 -; GCN-NEXT: s_waitcnt lgkmcnt(14) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63] -; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:24688 -; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:24640 -; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:24656 -; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:24608 -; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:24624 -; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:24576 -; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:24592 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:8288 +; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:8304 +; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:8256 +; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:8272 +; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:8224 +; GCN-NEXT: ds_write_b128 v0, a[140:143] offset:8240 +; GCN-NEXT: ds_write_b128 v0, a[128:131] offset:8192 +; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:8208 +; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:16480 +; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:16496 +; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:16448 +; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:16464 +; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:16416 +; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:16432 +; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:16384 +; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:16400 +; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:24672 +; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:24688 +; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:24640 +; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:24656 +; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:24608 +; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:24624 +; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:24576 +; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:24592 ; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:32864 ; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:32880 ; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:32832 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir @@ -0,0 +1,339 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck -check-prefix=GCN %s + +--- | + define amdgpu_kernel void @single-wave-phase-2b(i32 addrspace(3)* noalias %in0, i32 addrspace(3)* noalias %in1, i32 addrspace(3)* noalias %in2, i32 addrspace(3)* noalias %in3, i32 addrspace(3)* noalias %in4, i32 addrspace(3)* noalias %in5, i32 addrspace(3)* noalias %in6, i32 addrspace(3)* noalias %in7, i32 addrspace(3)* noalias %in8, i32 addrspace(3)* noalias %in9, i32 addrspace(3)* noalias %in10, i32 addrspace(3)* noalias %in11, i32 addrspace(7)* noalias %in12, i32 addrspace(7)* noalias %in13, i32 addrspace(7)* noalias %in14, i32 addrspace(7)* noalias %in15, i32 addrspace(7)* noalias %in16, i32 addrspace(7)* noalias %in17, i32 addrspace(7)* noalias %in18, i32 addrspace(7)* noalias %in19, i32 addrspace(7)* noalias %in20, i32 addrspace(7)* noalias %in21, i32 addrspace(7)* noalias %in22, i32 addrspace(7)* noalias %in23, i32 addrspace(7)* noalias %in24, i32 addrspace(7)* noalias %in25, i32 addrspace(7)* noalias %in26, i32 addrspace(7)* noalias %in27, i32 addrspace(7)* noalias %in28, i32 addrspace(7)* noalias %in29) #0 { ret void } + + !0 = distinct !{!0} + !1 = !{!1, !0} +... + + +--- +name: single-wave-phase-2b +tracksRegLiveness: true +machineFunctionInfo: + occupancy: 1 +body: | + ; GCN-LABEL: name: single-wave-phase-2b + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.1(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[DEF:%[0-9]+]]:areg_512_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:areg_512_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF2:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF3:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF4:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF5:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF13:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF14:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF15:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF17:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF18:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF19:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF20:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF21:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF22:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF23:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF24:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF25:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF26:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF27:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF28:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF29:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF30:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF31:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF32:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF33:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF34:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF35:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF36:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF37:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF38:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF39:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF40:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF41:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF42:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF43:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF44:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF45:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF46:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF47:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF48:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF49:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[DS_READ_B128_gfx9_:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF6]], 0, 0, implicit $exec :: (load (s128) from %ir.in0, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[DS_READ_B128_gfx9_1:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF7]], 0, 0, implicit $exec :: (load (s128) from %ir.in4, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[DS_READ_B128_gfx9_2:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF6]], 1040, 0, implicit $exec :: (load (s128) from %ir.in1, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[DS_READ_B128_gfx9_3:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF7]], 2064, 0, implicit $exec :: (load (s128) from %ir.in5, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub0_sub1, [[DS_READ_B128_gfx9_1]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[DS_READ_B128_gfx9_4:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF6]], 2080, 0, implicit $exec :: (load (s128) from %ir.in2, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF8]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF9]], implicit $exec + ; GCN-NEXT: dead [[V_MFMA_F32_32X32X8F16_mac_e64_1:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub2_sub3, [[DS_READ_B128_gfx9_1]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[DS_READ_B128_gfx9_5:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF6]], 3120, 0, implicit $exec :: (load (s128) from %ir.in3, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF10]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF11]], implicit $exec + ; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub0_sub1, [[DS_READ_B128_gfx9_3]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[DS_READ_B128_gfx9_6:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF7]], 4128, 0, implicit $exec :: (load (s128) from %ir.in6, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[V_ADD_U32_e32_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF12]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF13]], implicit $exec + ; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub2_sub3, [[DS_READ_B128_gfx9_3]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[DS_READ_B128_gfx9_7:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF7]], 6192, 0, implicit $exec :: (load (s128) from %ir.in7, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[V_ADD_U32_e32_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF14]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF15]], implicit $exec + ; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_4]].sub0_sub1, [[DS_READ_B128_gfx9_6]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[DS_READ_B128_gfx9_8:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF7]], 1024, 0, implicit $exec :: (load (s128) from %ir.in8, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[V_ADD_U32_e32_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF16]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF17]], implicit $exec + ; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_4]].sub2_sub3, [[DS_READ_B128_gfx9_6]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[DS_READ_B128_gfx9_9:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF7]], 3088, 0, implicit $exec :: (load (s128) from %ir.in9, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[V_ADD_U32_e32_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF18]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF19]], implicit $exec + ; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_5]].sub0_sub1, [[DS_READ_B128_gfx9_7]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[DS_READ_B128_gfx9_10:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF7]], 5152, 0, implicit $exec :: (load (s128) from %ir.in10, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[V_ADD_U32_e32_12:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF20]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF21]], implicit $exec + ; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_5]].sub2_sub3, [[DS_READ_B128_gfx9_7]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[DS_READ_B128_gfx9_11:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF7]], 7216, 0, implicit $exec :: (load (s128) from %ir.in11, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub0_sub1, [[DS_READ_B128_gfx9_8]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: undef %63.sub1:vreg_128_align2 = V_PERM_B32_e64 [[DEF28]], [[DEF29]], [[DEF44]], implicit $exec + ; GCN-NEXT: %63.sub0:vreg_128_align2 = V_PERM_B32_e64 [[DEF30]], [[DEF31]], [[DEF44]], implicit $exec + ; GCN-NEXT: %63.sub3:vreg_128_align2 = V_PERM_B32_e64 [[DEF24]], [[DEF25]], [[DEF44]], implicit $exec + ; GCN-NEXT: %63.sub2:vreg_128_align2 = V_PERM_B32_e64 [[DEF26]], [[DEF27]], [[DEF44]], implicit $exec + ; GCN-NEXT: DS_WRITE_B128_gfx9 [[DEF40]], %63, 0, 0, implicit $exec :: (store (s128) into %ir.in0, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in12, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_1]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in13, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_2]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in14, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_3]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in15, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_3:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub2_sub3, [[DS_READ_B128_gfx9_8]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_3]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_4]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in16, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_5]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in17, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN6:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_6]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in18, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN7:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_7]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in19, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_3:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub0_sub1, [[DS_READ_B128_gfx9_9]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_3]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: undef %64.sub1:vreg_128_align2 = V_PERM_B32_e64 [[DEF36]], [[DEF37]], [[DEF44]], implicit $exec + ; GCN-NEXT: %64.sub0:vreg_128_align2 = V_PERM_B32_e64 [[DEF38]], [[DEF39]], [[DEF44]], implicit $exec + ; GCN-NEXT: %64.sub3:vreg_128_align2 = V_PERM_B32_e64 [[DEF32]], [[DEF33]], [[DEF44]], implicit $exec + ; GCN-NEXT: %64.sub2:vreg_128_align2 = V_PERM_B32_e64 [[DEF34]], [[DEF35]], [[DEF44]], implicit $exec + ; GCN-NEXT: DS_WRITE_B128_gfx9 [[DEF40]], %64, 1040, 0, implicit $exec :: (store (s128) into %ir.in1, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN8:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_8]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in20, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN9:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_9]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in21, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN10:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_10]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in22, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_11]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in23, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_3:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub2_sub3, [[DS_READ_B128_gfx9_9]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_3]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_12]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in24, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_13]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in25, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[V_ADD_U32_e32_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF22]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF23]], implicit $exec + ; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN14:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_14]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in26, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN15:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_15]], [[DEF47]], 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_3:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_4]].sub0_sub1, [[DS_READ_B128_gfx9_10]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_3]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: DS_WRITE_B128_gfx9 [[DEF41]], [[DEF2]], 0, 0, implicit $exec :: (store (s128) into %ir.in2, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[V_ADD_U32_e32_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 -16, [[DEF45]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 -16, [[DEF46]], implicit $exec + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[V_ADD_U32_e32_16]], [[DEF48]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in26, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_3:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_4]].sub2_sub3, [[DS_READ_B128_gfx9_10]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_3]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: DS_WRITE_B128_gfx9 [[DEF41]], [[DEF3]], 2064, 0, implicit $exec :: (store (s128) into %ir.in3, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[DEF45]], [[DEF48]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in27, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[V_ADD_U32_e32_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 128, [[V_ADD_U32_e32_18]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_19]], implicit $exec + ; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_3:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_5]].sub0_sub1, [[DS_READ_B128_gfx9_11]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_3]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: DS_WRITE_B128_gfx9 [[DEF41]], [[DEF4]], 2080, 0, implicit $exec :: (store (s128) into %ir.in4, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[DEF46]], [[DEF48]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in28, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[V_ADD_U32_e32_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 128, [[V_ADD_U32_e32_20]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_21]], implicit $exec + ; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_3:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_5]].sub2_sub3, [[DS_READ_B128_gfx9_11]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_3]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: DS_WRITE_B128_gfx9 [[DEF41]], [[DEF5]], 16, 0, implicit $exec :: (store (s128) into %ir.in5, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[V_ADD_U32_e32_17]], [[DEF48]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in29, !alias.scope !0, addrspace 7) + ; GCN-NEXT: IGLP_OPT 1 + ; GCN-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = nsw S_ADD_I32 [[S_ADD_I32_]], -1, implicit-def dead $scc + ; GCN-NEXT: S_CMP_LG_U32 [[S_ADD_I32_]], 0, implicit-def $scc + ; GCN-NEXT: [[V_ADD_U32_e32_22:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_22]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_23:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_23]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_24:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_24]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_25:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_25]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_26:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_26]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_27:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_27]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_28:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_28]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_29:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_29]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_30]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_31:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_31]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_32:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_32]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_33:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_33]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_34:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_34]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_35:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_35]], implicit $exec + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.1, implicit killed $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: S_ENDPGM 0 + bb.0: + %761:areg_512_align2 = IMPLICIT_DEF + %814:areg_512_align2 = IMPLICIT_DEF + %1757:av_128_align2 = IMPLICIT_DEF + %1755:av_128_align2 = IMPLICIT_DEF + %1754:av_128_align2 = IMPLICIT_DEF + %1756:av_128_align2 = IMPLICIT_DEF + %37:vgpr_32 = IMPLICIT_DEF + %38:vgpr_32 = IMPLICIT_DEF + %1736:vgpr_32 = IMPLICIT_DEF + %1737:vgpr_32 = IMPLICIT_DEF + %1738:vgpr_32 = IMPLICIT_DEF + %1739:vgpr_32 = IMPLICIT_DEF + %1740:vgpr_32 = IMPLICIT_DEF + %1741:vgpr_32 = IMPLICIT_DEF + %1742:vgpr_32 = IMPLICIT_DEF + %1743:vgpr_32 = IMPLICIT_DEF + %1744:vgpr_32 = IMPLICIT_DEF + %1745:vgpr_32 = IMPLICIT_DEF + %1746:vgpr_32 = IMPLICIT_DEF + %1747:vgpr_32 = IMPLICIT_DEF + %1748:vgpr_32 = IMPLICIT_DEF + %1749:vgpr_32 = IMPLICIT_DEF + %1750:vgpr_32 = IMPLICIT_DEF + %1751:vgpr_32 = IMPLICIT_DEF + %1766:vgpr_32 = IMPLICIT_DEF + %1767:vgpr_32 = IMPLICIT_DEF + %1768:vgpr_32 = IMPLICIT_DEF + %1769:vgpr_32 = IMPLICIT_DEF + %1770:vgpr_32 = IMPLICIT_DEF + %1771:vgpr_32 = IMPLICIT_DEF + %1772:vgpr_32 = IMPLICIT_DEF + %1773:vgpr_32 = IMPLICIT_DEF + %1758:vgpr_32 = IMPLICIT_DEF + %1759:vgpr_32 = IMPLICIT_DEF + %1760:vgpr_32 = IMPLICIT_DEF + %1761:vgpr_32 = IMPLICIT_DEF + %1762:vgpr_32 = IMPLICIT_DEF + %1763:vgpr_32 = IMPLICIT_DEF + %1764:vgpr_32 = IMPLICIT_DEF + %1765:vgpr_32 = IMPLICIT_DEF + %14:vgpr_32 = IMPLICIT_DEF + %31:vgpr_32 = IMPLICIT_DEF + %41:vgpr_32 = IMPLICIT_DEF + %43:sreg_32 = IMPLICIT_DEF + %535:sreg_32 = IMPLICIT_DEF + %1752:vgpr_32 = IMPLICIT_DEF + %1753:vgpr_32 = IMPLICIT_DEF + %450:sgpr_128 = IMPLICIT_DEF + %518:sgpr_128 = IMPLICIT_DEF + %1735:sreg_32 = IMPLICIT_DEF + + bb.1: + IGLP_OPT 1 + %683:av_128_align2 = DS_READ_B128_gfx9 %37:vgpr_32, 0, 0, implicit $exec :: (load (s128) from %ir.in0, !alias.scope !0, addrspace 3) + %688:av_128_align2 = DS_READ_B128_gfx9 %37:vgpr_32, 1040, 0, implicit $exec :: (load (s128) from %ir.in1, !alias.scope !0, addrspace 3) + %693:av_128_align2 = DS_READ_B128_gfx9 %37:vgpr_32, 2080, 0, implicit $exec :: (load (s128) from %ir.in2, !alias.scope !0, addrspace 3) + %698:av_128_align2 = DS_READ_B128_gfx9 %37:vgpr_32, 3120, 0, implicit $exec :: (load (s128) from %ir.in3, !alias.scope !0, addrspace 3) + %703:av_128_align2 = DS_READ_B128_gfx9 %38:vgpr_32, 0, 0, implicit $exec :: (load (s128) from %ir.in4, !alias.scope !0, addrspace 3) + %708:av_128_align2 = DS_READ_B128_gfx9 %38:vgpr_32, 2064, 0, implicit $exec :: (load (s128) from %ir.in5, !alias.scope !0, addrspace 3) + %713:av_128_align2 = DS_READ_B128_gfx9 %38:vgpr_32, 4128, 0, implicit $exec :: (load (s128) from %ir.in6, !alias.scope !0, addrspace 3) + %718:av_128_align2 = DS_READ_B128_gfx9 %38:vgpr_32, 6192, 0, implicit $exec :: (load (s128) from %ir.in7, !alias.scope !0, addrspace 3) + %761:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %683.sub0_sub1:av_128_align2, %703.sub0_sub1:av_128_align2, %761:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %762:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %683.sub2_sub3:av_128_align2, %703.sub2_sub3:av_128_align2, %761:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %761:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %688.sub0_sub1:av_128_align2, %708.sub0_sub1:av_128_align2, %761:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %761:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %688.sub2_sub3:av_128_align2, %708.sub2_sub3:av_128_align2, %761:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %761:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %693.sub0_sub1:av_128_align2, %713.sub0_sub1:av_128_align2, %761:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %761:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %693.sub2_sub3:av_128_align2, %713.sub2_sub3:av_128_align2, %761:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %761:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %698.sub0_sub1:av_128_align2, %718.sub0_sub1:av_128_align2, %761:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %761:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %698.sub2_sub3:av_128_align2, %718.sub2_sub3:av_128_align2, %761:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %764:av_128_align2 = DS_READ_B128_gfx9 %38:vgpr_32, 1024, 0, implicit $exec :: (load (s128) from %ir.in8, !alias.scope !0, addrspace 3) + %769:av_128_align2 = DS_READ_B128_gfx9 %38:vgpr_32, 3088, 0, implicit $exec :: (load (s128) from %ir.in9, !alias.scope !0, addrspace 3) + %774:av_128_align2 = DS_READ_B128_gfx9 %38:vgpr_32, 5152, 0, implicit $exec :: (load (s128) from %ir.in10, !alias.scope !0, addrspace 3) + %779:av_128_align2 = DS_READ_B128_gfx9 %38:vgpr_32, 7216, 0, implicit $exec :: (load (s128) from %ir.in11, !alias.scope !0, addrspace 3) + %814:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %683.sub0_sub1:av_128_align2, %764.sub0_sub1:av_128_align2, %814:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %814:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %683.sub2_sub3:av_128_align2, %764.sub2_sub3:av_128_align2, %814:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %814:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %688.sub0_sub1:av_128_align2, %769.sub0_sub1:av_128_align2, %814:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %814:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %688.sub2_sub3:av_128_align2, %769.sub2_sub3:av_128_align2, %814:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %814:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %693.sub0_sub1:av_128_align2, %774.sub0_sub1:av_128_align2, %814:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %814:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %693.sub2_sub3:av_128_align2, %774.sub2_sub3:av_128_align2, %814:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %814:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %698.sub0_sub1:av_128_align2, %779.sub0_sub1:av_128_align2, %814:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %814:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %698.sub2_sub3:av_128_align2, %779.sub2_sub3:av_128_align2, %814:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + undef %1599.sub3:vreg_128_align2 = V_PERM_B32_e64 %1766:vgpr_32, %1767:vgpr_32, %535:sreg_32, implicit $exec + %1599.sub2:vreg_128_align2 = V_PERM_B32_e64 %1768:vgpr_32, %1769:vgpr_32, %535:sreg_32, implicit $exec + %1599.sub1:vreg_128_align2 = V_PERM_B32_e64 %1770:vgpr_32, %1771:vgpr_32, %535:sreg_32, implicit $exec + %1599.sub0:vreg_128_align2 = V_PERM_B32_e64 %1772:vgpr_32, %1773:vgpr_32, %535:sreg_32, implicit $exec + undef %1579.sub3:vreg_128_align2 = V_PERM_B32_e64 %1758:vgpr_32, %1759:vgpr_32, %535:sreg_32, implicit $exec + %1579.sub2:vreg_128_align2 = V_PERM_B32_e64 %1760:vgpr_32, %1761:vgpr_32, %535:sreg_32, implicit $exec + %1579.sub1:vreg_128_align2 = V_PERM_B32_e64 %1762:vgpr_32, %1763:vgpr_32, %535:sreg_32, implicit $exec + %1579.sub0:vreg_128_align2 = V_PERM_B32_e64 %1764:vgpr_32, %1765:vgpr_32, %535:sreg_32, implicit $exec + DS_WRITE_B128_gfx9 %14:vgpr_32, %1599:vreg_128_align2, 0, 0, implicit $exec :: (store (s128) into %ir.in0, !alias.scope !0, addrspace 3) + DS_WRITE_B128_gfx9 %14:vgpr_32, %1579:vreg_128_align2, 1040, 0, implicit $exec :: (store (s128) into %ir.in1, !alias.scope !0, addrspace 3) + %830:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1736:vgpr_32, implicit $exec + %1773:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %830:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in12, !alias.scope !0, addrspace 7) + %833:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1737:vgpr_32, implicit $exec + %1772:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %833:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in13, !alias.scope !0, addrspace 7) + %835:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1738:vgpr_32, implicit $exec + %1771:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %835:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in14, !alias.scope !0, addrspace 7) + %837:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1739:vgpr_32, implicit $exec + %1770:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %837:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in15, !alias.scope !0, addrspace 7) + %839:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1740:vgpr_32, implicit $exec + %1769:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %839:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in16, !alias.scope !0, addrspace 7) + %841:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1741:vgpr_32, implicit $exec + %1768:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %841:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in17, !alias.scope !0, addrspace 7) + %843:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1742:vgpr_32, implicit $exec + %1767:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %843:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in18, !alias.scope !0, addrspace 7) + %845:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1743:vgpr_32, implicit $exec + %1766:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %845:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in19, !alias.scope !0, addrspace 7) + %847:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1744:vgpr_32, implicit $exec + %1758:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %847:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in20, !alias.scope !0, addrspace 7) + %849:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1745:vgpr_32, implicit $exec + %1759:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %849:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in21, !alias.scope !0, addrspace 7) + %851:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1746:vgpr_32, implicit $exec + %1760:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %851:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in22, !alias.scope !0, addrspace 7) + %853:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1747:vgpr_32, implicit $exec + %1761:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %853:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in23, !alias.scope !0, addrspace 7) + %855:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1748:vgpr_32, implicit $exec + %1762:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %855:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in24, !alias.scope !0, addrspace 7) + %857:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1749:vgpr_32, implicit $exec + %1763:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %857:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in25, !alias.scope !0, addrspace 7) + %859:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1750:vgpr_32, implicit $exec + %1764:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %859:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in26, !alias.scope !0, addrspace 7) + %861:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1751:vgpr_32, implicit $exec + %1765:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %861:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec + DS_WRITE_B128_gfx9 %31:vgpr_32, %1757:av_128_align2, 0, 0, implicit $exec :: (store (s128) into %ir.in2, !alias.scope !0, addrspace 3) + DS_WRITE_B128_gfx9 %31:vgpr_32, %1755:av_128_align2, 2064, 0, implicit $exec :: (store (s128) into %ir.in3, !alias.scope !0, addrspace 3) + DS_WRITE_B128_gfx9 %31:vgpr_32, %1754:av_128_align2, 2080, 0, implicit $exec :: (store (s128) into %ir.in4, !alias.scope !0, addrspace 3) + DS_WRITE_B128_gfx9 %31:vgpr_32, %1756:av_128_align2, 16, 0, implicit $exec :: (store (s128) into %ir.in5, !alias.scope !0, addrspace 3) + %864:vgpr_32 = V_ADD_U32_e32 -16, %1752:vgpr_32, implicit $exec + %1757:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %864:vgpr_32, %518:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in26, !alias.scope !0, addrspace 7) + %1755:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %1752:vgpr_32, %518:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in27, !alias.scope !0, addrspace 7) + %1754:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %1753:vgpr_32, %518:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in28, !alias.scope !0, addrspace 7) + %865:vgpr_32 = V_ADD_U32_e32 -16, %1753:vgpr_32, implicit $exec + %1756:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %865:vgpr_32, %518:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in29, !alias.scope !0, addrspace 7) + %1753:vgpr_32 = V_ADD_U32_e32 128, %1753:vgpr_32, implicit $exec + %1752:vgpr_32 = V_ADD_U32_e32 128, %1752:vgpr_32, implicit $exec + %1751:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1751:vgpr_32, implicit $exec + %1750:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1750:vgpr_32, implicit $exec + %1749:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1749:vgpr_32, implicit $exec + %1748:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1748:vgpr_32, implicit $exec + %1747:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1747:vgpr_32, implicit $exec + %1746:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1746:vgpr_32, implicit $exec + %1745:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1745:vgpr_32, implicit $exec + %1744:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1744:vgpr_32, implicit $exec + %1743:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1743:vgpr_32, implicit $exec + %1742:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1742:vgpr_32, implicit $exec + %1741:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1741:vgpr_32, implicit $exec + %1740:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1740:vgpr_32, implicit $exec + %1739:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1739:vgpr_32, implicit $exec + %1738:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1738:vgpr_32, implicit $exec + %1737:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1737:vgpr_32, implicit $exec + %1736:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1736:vgpr_32, implicit $exec + %1735:sreg_32 = nsw S_ADD_I32 %1735:sreg_32, -1, implicit-def dead $scc + S_CMP_LG_U32 %1735:sreg_32, 0, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + + bb.2: + S_ENDPGM 0 +--- diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2c.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2c.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2c.mir @@ -0,0 +1,217 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck -check-prefix=GCN %s + +--- | + define amdgpu_kernel void @single-wave-phase-2c(i32 addrspace(3)* noalias %in0, i32 addrspace(3)* noalias %in1, i32 addrspace(3)* noalias %in2, i32 addrspace(3)* noalias %in3, i32 addrspace(3)* noalias %in4, i32 addrspace(3)* noalias %in5, i32 addrspace(3)* noalias %in6, i32 addrspace(3)* noalias %in7, i32 addrspace(3)* noalias %in8, i32 addrspace(3)* noalias %in9, i32 addrspace(3)* noalias %in10, i32 addrspace(3)* noalias %in11, i32 addrspace(7)* noalias %in12, i32 addrspace(7)* noalias %in13, i32 addrspace(7)* noalias %in14, i32 addrspace(7)* noalias %in15, i32 addrspace(7)* noalias %in16, i32 addrspace(7)* noalias %in17) #0 { ret void } + + + !0 = distinct !{!0} + !1 = !{!1, !0} +... + +--- +name: single-wave-phase-2c +tracksRegLiveness: true +machineFunctionInfo: + occupancy: 1 +body: | + ; GCN-LABEL: name: single-wave-phase-2c + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.1(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[DEF:%[0-9]+]]:av_512_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:av_512_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF7:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF13:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF14:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF15:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF16:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF19:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: dead [[DEF20:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF21:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF22:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF23:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF24:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF25:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF26:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF27:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF28:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF29:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF30:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF31:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF32:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF33:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[DS_READ_B128_gfx9_:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF2]], 0, 0, implicit $exec :: (load (s128) from %ir.in0, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[DS_READ_B128_gfx9_1:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF3]], 0, 0, implicit $exec :: (load (s128) from %ir.in2, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[DS_READ_B128_gfx9_2:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF2]], 1040, 0, implicit $exec :: (load (s128) from %ir.in1, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[DS_READ_B128_gfx9_3:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF3]], 2064, 0, implicit $exec :: (load (s128) from %ir.in3, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[COPY:%[0-9]+]]:areg_512_align2 = COPY [[DEF1]] + ; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub0_sub1, [[DS_READ_B128_gfx9_1]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[DS_READ_B128_gfx9_4:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF3]], 1024, 0, implicit $exec :: (load (s128) from %ir.in4, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF33]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF21]], implicit $exec + ; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_1:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub2_sub3, [[DS_READ_B128_gfx9_1]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_1]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[DS_READ_B128_gfx9_5:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF3]], 3088, 0, implicit $exec :: (load (s128) from %ir.in5, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF22]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF23]], implicit $exec + ; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_1:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub0_sub1, [[DS_READ_B128_gfx9_3]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_1]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: DS_WRITE_B128_gfx9 [[DEF4]], [[DEF16]], 0, 0, implicit $exec :: (store (s128) into %ir.in6, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[DEF6]], [[DEF7]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in7, !alias.scope !0, addrspace 7) + ; GCN-NEXT: dead [[V_MFMA_F32_32X32X8F16_mac_e64_1:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub2_sub3, [[DS_READ_B128_gfx9_3]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_1]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[COPY:%[0-9]+]]:areg_512_align2 = COPY [[DEF]] + ; GCN-NEXT: undef [[DEF17]].sub2:vreg_128_align2 = V_PERM_B32_e64 [[DEF13]], [[DEF12]], [[DEF30]], implicit $exec + ; GCN-NEXT: [[DEF17]].sub3:vreg_128_align2 = V_PERM_B32_e64 [[DEF15]], [[DEF14]], [[DEF30]], implicit $exec + ; GCN-NEXT: [[DEF17]].sub0:vreg_128_align2 = V_PERM_B32_e64 [[DEF8]], [[DEF9]], [[DEF30]], implicit $exec + ; GCN-NEXT: [[DEF17]].sub1:vreg_128_align2 = V_PERM_B32_e64 [[DEF11]], [[DEF10]], [[DEF30]], implicit $exec + ; GCN-NEXT: DS_WRITE_B128_gfx9 [[DEF5]], [[DEF17]], 0, 0, implicit $exec :: (store (s128) into %ir.in8, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_1:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub0_sub1, [[DS_READ_B128_gfx9_4]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_1]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: undef [[DEF18]].sub0:vreg_128_align2 = V_PERM_B32_e64 [[DEF8]], [[DEF9]], [[DEF31]], implicit $exec + ; GCN-NEXT: [[DEF18]].sub1:vreg_128_align2 = V_PERM_B32_e64 [[DEF11]], [[DEF10]], [[DEF31]], implicit $exec + ; GCN-NEXT: [[DEF18]].sub2:vreg_128_align2 = V_PERM_B32_e64 [[DEF13]], [[DEF12]], [[DEF31]], implicit $exec + ; GCN-NEXT: [[DEF18]].sub3:vreg_128_align2 = V_PERM_B32_e64 [[DEF15]], [[DEF14]], [[DEF31]], implicit $exec + ; GCN-NEXT: DS_WRITE_B128_gfx9 [[DEF5]], [[DEF18]], 16, 0, implicit $exec :: (store (s128) into %ir.in9, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub2_sub3, [[DS_READ_B128_gfx9_4]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in10, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_1]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in11, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_2]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in12, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_3]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in13, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub0_sub1, [[DS_READ_B128_gfx9_5]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF24]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF25]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF26]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF27]], implicit $exec + ; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_4]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in14, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_5]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in15, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFEN6:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_6]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in16, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFEN7:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_7]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in17, !alias.scope !0, addrspace 7) + ; GCN-NEXT: dead [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub2_sub3, [[DS_READ_B128_gfx9_5]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: IGLP_OPT 1 + ; GCN-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = nsw S_ADD_I32 [[S_ADD_I32_]], -1, implicit-def dead $scc + ; GCN-NEXT: S_CMP_LG_U32 [[S_ADD_I32_]], 0, implicit-def $scc + ; GCN-NEXT: [[V_ADD_U32_e32_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF28]], [[V_ADD_U32_e32_8]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF28]], [[V_ADD_U32_e32_9]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF28]], [[V_ADD_U32_e32_10]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF28]], [[V_ADD_U32_e32_11]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_12:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 64, [[V_ADD_U32_e32_12]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF28]], [[V_ADD_U32_e32_13]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF28]], [[V_ADD_U32_e32_14]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF28]], [[V_ADD_U32_e32_15]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF28]], [[V_ADD_U32_e32_16]], implicit $exec + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.1, implicit killed $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: S_ENDPGM 0 + bb.0: + %1076:av_512_align2 = IMPLICIT_DEF + %1073:av_512_align2 = IMPLICIT_DEF + %25:vgpr_32 = IMPLICIT_DEF + %26:vgpr_32 = IMPLICIT_DEF + %13:vgpr_32 = IMPLICIT_DEF + %15:vgpr_32 = IMPLICIT_DEF + %1215:vgpr_32 = IMPLICIT_DEF + %381:sgpr_128 = IMPLICIT_DEF + %1225:vgpr_32 = IMPLICIT_DEF + %1224:vgpr_32 = IMPLICIT_DEF + %1226:vgpr_32 = IMPLICIT_DEF + %1227:vgpr_32 = IMPLICIT_DEF + %1228:vgpr_32 = IMPLICIT_DEF + %1229:vgpr_32 = IMPLICIT_DEF + %1230:vgpr_32 = IMPLICIT_DEF + %1231:vgpr_32 = IMPLICIT_DEF + %1232:av_128_align2 = IMPLICIT_DEF + %1091:vreg_128_align2 = IMPLICIT_DEF + %1067:vreg_128_align2 = IMPLICIT_DEF + %27:vgpr_32 = IMPLICIT_DEF + %1216:vgpr_32 = IMPLICIT_DEF + %1217:vgpr_32 = IMPLICIT_DEF + %1218:vgpr_32 = IMPLICIT_DEF + %1219:vgpr_32 = IMPLICIT_DEF + %1220:vgpr_32 = IMPLICIT_DEF + %1221:vgpr_32 = IMPLICIT_DEF + %1222:vgpr_32 = IMPLICIT_DEF + %1223:vgpr_32 = IMPLICIT_DEF + %29:sreg_32 = IMPLICIT_DEF + %1214:sreg_32 = IMPLICIT_DEF + %419:sreg_32 = IMPLICIT_DEF + %421:sreg_32 = IMPLICIT_DEF + %387:sgpr_128 = IMPLICIT_DEF + %1216:vgpr_32 = IMPLICIT_DEF + + bb.1: + IGLP_OPT 1 + %489:av_128_align2 = DS_READ_B128_gfx9 %25:vgpr_32, 0, 0, implicit $exec :: (load (s128) from %ir.in0, !alias.scope !0, addrspace 3) + %494:av_128_align2 = DS_READ_B128_gfx9 %25:vgpr_32, 1040, 0, implicit $exec :: (load (s128) from %ir.in1, !alias.scope !0, addrspace 3) + %499:av_128_align2 = DS_READ_B128_gfx9 %26:vgpr_32, 0, 0, implicit $exec :: (load (s128) from %ir.in2, !alias.scope !0, addrspace 3) + %504:av_128_align2 = DS_READ_B128_gfx9 %26:vgpr_32, 2064, 0, implicit $exec :: (load (s128) from %ir.in3, !alias.scope !0, addrspace 3) + %527:areg_512_align2 = COPY %1073:av_512_align2 + %527:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %489.sub0_sub1:av_128_align2, %499.sub0_sub1:av_128_align2, %527:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %527:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %489.sub2_sub3:av_128_align2, %499.sub2_sub3:av_128_align2, %527:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %527:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %494.sub0_sub1:av_128_align2, %504.sub0_sub1:av_128_align2, %527:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %527:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %494.sub2_sub3:av_128_align2, %504.sub2_sub3:av_128_align2, %527:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %530:av_128_align2 = DS_READ_B128_gfx9 %26:vgpr_32, 1024, 0, implicit $exec :: (load (s128) from %ir.in4, !alias.scope !0, addrspace 3) + %535:av_128_align2 = DS_READ_B128_gfx9 %26:vgpr_32, 3088, 0, implicit $exec :: (load (s128) from %ir.in5, !alias.scope !0, addrspace 3) + %554:areg_512_align2 = COPY %1076:av_512_align2 + %554:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %489.sub0_sub1:av_128_align2, %530.sub0_sub1:av_128_align2, %554:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %554:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %489.sub2_sub3:av_128_align2, %530.sub2_sub3:av_128_align2, %554:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %554:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %494.sub0_sub1:av_128_align2, %535.sub0_sub1:av_128_align2, %554:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %554:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %494.sub2_sub3:av_128_align2, %535.sub2_sub3:av_128_align2, %554:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + DS_WRITE_B128_gfx9 %13:vgpr_32, %1232:av_128_align2, 0, 0, implicit $exec :: (store (s128) into %ir.in6, !alias.scope !0, addrspace 3) + %1232:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %1215:vgpr_32, %381:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in7, !alias.scope !0, addrspace 7) + %1091.sub0:vreg_128_align2 = V_PERM_B32_e64 %1225:vgpr_32, %1224:vgpr_32, %419:sreg_32, implicit $exec + %1067.sub0:vreg_128_align2 = V_PERM_B32_e64 %1225:vgpr_32, %1224:vgpr_32, %421:sreg_32, implicit $exec + %1091.sub1:vreg_128_align2 = V_PERM_B32_e64 %1227:vgpr_32, %1226:vgpr_32, %419:sreg_32, implicit $exec + %1067.sub1:vreg_128_align2 = V_PERM_B32_e64 %1227:vgpr_32, %1226:vgpr_32, %421:sreg_32, implicit $exec + %1091.sub2:vreg_128_align2 = V_PERM_B32_e64 %1229:vgpr_32, %1228:vgpr_32, %419:sreg_32, implicit $exec + %1067.sub2:vreg_128_align2 = V_PERM_B32_e64 %1229:vgpr_32, %1228:vgpr_32, %421:sreg_32, implicit $exec + %1091.sub3:vreg_128_align2 = V_PERM_B32_e64 %1231:vgpr_32, %1230:vgpr_32, %419:sreg_32, implicit $exec + %1067.sub3:vreg_128_align2 = V_PERM_B32_e64 %1231:vgpr_32, %1230:vgpr_32, %421:sreg_32, implicit $exec + DS_WRITE_B128_gfx9 %15:vgpr_32, %1091:vreg_128_align2, 0, 0, implicit $exec :: (store (s128) into %ir.in8, !alias.scope !0, addrspace 3) + DS_WRITE_B128_gfx9 %15:vgpr_32, %1067:vreg_128_align2, 16, 0, implicit $exec :: (store (s128) into %ir.in9, !alias.scope !0, addrspace 3) + %572:vgpr_32 = V_ADD_U32_e32 %27:vgpr_32, %1216:vgpr_32, implicit $exec + %1224:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %572:vgpr_32, %387:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in10, !alias.scope !0, addrspace 7) + %573:vgpr_32 = V_ADD_U32_e32 %27:vgpr_32, %1217:vgpr_32, implicit $exec + %1225:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %573:vgpr_32, %387:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in11, !alias.scope !0, addrspace 7) + %574:vgpr_32 = V_ADD_U32_e32 %27:vgpr_32, %1218:vgpr_32, implicit $exec + %1226:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %574:vgpr_32, %387:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in12, !alias.scope !0, addrspace 7) + %575:vgpr_32 = V_ADD_U32_e32 %27:vgpr_32, %1219:vgpr_32, implicit $exec + %1227:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %575:vgpr_32, %387:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in13, !alias.scope !0, addrspace 7) + %576:vgpr_32 = V_ADD_U32_e32 %27:vgpr_32, %1220:vgpr_32, implicit $exec + %1228:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %576:vgpr_32, %387:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in14, !alias.scope !0, addrspace 7) + %577:vgpr_32 = V_ADD_U32_e32 %27:vgpr_32, %1221:vgpr_32, implicit $exec + %1229:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %577:vgpr_32, %387:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in15, !alias.scope !0, addrspace 7) + %578:vgpr_32 = V_ADD_U32_e32 %27:vgpr_32, %1222:vgpr_32, implicit $exec + %1230:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %578:vgpr_32, %387:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in16, !alias.scope !0, addrspace 7) + %579:vgpr_32 = V_ADD_U32_e32 %27:vgpr_32, %1223:vgpr_32, implicit $exec + %1231:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %579:vgpr_32, %387:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in17, !alias.scope !0, addrspace 7) + %1223:vgpr_32 = V_ADD_U32_e32 %29:sreg_32, %1223:vgpr_32, implicit $exec + %1222:vgpr_32 = V_ADD_U32_e32 %29:sreg_32, %1222:vgpr_32, implicit $exec + %1221:vgpr_32 = V_ADD_U32_e32 %29:sreg_32, %1221:vgpr_32, implicit $exec + %1220:vgpr_32 = V_ADD_U32_e32 %29:sreg_32, %1220:vgpr_32, implicit $exec + %1219:vgpr_32 = V_ADD_U32_e32 %29:sreg_32, %1219:vgpr_32, implicit $exec + %1218:vgpr_32 = V_ADD_U32_e32 %29:sreg_32, %1218:vgpr_32, implicit $exec + %1217:vgpr_32 = V_ADD_U32_e32 %29:sreg_32, %1217:vgpr_32, implicit $exec + %1216:vgpr_32 = V_ADD_U32_e32 %29:sreg_32, %1216:vgpr_32, implicit $exec + %1215:vgpr_32 = V_ADD_U32_e32 64, %1215:vgpr_32, implicit $exec + %1214:sreg_32 = nsw S_ADD_I32 %1214:sreg_32, -1, implicit-def dead $scc + S_CMP_LG_U32 %1214:sreg_32, 0, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + + bb.2: + S_ENDPGM 0 +---