diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -88,10 +88,10 @@ const SIInstrInfo *, SmallVectorImpl &, unsigned)> InstructionRuleType; - // Classify instructions into groups to enable fine tuned control over the - // scheduler. These groups may be more specific than current SchedModel - // instruction classes. - class SchedGroup { +// Classify instructions into groups to enable fine tuned control over the +// scheduler. These groups may be more specific than current SchedModel +// instruction classes. +class SchedGroup { private: // Mask that defines which instruction types can be classified into this // SchedGroup. The instruction types correspond to the mask from SCHED_BARRIER @@ -769,7 +769,10 @@ LLVM_DEBUG(DAG->dump()); } -enum IGLPStrategyID : int { MFMASmallGemmOptID = 0, DemoOptID = 1 }; +enum IGLPStrategyID : int { + MFMASmallGemmOptID = 0, + MFMASmallGemmSingleWaveOptID = 1, +}; // Implement a IGLP scheduling strategy. class IGLPStrategy { @@ -834,7 +837,7 @@ } } -class DemoOpt final : public IGLPStrategy { +class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy { private: Direction OptDir = Direction::TOP_DOWN; @@ -847,60 +850,515 @@ Direction getDirection() override { return OptDir; } - DemoOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) + MFMASmallGemmSingleWaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) : IGLPStrategy(DAG, TII) {} }; -void DemoOpt::applyIGLPStrategy( +void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( DenseMap &SyncedInstrs, DenseMap> &SyncedSchedGroups) { - // Count the number of MFMA instructions. unsigned MFMACount = 0; - for (const MachineInstr &I : *DAG) - if (TII->isMFMAorWMMA(I)) + unsigned DSWCount = 0; + unsigned DSWWithPermCount = 0; + unsigned DSWWithSharedVMEMCount = 0; + unsigned DSRCount = 0; + SmallVector, 6> DSWithPerms; + for (auto &SU : DAG->SUnits) { + auto I = SU.getInstr(); + if (TII->isMFMAorWMMA(*I)) ++MFMACount; + else if (TII->isDS(*I)) { + if (I->mayLoad()) + ++DSRCount; + else if (I->mayStore()) { + ++DSWCount; + for (auto Pred : SU.Preds) { + if (Pred.getSUnit()->getInstr()->getOpcode() == + AMDGPU::V_PERM_B32_e64) { + DSWithPerms.push_back({&SU, false}); + break; + } + } + } + } + } + DSWWithPermCount = DSWithPerms.size(); + auto I = DSWithPerms.begin(); + auto E = DSWithPerms.end(); + + // Get the count of DS_WRITE couples that use the same VMEM_READ data. Both + // DS_WRITES will both have distinct V_PERM predecessors, which, in turn, will + // have a loop carried dependency (WAR) on the same VMEM_READ. If we find such + // a couple, mark them as counted as we continue along the loop so as to not + // double count. + for (; I != E; I++) { + if (I->second) + continue; + auto J = I + 1; + for (; J != E; J++) { + if (J->second) + continue; + auto FirstPred = std::find_if( + I->first->Preds.begin(), I->first->Preds.end(), [](const SDep &Pred) { + return Pred.getSUnit()->getInstr()->getOpcode() == + AMDGPU::V_PERM_B32_e64; + }); + + SDep *VMEMSucc = std::find_if( + FirstPred->getSUnit()->Succs.begin(), + FirstPred->getSUnit()->Succs.end(), [this](const SDep &VPermPred) { + auto MI = VPermPred.getSUnit()->getInstr(); + return TII->isVMEM(*MI) && MI->mayLoad(); + }); + if (VMEMSucc == FirstPred->getSUnit()->Succs.end()) + continue; - const unsigned PipelineSyncID = 0; - SchedGroup *SG = nullptr; + if (std::any_of(J->first->Preds.begin(), J->first->Preds.end(), + [&VMEMSucc](const SDep &DSWPred) { + auto MI = DSWPred.getSUnit()->getInstr(); + if (MI->getOpcode() != AMDGPU::V_PERM_B32_e64) + return false; + return std::any_of( + DSWPred.getSUnit()->Succs.begin(), + DSWPred.getSUnit()->Succs.end(), + [&VMEMSucc](const SDep &OtherVMEMSucc) { + return VMEMSucc->getSUnit() == + OtherVMEMSucc.getSUnit(); + }); + })) { + DSWWithSharedVMEMCount += 2; + I->second = true; + J->second = true; + break; + } + } + } + + SchedGroup *SG; + unsigned PipelineSyncID = 0; + // For kernels with V_PERM, there are enough VALU to mix in between MFMAs + if (DSWWithPermCount) { + for (unsigned I = 0; I < MFMACount; I++) { + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::MFMA, 1, std::nullopt, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::VALU, 2, std::nullopt, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + } + } - // The SU is a successor of SU in prev SchedGroup - InstructionRuleType Rule1 = + PipelineSyncID = 1; + // Phase 1: Break up DS_READ and MFMA clusters. + // First DS_READ to make ready initial MFMA, then interleave MFMA with DS_READ + // prefetch + + // Whether the DS_READ is a predecessor of first four MFMA in region + InstructionRuleType EnablesInitialMFMA = + [](const SUnit *SU, ArrayRef Collection, const SIInstrInfo *TII, + SmallVectorImpl &SyncPipe, unsigned SGID) { + if (!SyncPipe.size()) + return false; + int MFMAsFound = 0; + for (auto &Elt : SyncPipe[0].DAG->SUnits) { + if (TII->isMFMAorWMMA(*Elt.getInstr())) { + if (std::any_of( + Elt.Preds.begin(), Elt.Preds.end(), + [&SU](const SDep &Pred) { return Pred.getSUnit() == SU; })) + return true; + + ++MFMAsFound; + } + if (MFMAsFound >= 4) + return false; + } + return false; + }; + + SmallVector DSRules; + DSRules.push_back(EnablesInitialMFMA); + + // Make ready initial MFMA + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::DS_READ, 4, DSRules, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::MFMA, 1, std::nullopt, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + // Interleave MFMA with DS_READ prefetch + for (unsigned I = 0; I < DSRCount - 4; ++I) { + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::DS_READ, 1, std::nullopt, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::MFMA, 1, std::nullopt, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + } + + // Phase 2a: Loop carried dependency with V_PERM + // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they + // depend on. Interleave MFMA to keep XDL unit busy throughout. + + // Whether the MI is a V_PERM and is a predecessor of a common DS_WRITE + InstructionRuleType IsPermForDSW = [](const SUnit *SU, ArrayRef Collection, const SIInstrInfo *TII, SmallVectorImpl &SyncPipe, unsigned SGID) { auto MI = SU->getInstr(); - if (MI->getOpcode() == TargetOpcode::BUNDLE) + if (MI->getOpcode() != AMDGPU::V_PERM_B32_e64) return false; + // Does the VALU have a DS_WRITE successor + if (!Collection.size()) { + return std::any_of( + SU->Succs.begin(), SU->Succs.end(), [&TII](const SDep &Succ) { + return (TII->isDS(*Succ.getSUnit()->getInstr()) && + Succ.getSUnit()->getInstr()->mayStore()); + }); + } + + // Does the VALU have a DS_WRITE successor that is the same as other + // VALU already in the group + return std::any_of( + Collection.begin(), Collection.end(), [&SU, &TII](SUnit *Elt) { + return std::any_of( + Elt->Succs.begin(), Elt->Succs.end(), + [&SU, &TII](const SDep &Succ) { + if (TII->isDS(*Succ.getSUnit()->getInstr()) && + Succ.getSUnit()->getInstr()->mayStore()) + return std::any_of(SU->Succs.begin(), SU->Succs.end(), + [&Succ](const SDep &ThisSucc) { + return ThisSucc.getSUnit() == + Succ.getSUnit(); + }); + return false; + }); + }); + }; + + // Whether the SU is a successor of any element in previous SchedGroup + InstructionRuleType IsSuccOfPrevGroup = + [](const SUnit *SU, ArrayRef Collection, const SIInstrInfo *TII, + SmallVectorImpl &SyncPipe, unsigned SGID) { SchedGroup *OtherGroup = nullptr; for (auto &PipeSG : SyncPipe) { - if (PipeSG.getSGID() == (int)SGID - 1) { + if ((unsigned)PipeSG.getSGID() == SGID - 1) { OtherGroup = &PipeSG; } } if (!OtherGroup) return false; + if (!OtherGroup->Collection.size()) + return true; + // Does the previous VALU have this DS_Write as a successor return (std::any_of(OtherGroup->Collection.begin(), OtherGroup->Collection.end(), [&SU](SUnit *Elt) { - return std::any_of( - Elt->Succs.begin(), Elt->Succs.end(), - [&SU](SDep &Succ) { return Succ.getSUnit() == SU; }); + return std::any_of(Elt->Succs.begin(), + Elt->Succs.end(), + [&SU](SDep &Succ) { + return Succ.getSUnit() == SU; + }); })); - }; + }; + + // Whether the combined load width of group is 128 bits + InstructionRuleType VMEMSize = + [](const SUnit *SU, ArrayRef Collection, const SIInstrInfo *TII, + SmallVectorImpl &SyncPipe, unsigned SGID) { + auto MI = SU->getInstr(); + if (MI->getOpcode() == TargetOpcode::BUNDLE) + return false; + if (!Collection.size()) + return true; + + int NumBits = 0; + + auto TRI = TII->getRegisterInfo(); + auto &MRI = MI->getParent()->getParent()->getRegInfo(); + for (auto &Elt : Collection) { + auto Op = Elt->getInstr()->getOperand(0); + auto Size = + TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg(MRI, Op)); + NumBits += Size; + } + + if (NumBits < 128) { + assert(TII->isVMEM(*MI) && MI->mayLoad()); + if (NumBits + TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg( + MRI, MI->getOperand(0))) <= + 128) + return true; + } + + return false; + }; + + // Whether the SU shares a V_PERM predecessor with any SU in the previous + // SchedGroup + InstructionRuleType SharesPredWithPrevGroup = + [](const SUnit *SU, ArrayRef Collection, const SIInstrInfo *TII, + SmallVectorImpl &SyncPipe, unsigned SGID) { + SchedGroup *OtherGroup = nullptr; + if (!SyncPipe.size()) + return false; + for (auto &PipeSG : SyncPipe) { + if ((unsigned)PipeSG.getSGID() == SGID - 1) { + OtherGroup = &PipeSG; + } + } + + if (!OtherGroup) + return false; + if (!OtherGroup->Collection.size()) + return true; + auto DAG = SyncPipe[0].DAG; + + // Does the previous DS_WRITE share a V_PERM predecessor with this + // VMEM_READ + return (std::any_of( + OtherGroup->Collection.begin(), OtherGroup->Collection.end(), + [&SU, &DAG](SUnit *Elt) { + return std::any_of( + Elt->Preds.begin(), Elt->Preds.end(), + [&SU, &DAG](SDep &Pred) { + return Pred.getSUnit()->getInstr()->getOpcode() == + AMDGPU::V_PERM_B32_e64 && + DAG->IsReachable(const_cast(SU), + Pred.getSUnit()); + }); + })); + }; + + // Whether the SU shares a V_PERM predecessor with any SU in the SchedGroup 3 + // steps back in the pipeline + InstructionRuleType SharesPredWithThirdPrevGroup = + [](const SUnit *SU, ArrayRef Collection, const SIInstrInfo *TII, + SmallVectorImpl &SyncPipe, unsigned SGID) { + auto MI = SU->getInstr(); + if (MI->getOpcode() == TargetOpcode::BUNDLE) + return false; - SmallVector DemoRules; - DemoRules.push_back(Rule1); + SchedGroup *OtherGroup = nullptr; + for (auto &PipeSG : SyncPipe) { + if ((unsigned)PipeSG.getSGID() == SGID - 3) { + OtherGroup = &PipeSG; + } + } + + if (!OtherGroup) + return false; + if (!OtherGroup->Collection.size()) + return true; + + auto DAG = SyncPipe[0].DAG; + + // Does the previous DS_WRITE share a V_PERM predecessor with this + // VMEM_READ + return (std::any_of( + OtherGroup->Collection.begin(), OtherGroup->Collection.end(), + [&SU, &DAG](SUnit *Elt) { + return std::any_of( + Elt->Preds.begin(), Elt->Preds.end(), + [&SU, &DAG](SDep &Pred) { + return Pred.getSUnit()->getInstr()->getOpcode() == + AMDGPU::V_PERM_B32_e64 && + DAG->IsReachable(const_cast(SU), + Pred.getSUnit()); + }); + })); + }; + + SmallVector VALURules; + VALURules.push_back(IsPermForDSW); + + SmallVector DSWRules; + DSWRules.push_back(IsSuccOfPrevGroup); + + SmallVector VMEMRules; + VMEMRules.push_back(SharesPredWithPrevGroup); + VMEMRules.push_back(VMEMSize); + + SmallVector LaterVMEMRules; + LaterVMEMRules.push_back(SharesPredWithThirdPrevGroup); + LaterVMEMRules.push_back(VMEMSize); + + for (unsigned I = 0; I < DSWWithPermCount - DSWWithSharedVMEMCount; ++I) { + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::VALU, 4, VALURules, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::DS_WRITE, 1, DSWRules, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::VMEM_READ, 4, VMEMRules, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); - // Each iteration of pipeline has 1 MFMA and 1 DS_W, where the DS_W is a - // successor of the MFMA - for (unsigned I = 0; I < MFMACount; ++I) { SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( SchedGroupMask::MFMA, 1, std::nullopt, PipelineSyncID, DAG, TII); SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( - SchedGroupMask::DS_WRITE, 1, DemoRules, PipelineSyncID, DAG, TII); + SchedGroupMask::VMEM_READ, 4, LaterVMEMRules, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::MFMA, 1, std::nullopt, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + } + + // Phase 2b: Loop carried dependency without V_PERM + // Schedule DS_WRITE as closely as possible to the VMEM_READ they depend on. + // Interleave MFMA to keep XDL unit busy throughout. + SmallVector VMEMNOPermRules; + VMEMNOPermRules.push_back(VMEMSize); + + for (unsigned I = 0; I < DSWCount - DSWWithPermCount; I++) { + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::DS_WRITE, 1, std::nullopt, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::VMEM_READ, 4, VMEMNOPermRules, PipelineSyncID, DAG, + TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::MFMA, 1, std::nullopt, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + } + + // Phase 2c: Loop carried dependency with V_PERM, VMEM_READs are + // ultimately used by two DS_WRITE + // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they + // depend on. Interleave MFMA to keep XDL unit busy throughout. + + // Whether the SU shares a V_PERM predecessor with any SU in the SchedGroup 2 + // steps back in the pipeline + InstructionRuleType SharesPredWithSecondPrevGroup = + [](const SUnit *SU, ArrayRef Collection, const SIInstrInfo *TII, + SmallVectorImpl &SyncPipe, unsigned SGID) { + SchedGroup *OtherGroup = nullptr; + if (!SyncPipe.size()) + return false; + for (auto &PipeSG : SyncPipe) { + if ((unsigned)PipeSG.getSGID() == SGID - 2) { + OtherGroup = &PipeSG; + } + } + + if (!OtherGroup) + return false; + if (!OtherGroup->Collection.size()) + return true; + auto DAG = SyncPipe[0].DAG; + + // Does the previous DS_WRITE share a V_PERM predecessor with this + // VMEM_READ + return (std::any_of( + OtherGroup->Collection.begin(), OtherGroup->Collection.end(), + [&SU, &DAG](SUnit *Elt) { + return std::any_of( + Elt->Preds.begin(), Elt->Preds.end(), + [&SU, &DAG](SDep &Pred) { + return Pred.getSUnit()->getInstr()->getOpcode() == + AMDGPU::V_PERM_B32_e64 && + DAG->IsReachable(const_cast(SU), + Pred.getSUnit()); + }); + })); + }; + + // Whether the SU shares a V_PERM predecessor with any SU in the SchedGroup 4 + // steps back in the pipeline + InstructionRuleType SharesPredWithFourthPrevGroup = + [](const SUnit *SU, ArrayRef Collection, const SIInstrInfo *TII, + SmallVectorImpl &SyncPipe, unsigned SGID) { + SchedGroup *OtherGroup = nullptr; + if (!SyncPipe.size()) + return false; + for (auto &PipeSG : SyncPipe) { + if ((unsigned)PipeSG.getSGID() == SGID - 4) { + OtherGroup = &PipeSG; + } + } + + if (!OtherGroup) + return false; + if (!OtherGroup->Collection.size()) + return true; + auto DAG = SyncPipe[0].DAG; + + // Does the previous DS_WRITE share a V_PERM predecessor with this + // VMEM_READ + return (std::any_of( + OtherGroup->Collection.begin(), OtherGroup->Collection.end(), + [&SU, &DAG](SUnit *Elt) { + return std::any_of( + Elt->Preds.begin(), Elt->Preds.end(), + [&SU, &DAG](SDep &Pred) { + return Pred.getSUnit()->getInstr()->getOpcode() == + AMDGPU::V_PERM_B32_e64 && + DAG->IsReachable(const_cast(SU), + Pred.getSUnit()); + }); + })); + }; + + SmallVector VMEMRules2c; + VMEMRules2c.push_back(SharesPredWithSecondPrevGroup); + VMEMRules2c.push_back(VMEMSize); + + SmallVector LaterVMEMRules2c; + LaterVMEMRules2c.push_back(SharesPredWithFourthPrevGroup); + LaterVMEMRules2c.push_back(VMEMSize); + + for (unsigned I = 0; I < DSWWithSharedVMEMCount; ++I) { + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::VALU, 4, VALURules, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::DS_WRITE, 1, DSWRules, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::MFMA, 1, std::nullopt, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::VALU, 4, VALURules, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::DS_WRITE, 1, DSWRules, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::MFMA, 1, std::nullopt, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::VMEM_READ, 4, VMEMRules2c, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::MFMA, 1, std::nullopt, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::VMEM_READ, 4, LaterVMEMRules2c, PipelineSyncID, DAG, + TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::MFMA, 1, std::nullopt, PipelineSyncID, DAG, TII); SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); } } @@ -911,8 +1369,8 @@ switch (ID) { case MFMASmallGemmOptID: return std::make_unique(DAG, TII); - case DemoOptID: - return std::make_unique(DAG, TII); + case MFMASmallGemmSingleWaveOptID: + return std::make_unique(DAG, TII); } llvm_unreachable("Unknown IGLPStrategyID"); diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll @@ -168,27 +168,49 @@ ; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:48 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:8304 -; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:8288 -; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:8272 -; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:8256 -; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:8240 -; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:8224 -; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:8208 -; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:8192 +; GCN-NEXT: ds_read_b128 a[156:159], v1 offset:8304 +; GCN-NEXT: ds_read_b128 a[152:155], v1 offset:8288 +; GCN-NEXT: ds_read_b128 a[148:151], v1 offset:8272 +; GCN-NEXT: ds_read_b128 a[144:147], v1 offset:8256 +; GCN-NEXT: ds_read_b128 a[140:143], v1 offset:8240 +; GCN-NEXT: ds_read_b128 a[136:139], v1 offset:8224 +; GCN-NEXT: ds_read_b128 a[132:135], v1 offset:8208 +; GCN-NEXT: ds_read_b128 a[128:131], v1 offset:8192 ; GCN-NEXT: v_add_u32_e32 v0, s1, v0 -; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:24688 -; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:24672 -; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:24656 -; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:24640 -; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:24624 -; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:24608 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112 -; GCN-NEXT: s_waitcnt lgkmcnt(7) +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v3, a[128:159] +; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:24688 +; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:24672 +; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:24656 +; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:24640 +; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:24624 +; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:24608 +; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:24592 +; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:24576 +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v3, a[96:127] -; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:24592 -; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:24576 +; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:49264 +; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:49248 +; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:49232 +; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:49216 +; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:49200 +; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:49184 +; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:49168 +; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:49152 +; GCN-NEXT: v_add_u32_e32 v1, 0x6000, v1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v3, a[64:95] +; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:57456 +; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:57440 +; GCN-NEXT: ds_read_b128 a[52:55], v1 offset:57424 +; GCN-NEXT: ds_read_b128 a[48:51], v1 offset:57408 +; GCN-NEXT: ds_read_b128 a[32:35], v1 offset:57344 +; GCN-NEXT: ds_read_b128 a[36:39], v1 offset:57360 +; GCN-NEXT: ds_read_b128 a[40:43], v1 offset:57376 +; GCN-NEXT: ds_read_b128 a[44:47], v1 offset:57392 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63] +; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112 ; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:96 ; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:80 ; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:64 @@ -197,58 +219,30 @@ ; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16 ; GCN-NEXT: ds_write_b128 v0, a[0:3] ; GCN-NEXT: v_mov_b32_e32 v0, s1 -; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:49264 -; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:49248 -; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:49232 -; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:49216 -; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:49200 -; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:49184 -; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:49168 -; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:49152 -; GCN-NEXT: v_add_u32_e32 v4, 0x6000, v1 -; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:8288 -; GCN-NEXT: s_waitcnt lgkmcnt(14) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v3, a[64:95] -; GCN-NEXT: ds_read_b128 a[60:63], v4 offset:57456 -; GCN-NEXT: ds_read_b128 a[56:59], v4 offset:57440 -; GCN-NEXT: ds_read_b128 a[52:55], v4 offset:57424 -; GCN-NEXT: ds_read_b128 a[48:51], v4 offset:57408 -; GCN-NEXT: ds_read_b128 a[32:35], v4 offset:57344 -; GCN-NEXT: ds_read_b128 a[36:39], v4 offset:57360 -; GCN-NEXT: ds_read_b128 a[40:43], v4 offset:57376 -; GCN-NEXT: ds_read_b128 a[44:47], v4 offset:57392 -; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:8304 -; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:8256 -; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:8272 -; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:8224 -; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:8240 -; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:8192 -; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:8208 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:16480 -; GCN-NEXT: s_waitcnt lgkmcnt(14) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:16496 -; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:16448 -; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:16464 -; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:16416 -; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:16432 -; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:16384 -; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:16400 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 -; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:24672 -; GCN-NEXT: s_waitcnt lgkmcnt(14) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63] -; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:24688 -; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:24640 -; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:24656 -; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:24608 -; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:24624 -; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:24576 -; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:24592 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:8288 +; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:8304 +; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:8256 +; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:8272 +; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:8224 +; GCN-NEXT: ds_write_b128 v0, a[140:143] offset:8240 +; GCN-NEXT: ds_write_b128 v0, a[128:131] offset:8192 +; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:8208 +; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:16480 +; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:16496 +; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:16448 +; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:16464 +; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:16416 +; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:16432 +; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:16384 +; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:16400 +; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:24672 +; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:24688 +; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:24640 +; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:24656 +; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:24608 +; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:24624 +; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:24576 +; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:24592 ; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:32864 ; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:32880 ; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:32832 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir @@ -0,0 +1,339 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck -check-prefix=GCN %s + +--- | + define amdgpu_kernel void @single-wave-phase-2b(i32 addrspace(3)* noalias %in0, i32 addrspace(3)* noalias %in1, i32 addrspace(3)* noalias %in2, i32 addrspace(3)* noalias %in3, i32 addrspace(3)* noalias %in4, i32 addrspace(3)* noalias %in5, i32 addrspace(3)* noalias %in6, i32 addrspace(3)* noalias %in7, i32 addrspace(3)* noalias %in8, i32 addrspace(3)* noalias %in9, i32 addrspace(3)* noalias %in10, i32 addrspace(3)* noalias %in11, i32 addrspace(7)* noalias %in12, i32 addrspace(7)* noalias %in13, i32 addrspace(7)* noalias %in14, i32 addrspace(7)* noalias %in15, i32 addrspace(7)* noalias %in16, i32 addrspace(7)* noalias %in17, i32 addrspace(7)* noalias %in18, i32 addrspace(7)* noalias %in19, i32 addrspace(7)* noalias %in20, i32 addrspace(7)* noalias %in21, i32 addrspace(7)* noalias %in22, i32 addrspace(7)* noalias %in23, i32 addrspace(7)* noalias %in24, i32 addrspace(7)* noalias %in25, i32 addrspace(7)* noalias %in26, i32 addrspace(7)* noalias %in27, i32 addrspace(7)* noalias %in28, i32 addrspace(7)* noalias %in29) #0 { ret void } + + !0 = distinct !{!0} + !1 = !{!1, !0} +... + + +--- +name: single-wave-phase-2b +tracksRegLiveness: true +machineFunctionInfo: + occupancy: 1 +body: | + ; GCN-LABEL: name: single-wave-phase-2b + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.1(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[DEF:%[0-9]+]]:areg_512_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:areg_512_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF2:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF3:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF4:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF5:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF13:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF14:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF15:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF17:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF18:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF19:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF20:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF21:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF22:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF23:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF24:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF25:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF26:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF27:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF28:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF29:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF30:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF31:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF32:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF33:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF34:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF35:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF36:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF37:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF38:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF39:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF40:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF41:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF42:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF43:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF44:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF45:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF46:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF47:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF48:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF49:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[DS_READ_B128_gfx9_:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF6]], 0, 0, implicit $exec :: (load (s128) from %ir.in0, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[DS_READ_B128_gfx9_1:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF7]], 0, 0, implicit $exec :: (load (s128) from %ir.in4, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[DS_READ_B128_gfx9_2:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF6]], 1040, 0, implicit $exec :: (load (s128) from %ir.in1, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[DS_READ_B128_gfx9_3:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF7]], 2064, 0, implicit $exec :: (load (s128) from %ir.in5, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub0_sub1, [[DS_READ_B128_gfx9_1]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[DS_READ_B128_gfx9_4:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF6]], 2080, 0, implicit $exec :: (load (s128) from %ir.in2, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF8]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF9]], implicit $exec + ; GCN-NEXT: dead [[V_MFMA_F32_32X32X8F16_mac_e64_1:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub2_sub3, [[DS_READ_B128_gfx9_1]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[DS_READ_B128_gfx9_5:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF6]], 3120, 0, implicit $exec :: (load (s128) from %ir.in3, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF10]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF11]], implicit $exec + ; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub0_sub1, [[DS_READ_B128_gfx9_3]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[DS_READ_B128_gfx9_6:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF7]], 4128, 0, implicit $exec :: (load (s128) from %ir.in6, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[V_ADD_U32_e32_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF12]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF13]], implicit $exec + ; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub2_sub3, [[DS_READ_B128_gfx9_3]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[DS_READ_B128_gfx9_7:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF7]], 6192, 0, implicit $exec :: (load (s128) from %ir.in7, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[V_ADD_U32_e32_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF14]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF15]], implicit $exec + ; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_4]].sub0_sub1, [[DS_READ_B128_gfx9_6]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[DS_READ_B128_gfx9_8:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF7]], 1024, 0, implicit $exec :: (load (s128) from %ir.in8, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[V_ADD_U32_e32_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF16]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF17]], implicit $exec + ; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_4]].sub2_sub3, [[DS_READ_B128_gfx9_6]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[DS_READ_B128_gfx9_9:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF7]], 3088, 0, implicit $exec :: (load (s128) from %ir.in9, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[V_ADD_U32_e32_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF18]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF19]], implicit $exec + ; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_5]].sub0_sub1, [[DS_READ_B128_gfx9_7]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[DS_READ_B128_gfx9_10:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF7]], 5152, 0, implicit $exec :: (load (s128) from %ir.in10, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[V_ADD_U32_e32_12:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF20]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF21]], implicit $exec + ; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_5]].sub2_sub3, [[DS_READ_B128_gfx9_7]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[DS_READ_B128_gfx9_11:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF7]], 7216, 0, implicit $exec :: (load (s128) from %ir.in11, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub0_sub1, [[DS_READ_B128_gfx9_8]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: undef %63.sub1:vreg_128_align2 = V_PERM_B32_e64 [[DEF28]], [[DEF29]], [[DEF44]], implicit $exec + ; GCN-NEXT: %63.sub0:vreg_128_align2 = V_PERM_B32_e64 [[DEF30]], [[DEF31]], [[DEF44]], implicit $exec + ; GCN-NEXT: %63.sub3:vreg_128_align2 = V_PERM_B32_e64 [[DEF24]], [[DEF25]], [[DEF44]], implicit $exec + ; GCN-NEXT: %63.sub2:vreg_128_align2 = V_PERM_B32_e64 [[DEF26]], [[DEF27]], [[DEF44]], implicit $exec + ; GCN-NEXT: DS_WRITE_B128_gfx9 [[DEF40]], %63, 0, 0, implicit $exec :: (store (s128) into %ir.in0, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in12, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_1]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in13, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_2]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in14, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_3]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in15, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_3:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub2_sub3, [[DS_READ_B128_gfx9_8]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_3]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_4]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in16, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_5]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in17, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN6:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_6]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in18, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN7:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_7]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in19, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_3:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub0_sub1, [[DS_READ_B128_gfx9_9]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_3]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: undef %64.sub1:vreg_128_align2 = V_PERM_B32_e64 [[DEF36]], [[DEF37]], [[DEF44]], implicit $exec + ; GCN-NEXT: %64.sub0:vreg_128_align2 = V_PERM_B32_e64 [[DEF38]], [[DEF39]], [[DEF44]], implicit $exec + ; GCN-NEXT: %64.sub3:vreg_128_align2 = V_PERM_B32_e64 [[DEF32]], [[DEF33]], [[DEF44]], implicit $exec + ; GCN-NEXT: %64.sub2:vreg_128_align2 = V_PERM_B32_e64 [[DEF34]], [[DEF35]], [[DEF44]], implicit $exec + ; GCN-NEXT: DS_WRITE_B128_gfx9 [[DEF40]], %64, 1040, 0, implicit $exec :: (store (s128) into %ir.in1, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN8:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_8]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in20, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN9:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_9]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in21, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN10:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_10]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in22, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_11]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in23, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_3:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub2_sub3, [[DS_READ_B128_gfx9_9]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_3]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_12]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in24, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_13]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in25, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[V_ADD_U32_e32_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF22]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF23]], implicit $exec + ; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN14:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_14]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in26, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN15:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_15]], [[DEF47]], 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_3:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_4]].sub0_sub1, [[DS_READ_B128_gfx9_10]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_3]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: DS_WRITE_B128_gfx9 [[DEF41]], [[DEF2]], 0, 0, implicit $exec :: (store (s128) into %ir.in2, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[V_ADD_U32_e32_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 -16, [[DEF45]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 -16, [[DEF46]], implicit $exec + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[V_ADD_U32_e32_16]], [[DEF48]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in26, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_3:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_4]].sub2_sub3, [[DS_READ_B128_gfx9_10]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_3]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: DS_WRITE_B128_gfx9 [[DEF41]], [[DEF3]], 2064, 0, implicit $exec :: (store (s128) into %ir.in3, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[DEF45]], [[DEF48]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in27, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[V_ADD_U32_e32_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 128, [[V_ADD_U32_e32_18]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_19]], implicit $exec + ; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_3:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_5]].sub0_sub1, [[DS_READ_B128_gfx9_11]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_3]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: DS_WRITE_B128_gfx9 [[DEF41]], [[DEF4]], 2080, 0, implicit $exec :: (store (s128) into %ir.in4, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[DEF46]], [[DEF48]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in28, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[V_ADD_U32_e32_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 128, [[V_ADD_U32_e32_20]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_21]], implicit $exec + ; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_3:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_5]].sub2_sub3, [[DS_READ_B128_gfx9_11]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_3]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: DS_WRITE_B128_gfx9 [[DEF41]], [[DEF5]], 16, 0, implicit $exec :: (store (s128) into %ir.in5, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[V_ADD_U32_e32_17]], [[DEF48]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in29, !alias.scope !0, addrspace 7) + ; GCN-NEXT: IGLP_OPT 1 + ; GCN-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = nsw S_ADD_I32 [[S_ADD_I32_]], -1, implicit-def dead $scc + ; GCN-NEXT: S_CMP_LG_U32 [[S_ADD_I32_]], 0, implicit-def $scc + ; GCN-NEXT: [[V_ADD_U32_e32_22:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_22]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_23:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_23]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_24:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_24]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_25:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_25]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_26:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_26]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_27:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_27]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_28:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_28]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_29:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_29]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_30]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_31:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_31]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_32:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_32]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_33:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_33]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_34:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_34]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_35:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_35]], implicit $exec + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.1, implicit killed $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: S_ENDPGM 0 + bb.0: + %761:areg_512_align2 = IMPLICIT_DEF + %814:areg_512_align2 = IMPLICIT_DEF + %1757:av_128_align2 = IMPLICIT_DEF + %1755:av_128_align2 = IMPLICIT_DEF + %1754:av_128_align2 = IMPLICIT_DEF + %1756:av_128_align2 = IMPLICIT_DEF + %37:vgpr_32 = IMPLICIT_DEF + %38:vgpr_32 = IMPLICIT_DEF + %1736:vgpr_32 = IMPLICIT_DEF + %1737:vgpr_32 = IMPLICIT_DEF + %1738:vgpr_32 = IMPLICIT_DEF + %1739:vgpr_32 = IMPLICIT_DEF + %1740:vgpr_32 = IMPLICIT_DEF + %1741:vgpr_32 = IMPLICIT_DEF + %1742:vgpr_32 = IMPLICIT_DEF + %1743:vgpr_32 = IMPLICIT_DEF + %1744:vgpr_32 = IMPLICIT_DEF + %1745:vgpr_32 = IMPLICIT_DEF + %1746:vgpr_32 = IMPLICIT_DEF + %1747:vgpr_32 = IMPLICIT_DEF + %1748:vgpr_32 = IMPLICIT_DEF + %1749:vgpr_32 = IMPLICIT_DEF + %1750:vgpr_32 = IMPLICIT_DEF + %1751:vgpr_32 = IMPLICIT_DEF + %1766:vgpr_32 = IMPLICIT_DEF + %1767:vgpr_32 = IMPLICIT_DEF + %1768:vgpr_32 = IMPLICIT_DEF + %1769:vgpr_32 = IMPLICIT_DEF + %1770:vgpr_32 = IMPLICIT_DEF + %1771:vgpr_32 = IMPLICIT_DEF + %1772:vgpr_32 = IMPLICIT_DEF + %1773:vgpr_32 = IMPLICIT_DEF + %1758:vgpr_32 = IMPLICIT_DEF + %1759:vgpr_32 = IMPLICIT_DEF + %1760:vgpr_32 = IMPLICIT_DEF + %1761:vgpr_32 = IMPLICIT_DEF + %1762:vgpr_32 = IMPLICIT_DEF + %1763:vgpr_32 = IMPLICIT_DEF + %1764:vgpr_32 = IMPLICIT_DEF + %1765:vgpr_32 = IMPLICIT_DEF + %14:vgpr_32 = IMPLICIT_DEF + %31:vgpr_32 = IMPLICIT_DEF + %41:vgpr_32 = IMPLICIT_DEF + %43:sreg_32 = IMPLICIT_DEF + %535:sreg_32 = IMPLICIT_DEF + %1752:vgpr_32 = IMPLICIT_DEF + %1753:vgpr_32 = IMPLICIT_DEF + %450:sgpr_128 = IMPLICIT_DEF + %518:sgpr_128 = IMPLICIT_DEF + %1735:sreg_32 = IMPLICIT_DEF + + bb.1: + IGLP_OPT 1 + %683:av_128_align2 = DS_READ_B128_gfx9 %37:vgpr_32, 0, 0, implicit $exec :: (load (s128) from %ir.in0, !alias.scope !0, addrspace 3) + %688:av_128_align2 = DS_READ_B128_gfx9 %37:vgpr_32, 1040, 0, implicit $exec :: (load (s128) from %ir.in1, !alias.scope !0, addrspace 3) + %693:av_128_align2 = DS_READ_B128_gfx9 %37:vgpr_32, 2080, 0, implicit $exec :: (load (s128) from %ir.in2, !alias.scope !0, addrspace 3) + %698:av_128_align2 = DS_READ_B128_gfx9 %37:vgpr_32, 3120, 0, implicit $exec :: (load (s128) from %ir.in3, !alias.scope !0, addrspace 3) + %703:av_128_align2 = DS_READ_B128_gfx9 %38:vgpr_32, 0, 0, implicit $exec :: (load (s128) from %ir.in4, !alias.scope !0, addrspace 3) + %708:av_128_align2 = DS_READ_B128_gfx9 %38:vgpr_32, 2064, 0, implicit $exec :: (load (s128) from %ir.in5, !alias.scope !0, addrspace 3) + %713:av_128_align2 = DS_READ_B128_gfx9 %38:vgpr_32, 4128, 0, implicit $exec :: (load (s128) from %ir.in6, !alias.scope !0, addrspace 3) + %718:av_128_align2 = DS_READ_B128_gfx9 %38:vgpr_32, 6192, 0, implicit $exec :: (load (s128) from %ir.in7, !alias.scope !0, addrspace 3) + %761:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %683.sub0_sub1:av_128_align2, %703.sub0_sub1:av_128_align2, %761:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %762:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %683.sub2_sub3:av_128_align2, %703.sub2_sub3:av_128_align2, %761:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %761:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %688.sub0_sub1:av_128_align2, %708.sub0_sub1:av_128_align2, %761:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %761:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %688.sub2_sub3:av_128_align2, %708.sub2_sub3:av_128_align2, %761:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %761:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %693.sub0_sub1:av_128_align2, %713.sub0_sub1:av_128_align2, %761:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %761:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %693.sub2_sub3:av_128_align2, %713.sub2_sub3:av_128_align2, %761:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %761:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %698.sub0_sub1:av_128_align2, %718.sub0_sub1:av_128_align2, %761:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %761:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %698.sub2_sub3:av_128_align2, %718.sub2_sub3:av_128_align2, %761:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %764:av_128_align2 = DS_READ_B128_gfx9 %38:vgpr_32, 1024, 0, implicit $exec :: (load (s128) from %ir.in8, !alias.scope !0, addrspace 3) + %769:av_128_align2 = DS_READ_B128_gfx9 %38:vgpr_32, 3088, 0, implicit $exec :: (load (s128) from %ir.in9, !alias.scope !0, addrspace 3) + %774:av_128_align2 = DS_READ_B128_gfx9 %38:vgpr_32, 5152, 0, implicit $exec :: (load (s128) from %ir.in10, !alias.scope !0, addrspace 3) + %779:av_128_align2 = DS_READ_B128_gfx9 %38:vgpr_32, 7216, 0, implicit $exec :: (load (s128) from %ir.in11, !alias.scope !0, addrspace 3) + %814:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %683.sub0_sub1:av_128_align2, %764.sub0_sub1:av_128_align2, %814:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %814:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %683.sub2_sub3:av_128_align2, %764.sub2_sub3:av_128_align2, %814:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %814:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %688.sub0_sub1:av_128_align2, %769.sub0_sub1:av_128_align2, %814:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %814:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %688.sub2_sub3:av_128_align2, %769.sub2_sub3:av_128_align2, %814:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %814:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %693.sub0_sub1:av_128_align2, %774.sub0_sub1:av_128_align2, %814:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %814:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %693.sub2_sub3:av_128_align2, %774.sub2_sub3:av_128_align2, %814:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %814:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %698.sub0_sub1:av_128_align2, %779.sub0_sub1:av_128_align2, %814:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %814:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %698.sub2_sub3:av_128_align2, %779.sub2_sub3:av_128_align2, %814:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + undef %1599.sub3:vreg_128_align2 = V_PERM_B32_e64 %1766:vgpr_32, %1767:vgpr_32, %535:sreg_32, implicit $exec + %1599.sub2:vreg_128_align2 = V_PERM_B32_e64 %1768:vgpr_32, %1769:vgpr_32, %535:sreg_32, implicit $exec + %1599.sub1:vreg_128_align2 = V_PERM_B32_e64 %1770:vgpr_32, %1771:vgpr_32, %535:sreg_32, implicit $exec + %1599.sub0:vreg_128_align2 = V_PERM_B32_e64 %1772:vgpr_32, %1773:vgpr_32, %535:sreg_32, implicit $exec + undef %1579.sub3:vreg_128_align2 = V_PERM_B32_e64 %1758:vgpr_32, %1759:vgpr_32, %535:sreg_32, implicit $exec + %1579.sub2:vreg_128_align2 = V_PERM_B32_e64 %1760:vgpr_32, %1761:vgpr_32, %535:sreg_32, implicit $exec + %1579.sub1:vreg_128_align2 = V_PERM_B32_e64 %1762:vgpr_32, %1763:vgpr_32, %535:sreg_32, implicit $exec + %1579.sub0:vreg_128_align2 = V_PERM_B32_e64 %1764:vgpr_32, %1765:vgpr_32, %535:sreg_32, implicit $exec + DS_WRITE_B128_gfx9 %14:vgpr_32, %1599:vreg_128_align2, 0, 0, implicit $exec :: (store (s128) into %ir.in0, !alias.scope !0, addrspace 3) + DS_WRITE_B128_gfx9 %14:vgpr_32, %1579:vreg_128_align2, 1040, 0, implicit $exec :: (store (s128) into %ir.in1, !alias.scope !0, addrspace 3) + %830:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1736:vgpr_32, implicit $exec + %1773:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %830:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in12, !alias.scope !0, addrspace 7) + %833:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1737:vgpr_32, implicit $exec + %1772:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %833:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in13, !alias.scope !0, addrspace 7) + %835:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1738:vgpr_32, implicit $exec + %1771:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %835:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in14, !alias.scope !0, addrspace 7) + %837:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1739:vgpr_32, implicit $exec + %1770:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %837:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in15, !alias.scope !0, addrspace 7) + %839:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1740:vgpr_32, implicit $exec + %1769:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %839:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in16, !alias.scope !0, addrspace 7) + %841:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1741:vgpr_32, implicit $exec + %1768:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %841:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in17, !alias.scope !0, addrspace 7) + %843:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1742:vgpr_32, implicit $exec + %1767:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %843:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in18, !alias.scope !0, addrspace 7) + %845:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1743:vgpr_32, implicit $exec + %1766:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %845:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in19, !alias.scope !0, addrspace 7) + %847:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1744:vgpr_32, implicit $exec + %1758:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %847:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in20, !alias.scope !0, addrspace 7) + %849:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1745:vgpr_32, implicit $exec + %1759:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %849:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in21, !alias.scope !0, addrspace 7) + %851:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1746:vgpr_32, implicit $exec + %1760:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %851:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in22, !alias.scope !0, addrspace 7) + %853:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1747:vgpr_32, implicit $exec + %1761:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %853:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in23, !alias.scope !0, addrspace 7) + %855:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1748:vgpr_32, implicit $exec + %1762:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %855:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in24, !alias.scope !0, addrspace 7) + %857:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1749:vgpr_32, implicit $exec + %1763:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %857:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in25, !alias.scope !0, addrspace 7) + %859:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1750:vgpr_32, implicit $exec + %1764:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %859:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in26, !alias.scope !0, addrspace 7) + %861:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1751:vgpr_32, implicit $exec + %1765:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %861:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec + DS_WRITE_B128_gfx9 %31:vgpr_32, %1757:av_128_align2, 0, 0, implicit $exec :: (store (s128) into %ir.in2, !alias.scope !0, addrspace 3) + DS_WRITE_B128_gfx9 %31:vgpr_32, %1755:av_128_align2, 2064, 0, implicit $exec :: (store (s128) into %ir.in3, !alias.scope !0, addrspace 3) + DS_WRITE_B128_gfx9 %31:vgpr_32, %1754:av_128_align2, 2080, 0, implicit $exec :: (store (s128) into %ir.in4, !alias.scope !0, addrspace 3) + DS_WRITE_B128_gfx9 %31:vgpr_32, %1756:av_128_align2, 16, 0, implicit $exec :: (store (s128) into %ir.in5, !alias.scope !0, addrspace 3) + %864:vgpr_32 = V_ADD_U32_e32 -16, %1752:vgpr_32, implicit $exec + %1757:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %864:vgpr_32, %518:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in26, !alias.scope !0, addrspace 7) + %1755:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %1752:vgpr_32, %518:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in27, !alias.scope !0, addrspace 7) + %1754:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %1753:vgpr_32, %518:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in28, !alias.scope !0, addrspace 7) + %865:vgpr_32 = V_ADD_U32_e32 -16, %1753:vgpr_32, implicit $exec + %1756:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %865:vgpr_32, %518:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in29, !alias.scope !0, addrspace 7) + %1753:vgpr_32 = V_ADD_U32_e32 128, %1753:vgpr_32, implicit $exec + %1752:vgpr_32 = V_ADD_U32_e32 128, %1752:vgpr_32, implicit $exec + %1751:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1751:vgpr_32, implicit $exec + %1750:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1750:vgpr_32, implicit $exec + %1749:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1749:vgpr_32, implicit $exec + %1748:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1748:vgpr_32, implicit $exec + %1747:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1747:vgpr_32, implicit $exec + %1746:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1746:vgpr_32, implicit $exec + %1745:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1745:vgpr_32, implicit $exec + %1744:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1744:vgpr_32, implicit $exec + %1743:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1743:vgpr_32, implicit $exec + %1742:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1742:vgpr_32, implicit $exec + %1741:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1741:vgpr_32, implicit $exec + %1740:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1740:vgpr_32, implicit $exec + %1739:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1739:vgpr_32, implicit $exec + %1738:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1738:vgpr_32, implicit $exec + %1737:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1737:vgpr_32, implicit $exec + %1736:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1736:vgpr_32, implicit $exec + %1735:sreg_32 = nsw S_ADD_I32 %1735:sreg_32, -1, implicit-def dead $scc + S_CMP_LG_U32 %1735:sreg_32, 0, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + + bb.2: + S_ENDPGM 0 +--- diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2c.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2c.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2c.mir @@ -0,0 +1,217 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck -check-prefix=GCN %s + +--- | + define amdgpu_kernel void @single-wave-phase-2c(i32 addrspace(3)* noalias %in0, i32 addrspace(3)* noalias %in1, i32 addrspace(3)* noalias %in2, i32 addrspace(3)* noalias %in3, i32 addrspace(3)* noalias %in4, i32 addrspace(3)* noalias %in5, i32 addrspace(3)* noalias %in6, i32 addrspace(3)* noalias %in7, i32 addrspace(3)* noalias %in8, i32 addrspace(3)* noalias %in9, i32 addrspace(3)* noalias %in10, i32 addrspace(3)* noalias %in11, i32 addrspace(7)* noalias %in12, i32 addrspace(7)* noalias %in13, i32 addrspace(7)* noalias %in14, i32 addrspace(7)* noalias %in15, i32 addrspace(7)* noalias %in16, i32 addrspace(7)* noalias %in17) #0 { ret void } + + + !0 = distinct !{!0} + !1 = !{!1, !0} +... + +--- +name: single-wave-phase-2c +tracksRegLiveness: true +machineFunctionInfo: + occupancy: 1 +body: | + ; GCN-LABEL: name: single-wave-phase-2c + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.1(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[DEF:%[0-9]+]]:av_512_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:av_512_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF7:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF13:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF14:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF15:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF16:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF19:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: dead [[DEF20:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF21:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF22:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF23:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF24:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF25:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF26:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF27:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF28:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF29:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF30:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF31:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF32:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF33:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[DS_READ_B128_gfx9_:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF2]], 0, 0, implicit $exec :: (load (s128) from %ir.in0, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[DS_READ_B128_gfx9_1:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF3]], 0, 0, implicit $exec :: (load (s128) from %ir.in2, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[DS_READ_B128_gfx9_2:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF2]], 1040, 0, implicit $exec :: (load (s128) from %ir.in1, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[DS_READ_B128_gfx9_3:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF3]], 2064, 0, implicit $exec :: (load (s128) from %ir.in3, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[PRED_COPY:%[0-9]+]]:areg_512_align2 = PRED_COPY [[DEF1]] + ; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub0_sub1, [[DS_READ_B128_gfx9_1]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[DS_READ_B128_gfx9_4:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF3]], 1024, 0, implicit $exec :: (load (s128) from %ir.in4, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF33]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF21]], implicit $exec + ; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_1:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub2_sub3, [[DS_READ_B128_gfx9_1]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_1]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[DS_READ_B128_gfx9_5:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF3]], 3088, 0, implicit $exec :: (load (s128) from %ir.in5, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF22]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF23]], implicit $exec + ; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_1:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub0_sub1, [[DS_READ_B128_gfx9_3]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_1]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: DS_WRITE_B128_gfx9 [[DEF4]], [[DEF16]], 0, 0, implicit $exec :: (store (s128) into %ir.in6, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[DEF6]], [[DEF7]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in7, !alias.scope !0, addrspace 7) + ; GCN-NEXT: dead [[V_MFMA_F32_32X32X8F16_mac_e64_1:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub2_sub3, [[DS_READ_B128_gfx9_3]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_1]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[PRED_COPY:%[0-9]+]]:areg_512_align2 = PRED_COPY [[DEF]] + ; GCN-NEXT: undef [[DEF17]].sub2:vreg_128_align2 = V_PERM_B32_e64 [[DEF13]], [[DEF12]], [[DEF30]], implicit $exec + ; GCN-NEXT: [[DEF17]].sub3:vreg_128_align2 = V_PERM_B32_e64 [[DEF15]], [[DEF14]], [[DEF30]], implicit $exec + ; GCN-NEXT: [[DEF17]].sub0:vreg_128_align2 = V_PERM_B32_e64 [[DEF8]], [[DEF9]], [[DEF30]], implicit $exec + ; GCN-NEXT: [[DEF17]].sub1:vreg_128_align2 = V_PERM_B32_e64 [[DEF11]], [[DEF10]], [[DEF30]], implicit $exec + ; GCN-NEXT: DS_WRITE_B128_gfx9 [[DEF5]], [[DEF17]], 0, 0, implicit $exec :: (store (s128) into %ir.in8, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_1:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub0_sub1, [[DS_READ_B128_gfx9_4]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_1]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: undef [[DEF18]].sub0:vreg_128_align2 = V_PERM_B32_e64 [[DEF8]], [[DEF9]], [[DEF31]], implicit $exec + ; GCN-NEXT: [[DEF18]].sub1:vreg_128_align2 = V_PERM_B32_e64 [[DEF11]], [[DEF10]], [[DEF31]], implicit $exec + ; GCN-NEXT: [[DEF18]].sub2:vreg_128_align2 = V_PERM_B32_e64 [[DEF13]], [[DEF12]], [[DEF31]], implicit $exec + ; GCN-NEXT: [[DEF18]].sub3:vreg_128_align2 = V_PERM_B32_e64 [[DEF15]], [[DEF14]], [[DEF31]], implicit $exec + ; GCN-NEXT: DS_WRITE_B128_gfx9 [[DEF5]], [[DEF18]], 16, 0, implicit $exec :: (store (s128) into %ir.in9, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub2_sub3, [[DS_READ_B128_gfx9_4]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in10, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_1]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in11, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_2]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in12, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_3]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in13, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub0_sub1, [[DS_READ_B128_gfx9_5]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF24]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF25]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF26]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF27]], implicit $exec + ; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_4]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in14, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_5]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in15, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFEN6:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_6]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in16, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFEN7:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_7]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in17, !alias.scope !0, addrspace 7) + ; GCN-NEXT: dead [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub2_sub3, [[DS_READ_B128_gfx9_5]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: IGLP_OPT 1 + ; GCN-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = nsw S_ADD_I32 [[S_ADD_I32_]], -1, implicit-def dead $scc + ; GCN-NEXT: S_CMP_LG_U32 [[S_ADD_I32_]], 0, implicit-def $scc + ; GCN-NEXT: [[V_ADD_U32_e32_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF28]], [[V_ADD_U32_e32_8]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF28]], [[V_ADD_U32_e32_9]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF28]], [[V_ADD_U32_e32_10]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF28]], [[V_ADD_U32_e32_11]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_12:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 64, [[V_ADD_U32_e32_12]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF28]], [[V_ADD_U32_e32_13]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF28]], [[V_ADD_U32_e32_14]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF28]], [[V_ADD_U32_e32_15]], implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e32_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF28]], [[V_ADD_U32_e32_16]], implicit $exec + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.1, implicit killed $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: S_ENDPGM 0 + bb.0: + %1076:av_512_align2 = IMPLICIT_DEF + %1073:av_512_align2 = IMPLICIT_DEF + %25:vgpr_32 = IMPLICIT_DEF + %26:vgpr_32 = IMPLICIT_DEF + %13:vgpr_32 = IMPLICIT_DEF + %15:vgpr_32 = IMPLICIT_DEF + %1215:vgpr_32 = IMPLICIT_DEF + %381:sgpr_128 = IMPLICIT_DEF + %1225:vgpr_32 = IMPLICIT_DEF + %1224:vgpr_32 = IMPLICIT_DEF + %1226:vgpr_32 = IMPLICIT_DEF + %1227:vgpr_32 = IMPLICIT_DEF + %1228:vgpr_32 = IMPLICIT_DEF + %1229:vgpr_32 = IMPLICIT_DEF + %1230:vgpr_32 = IMPLICIT_DEF + %1231:vgpr_32 = IMPLICIT_DEF + %1232:av_128_align2 = IMPLICIT_DEF + %1091:vreg_128_align2 = IMPLICIT_DEF + %1067:vreg_128_align2 = IMPLICIT_DEF + %27:vgpr_32 = IMPLICIT_DEF + %1216:vgpr_32 = IMPLICIT_DEF + %1217:vgpr_32 = IMPLICIT_DEF + %1218:vgpr_32 = IMPLICIT_DEF + %1219:vgpr_32 = IMPLICIT_DEF + %1220:vgpr_32 = IMPLICIT_DEF + %1221:vgpr_32 = IMPLICIT_DEF + %1222:vgpr_32 = IMPLICIT_DEF + %1223:vgpr_32 = IMPLICIT_DEF + %29:sreg_32 = IMPLICIT_DEF + %1214:sreg_32 = IMPLICIT_DEF + %419:sreg_32 = IMPLICIT_DEF + %421:sreg_32 = IMPLICIT_DEF + %387:sgpr_128 = IMPLICIT_DEF + %1216:vgpr_32 = IMPLICIT_DEF + + bb.1: + IGLP_OPT 1 + %489:av_128_align2 = DS_READ_B128_gfx9 %25:vgpr_32, 0, 0, implicit $exec :: (load (s128) from %ir.in0, !alias.scope !0, addrspace 3) + %494:av_128_align2 = DS_READ_B128_gfx9 %25:vgpr_32, 1040, 0, implicit $exec :: (load (s128) from %ir.in1, !alias.scope !0, addrspace 3) + %499:av_128_align2 = DS_READ_B128_gfx9 %26:vgpr_32, 0, 0, implicit $exec :: (load (s128) from %ir.in2, !alias.scope !0, addrspace 3) + %504:av_128_align2 = DS_READ_B128_gfx9 %26:vgpr_32, 2064, 0, implicit $exec :: (load (s128) from %ir.in3, !alias.scope !0, addrspace 3) + %527:areg_512_align2 = PRED_COPY %1073:av_512_align2 + %527:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %489.sub0_sub1:av_128_align2, %499.sub0_sub1:av_128_align2, %527:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %527:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %489.sub2_sub3:av_128_align2, %499.sub2_sub3:av_128_align2, %527:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %527:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %494.sub0_sub1:av_128_align2, %504.sub0_sub1:av_128_align2, %527:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %527:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %494.sub2_sub3:av_128_align2, %504.sub2_sub3:av_128_align2, %527:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %530:av_128_align2 = DS_READ_B128_gfx9 %26:vgpr_32, 1024, 0, implicit $exec :: (load (s128) from %ir.in4, !alias.scope !0, addrspace 3) + %535:av_128_align2 = DS_READ_B128_gfx9 %26:vgpr_32, 3088, 0, implicit $exec :: (load (s128) from %ir.in5, !alias.scope !0, addrspace 3) + %554:areg_512_align2 = PRED_COPY %1076:av_512_align2 + %554:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %489.sub0_sub1:av_128_align2, %530.sub0_sub1:av_128_align2, %554:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %554:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %489.sub2_sub3:av_128_align2, %530.sub2_sub3:av_128_align2, %554:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %554:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %494.sub0_sub1:av_128_align2, %535.sub0_sub1:av_128_align2, %554:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %554:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %494.sub2_sub3:av_128_align2, %535.sub2_sub3:av_128_align2, %554:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + DS_WRITE_B128_gfx9 %13:vgpr_32, %1232:av_128_align2, 0, 0, implicit $exec :: (store (s128) into %ir.in6, !alias.scope !0, addrspace 3) + %1232:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %1215:vgpr_32, %381:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in7, !alias.scope !0, addrspace 7) + %1091.sub0:vreg_128_align2 = V_PERM_B32_e64 %1225:vgpr_32, %1224:vgpr_32, %419:sreg_32, implicit $exec + %1067.sub0:vreg_128_align2 = V_PERM_B32_e64 %1225:vgpr_32, %1224:vgpr_32, %421:sreg_32, implicit $exec + %1091.sub1:vreg_128_align2 = V_PERM_B32_e64 %1227:vgpr_32, %1226:vgpr_32, %419:sreg_32, implicit $exec + %1067.sub1:vreg_128_align2 = V_PERM_B32_e64 %1227:vgpr_32, %1226:vgpr_32, %421:sreg_32, implicit $exec + %1091.sub2:vreg_128_align2 = V_PERM_B32_e64 %1229:vgpr_32, %1228:vgpr_32, %419:sreg_32, implicit $exec + %1067.sub2:vreg_128_align2 = V_PERM_B32_e64 %1229:vgpr_32, %1228:vgpr_32, %421:sreg_32, implicit $exec + %1091.sub3:vreg_128_align2 = V_PERM_B32_e64 %1231:vgpr_32, %1230:vgpr_32, %419:sreg_32, implicit $exec + %1067.sub3:vreg_128_align2 = V_PERM_B32_e64 %1231:vgpr_32, %1230:vgpr_32, %421:sreg_32, implicit $exec + DS_WRITE_B128_gfx9 %15:vgpr_32, %1091:vreg_128_align2, 0, 0, implicit $exec :: (store (s128) into %ir.in8, !alias.scope !0, addrspace 3) + DS_WRITE_B128_gfx9 %15:vgpr_32, %1067:vreg_128_align2, 16, 0, implicit $exec :: (store (s128) into %ir.in9, !alias.scope !0, addrspace 3) + %572:vgpr_32 = V_ADD_U32_e32 %27:vgpr_32, %1216:vgpr_32, implicit $exec + %1224:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %572:vgpr_32, %387:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in10, !alias.scope !0, addrspace 7) + %573:vgpr_32 = V_ADD_U32_e32 %27:vgpr_32, %1217:vgpr_32, implicit $exec + %1225:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %573:vgpr_32, %387:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in11, !alias.scope !0, addrspace 7) + %574:vgpr_32 = V_ADD_U32_e32 %27:vgpr_32, %1218:vgpr_32, implicit $exec + %1226:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %574:vgpr_32, %387:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in12, !alias.scope !0, addrspace 7) + %575:vgpr_32 = V_ADD_U32_e32 %27:vgpr_32, %1219:vgpr_32, implicit $exec + %1227:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %575:vgpr_32, %387:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in13, !alias.scope !0, addrspace 7) + %576:vgpr_32 = V_ADD_U32_e32 %27:vgpr_32, %1220:vgpr_32, implicit $exec + %1228:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %576:vgpr_32, %387:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in14, !alias.scope !0, addrspace 7) + %577:vgpr_32 = V_ADD_U32_e32 %27:vgpr_32, %1221:vgpr_32, implicit $exec + %1229:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %577:vgpr_32, %387:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in15, !alias.scope !0, addrspace 7) + %578:vgpr_32 = V_ADD_U32_e32 %27:vgpr_32, %1222:vgpr_32, implicit $exec + %1230:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %578:vgpr_32, %387:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in16, !alias.scope !0, addrspace 7) + %579:vgpr_32 = V_ADD_U32_e32 %27:vgpr_32, %1223:vgpr_32, implicit $exec + %1231:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %579:vgpr_32, %387:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in17, !alias.scope !0, addrspace 7) + %1223:vgpr_32 = V_ADD_U32_e32 %29:sreg_32, %1223:vgpr_32, implicit $exec + %1222:vgpr_32 = V_ADD_U32_e32 %29:sreg_32, %1222:vgpr_32, implicit $exec + %1221:vgpr_32 = V_ADD_U32_e32 %29:sreg_32, %1221:vgpr_32, implicit $exec + %1220:vgpr_32 = V_ADD_U32_e32 %29:sreg_32, %1220:vgpr_32, implicit $exec + %1219:vgpr_32 = V_ADD_U32_e32 %29:sreg_32, %1219:vgpr_32, implicit $exec + %1218:vgpr_32 = V_ADD_U32_e32 %29:sreg_32, %1218:vgpr_32, implicit $exec + %1217:vgpr_32 = V_ADD_U32_e32 %29:sreg_32, %1217:vgpr_32, implicit $exec + %1216:vgpr_32 = V_ADD_U32_e32 %29:sreg_32, %1216:vgpr_32, implicit $exec + %1215:vgpr_32 = V_ADD_U32_e32 64, %1215:vgpr_32, implicit $exec + %1214:sreg_32 = nsw S_ADD_I32 %1214:sreg_32, -1, implicit-def dead $scc + S_CMP_LG_U32 %1214:sreg_32, 0, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + + bb.2: + S_ENDPGM 0 +---