Diff 540151

llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp

Show First 20 Lines • Show All 76 Lines • ▼ Show 20 Lines	enum class SchedGroupMask {
DS_WRITE = 1u << 9,		DS_WRITE = 1u << 9,
ALL = ALU \| VALU \| SALU \| MFMA \| VMEM \| VMEM_READ \| VMEM_WRITE \| DS \|		ALL = ALU \| VALU \| SALU \| MFMA \| VMEM \| VMEM_READ \| VMEM_WRITE \| DS \|
DS_READ \| DS_WRITE,		DS_READ \| DS_WRITE,
LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)		LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
};		};

class SchedGroup;		class SchedGroup;

typedef DenseMap<SUnit *, SmallVector<int, 4>> SUnitsToCandidateSGsMap;		// InstructionRule class is used to enact a filter which determines whether or
		// not an SU maps to a given SchedGroup. It contains complementary data
		// structures (e.g Cache) to help those filters.
		class InstructionRule {
		protected:
		const SIInstrInfo *TII;
		unsigned SGID;
		// A cache made available to the Filter to store SUnits for subsequent
		// invocations of the Filter
		std::optional<SmallVector<SUnit *, 4>> Cache;

		public:
		virtual bool
		apply(const SUnit , const ArrayRef<SUnit >,
		SmallVectorImpl<SchedGroup> &) {
		return true;
		};

		InstructionRule(const SIInstrInfo *TII, unsigned SGID,
		bool NeedsCache = false)
		: TII(TII), SGID(SGID) {
		if (NeedsCache) {
		Cache = SmallVector<SUnit *, 4>();
		}
		}

typedef function_ref<bool(const SUnit , const ArrayRef<SUnit >,		virtual ~InstructionRule() = default;
const SIInstrInfo *, SmallVectorImpl<SchedGroup> &,		};
unsigned)>
InstructionRuleType;		typedef DenseMap<SUnit *, SmallVector<int, 4>> SUnitsToCandidateSGsMap;
		kerbowaUnsubmitted Not Done Reply Inline Actions Put this in the class so that it can access the cache? kerbowa: Put this in the class so that it can access the cache?

		kerbowaUnsubmitted Not Done Reply Inline Actions What if this was a member function in InstructionRule, that way rules could be reused. Or do you think that the rules will always be so specific that it is not needed and that common functionality should just be in some functions somewhere? kerbowa: What if this was a member function in InstructionRule, that way rules could be reused. Or do…
// Classify instructions into groups to enable fine tuned control over the		// Classify instructions into groups to enable fine tuned control over the
		kerbowaUnsubmitted Not Done Reply Inline Actions Can you make this into a class or struct? We may want to add more to the Rule class than just the classifier and a cache. kerbowa: Can you make this into a class or struct? We may want to add more to the Rule class than just…
// scheduler. These groups may be more specific than current SchedModel		// scheduler. These groups may be more specific than current SchedModel
// instruction classes.		// instruction classes.
class SchedGroup {		class SchedGroup {
private:		private:
// Mask that defines which instruction types can be classified into this		// Mask that defines which instruction types can be classified into this
// SchedGroup. The instruction types correspond to the mask from SCHED_BARRIER		// SchedGroup. The instruction types correspond to the mask from SCHED_BARRIER
// and SCHED_GROUP_BARRIER.		// and SCHED_GROUP_BARRIER.
SchedGroupMask SGMask;		SchedGroupMask SGMask;

// Maximum number of SUnits that can be added to this group.		// Maximum number of SUnits that can be added to this group.
std::optional<unsigned> MaxSize;		std::optional<unsigned> MaxSize;

// SchedGroups will only synchronize with other SchedGroups that have the same		// SchedGroups will only synchronize with other SchedGroups that have the same
// SyncID.		// SyncID.
int SyncID = 0;		int SyncID = 0;

// SGID is used to map instructions to candidate SchedGroups		// SGID is used to map instructions to candidate SchedGroups
unsigned SGID;		unsigned SGID;

// The different rules each instruction in this SchedGroup must conform to		// The different rules each instruction in this SchedGroup must conform to
SmallVector<InstructionRuleType, 4> Rules;		SmallVector<std::shared_ptr<InstructionRule>, 4> Rules;

// Count of the number of created SchedGroups, used to initialize SGID.		// Count of the number of created SchedGroups, used to initialize SGID.
		kerbowaUnsubmitted Not Done Reply Inline Actions Looks good, but this seems a bit vague as a field in SchedGroup. Can these just be captured in the lamdas since the way each rule would use a cache may be different? kerbowa: Looks good, but this seems a bit vague as a field in SchedGroup. Can these just be captured in…
static unsigned NumSchedGroups;		static unsigned NumSchedGroups;
		kerbowaUnsubmitted Not Done Reply Inline Actions Unused. kerbowa: Unused.

const SIInstrInfo *TII;		const SIInstrInfo *TII;

// Try to add and edge from SU A to SU B.		// Try to add and edge from SU A to SU B.
bool tryAddEdge(SUnit A, SUnit B);		bool tryAddEdge(SUnit A, SUnit B);

// Use SGMask to determine whether we can classify MI as a member of this		// Use SGMask to determine whether we can classify MI as a member of this
// SchedGroup object.		// SchedGroup object.
Show All 29 Lines	public:

// Returns true if no more instructions may be added to this group.		// Returns true if no more instructions may be added to this group.
bool isFull() const { return MaxSize && Collection.size() >= *MaxSize; }		bool isFull() const { return MaxSize && Collection.size() >= *MaxSize; }

// Append a constraint that SUs must meet in order to fit into this		// Append a constraint that SUs must meet in order to fit into this
// SchedGroup. Since many rules involve the relationship between a SchedGroup		// SchedGroup. Since many rules involve the relationship between a SchedGroup
// and the SUnits in other SchedGroups, rules are checked at Pipeline Solve		// and the SUnits in other SchedGroups, rules are checked at Pipeline Solve
// time (rather than SchedGroup init time.)		// time (rather than SchedGroup init time.)
void addRule(const InstructionRuleType &NewRule) { Rules.push_back(NewRule); }		void addRule(std::shared_ptr<InstructionRule> NewRule) {
		Rules.push_back(NewRule);
		}

// Returns true if the SU matches all rules		// Returns true if the SU matches all rules
bool allowedByRules(const SUnit *SU,		bool allowedByRules(const SUnit *SU,
SmallVectorImpl<SchedGroup> &SyncPipe) const {		SmallVectorImpl<SchedGroup> &SyncPipe) const {
if (Rules.empty())		if (Rules.empty())
return true;		return true;
for (auto &Rule : Rules) {		for (size_t I = 0; I < Rules.size(); I++) {
if (!Rule(SU, Collection, TII, SyncPipe, SGID)) {		auto TheRule = Rules[I].get();
		if (!TheRule->apply(SU, Collection, SyncPipe)) {
return false;		return false;
}		}
}		}
return true;		return true;
}		}

// Add SU to the SchedGroup.		// Add SU to the SchedGroup.
void add(SUnit &SU) {		void add(SUnit &SU) {
▲ Show 20 Lines • Show All 240 Lines • ▼ Show 20 Lines	for (auto J = std::next(I); J != E; ++J) {
GroupA.link(GroupB);		GroupA.link(GroupB);
}		}
}		}
}		}

void PipelineSolver::makePipeline() {		void PipelineSolver::makePipeline() {
// Preserve the order of barrier for subsequent SchedGroupBarrier mutations		// Preserve the order of barrier for subsequent SchedGroupBarrier mutations
for (auto &SyncPipeline : BestPipeline) {		for (auto &SyncPipeline : BestPipeline) {
		LLVM_DEBUG(dbgs() << "Printing SchedGroups\n");
for (auto &SG : SyncPipeline) {		for (auto &SG : SyncPipeline) {
LLVM_DEBUG(dbgs() << "Printing SchedGroups\nSchedGroup with SGID "		LLVM_DEBUG(dbgs() << "SchedGroup with SGID " << SG.getSGID()
<< SG.getSGID() << " has: \n");		<< " has: \n");
SUnit *SGBarr = nullptr;		SUnit *SGBarr = nullptr;
for (auto &SU : SG.Collection) {		for (auto &SU : SG.Collection) {
if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)		if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
SGBarr = SU;		SGBarr = SU;
LLVM_DEBUG(dbgs() << "SU(" << SU->NodeNum << ")\n");		LLVM_DEBUG(dbgs() << "SU(" << SU->NodeNum << ")\n");
}		}
// Command line requested IGroupLP doesn't have SGBarr		// Command line requested IGroupLP doesn't have SGBarr
if (!SGBarr)		if (!SGBarr)
▲ Show 20 Lines • Show All 365 Lines • ▼ Show 20 Lines	if (EnableExactSolver \|\| BelowCutoff) {
solveGreedy();		solveGreedy();
}		}

makePipeline();		makePipeline();
LLVM_DEBUG(dbgs() << "After applying mutation\n");		LLVM_DEBUG(dbgs() << "After applying mutation\n");
LLVM_DEBUG(DAG->dump());		LLVM_DEBUG(DAG->dump());
}		}

enum IGLPStrategyID : int { MFMASmallGemmOptID = 0, DemoOptID = 1 };		enum IGLPStrategyID : int {
		MFMASmallGemmOptID = 0,
		MFMASmallGemmSingleWaveOptID = 1,
		};

// Implement a IGLP scheduling strategy.		// Implement a IGLP scheduling strategy.
class IGLPStrategy {		class IGLPStrategy {
protected:		protected:
ScheduleDAGInstrs *DAG;		ScheduleDAGInstrs *DAG;

const SIInstrInfo *TII;		const SIInstrInfo *TII;

▲ Show 20 Lines • Show All 46 Lines • ▼ Show 20 Lines	for (unsigned I = 0; I < MFMACount * 3; ++I) {
SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);		SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
}		}
}		}

class DemoOpt final : public IGLPStrategy {		class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
private:		private:
		// Whether the DS_READ is a predecessor of first four MFMA in region
		class EnablesInitialMFMA final : public InstructionRule {
public:		public:
void applyIGLPStrategy(		bool apply(const SUnit SU, const ArrayRef<SUnit > Collection,
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,		SmallVectorImpl<SchedGroup> &SyncPipe) override {
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) override;		if (!SyncPipe.size())
		return false;
bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }		int MFMAsFound = 0;
		if (!Cache->size()) {
DemoOpt(ScheduleDAGInstrs DAG, const SIInstrInfo TII)		for (auto &Elt : SyncPipe[0].DAG->SUnits) {
: IGLPStrategy(DAG, TII) {		if (TII->isMFMAorWMMA(*Elt.getInstr())) {
IsBottomUp = 0;		++MFMAsFound;
		if (MFMAsFound > 4)
		break;
		Cache->push_back(&Elt);
		}
		}
}		}
};

void DemoOpt::applyIGLPStrategy(		assert(Cache->size());
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,		auto DAG = SyncPipe[0].DAG;
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) {		for (auto &Elt : *Cache) {
// Count the number of MFMA instructions.		if (DAG->IsReachable(Elt, const_cast<SUnit *>(SU)))
unsigned MFMACount = 0;		return true;
for (const MachineInstr &I : *DAG)		}
if (TII->isMFMAorWMMA(I))		return false;
++MFMACount;		}

const unsigned PipelineSyncID = 0;		EnablesInitialMFMA(const SIInstrInfo *TII, unsigned SGID,
SchedGroup *SG = nullptr;		bool NeedsCache = false)
		: InstructionRule(TII, SGID, NeedsCache) {}
		};

// The SU is a successor of SU in prev SchedGroup		// Whether the MI is a V_PERM and is a predecessor of a common DS_WRITE
InstructionRuleType Rule1 =		class IsPermForDSW final : public InstructionRule {
[](const SUnit SU, ArrayRef<SUnit > Collection, const SIInstrInfo *TII,		public:
SmallVectorImpl<SchedGroup> &SyncPipe, unsigned SGID) {		bool apply(const SUnit SU, const ArrayRef<SUnit > Collection,
		SmallVectorImpl<SchedGroup> &SyncPipe) override {
auto MI = SU->getInstr();		auto MI = SU->getInstr();
if (MI->getOpcode() == TargetOpcode::BUNDLE)		if (MI->getOpcode() != AMDGPU::V_PERM_B32_e64)
return false;		return false;

		bool FitsInGroup = false;
		// Does the VALU have a DS_WRITE successor
		if (!Collection.size()) {
		for (auto &Succ : SU->Succs) {
		SUnit *SuccUnit = Succ.getSUnit();
		if (TII->isDS(*SuccUnit->getInstr()) &&
		SuccUnit->getInstr()->mayStore()) {
		Cache->push_back(SuccUnit);
		FitsInGroup = true;
		}
		}
		return FitsInGroup;
		}

		assert(Cache->size());

		// Does the VALU have a DS_WRITE successor that is the same as other
		// VALU already in the group. The V_PERMs will all share 1 DS_W succ
		return std::any_of(Cache->begin(), Cache->end(), [&SU](SUnit *Elt) {
		return std::any_of(SU->Succs.begin(), SU->Succs.end(),
		[&Elt](const SDep &ThisSucc) {
		return ThisSucc.getSUnit() == Elt;
		});
		});
		}

		IsPermForDSW(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
		: InstructionRule(TII, SGID, NeedsCache) {}
		};

		// Whether the SU is a successor of any element in previous SchedGroup
		class IsSuccOfPrevGroup final : public InstructionRule {
		public:
		bool apply(const SUnit SU, const ArrayRef<SUnit > Collection,
		SmallVectorImpl<SchedGroup> &SyncPipe) override {
SchedGroup *OtherGroup = nullptr;		SchedGroup *OtherGroup = nullptr;
for (auto &PipeSG : SyncPipe) {		for (auto &PipeSG : SyncPipe) {
if (PipeSG.getSGID() == (int)SGID - 1) {		if ((unsigned)PipeSG.getSGID() == SGID - 1) {
OtherGroup = &PipeSG;		OtherGroup = &PipeSG;
}		}
}		}

if (!OtherGroup)		if (!OtherGroup)
return false;		return false;
		if (!OtherGroup->Collection.size())
		return true;

		// Does the previous VALU have this DS_Write as a successor
return (std::any_of(OtherGroup->Collection.begin(),		return (std::any_of(OtherGroup->Collection.begin(),
OtherGroup->Collection.end(), [&SU](SUnit *Elt) {		OtherGroup->Collection.end(), [&SU](SUnit *Elt) {
return std::any_of(Elt->Succs.begin(),		return std::any_of(Elt->Succs.begin(),
Elt->Succs.end(),		Elt->Succs.end(),
[&SU](SDep &Succ) {		[&SU](SDep &Succ) {
return Succ.getSUnit() == SU;		return Succ.getSUnit() == SU;
});		});
}));		}));
		}
		IsSuccOfPrevGroup(const SIInstrInfo *TII, unsigned SGID,
		bool NeedsCache = false)
		: InstructionRule(TII, SGID, NeedsCache) {}
		};

		// Whether the combined load width of group is 128 bits
		class VMEMSize final : public InstructionRule {
		public:
		bool apply(const SUnit SU, const ArrayRef<SUnit > Collection,
		SmallVectorImpl<SchedGroup> &SyncPipe) override {
		auto MI = SU->getInstr();
		if (MI->getOpcode() == TargetOpcode::BUNDLE)
		return false;
		if (!Collection.size())
		return true;

		int NumBits = 0;

		auto TRI = TII->getRegisterInfo();
		auto &MRI = MI->getParent()->getParent()->getRegInfo();
		for (auto &Elt : Collection) {
		auto Op = Elt->getInstr()->getOperand(0);
		auto Size =
		TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg(MRI, Op));
		NumBits += Size;
		}

		if (NumBits < 128) {
		assert(TII->isVMEM(*MI) && MI->mayLoad());
		if (NumBits + TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg(
		MRI, MI->getOperand(0))) <=
		128)
		return true;
		}

		return false;
		}

		VMEMSize(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
		: InstructionRule(TII, SGID, NeedsCache) {}
		};

		// Whether the SU shares a V_PERM predecessor with any SU in the SchedGroup
		// that is /p Distance steps away
		class SharesPredWithPrevNthGroup final : public InstructionRule {
		private:
		unsigned Distance = 1;

		public:
		bool apply(const SUnit SU, const ArrayRef<SUnit > Collection,
		SmallVectorImpl<SchedGroup> &SyncPipe) override {
		SchedGroup *OtherGroup = nullptr;
		if (!SyncPipe.size())
		return false;

		if (!Cache->size()) {

		for (auto &PipeSG : SyncPipe) {
		if ((unsigned)PipeSG.getSGID() == SGID - Distance) {
		OtherGroup = &PipeSG;
		}
		}

		if (!OtherGroup)
		return false;
		if (!OtherGroup->Collection.size())
		return true;

		for (auto &OtherEle : OtherGroup->Collection) {
		for (auto &Pred : OtherEle->Preds) {
		if (Pred.getSUnit()->getInstr()->getOpcode() ==
		AMDGPU::V_PERM_B32_e64)
		Cache->push_back(Pred.getSUnit());
		}
		}
		}

		assert(Cache->size());
		auto DAG = SyncPipe[0].DAG;
		// Does the previous DS_WRITE share a V_PERM predecessor with this
		// VMEM_READ
		return (
		std::any_of(Cache->begin(), Cache->end(), [&SU, &DAG](SUnit *Elt) {
		return DAG->IsReachable(const_cast<SUnit *>(SU), Elt);
		}));
		}
		SharesPredWithPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,
		unsigned SGID, bool NeedsCache = false)
		: InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {}
		};

		kerbowaUnsubmitted Done Reply Inline Actions Extra private. kerbowa: Extra private.
		public:
		void applyIGLPStrategy(
		DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
		DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) override;

		bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }

		MFMASmallGemmSingleWaveOpt(ScheduleDAGInstrs DAG, const SIInstrInfo TII)
		: IGLPStrategy(DAG, TII) {
		IsBottomUp = 0;
		}
};		};

// Each iteration of pipeline has 1 MFMA and 1 DS_W, where the DS_W is a		void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
// successor of the MFMA		DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
for (unsigned I = 0; I < MFMACount; ++I) {		DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) {
		unsigned MFMACount = 0;
		unsigned DSWCount = 0;
		unsigned DSWWithPermCount = 0;
		unsigned DSWWithSharedVMEMCount = 0;
		unsigned DSRCount = 0;
		SmallVector<SUnit *, 6> DSWithPerms;
		for (auto &SU : DAG->SUnits) {
		auto I = SU.getInstr();
		if (TII->isMFMAorWMMA(*I))
		++MFMACount;
		else if (TII->isDS(*I)) {
		if (I->mayLoad())
		++DSRCount;
		else if (I->mayStore()) {
		++DSWCount;
		for (auto Pred : SU.Preds) {
		if (Pred.getSUnit()->getInstr()->getOpcode() ==
		AMDGPU::V_PERM_B32_e64) {
		DSWithPerms.push_back(&SU);
		break;
		}
		}
		}
		}
		}
		DSWWithPermCount = DSWithPerms.size();
		auto I = DSWithPerms.begin();
		auto E = DSWithPerms.end();

		// Get the count of DS_WRITES with V_PERM predecessors which
		// have loop carried dependencies (WAR) on the same VMEM_READs.
		// We consider partial overlap as a miss -- in other words,
		// for a given DS_W, we only consider another DS_W as matching
		// if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred
		// for every V_PERM pred of this DS_W.
		kerbowaUnsubmitted Done Reply Inline Actions For this type of idiom wouldn't you normally use a visited set? Could also be improved by tracking all the VEMM_R and their associated DS_WRITE with perms via some map? kerbowa: For this type of idiom wouldn't you normally use a visited set? Could also be improved by…
		DenseMap<MachineInstr , SUnit > VMEMLookup;
		SmallVector<SUnit *, 6> Counted;
		for (; I != E; I++) {
		SUnit *Cand = nullptr;
		bool MissedAny = false;
		for (auto &Pred : (*I)->Preds) {
		if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64)
		kerbowaUnsubmitted Done Reply Inline Actions Could both of these find_if before the if statement be hoisted to the outer loop? kerbowa: Could both of these find_if before the if statement be hoisted to the outer loop?
		continue;

		if (Cand &&
		std::find(Counted.begin(), Counted.end(), Cand) != Counted.end())
		break;

		kerbowaUnsubmitted Done Reply Inline Actions It looks like the assumption is that each V_PERM DS_WRITE pair will have one VMEM load associated with it, since the loop stops at the first VMEM found? Is this correct? kerbowa: It looks like the assumption is that each V_PERM DS_WRITE pair will have one VMEM load…
		for (auto &Succ : Pred.getSUnit()->Succs) {
		auto MI = Succ.getSUnit()->getInstr();
		if (!TII->isVMEM(*MI) \|\| !MI->mayLoad())
		continue;

		if (MissedAny \|\| !VMEMLookup.size()) {
		kerbowaUnsubmitted Done Reply Inline Actions If there is no vmem successors can you mark the instruction as visited, or else just move this before the inner loop like I recommend above? kerbowa: If there is no vmem successors can you mark the instruction as visited, or else just move this…
		MissedAny = true;
		VMEMLookup[MI] = *I;
		continue;
		}

		if (!VMEMLookup.contains(MI)) {
		MissedAny = true;
		VMEMLookup[MI] = *I;
		continue;
		}
		kerbowaUnsubmitted Done Reply Inline Actions I think the calculation for DSWWithSharedVMEMCount could be simplified by iterating over DSWithPerms once, while adding each found VMEM to a list or map with a count that increases by 1 for each found DSWithPerm paired with that VMEM. To avoid some of the repeated traversals of preds/succs of the same instructions? kerbowa: I think the calculation for DSWWithSharedVMEMCount could be simplified by iterating over…

		kerbowaUnsubmitted Not Done Reply Inline Actions Again, would caching these results help, or is it unlikely to be a bottleneck? kerbowa: Again, would caching these results help, or is it unlikely to be a bottleneck?
		Cand = VMEMLookup[MI];
		if (std::find(Counted.begin(), Counted.end(), Cand) != Counted.end()) {
		MissedAny = true;
		break;
		}
		}
		}
		if (!MissedAny && Cand) {
		DSWWithSharedVMEMCount += 2;
		Counted.push_back(Cand);
		Counted.push_back(*I);
		}
		}

		assert(DSWWithSharedVMEMCount <= DSWWithPermCount);
		SchedGroup *SG;
		unsigned PipelineSyncID = 0;
		// For kernels with V_PERM, there are enough VALU to mix in between MFMAs
		if (DSWWithPermCount) {
		for (unsigned I = 0; I < MFMACount; I++) {
		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
		SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
		SchedGroupMask::VALU, 2, PipelineSyncID, DAG, TII);
		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
		}
		}

		PipelineSyncID = 1;
		// Phase 1: Break up DS_READ and MFMA clusters.
		// First DS_READ to make ready initial MFMA, then interleave MFMA with DS_READ
		// prefetch

		// Make ready initial MFMA
		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
		SchedGroupMask::DS_READ, 4, PipelineSyncID, DAG, TII);
		SG->addRule(std::make_shared<EnablesInitialMFMA>(TII, SG->getSGID(), true));
		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
		SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

		// Interleave MFMA with DS_READ prefetch
		for (unsigned I = 0; I < DSRCount - 4; ++I) {
		kerbowaUnsubmitted Done Reply Inline Actions Will these rules be called relatively often, would it help to cache the first 4 mfma in cases like this? kerbowa: Will these rules be called relatively often, would it help to cache the first 4 mfma in cases…
		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
		kerbowaUnsubmitted Done Reply Inline Actions Does it need to be a direct pred of the MFMA or does it only need to be able to be scheduled before it, could you use reachability in the DAG here instead of searching the preds like this? kerbowa: Does it need to be a direct pred of the MFMA or does it only need to be able to be scheduled…
		SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG, TII);
		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);		SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
		}

		// Phase 2a: Loop carried dependency with V_PERM
		// Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they
		// depend on. Interleave MFMA to keep XDL unit busy throughout.
		for (unsigned I = 0; I < DSWWithPermCount - DSWWithSharedVMEMCount; ++I) {
		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
		SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
		SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));
		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);		SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
SG->addRule(Rule1);		SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID(), false));
		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
		SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
		SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
		1, TII, SG->getSGID(), true));
		SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
		SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
		SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
		SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
		3, TII, SG->getSGID(), true));
		SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
		SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
		}

		// Phase 2b: Loop carried dependency without V_PERM
		// Schedule DS_WRITE as closely as possible to the VMEM_READ they depend on.
		// Interleave MFMA to keep XDL unit busy throughout.
		for (unsigned I = 0; I < DSWCount - DSWWithPermCount; I++) {
		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
		SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
		SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
		SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
		SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
		}

		// Phase 2c: Loop carried dependency with V_PERM, VMEM_READs are
		// ultimately used by two DS_WRITE
		// Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they
		// depend on. Interleave MFMA to keep XDL unit busy throughout.

		for (unsigned I = 0; I < DSWWithSharedVMEMCount; ++I) {
		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
		SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
		SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));
		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
		SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
		SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID(), false));
		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
		SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
		SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
		SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));
		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
		SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
		SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID(), false));
		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
		SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
		SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
		SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
		2, TII, SG->getSGID(), true));
		SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
		SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
		SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
		SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
		4, TII, SG->getSGID(), true));
		SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
		SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
}		}
}		}

static std::unique_ptr<IGLPStrategy>		static std::unique_ptr<IGLPStrategy>
createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG,		createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG,
const SIInstrInfo *TII) {		const SIInstrInfo *TII) {
switch (ID) {		switch (ID) {
case MFMASmallGemmOptID:		case MFMASmallGemmOptID:
return std::make_unique<MFMASmallGemmOpt>(DAG, TII);		return std::make_unique<MFMASmallGemmOpt>(DAG, TII);
case DemoOptID:		case MFMASmallGemmSingleWaveOptID:
return std::make_unique<DemoOpt>(DAG, TII);		return std::make_unique<MFMASmallGemmSingleWaveOpt>(DAG, TII);
}		}

llvm_unreachable("Unknown IGLPStrategyID");		llvm_unreachable("Unknown IGLPStrategyID");
}		}

class IGroupLPDAGMutation : public ScheduleDAGMutation {		class IGroupLPDAGMutation : public ScheduleDAGMutation {
private:		private:
const SIInstrInfo *TII;		const SIInstrInfo *TII;
▲ Show 20 Lines • Show All 342 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll

	Show First 20 Lines • Show All 162 Lines • ▼ Show 20 Lines
	; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:80			; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:80
	; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:64			; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:64
	; GCN-NEXT: ds_read_b128 a[0:3], v1			; GCN-NEXT: ds_read_b128 a[0:3], v1
	; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:16			; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:16
	; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:32			; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:32
	; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:48			; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:48
	; GCN-NEXT: s_waitcnt lgkmcnt(0)			; GCN-NEXT: s_waitcnt lgkmcnt(0)
	; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]			; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
	; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:8304			; GCN-NEXT: ds_read_b128 a[156:159], v1 offset:8304
	; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:8288			; GCN-NEXT: ds_read_b128 a[152:155], v1 offset:8288
	; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:8272			; GCN-NEXT: ds_read_b128 a[148:151], v1 offset:8272
	; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:8256			; GCN-NEXT: ds_read_b128 a[144:147], v1 offset:8256
	; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:8240			; GCN-NEXT: ds_read_b128 a[140:143], v1 offset:8240
	; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:8224			; GCN-NEXT: ds_read_b128 a[136:139], v1 offset:8224
	; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:8208			; GCN-NEXT: ds_read_b128 a[132:135], v1 offset:8208
	; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:8192			; GCN-NEXT: ds_read_b128 a[128:131], v1 offset:8192
	; GCN-NEXT: v_add_u32_e32 v0, s1, v0			; GCN-NEXT: v_add_u32_e32 v0, s1, v0
	; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:24688			; GCN-NEXT: s_waitcnt lgkmcnt(0)
	; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:24672			; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v3, a[128:159]
	; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:24656			; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:24688
	; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:24640			; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:24672
	; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:24624			; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:24656
	; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:24608			; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:24640
	; GCN-NEXT: s_nop 3			; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:24624
	; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112			; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:24608
	; GCN-NEXT: s_waitcnt lgkmcnt(7)			; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:24592
				; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:24576
				; GCN-NEXT: s_waitcnt lgkmcnt(0)
	; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v3, a[96:127]			; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v3, a[96:127]
	; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:24592			; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:49264
	; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:24576			; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:49248
				; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:49232
				; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:49216
				; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:49200
				; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:49184
				; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:49168
				; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:49152
				; GCN-NEXT: v_add_u32_e32 v1, 0x6000, v1
				; GCN-NEXT: s_waitcnt lgkmcnt(0)
				; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v3, a[64:95]
				; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:57456
				; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:57440
				; GCN-NEXT: ds_read_b128 a[52:55], v1 offset:57424
				; GCN-NEXT: ds_read_b128 a[48:51], v1 offset:57408
				; GCN-NEXT: ds_read_b128 a[32:35], v1 offset:57344
				; GCN-NEXT: ds_read_b128 a[36:39], v1 offset:57360
				; GCN-NEXT: ds_read_b128 a[40:43], v1 offset:57376
				; GCN-NEXT: ds_read_b128 a[44:47], v1 offset:57392
				; GCN-NEXT: s_waitcnt lgkmcnt(0)
				; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63]
				; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112
	; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:96			; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:96
	; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:80			; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:80
	; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:64			; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:64
	; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:48			; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:48
	; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32			; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32
	; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16			; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16
	; GCN-NEXT: ds_write_b128 v0, a[0:3]			; GCN-NEXT: ds_write_b128 v0, a[0:3]
	; GCN-NEXT: v_mov_b32_e32 v0, s1			; GCN-NEXT: v_mov_b32_e32 v0, s1
	; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:49264			; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:8288
	; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:49248			; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:8304
	; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:49232			; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:8256
	; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:49216			; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:8272
	; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:49200			; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:8224
	; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:49184			; GCN-NEXT: ds_write_b128 v0, a[140:143] offset:8240
	; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:49168			; GCN-NEXT: ds_write_b128 v0, a[128:131] offset:8192
	; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:49152			; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:8208
	; GCN-NEXT: v_add_u32_e32 v4, 0x6000, v1			; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:16480
	; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:8288			; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:16496
	; GCN-NEXT: s_waitcnt lgkmcnt(14)			; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:16448
	; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v3, a[64:95]			; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:16464
	; GCN-NEXT: ds_read_b128 a[60:63], v4 offset:57456			; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:16416
	; GCN-NEXT: ds_read_b128 a[56:59], v4 offset:57440			; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:16432
	; GCN-NEXT: ds_read_b128 a[52:55], v4 offset:57424			; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:16384
	; GCN-NEXT: ds_read_b128 a[48:51], v4 offset:57408			; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:16400
	; GCN-NEXT: ds_read_b128 a[32:35], v4 offset:57344			; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:24672
	; GCN-NEXT: ds_read_b128 a[36:39], v4 offset:57360			; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:24688
	; GCN-NEXT: ds_read_b128 a[40:43], v4 offset:57376			; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:24640
	; GCN-NEXT: ds_read_b128 a[44:47], v4 offset:57392			; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:24656
	; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:8304			; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:24608
	; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:8256			; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:24624
	; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:8272			; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:24576
	; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:8224			; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:24592
	; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:8240
	; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:8192
	; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:8208
	; GCN-NEXT: s_nop 3
	; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:16480
	; GCN-NEXT: s_waitcnt lgkmcnt(14)
	; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
	; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:16496
	; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:16448
	; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:16464
	; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:16416
	; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:16432
	; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:16384
	; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:16400
	; GCN-NEXT: s_nop 7
	; GCN-NEXT: s_nop 3
	; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:24672
	; GCN-NEXT: s_waitcnt lgkmcnt(14)
	; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63]
	; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:24688
	; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:24640
	; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:24656
	; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:24608
	; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:24624
	; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:24576
	; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:24592
	; GCN-NEXT: s_nop 7
	; GCN-NEXT: s_nop 3
	; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:32864			; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:32864
	; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:32880			; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:32880
	; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:32832			; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:32832
	; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:32848			; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:32848
	; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:32800			; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:32800
	; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:32816			; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:32816
	; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:32768			; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:32768
	; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:32784			; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:32784
	Show All 39 Lines

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir

This file was added.

				# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
				# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -run-pass=machine-scheduler -verify-misched -o - %s \| FileCheck -check-prefix=GCN %s

				--- \|
				define amdgpu_kernel void @single-wave-phase-2b(i32 addrspace(3)* noalias %in0, i32 addrspace(3)* noalias %in1, i32 addrspace(3)* noalias %in2, i32 addrspace(3)* noalias %in3, i32 addrspace(3)* noalias %in4, i32 addrspace(3)* noalias %in5, i32 addrspace(3)* noalias %in6, i32 addrspace(3)* noalias %in7, i32 addrspace(3)* noalias %in8, i32 addrspace(3)* noalias %in9, i32 addrspace(3)* noalias %in10, i32 addrspace(3)* noalias %in11, i32 addrspace(7)* noalias %in12, i32 addrspace(7)* noalias %in13, i32 addrspace(7)* noalias %in14, i32 addrspace(7)* noalias %in15, i32 addrspace(7)* noalias %in16, i32 addrspace(7)* noalias %in17, i32 addrspace(7)* noalias %in18, i32 addrspace(7)* noalias %in19, i32 addrspace(7)* noalias %in20, i32 addrspace(7)* noalias %in21, i32 addrspace(7)* noalias %in22, i32 addrspace(7)* noalias %in23, i32 addrspace(7)* noalias %in24, i32 addrspace(7)* noalias %in25, i32 addrspace(7)* noalias %in26, i32 addrspace(7)* noalias %in27, i32 addrspace(7)* noalias %in28, i32 addrspace(7)* noalias %in29) #0 { ret void }

				!0 = distinct !{!0}
				!1 = !{!1, !0}
				...


				---
				name: single-wave-phase-2b
				tracksRegLiveness: true
				machineFunctionInfo:
				occupancy: 1
				body: \|
				; GCN-LABEL: name: single-wave-phase-2b
				; GCN: bb.0:
				; GCN-NEXT: successors: %bb.1(0x80000000)
				; GCN-NEXT: {{ $}}
				; GCN-NEXT: [[DEF:%[0-9]+]]:areg_512_align2 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF1:%[0-9]+]]:areg_512_align2 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF2:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF3:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF4:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF5:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF13:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF14:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF15:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF17:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF18:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF19:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF20:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF21:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF22:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF23:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF24:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF25:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF26:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF27:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF28:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF29:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF30:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF31:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF32:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF33:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF34:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF35:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF36:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF37:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF38:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF39:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF40:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF41:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF42:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF43:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF44:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF45:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF46:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF47:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF48:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF49:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
				; GCN-NEXT: {{ $}}
				; GCN-NEXT: bb.1:
				; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
				; GCN-NEXT: {{ $}}
				; GCN-NEXT: [[DS_READ_B128_gfx9_:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF6]], 0, 0, implicit $exec :: (load (s128) from %ir.in0, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[DS_READ_B128_gfx9_1:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF7]], 0, 0, implicit $exec :: (load (s128) from %ir.in4, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[DS_READ_B128_gfx9_2:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF6]], 1040, 0, implicit $exec :: (load (s128) from %ir.in1, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[DS_READ_B128_gfx9_3:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF7]], 2064, 0, implicit $exec :: (load (s128) from %ir.in5, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub0_sub1, [[DS_READ_B128_gfx9_1]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_]], 0, 0, 0, implicit $mode, implicit $exec
				; GCN-NEXT: [[DS_READ_B128_gfx9_4:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF6]], 2080, 0, implicit $exec :: (load (s128) from %ir.in2, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF8]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF9]], implicit $exec
				; GCN-NEXT: dead [[V_MFMA_F32_32X32X8F16_mac_e64_1:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub2_sub3, [[DS_READ_B128_gfx9_1]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_]], 0, 0, 0, implicit $mode, implicit $exec
				; GCN-NEXT: [[DS_READ_B128_gfx9_5:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF6]], 3120, 0, implicit $exec :: (load (s128) from %ir.in3, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF10]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF11]], implicit $exec
				; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub0_sub1, [[DS_READ_B128_gfx9_3]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
				; GCN-NEXT: [[DS_READ_B128_gfx9_6:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF7]], 4128, 0, implicit $exec :: (load (s128) from %ir.in6, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[V_ADD_U32_e32_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF12]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF13]], implicit $exec
				; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub2_sub3, [[DS_READ_B128_gfx9_3]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
				; GCN-NEXT: [[DS_READ_B128_gfx9_7:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF7]], 6192, 0, implicit $exec :: (load (s128) from %ir.in7, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[V_ADD_U32_e32_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF14]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF15]], implicit $exec
				; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_4]].sub0_sub1, [[DS_READ_B128_gfx9_6]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
				; GCN-NEXT: [[DS_READ_B128_gfx9_8:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF7]], 1024, 0, implicit $exec :: (load (s128) from %ir.in8, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[V_ADD_U32_e32_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF16]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF17]], implicit $exec
				; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_4]].sub2_sub3, [[DS_READ_B128_gfx9_6]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
				; GCN-NEXT: [[DS_READ_B128_gfx9_9:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF7]], 3088, 0, implicit $exec :: (load (s128) from %ir.in9, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[V_ADD_U32_e32_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF18]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF19]], implicit $exec
				; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_5]].sub0_sub1, [[DS_READ_B128_gfx9_7]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
				; GCN-NEXT: [[DS_READ_B128_gfx9_10:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF7]], 5152, 0, implicit $exec :: (load (s128) from %ir.in10, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[V_ADD_U32_e32_12:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF20]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF21]], implicit $exec
				; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_5]].sub2_sub3, [[DS_READ_B128_gfx9_7]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
				; GCN-NEXT: [[DS_READ_B128_gfx9_11:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF7]], 7216, 0, implicit $exec :: (load (s128) from %ir.in11, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub0_sub1, [[DS_READ_B128_gfx9_8]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
				; GCN-NEXT: undef %63.sub1:vreg_128_align2 = V_PERM_B32_e64 [[DEF28]], [[DEF29]], [[DEF44]], implicit $exec
				; GCN-NEXT: %63.sub0:vreg_128_align2 = V_PERM_B32_e64 [[DEF30]], [[DEF31]], [[DEF44]], implicit $exec
				; GCN-NEXT: %63.sub3:vreg_128_align2 = V_PERM_B32_e64 [[DEF24]], [[DEF25]], [[DEF44]], implicit $exec
				; GCN-NEXT: %63.sub2:vreg_128_align2 = V_PERM_B32_e64 [[DEF26]], [[DEF27]], [[DEF44]], implicit $exec
				; GCN-NEXT: DS_WRITE_B128_gfx9 [[DEF40]], %63, 0, 0, implicit $exec :: (store (s128) into %ir.in0, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in12, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_1]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in13, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_2]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in14, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_3]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in15, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_3:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub2_sub3, [[DS_READ_B128_gfx9_8]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_3]], 0, 0, 0, implicit $mode, implicit $exec
				; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_4]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in16, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_5]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in17, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN6:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_6]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in18, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN7:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_7]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in19, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_3:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub0_sub1, [[DS_READ_B128_gfx9_9]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_3]], 0, 0, 0, implicit $mode, implicit $exec
				; GCN-NEXT: undef %64.sub1:vreg_128_align2 = V_PERM_B32_e64 [[DEF36]], [[DEF37]], [[DEF44]], implicit $exec
				; GCN-NEXT: %64.sub0:vreg_128_align2 = V_PERM_B32_e64 [[DEF38]], [[DEF39]], [[DEF44]], implicit $exec
				; GCN-NEXT: %64.sub3:vreg_128_align2 = V_PERM_B32_e64 [[DEF32]], [[DEF33]], [[DEF44]], implicit $exec
				; GCN-NEXT: %64.sub2:vreg_128_align2 = V_PERM_B32_e64 [[DEF34]], [[DEF35]], [[DEF44]], implicit $exec
				; GCN-NEXT: DS_WRITE_B128_gfx9 [[DEF40]], %64, 1040, 0, implicit $exec :: (store (s128) into %ir.in1, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN8:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_8]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in20, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN9:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_9]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in21, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN10:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_10]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in22, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_11]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in23, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_3:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub2_sub3, [[DS_READ_B128_gfx9_9]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_3]], 0, 0, 0, implicit $mode, implicit $exec
				; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_12]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in24, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_13]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in25, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[V_ADD_U32_e32_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF22]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF23]], implicit $exec
				; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN14:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_14]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in26, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN15:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_15]], [[DEF47]], 0, 0, 0, 0, implicit $exec
				; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_3:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_4]].sub0_sub1, [[DS_READ_B128_gfx9_10]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_3]], 0, 0, 0, implicit $mode, implicit $exec
				; GCN-NEXT: DS_WRITE_B128_gfx9 [[DEF41]], [[DEF2]], 0, 0, implicit $exec :: (store (s128) into %ir.in2, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[V_ADD_U32_e32_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 -16, [[DEF45]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 -16, [[DEF46]], implicit $exec
				; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[V_ADD_U32_e32_16]], [[DEF48]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in26, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_3:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_4]].sub2_sub3, [[DS_READ_B128_gfx9_10]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_3]], 0, 0, 0, implicit $mode, implicit $exec
				; GCN-NEXT: DS_WRITE_B128_gfx9 [[DEF41]], [[DEF3]], 2064, 0, implicit $exec :: (store (s128) into %ir.in3, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[DEF45]], [[DEF48]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in27, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[V_ADD_U32_e32_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 128, [[V_ADD_U32_e32_18]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_19]], implicit $exec
				; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_3:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_5]].sub0_sub1, [[DS_READ_B128_gfx9_11]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_3]], 0, 0, 0, implicit $mode, implicit $exec
				; GCN-NEXT: DS_WRITE_B128_gfx9 [[DEF41]], [[DEF4]], 2080, 0, implicit $exec :: (store (s128) into %ir.in4, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[DEF46]], [[DEF48]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in28, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[V_ADD_U32_e32_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 128, [[V_ADD_U32_e32_20]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_21]], implicit $exec
				; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_3:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_5]].sub2_sub3, [[DS_READ_B128_gfx9_11]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_3]], 0, 0, 0, implicit $mode, implicit $exec
				; GCN-NEXT: DS_WRITE_B128_gfx9 [[DEF41]], [[DEF5]], 16, 0, implicit $exec :: (store (s128) into %ir.in5, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[V_ADD_U32_e32_17]], [[DEF48]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in29, !alias.scope !0, addrspace 7)
				; GCN-NEXT: IGLP_OPT 1
				; GCN-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = nsw S_ADD_I32 [[S_ADD_I32_]], -1, implicit-def dead $scc
				; GCN-NEXT: S_CMP_LG_U32 [[S_ADD_I32_]], 0, implicit-def $scc
				; GCN-NEXT: [[V_ADD_U32_e32_22:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_22]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_23:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_23]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_24:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_24]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_25:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_25]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_26:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_26]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_27:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_27]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_28:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_28]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_29:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_29]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_30]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_31:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_31]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_32:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_32]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_33:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_33]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_34:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_34]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_35:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_35]], implicit $exec
				; GCN-NEXT: S_CBRANCH_SCC1 %bb.1, implicit killed $scc
				; GCN-NEXT: {{ $}}
				; GCN-NEXT: bb.2:
				; GCN-NEXT: S_ENDPGM 0
				bb.0:
				%761:areg_512_align2 = IMPLICIT_DEF
				%814:areg_512_align2 = IMPLICIT_DEF
				%1757:av_128_align2 = IMPLICIT_DEF
				%1755:av_128_align2 = IMPLICIT_DEF
				%1754:av_128_align2 = IMPLICIT_DEF
				%1756:av_128_align2 = IMPLICIT_DEF
				%37:vgpr_32 = IMPLICIT_DEF
				%38:vgpr_32 = IMPLICIT_DEF
				%1736:vgpr_32 = IMPLICIT_DEF
				%1737:vgpr_32 = IMPLICIT_DEF
				%1738:vgpr_32 = IMPLICIT_DEF
				%1739:vgpr_32 = IMPLICIT_DEF
				%1740:vgpr_32 = IMPLICIT_DEF
				%1741:vgpr_32 = IMPLICIT_DEF
				%1742:vgpr_32 = IMPLICIT_DEF
				%1743:vgpr_32 = IMPLICIT_DEF
				%1744:vgpr_32 = IMPLICIT_DEF
				%1745:vgpr_32 = IMPLICIT_DEF
				%1746:vgpr_32 = IMPLICIT_DEF
				%1747:vgpr_32 = IMPLICIT_DEF
				%1748:vgpr_32 = IMPLICIT_DEF
				%1749:vgpr_32 = IMPLICIT_DEF
				%1750:vgpr_32 = IMPLICIT_DEF
				%1751:vgpr_32 = IMPLICIT_DEF
				%1766:vgpr_32 = IMPLICIT_DEF
				%1767:vgpr_32 = IMPLICIT_DEF
				%1768:vgpr_32 = IMPLICIT_DEF
				%1769:vgpr_32 = IMPLICIT_DEF
				%1770:vgpr_32 = IMPLICIT_DEF
				%1771:vgpr_32 = IMPLICIT_DEF
				%1772:vgpr_32 = IMPLICIT_DEF
				%1773:vgpr_32 = IMPLICIT_DEF
				%1758:vgpr_32 = IMPLICIT_DEF
				%1759:vgpr_32 = IMPLICIT_DEF
				%1760:vgpr_32 = IMPLICIT_DEF
				%1761:vgpr_32 = IMPLICIT_DEF
				%1762:vgpr_32 = IMPLICIT_DEF
				%1763:vgpr_32 = IMPLICIT_DEF
				%1764:vgpr_32 = IMPLICIT_DEF
				%1765:vgpr_32 = IMPLICIT_DEF
				%14:vgpr_32 = IMPLICIT_DEF
				%31:vgpr_32 = IMPLICIT_DEF
				%41:vgpr_32 = IMPLICIT_DEF
				%43:sreg_32 = IMPLICIT_DEF
				%535:sreg_32 = IMPLICIT_DEF
				%1752:vgpr_32 = IMPLICIT_DEF
				%1753:vgpr_32 = IMPLICIT_DEF
				%450:sgpr_128 = IMPLICIT_DEF
				%518:sgpr_128 = IMPLICIT_DEF
				%1735:sreg_32 = IMPLICIT_DEF

				bb.1:
				IGLP_OPT 1
				%683:av_128_align2 = DS_READ_B128_gfx9 %37:vgpr_32, 0, 0, implicit $exec :: (load (s128) from %ir.in0, !alias.scope !0, addrspace 3)
				%688:av_128_align2 = DS_READ_B128_gfx9 %37:vgpr_32, 1040, 0, implicit $exec :: (load (s128) from %ir.in1, !alias.scope !0, addrspace 3)
				%693:av_128_align2 = DS_READ_B128_gfx9 %37:vgpr_32, 2080, 0, implicit $exec :: (load (s128) from %ir.in2, !alias.scope !0, addrspace 3)
				%698:av_128_align2 = DS_READ_B128_gfx9 %37:vgpr_32, 3120, 0, implicit $exec :: (load (s128) from %ir.in3, !alias.scope !0, addrspace 3)
				%703:av_128_align2 = DS_READ_B128_gfx9 %38:vgpr_32, 0, 0, implicit $exec :: (load (s128) from %ir.in4, !alias.scope !0, addrspace 3)
				%708:av_128_align2 = DS_READ_B128_gfx9 %38:vgpr_32, 2064, 0, implicit $exec :: (load (s128) from %ir.in5, !alias.scope !0, addrspace 3)
				%713:av_128_align2 = DS_READ_B128_gfx9 %38:vgpr_32, 4128, 0, implicit $exec :: (load (s128) from %ir.in6, !alias.scope !0, addrspace 3)
				%718:av_128_align2 = DS_READ_B128_gfx9 %38:vgpr_32, 6192, 0, implicit $exec :: (load (s128) from %ir.in7, !alias.scope !0, addrspace 3)
				%761:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %683.sub0_sub1:av_128_align2, %703.sub0_sub1:av_128_align2, %761:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
				%762:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %683.sub2_sub3:av_128_align2, %703.sub2_sub3:av_128_align2, %761:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
				%761:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %688.sub0_sub1:av_128_align2, %708.sub0_sub1:av_128_align2, %761:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
				%761:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %688.sub2_sub3:av_128_align2, %708.sub2_sub3:av_128_align2, %761:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
				%761:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %693.sub0_sub1:av_128_align2, %713.sub0_sub1:av_128_align2, %761:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
				%761:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %693.sub2_sub3:av_128_align2, %713.sub2_sub3:av_128_align2, %761:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
				%761:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %698.sub0_sub1:av_128_align2, %718.sub0_sub1:av_128_align2, %761:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
				%761:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %698.sub2_sub3:av_128_align2, %718.sub2_sub3:av_128_align2, %761:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
				%764:av_128_align2 = DS_READ_B128_gfx9 %38:vgpr_32, 1024, 0, implicit $exec :: (load (s128) from %ir.in8, !alias.scope !0, addrspace 3)
				%769:av_128_align2 = DS_READ_B128_gfx9 %38:vgpr_32, 3088, 0, implicit $exec :: (load (s128) from %ir.in9, !alias.scope !0, addrspace 3)
				%774:av_128_align2 = DS_READ_B128_gfx9 %38:vgpr_32, 5152, 0, implicit $exec :: (load (s128) from %ir.in10, !alias.scope !0, addrspace 3)
				%779:av_128_align2 = DS_READ_B128_gfx9 %38:vgpr_32, 7216, 0, implicit $exec :: (load (s128) from %ir.in11, !alias.scope !0, addrspace 3)
				%814:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %683.sub0_sub1:av_128_align2, %764.sub0_sub1:av_128_align2, %814:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
				%814:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %683.sub2_sub3:av_128_align2, %764.sub2_sub3:av_128_align2, %814:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
				%814:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %688.sub0_sub1:av_128_align2, %769.sub0_sub1:av_128_align2, %814:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
				%814:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %688.sub2_sub3:av_128_align2, %769.sub2_sub3:av_128_align2, %814:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
				%814:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %693.sub0_sub1:av_128_align2, %774.sub0_sub1:av_128_align2, %814:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
				%814:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %693.sub2_sub3:av_128_align2, %774.sub2_sub3:av_128_align2, %814:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
				%814:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %698.sub0_sub1:av_128_align2, %779.sub0_sub1:av_128_align2, %814:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
				%814:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %698.sub2_sub3:av_128_align2, %779.sub2_sub3:av_128_align2, %814:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
				undef %1599.sub3:vreg_128_align2 = V_PERM_B32_e64 %1766:vgpr_32, %1767:vgpr_32, %535:sreg_32, implicit $exec
				%1599.sub2:vreg_128_align2 = V_PERM_B32_e64 %1768:vgpr_32, %1769:vgpr_32, %535:sreg_32, implicit $exec
				%1599.sub1:vreg_128_align2 = V_PERM_B32_e64 %1770:vgpr_32, %1771:vgpr_32, %535:sreg_32, implicit $exec
				%1599.sub0:vreg_128_align2 = V_PERM_B32_e64 %1772:vgpr_32, %1773:vgpr_32, %535:sreg_32, implicit $exec
				undef %1579.sub3:vreg_128_align2 = V_PERM_B32_e64 %1758:vgpr_32, %1759:vgpr_32, %535:sreg_32, implicit $exec
				%1579.sub2:vreg_128_align2 = V_PERM_B32_e64 %1760:vgpr_32, %1761:vgpr_32, %535:sreg_32, implicit $exec
				%1579.sub1:vreg_128_align2 = V_PERM_B32_e64 %1762:vgpr_32, %1763:vgpr_32, %535:sreg_32, implicit $exec
				%1579.sub0:vreg_128_align2 = V_PERM_B32_e64 %1764:vgpr_32, %1765:vgpr_32, %535:sreg_32, implicit $exec
				DS_WRITE_B128_gfx9 %14:vgpr_32, %1599:vreg_128_align2, 0, 0, implicit $exec :: (store (s128) into %ir.in0, !alias.scope !0, addrspace 3)
				DS_WRITE_B128_gfx9 %14:vgpr_32, %1579:vreg_128_align2, 1040, 0, implicit $exec :: (store (s128) into %ir.in1, !alias.scope !0, addrspace 3)
				%830:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1736:vgpr_32, implicit $exec
				%1773:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %830:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in12, !alias.scope !0, addrspace 7)
				%833:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1737:vgpr_32, implicit $exec
				%1772:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %833:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in13, !alias.scope !0, addrspace 7)
				%835:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1738:vgpr_32, implicit $exec
				%1771:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %835:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in14, !alias.scope !0, addrspace 7)
				%837:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1739:vgpr_32, implicit $exec
				%1770:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %837:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in15, !alias.scope !0, addrspace 7)
				%839:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1740:vgpr_32, implicit $exec
				%1769:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %839:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in16, !alias.scope !0, addrspace 7)
				%841:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1741:vgpr_32, implicit $exec
				%1768:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %841:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in17, !alias.scope !0, addrspace 7)
				%843:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1742:vgpr_32, implicit $exec
				%1767:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %843:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in18, !alias.scope !0, addrspace 7)
				%845:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1743:vgpr_32, implicit $exec
				%1766:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %845:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in19, !alias.scope !0, addrspace 7)
				%847:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1744:vgpr_32, implicit $exec
				%1758:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %847:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in20, !alias.scope !0, addrspace 7)
				%849:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1745:vgpr_32, implicit $exec
				%1759:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %849:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in21, !alias.scope !0, addrspace 7)
				%851:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1746:vgpr_32, implicit $exec
				%1760:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %851:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in22, !alias.scope !0, addrspace 7)
				%853:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1747:vgpr_32, implicit $exec
				%1761:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %853:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in23, !alias.scope !0, addrspace 7)
				%855:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1748:vgpr_32, implicit $exec
				%1762:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %855:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in24, !alias.scope !0, addrspace 7)
				%857:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1749:vgpr_32, implicit $exec
				%1763:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %857:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in25, !alias.scope !0, addrspace 7)
				%859:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1750:vgpr_32, implicit $exec
				%1764:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %859:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in26, !alias.scope !0, addrspace 7)
				%861:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1751:vgpr_32, implicit $exec
				%1765:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %861:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec
				DS_WRITE_B128_gfx9 %31:vgpr_32, %1757:av_128_align2, 0, 0, implicit $exec :: (store (s128) into %ir.in2, !alias.scope !0, addrspace 3)
				DS_WRITE_B128_gfx9 %31:vgpr_32, %1755:av_128_align2, 2064, 0, implicit $exec :: (store (s128) into %ir.in3, !alias.scope !0, addrspace 3)
				DS_WRITE_B128_gfx9 %31:vgpr_32, %1754:av_128_align2, 2080, 0, implicit $exec :: (store (s128) into %ir.in4, !alias.scope !0, addrspace 3)
				DS_WRITE_B128_gfx9 %31:vgpr_32, %1756:av_128_align2, 16, 0, implicit $exec :: (store (s128) into %ir.in5, !alias.scope !0, addrspace 3)
				%864:vgpr_32 = V_ADD_U32_e32 -16, %1752:vgpr_32, implicit $exec
				%1757:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %864:vgpr_32, %518:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in26, !alias.scope !0, addrspace 7)
				%1755:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %1752:vgpr_32, %518:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in27, !alias.scope !0, addrspace 7)
				%1754:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %1753:vgpr_32, %518:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in28, !alias.scope !0, addrspace 7)
				%865:vgpr_32 = V_ADD_U32_e32 -16, %1753:vgpr_32, implicit $exec
				%1756:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %865:vgpr_32, %518:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in29, !alias.scope !0, addrspace 7)
				%1753:vgpr_32 = V_ADD_U32_e32 128, %1753:vgpr_32, implicit $exec
				%1752:vgpr_32 = V_ADD_U32_e32 128, %1752:vgpr_32, implicit $exec
				%1751:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1751:vgpr_32, implicit $exec
				%1750:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1750:vgpr_32, implicit $exec
				%1749:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1749:vgpr_32, implicit $exec
				%1748:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1748:vgpr_32, implicit $exec
				%1747:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1747:vgpr_32, implicit $exec
				%1746:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1746:vgpr_32, implicit $exec
				%1745:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1745:vgpr_32, implicit $exec
				%1744:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1744:vgpr_32, implicit $exec
				%1743:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1743:vgpr_32, implicit $exec
				%1742:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1742:vgpr_32, implicit $exec
				%1741:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1741:vgpr_32, implicit $exec
				%1740:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1740:vgpr_32, implicit $exec
				%1739:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1739:vgpr_32, implicit $exec
				%1738:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1738:vgpr_32, implicit $exec
				%1737:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1737:vgpr_32, implicit $exec
				%1736:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1736:vgpr_32, implicit $exec
				%1735:sreg_32 = nsw S_ADD_I32 %1735:sreg_32, -1, implicit-def dead $scc
				S_CMP_LG_U32 %1735:sreg_32, 0, implicit-def $scc
				S_CBRANCH_SCC1 %bb.1, implicit killed $scc

				bb.2:
				S_ENDPGM 0
				---

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2c.mir

This file was added.

				# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
				# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -run-pass=machine-scheduler -verify-misched -o - %s \| FileCheck -check-prefix=GCN %s

				--- \|
				define amdgpu_kernel void @single-wave-phase-2c(i32 addrspace(3)* noalias %in0, i32 addrspace(3)* noalias %in1, i32 addrspace(3)* noalias %in2, i32 addrspace(3)* noalias %in3, i32 addrspace(3)* noalias %in4, i32 addrspace(3)* noalias %in5, i32 addrspace(3)* noalias %in6, i32 addrspace(3)* noalias %in7, i32 addrspace(3)* noalias %in8, i32 addrspace(3)* noalias %in9, i32 addrspace(3)* noalias %in10, i32 addrspace(3)* noalias %in11, i32 addrspace(7)* noalias %in12, i32 addrspace(7)* noalias %in13, i32 addrspace(7)* noalias %in14, i32 addrspace(7)* noalias %in15, i32 addrspace(7)* noalias %in16, i32 addrspace(7)* noalias %in17) #0 { ret void }


				!0 = distinct !{!0}
				!1 = !{!1, !0}
				...

				---
				name: single-wave-phase-2c
				tracksRegLiveness: true
				machineFunctionInfo:
				occupancy: 1
				body: \|
				; GCN-LABEL: name: single-wave-phase-2c
				; GCN: bb.0:
				; GCN-NEXT: successors: %bb.1(0x80000000)
				; GCN-NEXT: {{ $}}
				; GCN-NEXT: [[DEF:%[0-9]+]]:av_512_align2 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF1:%[0-9]+]]:av_512_align2 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF7:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF13:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF14:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF15:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF16:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF19:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: dead [[DEF20:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF21:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF22:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF23:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF24:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF25:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF26:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF27:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF28:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF29:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF30:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF31:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF32:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF33:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: {{ $}}
				; GCN-NEXT: bb.1:
				; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
				; GCN-NEXT: {{ $}}
				; GCN-NEXT: [[DS_READ_B128_gfx9_:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF2]], 0, 0, implicit $exec :: (load (s128) from %ir.in0, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[DS_READ_B128_gfx9_1:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF3]], 0, 0, implicit $exec :: (load (s128) from %ir.in2, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[DS_READ_B128_gfx9_2:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF2]], 1040, 0, implicit $exec :: (load (s128) from %ir.in1, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[DS_READ_B128_gfx9_3:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF3]], 2064, 0, implicit $exec :: (load (s128) from %ir.in3, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[COPY:%[0-9]+]]:areg_512_align2 = COPY [[DEF1]]
				; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub0_sub1, [[DS_READ_B128_gfx9_1]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_]], 0, 0, 0, implicit $mode, implicit $exec
				; GCN-NEXT: [[DS_READ_B128_gfx9_4:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF3]], 1024, 0, implicit $exec :: (load (s128) from %ir.in4, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF33]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF21]], implicit $exec
				; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_1:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub2_sub3, [[DS_READ_B128_gfx9_1]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
				; GCN-NEXT: [[DS_READ_B128_gfx9_5:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF3]], 3088, 0, implicit $exec :: (load (s128) from %ir.in5, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF22]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF23]], implicit $exec
				; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_1:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub0_sub1, [[DS_READ_B128_gfx9_3]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
				; GCN-NEXT: DS_WRITE_B128_gfx9 [[DEF4]], [[DEF16]], 0, 0, implicit $exec :: (store (s128) into %ir.in6, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[DEF6]], [[DEF7]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in7, !alias.scope !0, addrspace 7)
				; GCN-NEXT: dead [[V_MFMA_F32_32X32X8F16_mac_e64_1:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub2_sub3, [[DS_READ_B128_gfx9_3]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
				; GCN-NEXT: [[COPY:%[0-9]+]]:areg_512_align2 = COPY [[DEF]]
				; GCN-NEXT: undef [[DEF17]].sub2:vreg_128_align2 = V_PERM_B32_e64 [[DEF13]], [[DEF12]], [[DEF30]], implicit $exec
				; GCN-NEXT: [[DEF17]].sub3:vreg_128_align2 = V_PERM_B32_e64 [[DEF15]], [[DEF14]], [[DEF30]], implicit $exec
				; GCN-NEXT: [[DEF17]].sub0:vreg_128_align2 = V_PERM_B32_e64 [[DEF8]], [[DEF9]], [[DEF30]], implicit $exec
				; GCN-NEXT: [[DEF17]].sub1:vreg_128_align2 = V_PERM_B32_e64 [[DEF11]], [[DEF10]], [[DEF30]], implicit $exec
				; GCN-NEXT: DS_WRITE_B128_gfx9 [[DEF5]], [[DEF17]], 0, 0, implicit $exec :: (store (s128) into %ir.in8, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_1:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub0_sub1, [[DS_READ_B128_gfx9_4]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
				; GCN-NEXT: undef [[DEF18]].sub0:vreg_128_align2 = V_PERM_B32_e64 [[DEF8]], [[DEF9]], [[DEF31]], implicit $exec
				; GCN-NEXT: [[DEF18]].sub1:vreg_128_align2 = V_PERM_B32_e64 [[DEF11]], [[DEF10]], [[DEF31]], implicit $exec
				; GCN-NEXT: [[DEF18]].sub2:vreg_128_align2 = V_PERM_B32_e64 [[DEF13]], [[DEF12]], [[DEF31]], implicit $exec
				; GCN-NEXT: [[DEF18]].sub3:vreg_128_align2 = V_PERM_B32_e64 [[DEF15]], [[DEF14]], [[DEF31]], implicit $exec
				; GCN-NEXT: DS_WRITE_B128_gfx9 [[DEF5]], [[DEF18]], 16, 0, implicit $exec :: (store (s128) into %ir.in9, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub2_sub3, [[DS_READ_B128_gfx9_4]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
				; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in10, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_1]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in11, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_2]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in12, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_3]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in13, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub0_sub1, [[DS_READ_B128_gfx9_5]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF24]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF25]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF26]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF27]], implicit $exec
				; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_4]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in14, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_5]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in15, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFEN6:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_6]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in16, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFEN7:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_7]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in17, !alias.scope !0, addrspace 7)
				; GCN-NEXT: dead [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub2_sub3, [[DS_READ_B128_gfx9_5]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
				; GCN-NEXT: IGLP_OPT 1
				; GCN-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = nsw S_ADD_I32 [[S_ADD_I32_]], -1, implicit-def dead $scc
				; GCN-NEXT: S_CMP_LG_U32 [[S_ADD_I32_]], 0, implicit-def $scc
				; GCN-NEXT: [[V_ADD_U32_e32_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF28]], [[V_ADD_U32_e32_8]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF28]], [[V_ADD_U32_e32_9]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF28]], [[V_ADD_U32_e32_10]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF28]], [[V_ADD_U32_e32_11]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_12:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 64, [[V_ADD_U32_e32_12]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF28]], [[V_ADD_U32_e32_13]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF28]], [[V_ADD_U32_e32_14]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF28]], [[V_ADD_U32_e32_15]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF28]], [[V_ADD_U32_e32_16]], implicit $exec
				; GCN-NEXT: S_CBRANCH_SCC1 %bb.1, implicit killed $scc
				; GCN-NEXT: {{ $}}
				; GCN-NEXT: bb.2:
				; GCN-NEXT: S_ENDPGM 0
				bb.0:
				%1076:av_512_align2 = IMPLICIT_DEF
				%1073:av_512_align2 = IMPLICIT_DEF
				%25:vgpr_32 = IMPLICIT_DEF
				%26:vgpr_32 = IMPLICIT_DEF
				%13:vgpr_32 = IMPLICIT_DEF
				%15:vgpr_32 = IMPLICIT_DEF
				%1215:vgpr_32 = IMPLICIT_DEF
				%381:sgpr_128 = IMPLICIT_DEF
				%1225:vgpr_32 = IMPLICIT_DEF
				%1224:vgpr_32 = IMPLICIT_DEF
				%1226:vgpr_32 = IMPLICIT_DEF
				%1227:vgpr_32 = IMPLICIT_DEF
				%1228:vgpr_32 = IMPLICIT_DEF
				%1229:vgpr_32 = IMPLICIT_DEF
				%1230:vgpr_32 = IMPLICIT_DEF
				%1231:vgpr_32 = IMPLICIT_DEF
				%1232:av_128_align2 = IMPLICIT_DEF
				%1091:vreg_128_align2 = IMPLICIT_DEF
				%1067:vreg_128_align2 = IMPLICIT_DEF
				%27:vgpr_32 = IMPLICIT_DEF
				%1216:vgpr_32 = IMPLICIT_DEF
				%1217:vgpr_32 = IMPLICIT_DEF
				%1218:vgpr_32 = IMPLICIT_DEF
				%1219:vgpr_32 = IMPLICIT_DEF
				%1220:vgpr_32 = IMPLICIT_DEF
				%1221:vgpr_32 = IMPLICIT_DEF
				%1222:vgpr_32 = IMPLICIT_DEF
				%1223:vgpr_32 = IMPLICIT_DEF
				%29:sreg_32 = IMPLICIT_DEF
				%1214:sreg_32 = IMPLICIT_DEF
				%419:sreg_32 = IMPLICIT_DEF
				%421:sreg_32 = IMPLICIT_DEF
				%387:sgpr_128 = IMPLICIT_DEF
				%1216:vgpr_32 = IMPLICIT_DEF

				bb.1:
				IGLP_OPT 1
				%489:av_128_align2 = DS_READ_B128_gfx9 %25:vgpr_32, 0, 0, implicit $exec :: (load (s128) from %ir.in0, !alias.scope !0, addrspace 3)
				%494:av_128_align2 = DS_READ_B128_gfx9 %25:vgpr_32, 1040, 0, implicit $exec :: (load (s128) from %ir.in1, !alias.scope !0, addrspace 3)
				%499:av_128_align2 = DS_READ_B128_gfx9 %26:vgpr_32, 0, 0, implicit $exec :: (load (s128) from %ir.in2, !alias.scope !0, addrspace 3)
				%504:av_128_align2 = DS_READ_B128_gfx9 %26:vgpr_32, 2064, 0, implicit $exec :: (load (s128) from %ir.in3, !alias.scope !0, addrspace 3)
				%527:areg_512_align2 = COPY %1073:av_512_align2
				%527:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %489.sub0_sub1:av_128_align2, %499.sub0_sub1:av_128_align2, %527:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
				%527:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %489.sub2_sub3:av_128_align2, %499.sub2_sub3:av_128_align2, %527:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
				%527:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %494.sub0_sub1:av_128_align2, %504.sub0_sub1:av_128_align2, %527:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
				%527:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %494.sub2_sub3:av_128_align2, %504.sub2_sub3:av_128_align2, %527:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
				%530:av_128_align2 = DS_READ_B128_gfx9 %26:vgpr_32, 1024, 0, implicit $exec :: (load (s128) from %ir.in4, !alias.scope !0, addrspace 3)
				%535:av_128_align2 = DS_READ_B128_gfx9 %26:vgpr_32, 3088, 0, implicit $exec :: (load (s128) from %ir.in5, !alias.scope !0, addrspace 3)
				%554:areg_512_align2 = COPY %1076:av_512_align2
				%554:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %489.sub0_sub1:av_128_align2, %530.sub0_sub1:av_128_align2, %554:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
				%554:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %489.sub2_sub3:av_128_align2, %530.sub2_sub3:av_128_align2, %554:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
				%554:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %494.sub0_sub1:av_128_align2, %535.sub0_sub1:av_128_align2, %554:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
				%554:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %494.sub2_sub3:av_128_align2, %535.sub2_sub3:av_128_align2, %554:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
				DS_WRITE_B128_gfx9 %13:vgpr_32, %1232:av_128_align2, 0, 0, implicit $exec :: (store (s128) into %ir.in6, !alias.scope !0, addrspace 3)
				%1232:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %1215:vgpr_32, %381:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in7, !alias.scope !0, addrspace 7)
				%1091.sub0:vreg_128_align2 = V_PERM_B32_e64 %1225:vgpr_32, %1224:vgpr_32, %419:sreg_32, implicit $exec
				%1067.sub0:vreg_128_align2 = V_PERM_B32_e64 %1225:vgpr_32, %1224:vgpr_32, %421:sreg_32, implicit $exec
				%1091.sub1:vreg_128_align2 = V_PERM_B32_e64 %1227:vgpr_32, %1226:vgpr_32, %419:sreg_32, implicit $exec
				%1067.sub1:vreg_128_align2 = V_PERM_B32_e64 %1227:vgpr_32, %1226:vgpr_32, %421:sreg_32, implicit $exec
				%1091.sub2:vreg_128_align2 = V_PERM_B32_e64 %1229:vgpr_32, %1228:vgpr_32, %419:sreg_32, implicit $exec
				%1067.sub2:vreg_128_align2 = V_PERM_B32_e64 %1229:vgpr_32, %1228:vgpr_32, %421:sreg_32, implicit $exec
				%1091.sub3:vreg_128_align2 = V_PERM_B32_e64 %1231:vgpr_32, %1230:vgpr_32, %419:sreg_32, implicit $exec
				%1067.sub3:vreg_128_align2 = V_PERM_B32_e64 %1231:vgpr_32, %1230:vgpr_32, %421:sreg_32, implicit $exec
				DS_WRITE_B128_gfx9 %15:vgpr_32, %1091:vreg_128_align2, 0, 0, implicit $exec :: (store (s128) into %ir.in8, !alias.scope !0, addrspace 3)
				DS_WRITE_B128_gfx9 %15:vgpr_32, %1067:vreg_128_align2, 16, 0, implicit $exec :: (store (s128) into %ir.in9, !alias.scope !0, addrspace 3)
				%572:vgpr_32 = V_ADD_U32_e32 %27:vgpr_32, %1216:vgpr_32, implicit $exec
				%1224:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %572:vgpr_32, %387:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in10, !alias.scope !0, addrspace 7)
				%573:vgpr_32 = V_ADD_U32_e32 %27:vgpr_32, %1217:vgpr_32, implicit $exec
				%1225:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %573:vgpr_32, %387:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in11, !alias.scope !0, addrspace 7)
				%574:vgpr_32 = V_ADD_U32_e32 %27:vgpr_32, %1218:vgpr_32, implicit $exec
				%1226:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %574:vgpr_32, %387:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in12, !alias.scope !0, addrspace 7)
				%575:vgpr_32 = V_ADD_U32_e32 %27:vgpr_32, %1219:vgpr_32, implicit $exec
				%1227:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %575:vgpr_32, %387:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in13, !alias.scope !0, addrspace 7)
				%576:vgpr_32 = V_ADD_U32_e32 %27:vgpr_32, %1220:vgpr_32, implicit $exec
				%1228:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %576:vgpr_32, %387:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in14, !alias.scope !0, addrspace 7)
				%577:vgpr_32 = V_ADD_U32_e32 %27:vgpr_32, %1221:vgpr_32, implicit $exec
				%1229:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %577:vgpr_32, %387:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in15, !alias.scope !0, addrspace 7)
				%578:vgpr_32 = V_ADD_U32_e32 %27:vgpr_32, %1222:vgpr_32, implicit $exec
				%1230:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %578:vgpr_32, %387:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in16, !alias.scope !0, addrspace 7)
				%579:vgpr_32 = V_ADD_U32_e32 %27:vgpr_32, %1223:vgpr_32, implicit $exec
				%1231:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %579:vgpr_32, %387:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in17, !alias.scope !0, addrspace 7)
				%1223:vgpr_32 = V_ADD_U32_e32 %29:sreg_32, %1223:vgpr_32, implicit $exec
				%1222:vgpr_32 = V_ADD_U32_e32 %29:sreg_32, %1222:vgpr_32, implicit $exec
				%1221:vgpr_32 = V_ADD_U32_e32 %29:sreg_32, %1221:vgpr_32, implicit $exec
				%1220:vgpr_32 = V_ADD_U32_e32 %29:sreg_32, %1220:vgpr_32, implicit $exec
				%1219:vgpr_32 = V_ADD_U32_e32 %29:sreg_32, %1219:vgpr_32, implicit $exec
				%1218:vgpr_32 = V_ADD_U32_e32 %29:sreg_32, %1218:vgpr_32, implicit $exec
				%1217:vgpr_32 = V_ADD_U32_e32 %29:sreg_32, %1217:vgpr_32, implicit $exec
				%1216:vgpr_32 = V_ADD_U32_e32 %29:sreg_32, %1216:vgpr_32, implicit $exec
				%1215:vgpr_32 = V_ADD_U32_e32 64, %1215:vgpr_32, implicit $exec
				%1214:sreg_32 = nsw S_ADD_I32 %1214:sreg_32, -1, implicit-def dead $scc
				S_CMP_LG_U32 %1214:sreg_32, 0, implicit-def $scc
				S_CBRANCH_SCC1 %bb.1, implicit killed $scc

				bb.2:
				S_ENDPGM 0
				---

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU][IGLP] Add iglp_opt(1) strategy for single wave gemms
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 540151

llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2c.mir

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU][IGLP] Add iglp_opt(1) strategy for single wave gemmsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 540151

llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2c.mir

[AMDGPU][IGLP] Add iglp_opt(1) strategy for single wave gemms
ClosedPublic