Diff 537758

llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp

Show First 20 Lines • Show All 76 Lines • ▼ Show 20 Lines	enum class SchedGroupMask {
DS_WRITE = 1u << 9,		DS_WRITE = 1u << 9,
ALL = ALU \| VALU \| SALU \| MFMA \| VMEM \| VMEM_READ \| VMEM_WRITE \| DS \|		ALL = ALU \| VALU \| SALU \| MFMA \| VMEM \| VMEM_READ \| VMEM_WRITE \| DS \|
DS_READ \| DS_WRITE,		DS_READ \| DS_WRITE,
LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)		LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
};		};

class SchedGroup;		class SchedGroup;

typedef DenseMap<SUnit *, SmallVector<int, 4>> SUnitsToCandidateSGsMap;

typedef function_ref<bool(const SUnit , const ArrayRef<SUnit >,		typedef function_ref<bool(const SUnit , const ArrayRef<SUnit >,
		kerbowaUnsubmitted Not Done Reply Inline Actions Put this in the class so that it can access the cache? kerbowa: Put this in the class so that it can access the cache?
const SIInstrInfo *, SmallVectorImpl<SchedGroup> &,		const SIInstrInfo *, SmallVectorImpl<SchedGroup> &,
unsigned)>		unsigned, SmallVector<SUnit , 4> )>
InstructionRuleType;		InstructionClassifier;
		kerbowaUnsubmitted Not Done Reply Inline Actions What if this was a member function in InstructionRule, that way rules could be reused. Or do you think that the rules will always be so specific that it is not needed and that common functionality should just be in some functions somewhere? kerbowa: What if this was a member function in InstructionRule, that way rules could be reused. Or do…

		kerbowaUnsubmitted Not Done Reply Inline Actions Can you make this into a class or struct? We may want to add more to the Rule class than just the classifier and a cache. kerbowa: Can you make this into a class or struct? We may want to add more to the Rule class than just…
		class InstructionRule {
		private:
		// The lambda that decides whether or not a given SUnit belongs to the
		// SchedGroup
		InstructionClassifier Filter;
		// A cache made available to the Filter to store SUnits for subsequent
		// invocations of the Filter
		std::optional<SmallVector<SUnit *, 4>> FilterCache;

		public:
		InstructionClassifier getFilter() { return Filter; }
		SmallVector<SUnit , 4> getFilterCache() {
		return FilterCache.has_value() ? &*FilterCache : nullptr;
		}

		InstructionRule(InstructionClassifier Filter, bool FilterNeedsCache = false)
		: Filter(Filter) {
		if (FilterNeedsCache) {
		FilterCache = SmallVector<SUnit *, 4>();
		}
		}
		};

		typedef DenseMap<SUnit *, SmallVector<int, 4>> SUnitsToCandidateSGsMap;

// Classify instructions into groups to enable fine tuned control over the		// Classify instructions into groups to enable fine tuned control over the
// scheduler. These groups may be more specific than current SchedModel		// scheduler. These groups may be more specific than current SchedModel
// instruction classes.		// instruction classes.
class SchedGroup {		class SchedGroup {
private:		private:
// Mask that defines which instruction types can be classified into this		// Mask that defines which instruction types can be classified into this
// SchedGroup. The instruction types correspond to the mask from SCHED_BARRIER		// SchedGroup. The instruction types correspond to the mask from SCHED_BARRIER
// and SCHED_GROUP_BARRIER.		// and SCHED_GROUP_BARRIER.
SchedGroupMask SGMask;		SchedGroupMask SGMask;

// Maximum number of SUnits that can be added to this group.		// Maximum number of SUnits that can be added to this group.
std::optional<unsigned> MaxSize;		std::optional<unsigned> MaxSize;

// SchedGroups will only synchronize with other SchedGroups that have the same		// SchedGroups will only synchronize with other SchedGroups that have the same
// SyncID.		// SyncID.
int SyncID = 0;		int SyncID = 0;

// SGID is used to map instructions to candidate SchedGroups		// SGID is used to map instructions to candidate SchedGroups
unsigned SGID;		unsigned SGID;

// The different rules each instruction in this SchedGroup must conform to		// The different rules each instruction in this SchedGroup must conform to
SmallVector<InstructionRuleType, 4> Rules;		SmallVector<InstructionRule, 4> Rules;

// Count of the number of created SchedGroups, used to initialize SGID.		// Count of the number of created SchedGroups, used to initialize SGID.
		kerbowaUnsubmitted Not Done Reply Inline Actions Looks good, but this seems a bit vague as a field in SchedGroup. Can these just be captured in the lamdas since the way each rule would use a cache may be different? kerbowa: Looks good, but this seems a bit vague as a field in SchedGroup. Can these just be captured in…
static unsigned NumSchedGroups;		static unsigned NumSchedGroups;
		kerbowaUnsubmitted Not Done Reply Inline Actions Unused. kerbowa: Unused.

const SIInstrInfo *TII;		const SIInstrInfo *TII;

// Try to add and edge from SU A to SU B.		// Try to add and edge from SU A to SU B.
bool tryAddEdge(SUnit A, SUnit B);		bool tryAddEdge(SUnit A, SUnit B);

// Use SGMask to determine whether we can classify MI as a member of this		// Use SGMask to determine whether we can classify MI as a member of this
// SchedGroup object.		// SchedGroup object.
Show All 29 Lines	public:

// Returns true if no more instructions may be added to this group.		// Returns true if no more instructions may be added to this group.
bool isFull() const { return MaxSize && Collection.size() >= *MaxSize; }		bool isFull() const { return MaxSize && Collection.size() >= *MaxSize; }

// Append a constraint that SUs must meet in order to fit into this		// Append a constraint that SUs must meet in order to fit into this
// SchedGroup. Since many rules involve the relationship between a SchedGroup		// SchedGroup. Since many rules involve the relationship between a SchedGroup
// and the SUnits in other SchedGroups, rules are checked at Pipeline Solve		// and the SUnits in other SchedGroups, rules are checked at Pipeline Solve
// time (rather than SchedGroup init time.)		// time (rather than SchedGroup init time.)
void addRule(const InstructionRuleType &NewRule) { Rules.push_back(NewRule); }		void addRule(const InstructionRule &NewRule) { Rules.push_back(NewRule); }

// Returns true if the SU matches all rules		// Returns true if the SU matches all rules
bool allowedByRules(const SUnit *SU,		bool allowedByRules(const SUnit *SU,
SmallVectorImpl<SchedGroup> &SyncPipe) const {		SmallVectorImpl<SchedGroup> &SyncPipe) const {
if (Rules.empty())		if (Rules.empty())
return true;		return true;
for (auto &Rule : Rules) {		for (auto &Rule : Rules) {
if (!Rule(SU, Collection, TII, SyncPipe, SGID)) {		InstructionRule TheRule = const_cast<InstructionRule >(&Rule);
		auto Filter = TheRule->getFilter();
		if (!Filter(SU, Collection, TII, SyncPipe, SGID,
		TheRule->getFilterCache())) {
return false;		return false;
}		}
}		}
return true;		return true;
}		}

// Add SU to the SchedGroup.		// Add SU to the SchedGroup.
void add(SUnit &SU) {		void add(SUnit &SU) {
▲ Show 20 Lines • Show All 240 Lines • ▼ Show 20 Lines	for (auto J = std::next(I); J != E; ++J) {
GroupA.link(GroupB);		GroupA.link(GroupB);
}		}
}		}
}		}

void PipelineSolver::makePipeline() {		void PipelineSolver::makePipeline() {
// Preserve the order of barrier for subsequent SchedGroupBarrier mutations		// Preserve the order of barrier for subsequent SchedGroupBarrier mutations
for (auto &SyncPipeline : BestPipeline) {		for (auto &SyncPipeline : BestPipeline) {
		LLVM_DEBUG(dbgs() << "Printing SchedGroups\n");
for (auto &SG : SyncPipeline) {		for (auto &SG : SyncPipeline) {
LLVM_DEBUG(dbgs() << "Printing SchedGroups\nSchedGroup with SGID "		LLVM_DEBUG(dbgs() << "SchedGroup with SGID " << SG.getSGID()
<< SG.getSGID() << " has: \n");		<< " has: \n");
SUnit *SGBarr = nullptr;		SUnit *SGBarr = nullptr;
for (auto &SU : SG.Collection) {		for (auto &SU : SG.Collection) {
if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)		if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
SGBarr = SU;		SGBarr = SU;
LLVM_DEBUG(dbgs() << "SU(" << SU->NodeNum << ")\n");		LLVM_DEBUG(dbgs() << "SU(" << SU->NodeNum << ")\n");
}		}
// Command line requested IGroupLP doesn't have SGBarr		// Command line requested IGroupLP doesn't have SGBarr
if (!SGBarr)		if (!SGBarr)
▲ Show 20 Lines • Show All 365 Lines • ▼ Show 20 Lines	if (EnableExactSolver \|\| BelowCutoff) {
solveGreedy();		solveGreedy();
}		}

makePipeline();		makePipeline();
LLVM_DEBUG(dbgs() << "After applying mutation\n");		LLVM_DEBUG(dbgs() << "After applying mutation\n");
LLVM_DEBUG(DAG->dump());		LLVM_DEBUG(DAG->dump());
}		}

enum IGLPStrategyID : int { MFMASmallGemmOptID = 0, DemoOptID = 1 };		enum IGLPStrategyID : int {
		MFMASmallGemmOptID = 0,
		MFMASmallGemmSingleWaveOptID = 1,
		};

// Implement a IGLP scheduling strategy.		// Implement a IGLP scheduling strategy.
class IGLPStrategy {		class IGLPStrategy {
protected:		protected:
ScheduleDAGInstrs *DAG;		ScheduleDAGInstrs *DAG;

const SIInstrInfo *TII;		const SIInstrInfo *TII;

▲ Show 20 Lines • Show All 46 Lines • ▼ Show 20 Lines	for (unsigned I = 0; I < MFMACount * 3; ++I) {
SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);		SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
}		}
}		}

class DemoOpt final : public IGLPStrategy {		class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
		kerbowaUnsubmitted Done Reply Inline Actions Extra private. kerbowa: Extra private.
private:
public:		public:
void applyIGLPStrategy(		void applyIGLPStrategy(
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,		DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) override;		DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) override;

bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }		bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }

DemoOpt(ScheduleDAGInstrs DAG, const SIInstrInfo TII)		MFMASmallGemmSingleWaveOpt(ScheduleDAGInstrs DAG, const SIInstrInfo TII)
: IGLPStrategy(DAG, TII) {		: IGLPStrategy(DAG, TII) {
IsBottomUp = 0;		IsBottomUp = 0;
}		}
};		};

void DemoOpt::applyIGLPStrategy(		void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,		DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) {		DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) {
// Count the number of MFMA instructions.
unsigned MFMACount = 0;		unsigned MFMACount = 0;
for (const MachineInstr &I : *DAG)		unsigned DSWCount = 0;
if (TII->isMFMAorWMMA(I))		unsigned DSWWithPermCount = 0;
		unsigned DSWWithSharedVMEMCount = 0;
		unsigned DSRCount = 0;
		SmallVector<SUnit *, 6> DSWithPerms;
		for (auto &SU : DAG->SUnits) {
		auto I = SU.getInstr();
		if (TII->isMFMAorWMMA(*I))
++MFMACount;		++MFMACount;
		else if (TII->isDS(*I)) {
		if (I->mayLoad())
		++DSRCount;
		else if (I->mayStore()) {
		++DSWCount;
		for (auto Pred : SU.Preds) {
		if (Pred.getSUnit()->getInstr()->getOpcode() ==
		AMDGPU::V_PERM_B32_e64) {
		DSWithPerms.push_back(&SU);
		break;
		}
		}
		}
		}
		}
		DSWWithPermCount = DSWithPerms.size();
		auto I = DSWithPerms.begin();
		auto E = DSWithPerms.end();

		// Get the count of DS_WRITES with V_PERM predecessors which
		// have loop carried dependencies (WAR) on the same VMEM_READs.
		// We consider partial overlap as a miss -- in other words,
		// for a given DS_W, we only consider another DS_W as matching
		// if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred
		// for every V_PERM pred of this DS_W.
		kerbowaUnsubmitted Done Reply Inline Actions For this type of idiom wouldn't you normally use a visited set? Could also be improved by tracking all the VEMM_R and their associated DS_WRITE with perms via some map? kerbowa: For this type of idiom wouldn't you normally use a visited set? Could also be improved by…
		DenseMap<MachineInstr , SUnit > VMEMLookup;
		SmallVector<SUnit *, 6> Counted;
		for (; I != E; I++) {
		SUnit *Cand = nullptr;
		bool MissedAny = false;
		for (auto &Pred : (*I)->Preds) {
		if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64)
		kerbowaUnsubmitted Done Reply Inline Actions Could both of these find_if before the if statement be hoisted to the outer loop? kerbowa: Could both of these find_if before the if statement be hoisted to the outer loop?
		continue;

const unsigned PipelineSyncID = 0;		if (Cand &&
SchedGroup *SG = nullptr;		std::find(Counted.begin(), Counted.end(), Cand) != Counted.end())
		break;

		kerbowaUnsubmitted Done Reply Inline Actions It looks like the assumption is that each V_PERM DS_WRITE pair will have one VMEM load associated with it, since the loop stops at the first VMEM found? Is this correct? kerbowa: It looks like the assumption is that each V_PERM DS_WRITE pair will have one VMEM load…
		for (auto &Succ : Pred.getSUnit()->Succs) {
		auto MI = Succ.getSUnit()->getInstr();
		if (!TII->isVMEM(*MI) \|\| !MI->mayLoad())
		continue;

		if (MissedAny \|\| !VMEMLookup.size()) {
		kerbowaUnsubmitted Done Reply Inline Actions If there is no vmem successors can you mark the instruction as visited, or else just move this before the inner loop like I recommend above? kerbowa: If there is no vmem successors can you mark the instruction as visited, or else just move this…
		MissedAny = true;
		VMEMLookup[MI] = *I;
		continue;
		}

		if (!VMEMLookup.contains(MI)) {
		MissedAny = true;
		VMEMLookup[MI] = *I;
		continue;
		}
		kerbowaUnsubmitted Done Reply Inline Actions I think the calculation for DSWWithSharedVMEMCount could be simplified by iterating over DSWithPerms once, while adding each found VMEM to a list or map with a count that increases by 1 for each found DSWithPerm paired with that VMEM. To avoid some of the repeated traversals of preds/succs of the same instructions? kerbowa: I think the calculation for DSWWithSharedVMEMCount could be simplified by iterating over…

		Cand = VMEMLookup[MI];
		if (std::find(Counted.begin(), Counted.end(), Cand) != Counted.end()) {
		MissedAny = true;
		break;
		}
		}
		}
		if (!MissedAny && Cand) {
		DSWWithSharedVMEMCount += 2;
		Counted.push_back(Cand);
		Counted.push_back(*I);
		}
		}

		assert(DSWWithSharedVMEMCount <= DSWWithPermCount);
		SchedGroup *SG;
		unsigned PipelineSyncID = 0;
		// For kernels with V_PERM, there are enough VALU to mix in between MFMAs
		if (DSWWithPermCount) {
		for (unsigned I = 0; I < MFMACount; I++) {
		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
		SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
		SchedGroupMask::VALU, 2, PipelineSyncID, DAG, TII);
		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
		}
		}

		PipelineSyncID = 1;
		// Phase 1: Break up DS_READ and MFMA clusters.
		// First DS_READ to make ready initial MFMA, then interleave MFMA with DS_READ
		// prefetch

		// Whether the DS_READ is a predecessor of first four MFMA in region
		InstructionClassifier EnablesInitialMFMA =
		[](const SUnit SU, ArrayRef<SUnit > Collection, const SIInstrInfo *TII,
		SmallVectorImpl<SchedGroup> &SyncPipe, unsigned SGID,
		SmallVector<SUnit , 4> Cache) {
		if (!SyncPipe.size())
		return false;
		int MFMAsFound = 0;
		if (!Cache->size()) {
		for (auto &Elt : SyncPipe[0].DAG->SUnits) {
		if (TII->isMFMAorWMMA(*Elt.getInstr())) {
		++MFMAsFound;
		kerbowaUnsubmitted Done Reply Inline Actions Will these rules be called relatively often, would it help to cache the first 4 mfma in cases like this? kerbowa: Will these rules be called relatively often, would it help to cache the first 4 mfma in cases…
		if (MFMAsFound > 4)
		kerbowaUnsubmitted Done Reply Inline Actions Does it need to be a direct pred of the MFMA or does it only need to be able to be scheduled before it, could you use reachability in the DAG here instead of searching the preds like this? kerbowa: Does it need to be a direct pred of the MFMA or does it only need to be able to be scheduled…
		break;
		Cache->push_back(&Elt);
		}
		}
		}

// The SU is a successor of SU in prev SchedGroup		assert(Cache->size());
InstructionRuleType Rule1 =		auto DAG = SyncPipe[0].DAG;
		for (auto &Elt : *Cache) {
		if (DAG->IsReachable(Elt, const_cast<SUnit *>(SU)))
		return true;
		}
		return false;
		};

		// Make ready initial MFMA
		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
		SchedGroupMask::DS_READ, 4, PipelineSyncID, DAG, TII);
		SG->addRule(InstructionRule(EnablesInitialMFMA, true));
		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
		SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

		// Interleave MFMA with DS_READ prefetch
		for (unsigned I = 0; I < DSRCount - 4; ++I) {
		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
		SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG, TII);
		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
		SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
		}

		// Phase 2a: Loop carried dependency with V_PERM
		// Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they
		// depend on. Interleave MFMA to keep XDL unit busy throughout.

		// Whether the MI is a V_PERM and is a predecessor of a common DS_WRITE
		InstructionClassifier IsPermForDSW =
[](const SUnit SU, ArrayRef<SUnit > Collection, const SIInstrInfo *TII,		[](const SUnit SU, ArrayRef<SUnit > Collection, const SIInstrInfo *TII,
SmallVectorImpl<SchedGroup> &SyncPipe, unsigned SGID) {		SmallVectorImpl<SchedGroup> &SyncPipe, unsigned SGID,
		SmallVector<SUnit , 4> Cache) {
auto MI = SU->getInstr();		auto MI = SU->getInstr();
if (MI->getOpcode() == TargetOpcode::BUNDLE)		if (MI->getOpcode() != AMDGPU::V_PERM_B32_e64)
return false;		return false;

		bool FitsInGroup = false;
		// Does the VALU have a DS_WRITE successor
		if (!Collection.size()) {
		for (auto &Succ : SU->Succs) {
		SUnit *SuccUnit = Succ.getSUnit();
		if (TII->isDS(*SuccUnit->getInstr()) &&
		SuccUnit->getInstr()->mayStore()) {
		Cache->push_back(SuccUnit);
		FitsInGroup = true;
		}
		}
		return FitsInGroup;
		kerbowaUnsubmitted Not Done Reply Inline Actions Again, would caching these results help, or is it unlikely to be a bottleneck? kerbowa: Again, would caching these results help, or is it unlikely to be a bottleneck?
		}

		assert(Cache->size());

		// Does the VALU have a DS_WRITE successor that is the same as other
		// VALU already in the group. The V_PERMs will all share 1 DS_W succ
		return std::any_of(Cache->begin(), Cache->end(), [&SU](SUnit *Elt) {
		return std::any_of(SU->Succs.begin(), SU->Succs.end(),
		[&Elt](const SDep &ThisSucc) {
		return ThisSucc.getSUnit() == Elt;
		});
		});
		};

		// Whether the SU is a successor of any element in previous SchedGroup
		InstructionClassifier IsSuccOfPrevGroup =
		[](const SUnit SU, ArrayRef<SUnit > Collection, const SIInstrInfo *TII,
		SmallVectorImpl<SchedGroup> &SyncPipe, unsigned SGID,
		SmallVector<SUnit , 4> Cache) {
SchedGroup *OtherGroup = nullptr;		SchedGroup *OtherGroup = nullptr;
for (auto &PipeSG : SyncPipe) {		for (auto &PipeSG : SyncPipe) {
if (PipeSG.getSGID() == (int)SGID - 1) {		if ((unsigned)PipeSG.getSGID() == SGID - 1) {
OtherGroup = &PipeSG;		OtherGroup = &PipeSG;
}		}
}		}

if (!OtherGroup)		if (!OtherGroup)
return false;		return false;
		if (!OtherGroup->Collection.size())
		return true;

		// Does the previous VALU have this DS_Write as a successor
return (std::any_of(OtherGroup->Collection.begin(),		return (std::any_of(OtherGroup->Collection.begin(),
OtherGroup->Collection.end(), [&SU](SUnit *Elt) {		OtherGroup->Collection.end(), [&SU](SUnit *Elt) {
return std::any_of(Elt->Succs.begin(),		return std::any_of(Elt->Succs.begin(),
Elt->Succs.end(),		Elt->Succs.end(),
[&SU](SDep &Succ) {		[&SU](SDep &Succ) {
return Succ.getSUnit() == SU;		return Succ.getSUnit() == SU;
});		});
}));		}));
};		};

SmallVector<InstructionRuleType, 4> DemoRules;		// Whether the combined load width of group is 128 bits
		InstructionClassifier VMEMSize =
		[](const SUnit SU, ArrayRef<SUnit > Collection, const SIInstrInfo *TII,
		SmallVectorImpl<SchedGroup> &SyncPipe, unsigned SGID,
		SmallVector<SUnit , 4> Cache) {
		auto MI = SU->getInstr();
		if (MI->getOpcode() == TargetOpcode::BUNDLE)
		return false;
		if (!Collection.size())
		return true;

		int NumBits = 0;

		auto TRI = TII->getRegisterInfo();
		auto &MRI = MI->getParent()->getParent()->getRegInfo();
		for (auto &Elt : Collection) {
		auto Op = Elt->getInstr()->getOperand(0);
		auto Size =
		TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg(MRI, Op));
		NumBits += Size;
		}

		if (NumBits < 128) {
		assert(TII->isVMEM(*MI) && MI->mayLoad());
		if (NumBits + TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg(
		MRI, MI->getOperand(0))) <=
		128)
		return true;
		}

		return false;
		};

		// Whether the SU shares a V_PERM predecessor with any SU in the previous
		// SchedGroup
		InstructionClassifier SharesPredWithPrevGroup =
		[](const SUnit SU, ArrayRef<SUnit > Collection, const SIInstrInfo *TII,
		SmallVectorImpl<SchedGroup> &SyncPipe, unsigned SGID,
		SmallVector<SUnit , 4> Cache) {
		SchedGroup *OtherGroup = nullptr;
		if (!SyncPipe.size())
		return false;

		if (!Cache->size()) {

		for (auto &PipeSG : SyncPipe) {
		if ((unsigned)PipeSG.getSGID() == SGID - 1) {
		OtherGroup = &PipeSG;
		}
		}

		if (!OtherGroup)
		return false;
		if (!OtherGroup->Collection.size())
		return true;

		for (auto &OtherEle : OtherGroup->Collection) {
		for (auto &Pred : OtherEle->Preds) {
		if (Pred.getSUnit()->getInstr()->getOpcode() ==
		AMDGPU::V_PERM_B32_e64)
		Cache->push_back(Pred.getSUnit());
		}
		}
		}

		assert(Cache->size());
		auto DAG = SyncPipe[0].DAG;
		// Does the previous DS_WRITE share a V_PERM predecessor with this
		// VMEM_READ
		return (
		std::any_of(Cache->begin(), Cache->end(), [&SU, &DAG](SUnit *Elt) {
		return DAG->IsReachable(const_cast<SUnit *>(SU), Elt);
		}));
		};

		// Whether the SU shares a V_PERM predecessor with any SU in the SchedGroup 3
		// steps back in the pipeline
		InstructionClassifier SharesPredWithThirdPrevGroup =
		[](const SUnit SU, ArrayRef<SUnit > Collection, const SIInstrInfo *TII,
		SmallVectorImpl<SchedGroup> &SyncPipe, unsigned SGID,
		SmallVector<SUnit , 4> Cache) {
		SchedGroup *OtherGroup = nullptr;
		if (!SyncPipe.size())
		return false;

		if (!Cache->size()) {
		for (auto &PipeSG : SyncPipe) {
		if ((unsigned)PipeSG.getSGID() == SGID - 3) {
		OtherGroup = &PipeSG;
		}
		}

		if (!OtherGroup)
		return false;
		if (!OtherGroup->Collection.size())
		return true;

		for (auto &OtherEle : OtherGroup->Collection) {
		for (auto &Pred : OtherEle->Preds) {
		if (Pred.getSUnit()->getInstr()->getOpcode() ==
		AMDGPU::V_PERM_B32_e64)
		Cache->push_back(Pred.getSUnit());
		}
		}
		}

		assert(Cache->size());
		auto DAG = SyncPipe[0].DAG;
		// Does the previous DS_WRITE share a V_PERM predecessor with this
		// VMEM_READ
		return (
		std::any_of(Cache->begin(), Cache->end(), [&SU, &DAG](SUnit *Elt) {
		return DAG->IsReachable(const_cast<SUnit *>(SU), Elt);
		}));
		};

		for (unsigned I = 0; I < DSWWithPermCount - DSWWithSharedVMEMCount; ++I) {
		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
		SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
		SG->addRule(InstructionRule(IsPermForDSW, true));
		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
		SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
		SG->addRule(InstructionRule(IsSuccOfPrevGroup));
		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
		SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
		SG->addRule(InstructionRule(SharesPredWithPrevGroup, true));
		SG->addRule(InstructionRule(VMEMSize));
		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

// Each iteration of pipeline has 1 MFMA and 1 DS_W, where the DS_W is a
// successor of the MFMA
for (unsigned I = 0; I < MFMACount; ++I) {
SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);		SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
		SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
		SG->addRule(InstructionRule(SharesPredWithThirdPrevGroup, true));
		SG->addRule(InstructionRule(VMEMSize));
		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
		SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
		}

		// Phase 2b: Loop carried dependency without V_PERM
		// Schedule DS_WRITE as closely as possible to the VMEM_READ they depend on.
		// Interleave MFMA to keep XDL unit busy throughout.
		for (unsigned I = 0; I < DSWCount - DSWWithPermCount; I++) {
		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);		SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
SG->addRule(Rule1);		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
		SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
		SG->addRule(InstructionRule(VMEMSize));
		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
		SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
		}

		// Phase 2c: Loop carried dependency with V_PERM, VMEM_READs are
		// ultimately used by two DS_WRITE
		// Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they
		// depend on. Interleave MFMA to keep XDL unit busy throughout.

		// Whether the SU shares a V_PERM predecessor with any SU in the SchedGroup 2
		// steps back in the pipeline
		InstructionClassifier SharesPredWithSecondPrevGroup =
		[](const SUnit SU, ArrayRef<SUnit > Collection, const SIInstrInfo *TII,
		SmallVectorImpl<SchedGroup> &SyncPipe, unsigned SGID,
		SmallVector<SUnit , 4> Cache) {
		SchedGroup *OtherGroup = nullptr;
		if (!SyncPipe.size())
		return false;

		if (!Cache->size()) {
		for (auto &PipeSG : SyncPipe) {
		if ((unsigned)PipeSG.getSGID() == SGID - 2) {
		OtherGroup = &PipeSG;
		}
		}

		if (!OtherGroup)
		return false;
		if (!OtherGroup->Collection.size())
		return true;

		for (auto &OtherEle : OtherGroup->Collection) {
		for (auto &Pred : OtherEle->Preds) {
		if (Pred.getSUnit()->getInstr()->getOpcode() ==
		AMDGPU::V_PERM_B32_e64)
		Cache->push_back(Pred.getSUnit());
		}
		}
		}

		assert(Cache->size());
		auto DAG = SyncPipe[0].DAG;
		// Does the previous DS_WRITE share a V_PERM predecessor with this
		// VMEM_READ
		return (
		std::any_of(Cache->begin(), Cache->end(), [&SU, &DAG](SUnit *Elt) {
		return DAG->IsReachable(const_cast<SUnit *>(SU), Elt);
		}));
		};

		// Whether the SU shares a V_PERM predecessor with any SU in the SchedGroup 4
		// steps back in the pipeline
		InstructionClassifier SharesPredWithFourthPrevGroup =
		[](const SUnit SU, ArrayRef<SUnit > Collection, const SIInstrInfo *TII,
		SmallVectorImpl<SchedGroup> &SyncPipe, unsigned SGID,
		SmallVector<SUnit , 4> Cache) {
		SchedGroup *OtherGroup = nullptr;
		if (!SyncPipe.size())
		return false;

		if (!Cache->size()) {
		for (auto &PipeSG : SyncPipe) {
		if ((unsigned)PipeSG.getSGID() == SGID - 4) {
		OtherGroup = &PipeSG;
		}
		}

		if (!OtherGroup)
		return false;
		if (!OtherGroup->Collection.size())
		return true;

		for (auto &OtherEle : OtherGroup->Collection) {
		for (auto &Pred : OtherEle->Preds) {
		if (Pred.getSUnit()->getInstr()->getOpcode() ==
		AMDGPU::V_PERM_B32_e64)
		Cache->push_back(Pred.getSUnit());
		}
		}
		}
		assert(Cache->size());
		auto DAG = SyncPipe[0].DAG;
		// Does the previous DS_WRITE share a V_PERM predecessor with this
		// VMEM_READ
		return (
		std::any_of(Cache->begin(), Cache->end(), [&SU, &DAG](SUnit *Elt) {
		return DAG->IsReachable(const_cast<SUnit *>(SU), Elt);
		}));
		};

		for (unsigned I = 0; I < DSWWithSharedVMEMCount; ++I) {
		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
		SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
		SG->addRule(InstructionRule(IsPermForDSW, true));
		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
		SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
		SG->addRule(InstructionRule(IsSuccOfPrevGroup));
		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
		SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
		SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
		SG->addRule(InstructionRule(IsPermForDSW, true));
		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
		SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
		SG->addRule(InstructionRule(IsSuccOfPrevGroup));
		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
		SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
		SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
		SG->addRule(InstructionRule(SharesPredWithSecondPrevGroup, true));
		SG->addRule(InstructionRule(VMEMSize));
		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
		SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
		SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
		SG->addRule(InstructionRule(SharesPredWithFourthPrevGroup, true));
		SG->addRule(InstructionRule(VMEMSize));
		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
		SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
}		}
}		}

static std::unique_ptr<IGLPStrategy>		static std::unique_ptr<IGLPStrategy>
createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG,		createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG,
const SIInstrInfo *TII) {		const SIInstrInfo *TII) {
switch (ID) {		switch (ID) {
case MFMASmallGemmOptID:		case MFMASmallGemmOptID:
return std::make_unique<MFMASmallGemmOpt>(DAG, TII);		return std::make_unique<MFMASmallGemmOpt>(DAG, TII);
case DemoOptID:		case MFMASmallGemmSingleWaveOptID:
return std::make_unique<DemoOpt>(DAG, TII);		return std::make_unique<MFMASmallGemmSingleWaveOpt>(DAG, TII);
}		}

llvm_unreachable("Unknown IGLPStrategyID");		llvm_unreachable("Unknown IGLPStrategyID");
}		}

class IGroupLPDAGMutation : public ScheduleDAGMutation {		class IGroupLPDAGMutation : public ScheduleDAGMutation {
private:		private:
const SIInstrInfo *TII;		const SIInstrInfo *TII;
▲ Show 20 Lines • Show All 342 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll

	Show First 20 Lines • Show All 162 Lines • ▼ Show 20 Lines
	; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:80			; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:80
	; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:64			; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:64
	; GCN-NEXT: ds_read_b128 a[0:3], v1			; GCN-NEXT: ds_read_b128 a[0:3], v1
	; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:16			; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:16
	; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:32			; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:32
	; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:48			; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:48
	; GCN-NEXT: s_waitcnt lgkmcnt(0)			; GCN-NEXT: s_waitcnt lgkmcnt(0)
	; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]			; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
	; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:8304			; GCN-NEXT: ds_read_b128 a[156:159], v1 offset:8304
	; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:8288			; GCN-NEXT: ds_read_b128 a[152:155], v1 offset:8288
	; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:8272			; GCN-NEXT: ds_read_b128 a[148:151], v1 offset:8272
	; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:8256			; GCN-NEXT: ds_read_b128 a[144:147], v1 offset:8256
	; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:8240			; GCN-NEXT: ds_read_b128 a[140:143], v1 offset:8240
	; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:8224			; GCN-NEXT: ds_read_b128 a[136:139], v1 offset:8224
	; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:8208			; GCN-NEXT: ds_read_b128 a[132:135], v1 offset:8208
	; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:8192			; GCN-NEXT: ds_read_b128 a[128:131], v1 offset:8192
	; GCN-NEXT: v_add_u32_e32 v0, s1, v0			; GCN-NEXT: v_add_u32_e32 v0, s1, v0
	; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:24688			; GCN-NEXT: s_waitcnt lgkmcnt(0)
	; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:24672			; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v3, a[128:159]
	; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:24656			; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:24688
	; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:24640			; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:24672
	; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:24624			; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:24656
	; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:24608			; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:24640
	; GCN-NEXT: s_nop 3			; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:24624
	; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112			; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:24608
	; GCN-NEXT: s_waitcnt lgkmcnt(7)			; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:24592
				; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:24576
				; GCN-NEXT: s_waitcnt lgkmcnt(0)
	; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v3, a[96:127]			; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v3, a[96:127]
	; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:24592			; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:49264
	; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:24576			; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:49248
				; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:49232
				; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:49216
				; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:49200
				; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:49184
				; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:49168
				; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:49152
				; GCN-NEXT: v_add_u32_e32 v1, 0x6000, v1
				; GCN-NEXT: s_waitcnt lgkmcnt(0)
				; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v3, a[64:95]
				; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:57456
				; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:57440
				; GCN-NEXT: ds_read_b128 a[52:55], v1 offset:57424
				; GCN-NEXT: ds_read_b128 a[48:51], v1 offset:57408
				; GCN-NEXT: ds_read_b128 a[32:35], v1 offset:57344
				; GCN-NEXT: ds_read_b128 a[36:39], v1 offset:57360
				; GCN-NEXT: ds_read_b128 a[40:43], v1 offset:57376
				; GCN-NEXT: ds_read_b128 a[44:47], v1 offset:57392
				; GCN-NEXT: s_waitcnt lgkmcnt(0)
				; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63]
				; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112
	; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:96			; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:96
	; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:80			; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:80
	; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:64			; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:64
	; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:48			; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:48
	; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32			; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32
	; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16			; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16
	; GCN-NEXT: ds_write_b128 v0, a[0:3]			; GCN-NEXT: ds_write_b128 v0, a[0:3]
	; GCN-NEXT: v_mov_b32_e32 v0, s1			; GCN-NEXT: v_mov_b32_e32 v0, s1
	; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:49264			; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:8288
	; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:49248			; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:8304
	; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:49232			; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:8256
	; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:49216			; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:8272
	; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:49200			; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:8224
	; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:49184			; GCN-NEXT: ds_write_b128 v0, a[140:143] offset:8240
	; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:49168			; GCN-NEXT: ds_write_b128 v0, a[128:131] offset:8192
	; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:49152			; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:8208
	; GCN-NEXT: v_add_u32_e32 v4, 0x6000, v1			; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:16480
	; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:8288			; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:16496
	; GCN-NEXT: s_waitcnt lgkmcnt(14)			; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:16448
	; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v3, a[64:95]			; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:16464
	; GCN-NEXT: ds_read_b128 a[60:63], v4 offset:57456			; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:16416
	; GCN-NEXT: ds_read_b128 a[56:59], v4 offset:57440			; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:16432
	; GCN-NEXT: ds_read_b128 a[52:55], v4 offset:57424			; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:16384
	; GCN-NEXT: ds_read_b128 a[48:51], v4 offset:57408			; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:16400
	; GCN-NEXT: ds_read_b128 a[32:35], v4 offset:57344			; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:24672
	; GCN-NEXT: ds_read_b128 a[36:39], v4 offset:57360			; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:24688
	; GCN-NEXT: ds_read_b128 a[40:43], v4 offset:57376			; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:24640
	; GCN-NEXT: ds_read_b128 a[44:47], v4 offset:57392			; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:24656
	; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:8304			; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:24608
	; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:8256			; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:24624
	; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:8272			; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:24576
	; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:8224			; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:24592
	; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:8240
	; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:8192
	; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:8208
	; GCN-NEXT: s_nop 3
	; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:16480
	; GCN-NEXT: s_waitcnt lgkmcnt(14)
	; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
	; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:16496
	; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:16448
	; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:16464
	; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:16416
	; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:16432
	; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:16384
	; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:16400
	; GCN-NEXT: s_nop 7
	; GCN-NEXT: s_nop 3
	; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:24672
	; GCN-NEXT: s_waitcnt lgkmcnt(14)
	; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63]
	; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:24688
	; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:24640
	; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:24656
	; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:24608
	; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:24624
	; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:24576
	; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:24592
	; GCN-NEXT: s_nop 7
	; GCN-NEXT: s_nop 3
	; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:32864			; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:32864
	; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:32880			; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:32880
	; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:32832			; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:32832
	; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:32848			; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:32848
	; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:32800			; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:32800
	; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:32816			; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:32816
	; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:32768			; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:32768
	; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:32784			; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:32784
	Show All 39 Lines

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir

This file was added.

				# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
				# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -run-pass=machine-scheduler -verify-misched -o - %s \| FileCheck -check-prefix=GCN %s

				--- \|
				define amdgpu_kernel void @single-wave-phase-2b(i32 addrspace(3)* noalias %in0, i32 addrspace(3)* noalias %in1, i32 addrspace(3)* noalias %in2, i32 addrspace(3)* noalias %in3, i32 addrspace(3)* noalias %in4, i32 addrspace(3)* noalias %in5, i32 addrspace(3)* noalias %in6, i32 addrspace(3)* noalias %in7, i32 addrspace(3)* noalias %in8, i32 addrspace(3)* noalias %in9, i32 addrspace(3)* noalias %in10, i32 addrspace(3)* noalias %in11, i32 addrspace(7)* noalias %in12, i32 addrspace(7)* noalias %in13, i32 addrspace(7)* noalias %in14, i32 addrspace(7)* noalias %in15, i32 addrspace(7)* noalias %in16, i32 addrspace(7)* noalias %in17, i32 addrspace(7)* noalias %in18, i32 addrspace(7)* noalias %in19, i32 addrspace(7)* noalias %in20, i32 addrspace(7)* noalias %in21, i32 addrspace(7)* noalias %in22, i32 addrspace(7)* noalias %in23, i32 addrspace(7)* noalias %in24, i32 addrspace(7)* noalias %in25, i32 addrspace(7)* noalias %in26, i32 addrspace(7)* noalias %in27, i32 addrspace(7)* noalias %in28, i32 addrspace(7)* noalias %in29) #0 { ret void }

				!0 = distinct !{!0}
				!1 = !{!1, !0}
				...


				---
				name: single-wave-phase-2b
				tracksRegLiveness: true
				machineFunctionInfo:
				occupancy: 1
				body: \|
				; GCN-LABEL: name: single-wave-phase-2b
				; GCN: bb.0:
				; GCN-NEXT: successors: %bb.1(0x80000000)
				; GCN-NEXT: {{ $}}
				; GCN-NEXT: [[DEF:%[0-9]+]]:areg_512_align2 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF1:%[0-9]+]]:areg_512_align2 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF2:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF3:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF4:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF5:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF13:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF14:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF15:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF17:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF18:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF19:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF20:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF21:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF22:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF23:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF24:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF25:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF26:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF27:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF28:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF29:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF30:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF31:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF32:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF33:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF34:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF35:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF36:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF37:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF38:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF39:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF40:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF41:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF42:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF43:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF44:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF45:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF46:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF47:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF48:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF49:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
				; GCN-NEXT: {{ $}}
				; GCN-NEXT: bb.1:
				; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
				; GCN-NEXT: {{ $}}
				; GCN-NEXT: [[DS_READ_B128_gfx9_:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF6]], 0, 0, implicit $exec :: (load (s128) from %ir.in0, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[DS_READ_B128_gfx9_1:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF7]], 0, 0, implicit $exec :: (load (s128) from %ir.in4, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[DS_READ_B128_gfx9_2:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF6]], 1040, 0, implicit $exec :: (load (s128) from %ir.in1, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[DS_READ_B128_gfx9_3:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF7]], 2064, 0, implicit $exec :: (load (s128) from %ir.in5, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub0_sub1, [[DS_READ_B128_gfx9_1]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_]], 0, 0, 0, implicit $mode, implicit $exec
				; GCN-NEXT: [[DS_READ_B128_gfx9_4:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF6]], 2080, 0, implicit $exec :: (load (s128) from %ir.in2, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF8]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF9]], implicit $exec
				; GCN-NEXT: dead [[V_MFMA_F32_32X32X8F16_mac_e64_1:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub2_sub3, [[DS_READ_B128_gfx9_1]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_]], 0, 0, 0, implicit $mode, implicit $exec
				; GCN-NEXT: [[DS_READ_B128_gfx9_5:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF6]], 3120, 0, implicit $exec :: (load (s128) from %ir.in3, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF10]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF11]], implicit $exec
				; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub0_sub1, [[DS_READ_B128_gfx9_3]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
				; GCN-NEXT: [[DS_READ_B128_gfx9_6:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF7]], 4128, 0, implicit $exec :: (load (s128) from %ir.in6, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[V_ADD_U32_e32_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF12]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF13]], implicit $exec
				; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub2_sub3, [[DS_READ_B128_gfx9_3]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
				; GCN-NEXT: [[DS_READ_B128_gfx9_7:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF7]], 6192, 0, implicit $exec :: (load (s128) from %ir.in7, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[V_ADD_U32_e32_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF14]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF15]], implicit $exec
				; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_4]].sub0_sub1, [[DS_READ_B128_gfx9_6]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
				; GCN-NEXT: [[DS_READ_B128_gfx9_8:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF7]], 1024, 0, implicit $exec :: (load (s128) from %ir.in8, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[V_ADD_U32_e32_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF16]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF17]], implicit $exec
				; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_4]].sub2_sub3, [[DS_READ_B128_gfx9_6]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
				; GCN-NEXT: [[DS_READ_B128_gfx9_9:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF7]], 3088, 0, implicit $exec :: (load (s128) from %ir.in9, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[V_ADD_U32_e32_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF18]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF19]], implicit $exec
				; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_5]].sub0_sub1, [[DS_READ_B128_gfx9_7]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
				; GCN-NEXT: [[DS_READ_B128_gfx9_10:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF7]], 5152, 0, implicit $exec :: (load (s128) from %ir.in10, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[V_ADD_U32_e32_12:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF20]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF21]], implicit $exec
				; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_5]].sub2_sub3, [[DS_READ_B128_gfx9_7]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
				; GCN-NEXT: [[DS_READ_B128_gfx9_11:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF7]], 7216, 0, implicit $exec :: (load (s128) from %ir.in11, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub0_sub1, [[DS_READ_B128_gfx9_8]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
				; GCN-NEXT: undef %63.sub1:vreg_128_align2 = V_PERM_B32_e64 [[DEF28]], [[DEF29]], [[DEF44]], implicit $exec
				; GCN-NEXT: %63.sub0:vreg_128_align2 = V_PERM_B32_e64 [[DEF30]], [[DEF31]], [[DEF44]], implicit $exec
				; GCN-NEXT: %63.sub3:vreg_128_align2 = V_PERM_B32_e64 [[DEF24]], [[DEF25]], [[DEF44]], implicit $exec
				; GCN-NEXT: %63.sub2:vreg_128_align2 = V_PERM_B32_e64 [[DEF26]], [[DEF27]], [[DEF44]], implicit $exec
				; GCN-NEXT: DS_WRITE_B128_gfx9 [[DEF40]], %63, 0, 0, implicit $exec :: (store (s128) into %ir.in0, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in12, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_1]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in13, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_2]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in14, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_3]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in15, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_3:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub2_sub3, [[DS_READ_B128_gfx9_8]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_3]], 0, 0, 0, implicit $mode, implicit $exec
				; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_4]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in16, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_5]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in17, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN6:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_6]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in18, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN7:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_7]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in19, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_3:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub0_sub1, [[DS_READ_B128_gfx9_9]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_3]], 0, 0, 0, implicit $mode, implicit $exec
				; GCN-NEXT: undef %64.sub1:vreg_128_align2 = V_PERM_B32_e64 [[DEF36]], [[DEF37]], [[DEF44]], implicit $exec
				; GCN-NEXT: %64.sub0:vreg_128_align2 = V_PERM_B32_e64 [[DEF38]], [[DEF39]], [[DEF44]], implicit $exec
				; GCN-NEXT: %64.sub3:vreg_128_align2 = V_PERM_B32_e64 [[DEF32]], [[DEF33]], [[DEF44]], implicit $exec
				; GCN-NEXT: %64.sub2:vreg_128_align2 = V_PERM_B32_e64 [[DEF34]], [[DEF35]], [[DEF44]], implicit $exec
				; GCN-NEXT: DS_WRITE_B128_gfx9 [[DEF40]], %64, 1040, 0, implicit $exec :: (store (s128) into %ir.in1, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN8:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_8]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in20, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN9:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_9]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in21, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN10:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_10]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in22, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_11]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in23, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_3:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub2_sub3, [[DS_READ_B128_gfx9_9]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_3]], 0, 0, 0, implicit $mode, implicit $exec
				; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_12]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in24, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_13]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in25, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[V_ADD_U32_e32_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF22]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF42]], [[DEF23]], implicit $exec
				; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN14:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_14]], [[DEF47]], 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in26, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[BUFFER_LOAD_USHORT_OFFEN15:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[V_ADD_U32_e32_15]], [[DEF47]], 0, 0, 0, 0, implicit $exec
				; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_3:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_4]].sub0_sub1, [[DS_READ_B128_gfx9_10]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_3]], 0, 0, 0, implicit $mode, implicit $exec
				; GCN-NEXT: DS_WRITE_B128_gfx9 [[DEF41]], [[DEF2]], 0, 0, implicit $exec :: (store (s128) into %ir.in2, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[V_ADD_U32_e32_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 -16, [[DEF45]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 -16, [[DEF46]], implicit $exec
				; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[V_ADD_U32_e32_16]], [[DEF48]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in26, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_3:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_4]].sub2_sub3, [[DS_READ_B128_gfx9_10]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_3]], 0, 0, 0, implicit $mode, implicit $exec
				; GCN-NEXT: DS_WRITE_B128_gfx9 [[DEF41]], [[DEF3]], 2064, 0, implicit $exec :: (store (s128) into %ir.in3, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[DEF45]], [[DEF48]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in27, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[V_ADD_U32_e32_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 128, [[V_ADD_U32_e32_18]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_19]], implicit $exec
				; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_3:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_5]].sub0_sub1, [[DS_READ_B128_gfx9_11]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_3]], 0, 0, 0, implicit $mode, implicit $exec
				; GCN-NEXT: DS_WRITE_B128_gfx9 [[DEF41]], [[DEF4]], 2080, 0, implicit $exec :: (store (s128) into %ir.in4, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[DEF46]], [[DEF48]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in28, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[V_ADD_U32_e32_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 128, [[V_ADD_U32_e32_20]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_21]], implicit $exec
				; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_3:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_5]].sub2_sub3, [[DS_READ_B128_gfx9_11]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_3]], 0, 0, 0, implicit $mode, implicit $exec
				; GCN-NEXT: DS_WRITE_B128_gfx9 [[DEF41]], [[DEF5]], 16, 0, implicit $exec :: (store (s128) into %ir.in5, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[V_ADD_U32_e32_17]], [[DEF48]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in29, !alias.scope !0, addrspace 7)
				; GCN-NEXT: IGLP_OPT 1
				; GCN-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = nsw S_ADD_I32 [[S_ADD_I32_]], -1, implicit-def dead $scc
				; GCN-NEXT: S_CMP_LG_U32 [[S_ADD_I32_]], 0, implicit-def $scc
				; GCN-NEXT: [[V_ADD_U32_e32_22:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_22]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_23:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_23]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_24:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_24]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_25:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_25]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_26:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_26]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_27:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_27]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_28:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_28]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_29:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_29]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_30]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_31:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_31]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_32:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_32]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_33:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_33]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_34:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_34]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_35:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF43]], [[V_ADD_U32_e32_35]], implicit $exec
				; GCN-NEXT: S_CBRANCH_SCC1 %bb.1, implicit killed $scc
				; GCN-NEXT: {{ $}}
				; GCN-NEXT: bb.2:
				; GCN-NEXT: S_ENDPGM 0
				bb.0:
				%761:areg_512_align2 = IMPLICIT_DEF
				%814:areg_512_align2 = IMPLICIT_DEF
				%1757:av_128_align2 = IMPLICIT_DEF
				%1755:av_128_align2 = IMPLICIT_DEF
				%1754:av_128_align2 = IMPLICIT_DEF
				%1756:av_128_align2 = IMPLICIT_DEF
				%37:vgpr_32 = IMPLICIT_DEF
				%38:vgpr_32 = IMPLICIT_DEF
				%1736:vgpr_32 = IMPLICIT_DEF
				%1737:vgpr_32 = IMPLICIT_DEF
				%1738:vgpr_32 = IMPLICIT_DEF
				%1739:vgpr_32 = IMPLICIT_DEF
				%1740:vgpr_32 = IMPLICIT_DEF
				%1741:vgpr_32 = IMPLICIT_DEF
				%1742:vgpr_32 = IMPLICIT_DEF
				%1743:vgpr_32 = IMPLICIT_DEF
				%1744:vgpr_32 = IMPLICIT_DEF
				%1745:vgpr_32 = IMPLICIT_DEF
				%1746:vgpr_32 = IMPLICIT_DEF
				%1747:vgpr_32 = IMPLICIT_DEF
				%1748:vgpr_32 = IMPLICIT_DEF
				%1749:vgpr_32 = IMPLICIT_DEF
				%1750:vgpr_32 = IMPLICIT_DEF
				%1751:vgpr_32 = IMPLICIT_DEF
				%1766:vgpr_32 = IMPLICIT_DEF
				%1767:vgpr_32 = IMPLICIT_DEF
				%1768:vgpr_32 = IMPLICIT_DEF
				%1769:vgpr_32 = IMPLICIT_DEF
				%1770:vgpr_32 = IMPLICIT_DEF
				%1771:vgpr_32 = IMPLICIT_DEF
				%1772:vgpr_32 = IMPLICIT_DEF
				%1773:vgpr_32 = IMPLICIT_DEF
				%1758:vgpr_32 = IMPLICIT_DEF
				%1759:vgpr_32 = IMPLICIT_DEF
				%1760:vgpr_32 = IMPLICIT_DEF
				%1761:vgpr_32 = IMPLICIT_DEF
				%1762:vgpr_32 = IMPLICIT_DEF
				%1763:vgpr_32 = IMPLICIT_DEF
				%1764:vgpr_32 = IMPLICIT_DEF
				%1765:vgpr_32 = IMPLICIT_DEF
				%14:vgpr_32 = IMPLICIT_DEF
				%31:vgpr_32 = IMPLICIT_DEF
				%41:vgpr_32 = IMPLICIT_DEF
				%43:sreg_32 = IMPLICIT_DEF
				%535:sreg_32 = IMPLICIT_DEF
				%1752:vgpr_32 = IMPLICIT_DEF
				%1753:vgpr_32 = IMPLICIT_DEF
				%450:sgpr_128 = IMPLICIT_DEF
				%518:sgpr_128 = IMPLICIT_DEF
				%1735:sreg_32 = IMPLICIT_DEF

				bb.1:
				IGLP_OPT 1
				%683:av_128_align2 = DS_READ_B128_gfx9 %37:vgpr_32, 0, 0, implicit $exec :: (load (s128) from %ir.in0, !alias.scope !0, addrspace 3)
				%688:av_128_align2 = DS_READ_B128_gfx9 %37:vgpr_32, 1040, 0, implicit $exec :: (load (s128) from %ir.in1, !alias.scope !0, addrspace 3)
				%693:av_128_align2 = DS_READ_B128_gfx9 %37:vgpr_32, 2080, 0, implicit $exec :: (load (s128) from %ir.in2, !alias.scope !0, addrspace 3)
				%698:av_128_align2 = DS_READ_B128_gfx9 %37:vgpr_32, 3120, 0, implicit $exec :: (load (s128) from %ir.in3, !alias.scope !0, addrspace 3)
				%703:av_128_align2 = DS_READ_B128_gfx9 %38:vgpr_32, 0, 0, implicit $exec :: (load (s128) from %ir.in4, !alias.scope !0, addrspace 3)
				%708:av_128_align2 = DS_READ_B128_gfx9 %38:vgpr_32, 2064, 0, implicit $exec :: (load (s128) from %ir.in5, !alias.scope !0, addrspace 3)
				%713:av_128_align2 = DS_READ_B128_gfx9 %38:vgpr_32, 4128, 0, implicit $exec :: (load (s128) from %ir.in6, !alias.scope !0, addrspace 3)
				%718:av_128_align2 = DS_READ_B128_gfx9 %38:vgpr_32, 6192, 0, implicit $exec :: (load (s128) from %ir.in7, !alias.scope !0, addrspace 3)
				%761:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %683.sub0_sub1:av_128_align2, %703.sub0_sub1:av_128_align2, %761:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
				%762:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %683.sub2_sub3:av_128_align2, %703.sub2_sub3:av_128_align2, %761:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
				%761:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %688.sub0_sub1:av_128_align2, %708.sub0_sub1:av_128_align2, %761:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
				%761:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %688.sub2_sub3:av_128_align2, %708.sub2_sub3:av_128_align2, %761:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
				%761:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %693.sub0_sub1:av_128_align2, %713.sub0_sub1:av_128_align2, %761:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
				%761:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %693.sub2_sub3:av_128_align2, %713.sub2_sub3:av_128_align2, %761:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
				%761:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %698.sub0_sub1:av_128_align2, %718.sub0_sub1:av_128_align2, %761:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
				%761:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %698.sub2_sub3:av_128_align2, %718.sub2_sub3:av_128_align2, %761:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
				%764:av_128_align2 = DS_READ_B128_gfx9 %38:vgpr_32, 1024, 0, implicit $exec :: (load (s128) from %ir.in8, !alias.scope !0, addrspace 3)
				%769:av_128_align2 = DS_READ_B128_gfx9 %38:vgpr_32, 3088, 0, implicit $exec :: (load (s128) from %ir.in9, !alias.scope !0, addrspace 3)
				%774:av_128_align2 = DS_READ_B128_gfx9 %38:vgpr_32, 5152, 0, implicit $exec :: (load (s128) from %ir.in10, !alias.scope !0, addrspace 3)
				%779:av_128_align2 = DS_READ_B128_gfx9 %38:vgpr_32, 7216, 0, implicit $exec :: (load (s128) from %ir.in11, !alias.scope !0, addrspace 3)
				%814:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %683.sub0_sub1:av_128_align2, %764.sub0_sub1:av_128_align2, %814:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
				%814:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %683.sub2_sub3:av_128_align2, %764.sub2_sub3:av_128_align2, %814:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
				%814:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %688.sub0_sub1:av_128_align2, %769.sub0_sub1:av_128_align2, %814:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
				%814:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %688.sub2_sub3:av_128_align2, %769.sub2_sub3:av_128_align2, %814:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
				%814:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %693.sub0_sub1:av_128_align2, %774.sub0_sub1:av_128_align2, %814:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
				%814:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %693.sub2_sub3:av_128_align2, %774.sub2_sub3:av_128_align2, %814:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
				%814:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %698.sub0_sub1:av_128_align2, %779.sub0_sub1:av_128_align2, %814:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
				%814:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %698.sub2_sub3:av_128_align2, %779.sub2_sub3:av_128_align2, %814:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
				undef %1599.sub3:vreg_128_align2 = V_PERM_B32_e64 %1766:vgpr_32, %1767:vgpr_32, %535:sreg_32, implicit $exec
				%1599.sub2:vreg_128_align2 = V_PERM_B32_e64 %1768:vgpr_32, %1769:vgpr_32, %535:sreg_32, implicit $exec
				%1599.sub1:vreg_128_align2 = V_PERM_B32_e64 %1770:vgpr_32, %1771:vgpr_32, %535:sreg_32, implicit $exec
				%1599.sub0:vreg_128_align2 = V_PERM_B32_e64 %1772:vgpr_32, %1773:vgpr_32, %535:sreg_32, implicit $exec
				undef %1579.sub3:vreg_128_align2 = V_PERM_B32_e64 %1758:vgpr_32, %1759:vgpr_32, %535:sreg_32, implicit $exec
				%1579.sub2:vreg_128_align2 = V_PERM_B32_e64 %1760:vgpr_32, %1761:vgpr_32, %535:sreg_32, implicit $exec
				%1579.sub1:vreg_128_align2 = V_PERM_B32_e64 %1762:vgpr_32, %1763:vgpr_32, %535:sreg_32, implicit $exec
				%1579.sub0:vreg_128_align2 = V_PERM_B32_e64 %1764:vgpr_32, %1765:vgpr_32, %535:sreg_32, implicit $exec
				DS_WRITE_B128_gfx9 %14:vgpr_32, %1599:vreg_128_align2, 0, 0, implicit $exec :: (store (s128) into %ir.in0, !alias.scope !0, addrspace 3)
				DS_WRITE_B128_gfx9 %14:vgpr_32, %1579:vreg_128_align2, 1040, 0, implicit $exec :: (store (s128) into %ir.in1, !alias.scope !0, addrspace 3)
				%830:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1736:vgpr_32, implicit $exec
				%1773:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %830:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in12, !alias.scope !0, addrspace 7)
				%833:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1737:vgpr_32, implicit $exec
				%1772:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %833:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in13, !alias.scope !0, addrspace 7)
				%835:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1738:vgpr_32, implicit $exec
				%1771:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %835:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in14, !alias.scope !0, addrspace 7)
				%837:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1739:vgpr_32, implicit $exec
				%1770:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %837:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in15, !alias.scope !0, addrspace 7)
				%839:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1740:vgpr_32, implicit $exec
				%1769:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %839:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in16, !alias.scope !0, addrspace 7)
				%841:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1741:vgpr_32, implicit $exec
				%1768:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %841:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in17, !alias.scope !0, addrspace 7)
				%843:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1742:vgpr_32, implicit $exec
				%1767:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %843:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in18, !alias.scope !0, addrspace 7)
				%845:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1743:vgpr_32, implicit $exec
				%1766:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %845:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in19, !alias.scope !0, addrspace 7)
				%847:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1744:vgpr_32, implicit $exec
				%1758:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %847:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in20, !alias.scope !0, addrspace 7)
				%849:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1745:vgpr_32, implicit $exec
				%1759:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %849:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in21, !alias.scope !0, addrspace 7)
				%851:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1746:vgpr_32, implicit $exec
				%1760:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %851:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in22, !alias.scope !0, addrspace 7)
				%853:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1747:vgpr_32, implicit $exec
				%1761:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %853:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in23, !alias.scope !0, addrspace 7)
				%855:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1748:vgpr_32, implicit $exec
				%1762:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %855:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in24, !alias.scope !0, addrspace 7)
				%857:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1749:vgpr_32, implicit $exec
				%1763:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %857:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in25, !alias.scope !0, addrspace 7)
				%859:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1750:vgpr_32, implicit $exec
				%1764:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %859:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s16) from %ir.in26, !alias.scope !0, addrspace 7)
				%861:vgpr_32 = V_ADD_U32_e32 %41:vgpr_32, %1751:vgpr_32, implicit $exec
				%1765:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %861:vgpr_32, %450:sgpr_128, 0, 0, 0, 0, implicit $exec
				DS_WRITE_B128_gfx9 %31:vgpr_32, %1757:av_128_align2, 0, 0, implicit $exec :: (store (s128) into %ir.in2, !alias.scope !0, addrspace 3)
				DS_WRITE_B128_gfx9 %31:vgpr_32, %1755:av_128_align2, 2064, 0, implicit $exec :: (store (s128) into %ir.in3, !alias.scope !0, addrspace 3)
				DS_WRITE_B128_gfx9 %31:vgpr_32, %1754:av_128_align2, 2080, 0, implicit $exec :: (store (s128) into %ir.in4, !alias.scope !0, addrspace 3)
				DS_WRITE_B128_gfx9 %31:vgpr_32, %1756:av_128_align2, 16, 0, implicit $exec :: (store (s128) into %ir.in5, !alias.scope !0, addrspace 3)
				%864:vgpr_32 = V_ADD_U32_e32 -16, %1752:vgpr_32, implicit $exec
				%1757:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %864:vgpr_32, %518:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in26, !alias.scope !0, addrspace 7)
				%1755:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %1752:vgpr_32, %518:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in27, !alias.scope !0, addrspace 7)
				%1754:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %1753:vgpr_32, %518:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in28, !alias.scope !0, addrspace 7)
				%865:vgpr_32 = V_ADD_U32_e32 -16, %1753:vgpr_32, implicit $exec
				%1756:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %865:vgpr_32, %518:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in29, !alias.scope !0, addrspace 7)
				%1753:vgpr_32 = V_ADD_U32_e32 128, %1753:vgpr_32, implicit $exec
				%1752:vgpr_32 = V_ADD_U32_e32 128, %1752:vgpr_32, implicit $exec
				%1751:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1751:vgpr_32, implicit $exec
				%1750:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1750:vgpr_32, implicit $exec
				%1749:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1749:vgpr_32, implicit $exec
				%1748:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1748:vgpr_32, implicit $exec
				%1747:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1747:vgpr_32, implicit $exec
				%1746:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1746:vgpr_32, implicit $exec
				%1745:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1745:vgpr_32, implicit $exec
				%1744:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1744:vgpr_32, implicit $exec
				%1743:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1743:vgpr_32, implicit $exec
				%1742:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1742:vgpr_32, implicit $exec
				%1741:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1741:vgpr_32, implicit $exec
				%1740:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1740:vgpr_32, implicit $exec
				%1739:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1739:vgpr_32, implicit $exec
				%1738:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1738:vgpr_32, implicit $exec
				%1737:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1737:vgpr_32, implicit $exec
				%1736:vgpr_32 = V_ADD_U32_e32 %43:sreg_32, %1736:vgpr_32, implicit $exec
				%1735:sreg_32 = nsw S_ADD_I32 %1735:sreg_32, -1, implicit-def dead $scc
				S_CMP_LG_U32 %1735:sreg_32, 0, implicit-def $scc
				S_CBRANCH_SCC1 %bb.1, implicit killed $scc

				bb.2:
				S_ENDPGM 0
				---

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2c.mir

This file was added.

				# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
				# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -run-pass=machine-scheduler -verify-misched -o - %s \| FileCheck -check-prefix=GCN %s

				--- \|
				define amdgpu_kernel void @single-wave-phase-2c(i32 addrspace(3)* noalias %in0, i32 addrspace(3)* noalias %in1, i32 addrspace(3)* noalias %in2, i32 addrspace(3)* noalias %in3, i32 addrspace(3)* noalias %in4, i32 addrspace(3)* noalias %in5, i32 addrspace(3)* noalias %in6, i32 addrspace(3)* noalias %in7, i32 addrspace(3)* noalias %in8, i32 addrspace(3)* noalias %in9, i32 addrspace(3)* noalias %in10, i32 addrspace(3)* noalias %in11, i32 addrspace(7)* noalias %in12, i32 addrspace(7)* noalias %in13, i32 addrspace(7)* noalias %in14, i32 addrspace(7)* noalias %in15, i32 addrspace(7)* noalias %in16, i32 addrspace(7)* noalias %in17) #0 { ret void }


				!0 = distinct !{!0}
				!1 = !{!1, !0}
				...

				---
				name: single-wave-phase-2c
				tracksRegLiveness: true
				machineFunctionInfo:
				occupancy: 1
				body: \|
				; GCN-LABEL: name: single-wave-phase-2c
				; GCN: bb.0:
				; GCN-NEXT: successors: %bb.1(0x80000000)
				; GCN-NEXT: {{ $}}
				; GCN-NEXT: [[DEF:%[0-9]+]]:av_512_align2 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF1:%[0-9]+]]:av_512_align2 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF7:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF13:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF14:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF15:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF16:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF19:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: dead [[DEF20:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF21:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF22:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF23:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF24:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF25:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF26:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF27:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF28:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF29:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF30:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF31:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF32:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
				; GCN-NEXT: [[DEF33:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
				; GCN-NEXT: {{ $}}
				; GCN-NEXT: bb.1:
				; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
				; GCN-NEXT: {{ $}}
				; GCN-NEXT: [[DS_READ_B128_gfx9_:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF2]], 0, 0, implicit $exec :: (load (s128) from %ir.in0, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[DS_READ_B128_gfx9_1:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF3]], 0, 0, implicit $exec :: (load (s128) from %ir.in2, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[DS_READ_B128_gfx9_2:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF2]], 1040, 0, implicit $exec :: (load (s128) from %ir.in1, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[DS_READ_B128_gfx9_3:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF3]], 2064, 0, implicit $exec :: (load (s128) from %ir.in3, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[COPY:%[0-9]+]]:areg_512_align2 = COPY [[DEF1]]
				; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub0_sub1, [[DS_READ_B128_gfx9_1]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_]], 0, 0, 0, implicit $mode, implicit $exec
				; GCN-NEXT: [[DS_READ_B128_gfx9_4:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF3]], 1024, 0, implicit $exec :: (load (s128) from %ir.in4, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF33]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF21]], implicit $exec
				; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_1:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub2_sub3, [[DS_READ_B128_gfx9_1]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
				; GCN-NEXT: [[DS_READ_B128_gfx9_5:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF3]], 3088, 0, implicit $exec :: (load (s128) from %ir.in5, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF22]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF23]], implicit $exec
				; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_1:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub0_sub1, [[DS_READ_B128_gfx9_3]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
				; GCN-NEXT: DS_WRITE_B128_gfx9 [[DEF4]], [[DEF16]], 0, 0, implicit $exec :: (store (s128) into %ir.in6, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[DEF6]], [[DEF7]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in7, !alias.scope !0, addrspace 7)
				; GCN-NEXT: dead [[V_MFMA_F32_32X32X8F16_mac_e64_1:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub2_sub3, [[DS_READ_B128_gfx9_3]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
				; GCN-NEXT: [[COPY:%[0-9]+]]:areg_512_align2 = COPY [[DEF]]
				; GCN-NEXT: undef [[DEF17]].sub2:vreg_128_align2 = V_PERM_B32_e64 [[DEF13]], [[DEF12]], [[DEF30]], implicit $exec
				; GCN-NEXT: [[DEF17]].sub3:vreg_128_align2 = V_PERM_B32_e64 [[DEF15]], [[DEF14]], [[DEF30]], implicit $exec
				; GCN-NEXT: [[DEF17]].sub0:vreg_128_align2 = V_PERM_B32_e64 [[DEF8]], [[DEF9]], [[DEF30]], implicit $exec
				; GCN-NEXT: [[DEF17]].sub1:vreg_128_align2 = V_PERM_B32_e64 [[DEF11]], [[DEF10]], [[DEF30]], implicit $exec
				; GCN-NEXT: DS_WRITE_B128_gfx9 [[DEF5]], [[DEF17]], 0, 0, implicit $exec :: (store (s128) into %ir.in8, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_1:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub0_sub1, [[DS_READ_B128_gfx9_4]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
				; GCN-NEXT: undef [[DEF18]].sub0:vreg_128_align2 = V_PERM_B32_e64 [[DEF8]], [[DEF9]], [[DEF31]], implicit $exec
				; GCN-NEXT: [[DEF18]].sub1:vreg_128_align2 = V_PERM_B32_e64 [[DEF11]], [[DEF10]], [[DEF31]], implicit $exec
				; GCN-NEXT: [[DEF18]].sub2:vreg_128_align2 = V_PERM_B32_e64 [[DEF13]], [[DEF12]], [[DEF31]], implicit $exec
				; GCN-NEXT: [[DEF18]].sub3:vreg_128_align2 = V_PERM_B32_e64 [[DEF15]], [[DEF14]], [[DEF31]], implicit $exec
				; GCN-NEXT: DS_WRITE_B128_gfx9 [[DEF5]], [[DEF18]], 16, 0, implicit $exec :: (store (s128) into %ir.in9, !alias.scope !0, addrspace 3)
				; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub2_sub3, [[DS_READ_B128_gfx9_4]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
				; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in10, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_1]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in11, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_2]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in12, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_3]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in13, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub0_sub1, [[DS_READ_B128_gfx9_5]].sub0_sub1, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF24]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF25]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF26]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF27]], implicit $exec
				; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_4]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in14, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_5]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in15, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFEN6:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_6]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in16, !alias.scope !0, addrspace 7)
				; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFEN7:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_7]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in17, !alias.scope !0, addrspace 7)
				; GCN-NEXT: dead [[V_MFMA_F32_32X32X8F16_mac_e64_2:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub2_sub3, [[DS_READ_B128_gfx9_5]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
				; GCN-NEXT: IGLP_OPT 1
				; GCN-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = nsw S_ADD_I32 [[S_ADD_I32_]], -1, implicit-def dead $scc
				; GCN-NEXT: S_CMP_LG_U32 [[S_ADD_I32_]], 0, implicit-def $scc
				; GCN-NEXT: [[V_ADD_U32_e32_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF28]], [[V_ADD_U32_e32_8]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF28]], [[V_ADD_U32_e32_9]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF28]], [[V_ADD_U32_e32_10]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF28]], [[V_ADD_U32_e32_11]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_12:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 64, [[V_ADD_U32_e32_12]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF28]], [[V_ADD_U32_e32_13]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF28]], [[V_ADD_U32_e32_14]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF28]], [[V_ADD_U32_e32_15]], implicit $exec
				; GCN-NEXT: [[V_ADD_U32_e32_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF28]], [[V_ADD_U32_e32_16]], implicit $exec
				; GCN-NEXT: S_CBRANCH_SCC1 %bb.1, implicit killed $scc
				; GCN-NEXT: {{ $}}
				; GCN-NEXT: bb.2:
				; GCN-NEXT: S_ENDPGM 0
				bb.0:
				%1076:av_512_align2 = IMPLICIT_DEF
				%1073:av_512_align2 = IMPLICIT_DEF
				%25:vgpr_32 = IMPLICIT_DEF
				%26:vgpr_32 = IMPLICIT_DEF
				%13:vgpr_32 = IMPLICIT_DEF
				%15:vgpr_32 = IMPLICIT_DEF
				%1215:vgpr_32 = IMPLICIT_DEF
				%381:sgpr_128 = IMPLICIT_DEF
				%1225:vgpr_32 = IMPLICIT_DEF
				%1224:vgpr_32 = IMPLICIT_DEF
				%1226:vgpr_32 = IMPLICIT_DEF
				%1227:vgpr_32 = IMPLICIT_DEF
				%1228:vgpr_32 = IMPLICIT_DEF
				%1229:vgpr_32 = IMPLICIT_DEF
				%1230:vgpr_32 = IMPLICIT_DEF
				%1231:vgpr_32 = IMPLICIT_DEF
				%1232:av_128_align2 = IMPLICIT_DEF
				%1091:vreg_128_align2 = IMPLICIT_DEF
				%1067:vreg_128_align2 = IMPLICIT_DEF
				%27:vgpr_32 = IMPLICIT_DEF
				%1216:vgpr_32 = IMPLICIT_DEF
				%1217:vgpr_32 = IMPLICIT_DEF
				%1218:vgpr_32 = IMPLICIT_DEF
				%1219:vgpr_32 = IMPLICIT_DEF
				%1220:vgpr_32 = IMPLICIT_DEF
				%1221:vgpr_32 = IMPLICIT_DEF
				%1222:vgpr_32 = IMPLICIT_DEF
				%1223:vgpr_32 = IMPLICIT_DEF
				%29:sreg_32 = IMPLICIT_DEF
				%1214:sreg_32 = IMPLICIT_DEF
				%419:sreg_32 = IMPLICIT_DEF
				%421:sreg_32 = IMPLICIT_DEF
				%387:sgpr_128 = IMPLICIT_DEF
				%1216:vgpr_32 = IMPLICIT_DEF

				bb.1:
				IGLP_OPT 1
				%489:av_128_align2 = DS_READ_B128_gfx9 %25:vgpr_32, 0, 0, implicit $exec :: (load (s128) from %ir.in0, !alias.scope !0, addrspace 3)
				%494:av_128_align2 = DS_READ_B128_gfx9 %25:vgpr_32, 1040, 0, implicit $exec :: (load (s128) from %ir.in1, !alias.scope !0, addrspace 3)
				%499:av_128_align2 = DS_READ_B128_gfx9 %26:vgpr_32, 0, 0, implicit $exec :: (load (s128) from %ir.in2, !alias.scope !0, addrspace 3)
				%504:av_128_align2 = DS_READ_B128_gfx9 %26:vgpr_32, 2064, 0, implicit $exec :: (load (s128) from %ir.in3, !alias.scope !0, addrspace 3)
				%527:areg_512_align2 = COPY %1073:av_512_align2
				%527:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %489.sub0_sub1:av_128_align2, %499.sub0_sub1:av_128_align2, %527:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
				%527:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %489.sub2_sub3:av_128_align2, %499.sub2_sub3:av_128_align2, %527:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
				%527:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %494.sub0_sub1:av_128_align2, %504.sub0_sub1:av_128_align2, %527:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
				%527:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %494.sub2_sub3:av_128_align2, %504.sub2_sub3:av_128_align2, %527:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
				%530:av_128_align2 = DS_READ_B128_gfx9 %26:vgpr_32, 1024, 0, implicit $exec :: (load (s128) from %ir.in4, !alias.scope !0, addrspace 3)
				%535:av_128_align2 = DS_READ_B128_gfx9 %26:vgpr_32, 3088, 0, implicit $exec :: (load (s128) from %ir.in5, !alias.scope !0, addrspace 3)
				%554:areg_512_align2 = COPY %1076:av_512_align2
				%554:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %489.sub0_sub1:av_128_align2, %530.sub0_sub1:av_128_align2, %554:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
				%554:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %489.sub2_sub3:av_128_align2, %530.sub2_sub3:av_128_align2, %554:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
				%554:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %494.sub0_sub1:av_128_align2, %535.sub0_sub1:av_128_align2, %554:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
				%554:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 %494.sub2_sub3:av_128_align2, %535.sub2_sub3:av_128_align2, %554:areg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
				DS_WRITE_B128_gfx9 %13:vgpr_32, %1232:av_128_align2, 0, 0, implicit $exec :: (store (s128) into %ir.in6, !alias.scope !0, addrspace 3)
				%1232:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %1215:vgpr_32, %381:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in7, !alias.scope !0, addrspace 7)
				%1091.sub0:vreg_128_align2 = V_PERM_B32_e64 %1225:vgpr_32, %1224:vgpr_32, %419:sreg_32, implicit $exec
				%1067.sub0:vreg_128_align2 = V_PERM_B32_e64 %1225:vgpr_32, %1224:vgpr_32, %421:sreg_32, implicit $exec
				%1091.sub1:vreg_128_align2 = V_PERM_B32_e64 %1227:vgpr_32, %1226:vgpr_32, %419:sreg_32, implicit $exec
				%1067.sub1:vreg_128_align2 = V_PERM_B32_e64 %1227:vgpr_32, %1226:vgpr_32, %421:sreg_32, implicit $exec
				%1091.sub2:vreg_128_align2 = V_PERM_B32_e64 %1229:vgpr_32, %1228:vgpr_32, %419:sreg_32, implicit $exec
				%1067.sub2:vreg_128_align2 = V_PERM_B32_e64 %1229:vgpr_32, %1228:vgpr_32, %421:sreg_32, implicit $exec
				%1091.sub3:vreg_128_align2 = V_PERM_B32_e64 %1231:vgpr_32, %1230:vgpr_32, %419:sreg_32, implicit $exec
				%1067.sub3:vreg_128_align2 = V_PERM_B32_e64 %1231:vgpr_32, %1230:vgpr_32, %421:sreg_32, implicit $exec
				DS_WRITE_B128_gfx9 %15:vgpr_32, %1091:vreg_128_align2, 0, 0, implicit $exec :: (store (s128) into %ir.in8, !alias.scope !0, addrspace 3)
				DS_WRITE_B128_gfx9 %15:vgpr_32, %1067:vreg_128_align2, 16, 0, implicit $exec :: (store (s128) into %ir.in9, !alias.scope !0, addrspace 3)
				%572:vgpr_32 = V_ADD_U32_e32 %27:vgpr_32, %1216:vgpr_32, implicit $exec
				%1224:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %572:vgpr_32, %387:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in10, !alias.scope !0, addrspace 7)
				%573:vgpr_32 = V_ADD_U32_e32 %27:vgpr_32, %1217:vgpr_32, implicit $exec
				%1225:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %573:vgpr_32, %387:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in11, !alias.scope !0, addrspace 7)
				%574:vgpr_32 = V_ADD_U32_e32 %27:vgpr_32, %1218:vgpr_32, implicit $exec
				%1226:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %574:vgpr_32, %387:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in12, !alias.scope !0, addrspace 7)
				%575:vgpr_32 = V_ADD_U32_e32 %27:vgpr_32, %1219:vgpr_32, implicit $exec
				%1227:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %575:vgpr_32, %387:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in13, !alias.scope !0, addrspace 7)
				%576:vgpr_32 = V_ADD_U32_e32 %27:vgpr_32, %1220:vgpr_32, implicit $exec
				%1228:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %576:vgpr_32, %387:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in14, !alias.scope !0, addrspace 7)
				%577:vgpr_32 = V_ADD_U32_e32 %27:vgpr_32, %1221:vgpr_32, implicit $exec
				%1229:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %577:vgpr_32, %387:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in15, !alias.scope !0, addrspace 7)
				%578:vgpr_32 = V_ADD_U32_e32 %27:vgpr_32, %1222:vgpr_32, implicit $exec
				%1230:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %578:vgpr_32, %387:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in16, !alias.scope !0, addrspace 7)
				%579:vgpr_32 = V_ADD_U32_e32 %27:vgpr_32, %1223:vgpr_32, implicit $exec
				%1231:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %579:vgpr_32, %387:sgpr_128, 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in17, !alias.scope !0, addrspace 7)
				%1223:vgpr_32 = V_ADD_U32_e32 %29:sreg_32, %1223:vgpr_32, implicit $exec
				%1222:vgpr_32 = V_ADD_U32_e32 %29:sreg_32, %1222:vgpr_32, implicit $exec
				%1221:vgpr_32 = V_ADD_U32_e32 %29:sreg_32, %1221:vgpr_32, implicit $exec
				%1220:vgpr_32 = V_ADD_U32_e32 %29:sreg_32, %1220:vgpr_32, implicit $exec
				%1219:vgpr_32 = V_ADD_U32_e32 %29:sreg_32, %1219:vgpr_32, implicit $exec
				%1218:vgpr_32 = V_ADD_U32_e32 %29:sreg_32, %1218:vgpr_32, implicit $exec
				%1217:vgpr_32 = V_ADD_U32_e32 %29:sreg_32, %1217:vgpr_32, implicit $exec
				%1216:vgpr_32 = V_ADD_U32_e32 %29:sreg_32, %1216:vgpr_32, implicit $exec
				%1215:vgpr_32 = V_ADD_U32_e32 64, %1215:vgpr_32, implicit $exec
				%1214:sreg_32 = nsw S_ADD_I32 %1214:sreg_32, -1, implicit-def dead $scc
				S_CMP_LG_U32 %1214:sreg_32, 0, implicit-def $scc
				S_CBRANCH_SCC1 %bb.1, implicit killed $scc

				bb.2:
				S_ENDPGM 0
				---

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU][IGLP] Add iglp_opt(1) strategy for single wave gemms
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 537758

llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2c.mir

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU][IGLP] Add iglp_opt(1) strategy for single wave gemmsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 537758

llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2c.mir

[AMDGPU][IGLP] Add iglp_opt(1) strategy for single wave gemms
ClosedPublic