Diff 526801

llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp

Show First 20 Lines • Show All 248 Lines • ▼ Show 20 Lines	class PipelineSolver {
// Index to the pipeline that is currently being fitted		// Index to the pipeline that is currently being fitted
int CurrSyncGroupIdx = 0;		int CurrSyncGroupIdx = 0;
// The first non trivial pipeline		// The first non trivial pipeline
int BeginSyncGroupIdx = 0;		int BeginSyncGroupIdx = 0;

// How many branches we have explored		// How many branches we have explored
uint64_t BranchesExplored = 0;		uint64_t BranchesExplored = 0;

		// The direction in which we process the candidate SchedGroups per SU
		bool IsBottomUp = 1;
		kerbowaUnsubmitted Done Reply Inline Actions Could this just be a boolean value? kerbowa: Could this just be a boolean value?

// Update indices to fit next conflicting instruction		// Update indices to fit next conflicting instruction
void advancePosition();		void advancePosition();
// Recede indices to attempt to find better fit for previous conflicting		// Recede indices to attempt to find better fit for previous conflicting
// instruction		// instruction
void retreatPosition();		void retreatPosition();

// The exponential time algorithm which finds the provably best fit		// The exponential time algorithm which finds the provably best fit
bool solveExact();		bool solveExact();
// The polynomial time algorithm which attempts to find a good fit		// The polynomial time algorithm which attempts to find a good fit
bool solveGreedy();		bool solveGreedy();
		// Find the best SchedGroup for the current SU using the heuristic given all
		// current information. One step in the greedy algorithm. Templated against
		// the SchedGroup iterator (either reverse or forward).
		template <typename T>
		void greedyFind(std::vector<std::pair<SUnit , SUnit >> &AddedEdges, T I,
		T E);
// Whether or not the current solution is optimal		// Whether or not the current solution is optimal
bool checkOptimal();		bool checkOptimal();
// Populate the ready list, prioiritizing fewest missed edges first		// Populate the ready list, prioiritizing fewest missed edges first
void populateReadyList(SUToCandSGsPair &CurrSU,		// Templated against the SchedGroup iterator (either reverse or forward).
SmallVectorImpl<std::pair<int, int>> &ReadyList,		template <typename T>
SmallVectorImpl<SchedGroup> &SyncPipeline);		void populateReadyList(SmallVectorImpl<std::pair<int, int>> &ReadyList, T I,
		T E);
// Add edges corresponding to the SchedGroups as assigned by solver		// Add edges corresponding to the SchedGroups as assigned by solver
void makePipeline();		void makePipeline();
		// Link the SchedGroups in the best found pipeline.
		// Tmplated against the SchedGroup iterator (either reverse or forward).
		template <typename T> void linkSchedGroups(T I, T E);
// Add the edges from the SU to the other SchedGroups in pipeline, and		// Add the edges from the SU to the other SchedGroups in pipeline, and
// return the number of edges missed.		// return the number of edges missed.
int addEdges(SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,		int addEdges(SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
std::vector<std::pair<SUnit , SUnit >> &AddedEdges);		std::vector<std::pair<SUnit , SUnit >> &AddedEdges);
// Remove the edges passed via AddedEdges		// Link the pipeline as if \p SU was in the SchedGroup with ID \p SGID. It
		// returns the cost (in terms of missed pipeline edges), and tracks the edges
		// added in \p AddedEdges
		template <typename T>
		int linkSUnit(SUnit *SU, int SGID,
		std::vector<std::pair<SUnit , SUnit >> &AddedEdges, T I, T E);
		// Remove the edges passed via \p AddedEdges
void removeEdges(const std::vector<std::pair<SUnit , SUnit >> &AddedEdges);		void removeEdges(const std::vector<std::pair<SUnit , SUnit >> &AddedEdges);
// Convert the passed in maps to arrays for bidirectional iterators		// Convert the passed in maps to arrays for bidirectional iterators
void convertSyncMapsToArrays();		void convertSyncMapsToArrays();

void reset();		void reset();

public:		public:
// Invoke the solver to map instructions to instruction groups. Heuristic &&		// Invoke the solver to map instructions to instruction groups. Heuristic &&
// command-line-option determines to use exact or greedy algorithm.		// command-line-option determines to use exact or greedy algorithm.
void solve();		void solve();

PipelineSolver(DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,		PipelineSolver(DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,		DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
ScheduleDAGMI *DAG)		ScheduleDAGMI *DAG, bool IsBottomUp = 1)
: DAG(DAG), SyncedInstrs(SyncedInstrs),		: DAG(DAG), SyncedInstrs(SyncedInstrs),
SyncedSchedGroups(SyncedSchedGroups) {		SyncedSchedGroups(SyncedSchedGroups), IsBottomUp(IsBottomUp) {

for (auto &PipelineInstrs : SyncedInstrs) {		for (auto &PipelineInstrs : SyncedInstrs) {
if (PipelineInstrs.second.size() > 0) {		if (PipelineInstrs.second.size() > 0) {
NeedsSolver = true;		NeedsSolver = true;
break;		break;
}		}
}		}

▲ Show 20 Lines • Show All 54 Lines • ▼ Show 20 Lines	for (auto &SUsToCandSGs : SyncInstrMap.second) {
++SortPosition;		++SortPosition;
PipelineInstrs[PipelineIDx].insert(		PipelineInstrs[PipelineIDx].insert(
SortPosition, std::pair(SUsToCandSGs.first, SUsToCandSGs.second));		SortPosition, std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
}		}
--PipelineIDx;		--PipelineIDx;
}		}
}		}

		template <typename T> void PipelineSolver::linkSchedGroups(T I, T E) {
		for (; I != E; ++I) {
		auto &GroupA = *I;
		for (auto J = std::next(I); J != E; ++J) {
		auto &GroupB = *J;
		GroupA.link(GroupB);
		}
		}
		}

void PipelineSolver::makePipeline() {		void PipelineSolver::makePipeline() {
// Preserve the order of barrier for subsequent SchedGroupBarrier mutations		// Preserve the order of barrier for subsequent SchedGroupBarrier mutations
for (auto &SyncPipeline : BestPipeline) {		for (auto &SyncPipeline : BestPipeline) {
for (auto &SG : SyncPipeline) {		for (auto &SG : SyncPipeline) {
		LLVM_DEBUG(dbgs() << "Printing SchedGroups\nSchedGroup with SGID "
		kerbowaUnsubmitted Done Reply Inline Actions Combine the two LLVM_DEBUG macros? kerbowa: Combine the two LLVM_DEBUG macros?
		<< SG.getSGID() << " has: \n");
SUnit *SGBarr = nullptr;		SUnit *SGBarr = nullptr;
for (auto &SU : SG.Collection) {		for (auto &SU : SG.Collection) {
if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)		if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
SGBarr = SU;		SGBarr = SU;
		LLVM_DEBUG(dbgs() << "SU(" << SU->NodeNum << ")\n");
}		}
// Command line requested IGroupLP doesn't have SGBarr		// Command line requested IGroupLP doesn't have SGBarr
if (!SGBarr)		if (!SGBarr)
continue;		continue;
resetEdges(*SGBarr, DAG);		resetEdges(*SGBarr, DAG);
SG.link(*SGBarr, false);		SG.link(*SGBarr, false);
}		}
}		}

for (auto &SyncPipeline : BestPipeline) {		for (auto &SyncPipeline : BestPipeline) {
auto I = SyncPipeline.rbegin();		IsBottomUp ? linkSchedGroups(SyncPipeline.rbegin(), SyncPipeline.rend())
		kerbowaUnsubmitted Done Reply Inline Actions Are we able to still use iterators here? auto I = IsBottomUp ? SyncPipeline.rbegin() : SyncPipeline.begin(); auto E = IsBottomUp ? SyncPipeline.rend() : SyncPipeline.end(); kerbowa: Are we able to still use iterators here? auto I = IsBottomUp ? SyncPipeline.rbegin()…
		jrbyrnesAuthorUnsubmitted Done Reply Inline Actions This doesn't work out of the box. SmallVector::iterator (T ) isn't compatible with SmallVector::reverse_iterator (std::reverse_iterator<T>). I can template this code based on the iterator, but I thought the current implementation was the cleanest way to do it (unfortunately). We can switch to ranges in c++20. Please correct me if I'm missing something. jrbyrnes: This doesn't work out of the box. SmallVector::iterator (T *) isn't compatible with SmallVector…
		kerbowaUnsubmitted Not Done Reply Inline Actions Oh right. Could you use two different for_each? f = loop(GroupA.link(GroupB) if (IsBottomUp) for_each(rbegin(), rend(), f) else for_each(begin(), end(), f) kerbowa: Oh right. Could you use two different for_each? f = loop(GroupA.link(GroupB) if (IsBottomUp)…
auto E = SyncPipeline.rend();		: linkSchedGroups(SyncPipeline.begin(), SyncPipeline.end());
for (; I != E; ++I) {
auto &GroupA = *I;
for (auto J = std::next(I); J != E; ++J) {
auto &GroupB = *J;
GroupA.link(GroupB);
}
}
}		}
}		}

int PipelineSolver::addEdges(		template <typename T>
SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,		int PipelineSolver::linkSUnit(
std::vector<std::pair<SUnit , SUnit >> &AddedEdges) {		SUnit SU, int SGID, std::vector<std::pair<SUnit , SUnit *>> &AddedEdges,
int AddedCost = 0;		T I, T E) {
bool MakePred = false;		bool MakePred = false;
		int AddedCost = 0;
		kerbowaUnsubmitted Done Reply Inline Actions I don't think we should remove the comment explaining MakePred. kerbowa: I don't think we should remove the comment explaining MakePred.
// The groups in the pipeline are in reverse order. Thus,		for (; I < E; ++I) {
// by traversing them from last to first, we are traversing		if (I->getSGID() == SGID) {
// them in the order as they were introduced in the code. After we
// pass the group the SU is being assigned to, it should be
// linked as a predecessor of the subsequent SchedGroups
auto GroupNo = (int)SyncPipeline.size() - 1;
for (; GroupNo >= 0; GroupNo--) {
if (SyncPipeline[GroupNo].getSGID() == SGID) {
MakePred = true;		MakePred = true;
continue;		continue;
}		}
auto Group = &SyncPipeline[GroupNo];		auto Group = *I;
AddedCost += Group->link(*SU, MakePred, AddedEdges);		AddedCost += Group.link(*SU, MakePred, AddedEdges);
assert(AddedCost >= 0);		assert(AddedCost >= 0);
}		}

return AddedCost;		return AddedCost;
}		}

		int PipelineSolver::addEdges(
		SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
		std::vector<std::pair<SUnit , SUnit >> &AddedEdges) {

		// For IsBottomUp, the first SchedGroup in SyncPipeline contains the
		// instructions that are the ultimate successors in the resultant mutation.
		// Therefore, in such a configuration, the SchedGroups occurring before the
		// candidate SGID are successors of the candidate SchedGroup, thus the current
		// SU should be linked as a predecessor to SUs in those SchedGroups. The
		// opposite is true if !IsBottomUp. IsBottomUp occurs in the case of multiple
		// SCHED_GROUP_BARRIERS, or if a user specifies IGLP_OPT SchedGroups using
		// IsBottomUp (in reverse).
		return IsBottomUp ? linkSUnit(SU, SGID, AddedEdges, SyncPipeline.rbegin(),
		SyncPipeline.rend())
		: linkSUnit(SU, SGID, AddedEdges, SyncPipeline.begin(),
		SyncPipeline.end());
		}

void PipelineSolver::removeEdges(		void PipelineSolver::removeEdges(
const std::vector<std::pair<SUnit , SUnit >> &EdgesToRemove) {		const std::vector<std::pair<SUnit , SUnit >> &EdgesToRemove) {
// Only remove the edges that we have added when testing		// Only remove the edges that we have added when testing
// the fit.		// the fit.
for (auto &PredSuccPair : EdgesToRemove) {		for (auto &PredSuccPair : EdgesToRemove) {
SUnit *Pred = PredSuccPair.first;		SUnit *Pred = PredSuccPair.first;
SUnit *Succ = PredSuccPair.second;		SUnit *Succ = PredSuccPair.second;

▲ Show 20 Lines • Show All 56 Lines • ▼ Show 20 Lines	bool PipelineSolver::checkOptimal() {

bool DoneExploring = false;		bool DoneExploring = false;
if (MaxBranchesExplored > 0 && BranchesExplored >= MaxBranchesExplored)		if (MaxBranchesExplored > 0 && BranchesExplored >= MaxBranchesExplored)
DoneExploring = true;		DoneExploring = true;

return (DoneExploring \|\| BestCost == 0);		return (DoneExploring \|\| BestCost == 0);
}		}

		template <typename T>
void PipelineSolver::populateReadyList(		void PipelineSolver::populateReadyList(
SUToCandSGsPair &CurrSU, SmallVectorImpl<std::pair<int, int>> &ReadyList,		SmallVectorImpl<std::pair<int, int>> &ReadyList, T I, T E) {
SmallVectorImpl<SchedGroup> &SyncPipeline) {		SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
		auto SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
assert(CurrSU.second.size() >= 1);		assert(CurrSU.second.size() >= 1);
auto I = CurrSU.second.rbegin();
auto E = CurrSU.second.rend();
for (; I != E; ++I) {		for (; I != E; ++I) {
		kerbowaUnsubmitted Done Reply Inline Actions Just use iterators? kerbowa: Just use iterators?
std::vector<std::pair<SUnit , SUnit >> AddedEdges;		std::vector<std::pair<SUnit , SUnit >> AddedEdges;
int CandSGID = *I;		int CandSGID = *I;
SchedGroup *Match;		SchedGroup *Match;
for (auto &SG : SyncPipeline) {		for (auto &SG : SyncPipeline) {
if (SG.getSGID() == CandSGID)		if (SG.getSGID() == CandSGID)
Match = &SG;		Match = &SG;
}		}

Show All 32 Lines	assert(static_cast<size_t>(CurrConflInstNo) <
PipelineInstrs[CurrSyncGroupIdx].size());		PipelineInstrs[CurrSyncGroupIdx].size());
SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];		SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum		LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum
<< ") in Pipeline # " << CurrSyncGroupIdx << "\n");		<< ") in Pipeline # " << CurrSyncGroupIdx << "\n");

// SchedGroup -> Cost pairs		// SchedGroup -> Cost pairs
SmallVector<std::pair<int, int>, 4> ReadyList;		SmallVector<std::pair<int, int>, 4> ReadyList;
// Prioritize the candidate sched groups in terms of lowest cost first		// Prioritize the candidate sched groups in terms of lowest cost first
populateReadyList(CurrSU, ReadyList, CurrPipeline[CurrSyncGroupIdx]);		IsBottomUp ? populateReadyList(ReadyList, CurrSU.second.rbegin(),
		CurrSU.second.rend())
		: populateReadyList(ReadyList, CurrSU.second.begin(),
		CurrSU.second.end());

auto I = ReadyList.begin();		auto I = ReadyList.begin();
auto E = ReadyList.end();		auto E = ReadyList.end();
for (; I != E; ++I) {		for (; I != E; ++I) {
// If we are trying SGs in least cost order, and the current SG is cost		// If we are trying SGs in least cost order, and the current SG is cost
// infeasible, then all subsequent SGs will also be cost infeasible, so we		// infeasible, then all subsequent SGs will also be cost infeasible, so we
// can prune.		// can prune.
if (BestCost != -1 && (CurrCost + I->second > BestCost))		if (BestCost != -1 && (CurrCost + I->second > BestCost))
▲ Show 20 Lines • Show All 58 Lines • ▼ Show 20 Lines	if (CurrCost < BestCost \|\| BestCost == -1) {
}		}
}		}

retreatPosition();		retreatPosition();
CurrCost -= MissPenalty;		CurrCost -= MissPenalty;
return FinishedExploring;		return FinishedExploring;
}		}

bool PipelineSolver::solveGreedy() {		template <typename T>
BestCost = 0;		void PipelineSolver::greedyFind(
std::vector<std::pair<SUnit , SUnit >> AddedEdges;		std::vector<std::pair<SUnit , SUnit >> &AddedEdges, T I, T E) {

while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) {
SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];		SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
int BestNodeCost = -1;		int BestNodeCost = -1;
int TempCost;		int TempCost;
SchedGroup *BestGroup = nullptr;		SchedGroup *BestGroup = nullptr;
int BestGroupID = -1;		int BestGroupID = -1;
auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];		auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum		LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum
<< ") in Pipeline # " << CurrSyncGroupIdx << "\n");		<< ") in Pipeline # " << CurrSyncGroupIdx << "\n");

// Since we have added the potential SchedGroups from bottom up, but		// Since we have added the potential SchedGroups from bottom up, but
		kerbowaUnsubmitted Done Reply Inline Actions Ditto. kerbowa: Ditto.
// traversed the DAG from top down, parse over the groups from last to		// traversed the DAG from top down, parse over the groups from last to
// first. If we fail to do this for the greedy algorithm, the solution will		// first. If we fail to do this for the greedy algorithm, the solution will
// likely not be good in more complex cases.		// likely not be good in more complex cases.
auto I = CurrSU.second.rbegin();
auto E = CurrSU.second.rend();
for (; I != E; ++I) {		for (; I != E; ++I) {
std::vector<std::pair<SUnit , SUnit >> AddedEdges;		std::vector<std::pair<SUnit , SUnit >> AddedEdges;
int CandSGID = *I;		int CandSGID = *I;
SchedGroup *Match;		SchedGroup *Match;
for (auto &SG : SyncPipeline) {		for (auto &SG : SyncPipeline) {
if (SG.getSGID() == CandSGID)		if (SG.getSGID() == CandSGID)
Match = &SG;		Match = &SG;
}		}

LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask "		LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask "
<< (int)Match->getMask() << "\n");		<< (int)Match->getMask() << "\n");

if (Match->isFull()) {		if (Match->isFull()) {
LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " is full\n");		LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " is full\n");
continue;		continue;
}		}
TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);		TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << "\n");		LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << "\n");
if (TempCost < BestNodeCost \|\| BestNodeCost == -1) {		if (TempCost < BestNodeCost \|\| BestNodeCost == -1) {
BestGroup = Match;		BestGroup = Match;
BestNodeCost = TempCost;		BestNodeCost = TempCost;
BestGroupID = CandSGID;		BestGroupID = CandSGID;
}		}
removeEdges(AddedEdges);		removeEdges(AddedEdges);
if (BestNodeCost == 0)		if (BestNodeCost == 0)
break;		break;
}		}

if (BestGroupID != -1) {		if (BestGroupID != -1) {
BestGroup->add(*CurrSU.first);		BestGroup->add(*CurrSU.first);
addEdges(SyncPipeline, CurrSU.first, BestGroupID, AddedEdges);		addEdges(SyncPipeline, CurrSU.first, BestGroupID, AddedEdges);
LLVM_DEBUG(dbgs() << "Best Group has ID: " << BestGroupID << " and Mask"		LLVM_DEBUG(dbgs() << "Best Group has ID: " << BestGroupID << " and Mask"
<< (int)BestGroup->getMask() << "\n");		<< (int)BestGroup->getMask() << "\n");
BestCost += TempCost;		BestCost += TempCost;
} else		} else
BestCost += MissPenalty;		BestCost += MissPenalty;

CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;		CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
		}

		bool PipelineSolver::solveGreedy() {
		BestCost = 0;
		std::vector<std::pair<SUnit , SUnit >> AddedEdges;

		while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) {
		SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
		IsBottomUp
		? greedyFind(AddedEdges, CurrSU.second.rbegin(), CurrSU.second.rend())
		: greedyFind(AddedEdges, CurrSU.second.begin(), CurrSU.second.end());
advancePosition();		advancePosition();
}		}
BestPipeline = CurrPipeline;		BestPipeline = CurrPipeline;
removeEdges(AddedEdges);		removeEdges(AddedEdges);
return false;		return false;
}		}

unsigned PipelineSolver::computeProblemSize() {		unsigned PipelineSolver::computeProblemSize() {
Show All 27 Lines	if (BestCost > 0) {
LLVM_DEBUG(dbgs() << "Exact produced best cost of " << BestCost << "\n");		LLVM_DEBUG(dbgs() << "Exact produced best cost of " << BestCost << "\n");
}		}
} else { // Use the Greedy Algorithm by default		} else { // Use the Greedy Algorithm by default
LLVM_DEBUG(dbgs() << "Starting GREEDY pipeline solver\n");		LLVM_DEBUG(dbgs() << "Starting GREEDY pipeline solver\n");
solveGreedy();		solveGreedy();
}		}

makePipeline();		makePipeline();
		LLVM_DEBUG(dbgs() << "After applying mutation\n");
		LLVM_DEBUG(DAG->dump());
}		}

enum IGLPStrategyID : int { MFMASmallGemmOptID = 0 };		enum IGLPStrategyID : int { MFMASmallGemmOptID = 0, DemoOptID = 1 };

// Implement a IGLP scheduling strategy.		// Implement a IGLP scheduling strategy.
class IGLPStrategy {		class IGLPStrategy {
protected:		protected:
ScheduleDAGInstrs *DAG;		ScheduleDAGInstrs *DAG;

const SIInstrInfo *TII;		const SIInstrInfo *TII;

public:		public:
// Add SchedGroups to \p Pipeline to implement this Strategy.		// Add SchedGroups to \p Pipeline to implement this Strategy.
virtual void applyIGLPStrategy(		virtual void applyIGLPStrategy(
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,		DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) = 0;		DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) = 0;

// Returns true if this strategy should be applied to a ScheduleDAG.		// Returns true if this strategy should be applied to a ScheduleDAG.
virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) = 0;		virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) = 0;

		bool IsBottomUp = 1;
		kerbowaUnsubmitted Done Reply Inline Actions Does this really need to be virtual if the derived class is just setting a flag to up/down? What special handling may we need here for different strategies beyond just setting the flag in the constructor? kerbowa: Does this really need to be virtual if the derived class is just setting a flag to up/down?

IGLPStrategy(ScheduleDAGInstrs DAG, const SIInstrInfo TII)		IGLPStrategy(ScheduleDAGInstrs DAG, const SIInstrInfo TII)
: DAG(DAG), TII(TII) {}		: DAG(DAG), TII(TII) {}

virtual ~IGLPStrategy() = default;		virtual ~IGLPStrategy() = default;
};		};

class MFMASmallGemmOpt final : public IGLPStrategy {		class MFMASmallGemmOpt final : public IGLPStrategy {
		private:
		kerbowaUnsubmitted Not Done Reply Inline Actions Remove kerbowa: Remove
public:		public:
void applyIGLPStrategy(		void applyIGLPStrategy(
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,		DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) override;		DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) override;

bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }		bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }

MFMASmallGemmOpt(ScheduleDAGInstrs DAG, const SIInstrInfo TII)		MFMASmallGemmOpt(ScheduleDAGInstrs DAG, const SIInstrInfo TII)
: IGLPStrategy(DAG, TII) {}		: IGLPStrategy(DAG, TII) {
		IsBottomUp = 1;
		}
};		};

void MFMASmallGemmOpt::applyIGLPStrategy(		void MFMASmallGemmOpt::applyIGLPStrategy(
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,		DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) {		DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) {
// Count the number of MFMA instructions.		// Count the number of MFMA instructions.
unsigned MFMACount = 0;		unsigned MFMACount = 0;
for (const MachineInstr &I : *DAG)		for (const MachineInstr &I : *DAG)
if (TII->isMFMAorWMMA(I))		if (TII->isMFMAorWMMA(I))
++MFMACount;		++MFMACount;

const unsigned PipelineSyncID = 0;		const unsigned PipelineSyncID = 0;
SchedGroup *SG = nullptr;		SchedGroup *SG = nullptr;
for (unsigned I = 0; I < MFMACount * 3; ++I) {		for (unsigned I = 0; I < MFMACount * 3; ++I) {
SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
SchedGroupMask::DS, 2, PipelineSyncID, DAG, TII);		SchedGroupMask::DS, 2, PipelineSyncID, DAG, TII);
SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);		SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
}		}
}		}

		class DemoOpt final : public IGLPStrategy {
		kerbowaUnsubmitted Done Reply Inline Actions Is this strategy just a demo of this new feature? What are your thoughts on upstreaming this? kerbowa: Is this strategy just a demo of this new feature? What are your thoughts on upstreaming this?
		jrbyrnesAuthorUnsubmitted Done Reply Inline Actions Hey -- thanks for bringing that up. It is to demo the new feature -- put another way, it is to facilitate lit testing of the new feature. The assumption is that clients of iglp_opt have an intimate understanding of the i32s passed to the builtin. However, I see that mixing testing and features is less than ideal. Maybe it makes the most sense to have incremental tests (under iglp_opt(1)) that is superseded by the singlewave strategy which incorporates all the new features? Otherwise, we will need a mechanism to test new features without changing the existing behavior of the current builtins -- whether it be under iglp_opt or a separate intrinsic. jrbyrnes: Hey -- thanks for bringing that up. It is to demo the new feature -- put another way, it is…
		private:
		public:
		void applyIGLPStrategy(
		DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
		DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) override;

		bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }

		DemoOpt(ScheduleDAGInstrs DAG, const SIInstrInfo TII)
		: IGLPStrategy(DAG, TII) {
		IsBottomUp = 0;
		}
		};

		void DemoOpt::applyIGLPStrategy(
		DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
		DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) {
		// Count the number of MFMA instructions.
		unsigned MFMACount = 0;
		for (const MachineInstr &I : *DAG)
		if (TII->isMFMAorWMMA(I))
		++MFMACount;

		const unsigned PipelineSyncID = 0;
		SchedGroup *SG = nullptr;
		for (unsigned I = 0; I < MFMACount * 3; ++I) {
		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
		SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

		SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
		SchedGroupMask::DS, 2, PipelineSyncID, DAG, TII);
		SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
		}
		}

static std::unique_ptr<IGLPStrategy>		static std::unique_ptr<IGLPStrategy>
createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG,		createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG,
const SIInstrInfo *TII) {		const SIInstrInfo *TII) {
switch (ID) {		switch (ID) {
case MFMASmallGemmOptID:		case MFMASmallGemmOptID:
return std::make_unique<MFMASmallGemmOpt>(DAG, TII);		return std::make_unique<MFMASmallGemmOpt>(DAG, TII);
		case DemoOptID:
		return std::make_unique<MFMASmallGemmOpt>(DAG, TII);
}		}

llvm_unreachable("Unknown IGLPStrategyID");		llvm_unreachable("Unknown IGLPStrategyID");
}		}

class IGroupLPDAGMutation : public ScheduleDAGMutation {		class IGroupLPDAGMutation : public ScheduleDAGMutation {
private:		private:
const SIInstrInfo *TII;		const SIInstrInfo *TII;
Show All 26 Lines	private:
void initSchedGroupBarrierPipelineStage(		void initSchedGroupBarrierPipelineStage(
std::vector<SUnit>::reverse_iterator RIter);		std::vector<SUnit>::reverse_iterator RIter);

void initIGLPOpt(SUnit &SU);		void initIGLPOpt(SUnit &SU);

public:		public:
void apply(ScheduleDAGInstrs *DAGInstrs) override;		void apply(ScheduleDAGInstrs *DAGInstrs) override;

		// The order in which the PipelineSolver should process the candidate
		// SchedGroup for a PipelineInstr. BOTTOM_UP will try to add SUs to the last
		// created SchedGroup first, and will consider that as the ultimate
		// predecessor group when linking. TOP_DOWN instead links and processes the
		// first created SchedGroup first.
		bool IsBottomUp = 1;

IGroupLPDAGMutation() = default;		IGroupLPDAGMutation() = default;
};		};

unsigned SchedGroup::NumSchedGroups = 0;		unsigned SchedGroup::NumSchedGroups = 0;

bool SchedGroup::tryAddEdge(SUnit A, SUnit B) {		bool SchedGroup::tryAddEdge(SUnit A, SUnit B) {
if (A != B && DAG->canAddEdge(B, A)) {		if (A != B && DAG->canAddEdge(B, A)) {
DAG->addEdge(B, SDep(A, SDep::Artificial));		DAG->addEdge(B, SDep(A, SDep::Artificial));
▲ Show 20 Lines • Show All 63 Lines • ▼ Show 20 Lines	for (auto *A : Collection) {
SUnit *B = &SU;		SUnit *B = &SU;
if (A == B \|\| A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)		if (A == B \|\| A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
continue;		continue;
if (MakePred)		if (MakePred)
std::swap(A, B);		std::swap(A, B);

if (DAG->IsReachable(B, A))		if (DAG->IsReachable(B, A))
continue;		continue;

// tryAddEdge returns false if there is a dependency that makes adding		// tryAddEdge returns false if there is a dependency that makes adding
// the A->B edge impossible, otherwise it returns true;		// the A->B edge impossible, otherwise it returns true;
bool Added = tryAddEdge(A, B);		bool Added = tryAddEdge(A, B);
if (Added)		if (Added)
AddedEdges.push_back(std::pair(A, B));		AddedEdges.push_back(std::pair(A, B));
else		else
++MissedEdges;		++MissedEdges;
}		}
▲ Show 20 Lines • Show All 110 Lines • ▼ Show 20 Lines	if (Opc == AMDGPU::SCHED_BARRIER) {
resetEdges(*R, DAG);		resetEdges(*R, DAG);
if (!foundSB && !foundIGLP)		if (!foundSB && !foundIGLP)
initIGLPOpt(*R);		initIGLPOpt(*R);
foundIGLP = true;		foundIGLP = true;
}		}
}		}

if (foundSB \|\| foundIGLP) {		if (foundSB \|\| foundIGLP) {
PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG);		PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG, IsBottomUp);
// PipelineSolver performs the mutation by adding the edges it		// PipelineSolver performs the mutation by adding the edges it
// determined as the best		// determined as the best
PS.solve();		PS.solve();
}		}
}		}

void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {		void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
MachineInstr &MI = *SchedBarrier.getInstr();		MachineInstr &MI = *SchedBarrier.getInstr();
▲ Show 20 Lines • Show All 63 Lines • ▼ Show 20 Lines	void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage(

SG.initSchedGroup(RIter, SyncedInstrs[SG.getSyncID()]);		SG.initSchedGroup(RIter, SyncedInstrs[SG.getSyncID()]);
}		}

void IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {		void IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
IGLPStrategyID StrategyID =		IGLPStrategyID StrategyID =
(IGLPStrategyID)SU.getInstr()->getOperand(0).getImm();		(IGLPStrategyID)SU.getInstr()->getOperand(0).getImm();
auto S = createIGLPStrategy(StrategyID, DAG, TII);		auto S = createIGLPStrategy(StrategyID, DAG, TII);
if (S->shouldApplyStrategy(DAG))		if (S->shouldApplyStrategy(DAG)) {
		IsBottomUp = S->IsBottomUp;
S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups);		S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups);
}		}
		}

} // namespace		} // namespace

namespace llvm {		namespace llvm {

std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation() {		std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation() {
return std::make_unique<IGroupLPDAGMutation>();		return std::make_unique<IGroupLPDAGMutation>();
}		}

} // end namespace llvm		} // end namespace llvm

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll

Show First 20 Lines • Show All 141 Lines • ▼ Show 20 Lines	entry:
store <32 x float> %mai.2, ptr addrspace(3) %store.2.addr		store <32 x float> %mai.2, ptr addrspace(3) %store.2.addr
%store.3.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 192		%store.3.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 192
store <32 x float> %mai.3, ptr addrspace(3) %store.3.addr		store <32 x float> %mai.3, ptr addrspace(3) %store.3.addr
%store.4.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 256		%store.4.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 256
store <32 x float> %mai.4, ptr addrspace(3) %store.4.addr		store <32 x float> %mai.4, ptr addrspace(3) %store.4.addr
ret void		ret void
}		}


		define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
		; GCN-LABEL: test_iglp_opt_rev_mfma_gemm:
		; GCN: ; %bb.0: ; %entry
		; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
		; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0
		; GCN-NEXT: v_mov_b32_e32 v3, 2.0
		; GCN-NEXT: ; iglp_opt mask(0x00000001)
		; GCN-NEXT: s_waitcnt lgkmcnt(0)
		; GCN-NEXT: v_add_u32_e32 v1, s0, v0
		; GCN-NEXT: v_add_u32_e32 v2, 0x6000, v1
		; GCN-NEXT: ds_read_b128 a[28:31], v2 offset:57456
		; GCN-NEXT: ds_read_b128 a[24:27], v2 offset:57440
		; GCN-NEXT: ds_read_b128 a[20:23], v2 offset:57424
		; GCN-NEXT: ds_read_b128 a[16:19], v2 offset:57408
		; GCN-NEXT: ds_read_b128 a[0:3], v2 offset:57344
		; GCN-NEXT: ds_read_b128 a[4:7], v2 offset:57360
		; GCN-NEXT: ds_read_b128 a[8:11], v2 offset:57376
		; GCN-NEXT: ds_read_b128 a[12:15], v2 offset:57392
		; GCN-NEXT: v_mov_b32_e32 v2, 1.0
		; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:49264
		; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:49248
		; GCN-NEXT: ds_read_b128 a[52:55], v1 offset:49232
		; GCN-NEXT: ds_read_b128 a[48:51], v1 offset:49216
		; GCN-NEXT: ds_read_b128 a[44:47], v1 offset:49200
		; GCN-NEXT: ds_read_b128 a[40:43], v1 offset:49184
		; GCN-NEXT: ds_read_b128 a[36:39], v1 offset:49168
		; GCN-NEXT: ds_read_b128 a[32:35], v1 offset:49152
		; GCN-NEXT: s_waitcnt lgkmcnt(8)
		; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
		; GCN-NEXT: ds_read_b128 a[156:159], v1 offset:112
		; GCN-NEXT: ds_read_b128 a[152:155], v1 offset:96
		; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:24592
		; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:24576
		; GCN-NEXT: v_add_u32_e32 v0, s1, v0
		; GCN-NEXT: s_waitcnt lgkmcnt(4)
		; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63]
		; GCN-NEXT: ds_read_b128 a[148:151], v1 offset:80
		; GCN-NEXT: ds_read_b128 a[144:147], v1 offset:64
		; GCN-NEXT: ds_read_b128 a[128:131], v1
		; GCN-NEXT: ds_read_b128 a[132:135], v1 offset:16
		; GCN-NEXT: ds_read_b128 a[136:139], v1 offset:32
		; GCN-NEXT: ds_read_b128 a[140:143], v1 offset:48
		; GCN-NEXT: s_waitcnt lgkmcnt(0)
		; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v3, a[128:159]
		; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:8304
		; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:8288
		; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:8272
		; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:8256
		; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:8240
		; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:8224
		; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:8208
		; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:8192
		; GCN-NEXT: s_waitcnt lgkmcnt(0)
		; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v3, a[96:127]
		; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:24688
		; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:24672
		; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:24656
		; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:24640
		; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:24624
		; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:24608
		; GCN-NEXT: s_nop 2
		; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:112
		; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:96
		; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:80
		; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:64
		; GCN-NEXT: ds_write_b128 v0, a[140:143] offset:48
		; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:32
		; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:16
		; GCN-NEXT: ds_write_b128 v0, a[128:131]
		; GCN-NEXT: v_mov_b32_e32 v0, s1
		; GCN-NEXT: s_waitcnt lgkmcnt(8)
		; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v3, a[64:95]
		; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:24672
		; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:24688
		; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:24640
		; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:8288
		; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:8304
		; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:8256
		; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:8272
		; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:8224
		; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:8240
		; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:8192
		; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:8208
		; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:24656
		; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:24608
		; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:24624
		; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:24576
		; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:24592
		; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:32864
		; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:32880
		; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:32832
		; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:16480
		; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:16496
		; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:16448
		; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:16464
		; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:16416
		; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:16432
		; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:16384
		; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:16400
		; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:32848
		; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32800
		; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:32816
		; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:32768
		; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:32784
		; GCN-NEXT: s_endpgm
		entry:
		call void @llvm.amdgcn.iglp.opt(i32 1)
		%idx = call i32 @llvm.amdgcn.workitem.id.x()
		%load.0.addr = getelementptr <32 x float>, ptr addrspace(3) %in, i32 %idx
		%load.0 = load <32 x float>, ptr addrspace(3) %load.0.addr
		%load.1.addr = getelementptr <32 x float>, ptr addrspace(3) %load.0.addr, i32 64
		%load.1 = load <32 x float>, ptr addrspace(3) %load.1.addr
		%load.2.addr = getelementptr <32 x float>, ptr addrspace(3) %load.1.addr, i32 128
		%load.2 = load <32 x float>, ptr addrspace(3) %load.2.addr
		%load.3.addr = getelementptr <32 x float>, ptr addrspace(3) %load.2.addr, i32 192
		%load.3 = load <32 x float>, ptr addrspace(3) %load.3.addr
		%load.4.addr = getelementptr <32 x float>, ptr addrspace(3) %load.3.addr, i32 256
		%load.4 = load <32 x float>, ptr addrspace(3) %load.4.addr
		%mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.0, i32 0, i32 0, i32 0)
		%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.1, i32 0, i32 0, i32 0)
		%mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.2, i32 0, i32 0, i32 0)
		%mai.3 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.3, i32 0, i32 0, i32 0)
		%mai.4 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.4, i32 0, i32 0, i32 0)
		%store.0.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 %idx
		store <32 x float> %mai.0, ptr addrspace(3) %store.0.addr
		%store.1.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 64
		store <32 x float> %mai.1, ptr addrspace(3) %store.1.addr
		%store.2.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 128
		store <32 x float> %mai.2, ptr addrspace(3) %store.2.addr
		%store.3.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 192
		store <32 x float> %mai.3, ptr addrspace(3) %store.3.addr
		%store.4.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 256
		store <32 x float> %mai.4, ptr addrspace(3) %store.4.addr
		ret void
		}


declare void @llvm.amdgcn.iglp.opt(i32) #1		declare void @llvm.amdgcn.iglp.opt(i32) #1
declare i32 @llvm.amdgcn.workitem.id.x() #1		declare i32 @llvm.amdgcn.workitem.id.x() #1
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) #1		declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) #1

attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1,256" }		attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1,256" }
attributes #1 = { convergent nounwind }		attributes #1 = { convergent nounwind }

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU][IGLP] Parameterize the SchedGroup processing / linking in Solver
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 526801

llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU][IGLP] Parameterize the SchedGroup processing / linking in SolverClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 526801

llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll

[AMDGPU][IGLP] Parameterize the SchedGroup processing / linking in Solver
ClosedPublic