Diff 460106

llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp

Show First 20 Lines • Show All 55 Lines • ▼ Show 20 Lines	static cl::opt<bool> UseCostHeur(
"amdgpu-igrouplp-exact-solver-cost-heur", cl::init(true), cl::Hidden,		"amdgpu-igrouplp-exact-solver-cost-heur", cl::init(true), cl::Hidden,
cl::desc("Whether to use the cost heuristic to make choices as we "		cl::desc("Whether to use the cost heuristic to make choices as we "
"traverse the search space using the exact solver. Defaulted "		"traverse the search space using the exact solver. Defaulted "
"to on, and if turned off, we will use the node order -- "		"to on, and if turned off, we will use the node order -- "
"attempting to put the later nodes in the later sched groups. "		"attempting to put the later nodes in the later sched groups. "
"Experimentally, results are mixed, so this should be set on a "		"Experimentally, results are mixed, so this should be set on a "
"case-by-case basis."));		"case-by-case basis."));

		static cl::opt<bool> EnableLowerBound(
		"amdgpu-igrouplp-exact-solver-lower-bound", cl::Hidden,
		cl::desc("Whether to use a lower bound when calculating the cost "
		"for a partial fit using the exact solver. The lower bound "
		"calcalutes the cost of assigning the remaining instructions "
		arsenmUnsubmitted Done Reply Inline Actions Typo calcalutes arsenm: Typo calcalutes
		"under idealized conditions. The LB reduces the overall search "
		"space but adds time complexity per branch explored."),
		cl::init(false));

// Components of the mask that determines which instruction types may be may be		// Components of the mask that determines which instruction types may be may be
// classified into a SchedGroup.		// classified into a SchedGroup.
enum class SchedGroupMask {		enum class SchedGroupMask {
NONE = 0u,		NONE = 0u,
ALU = 1u << 0,		ALU = 1u << 0,
VALU = 1u << 1,		VALU = 1u << 1,
SALU = 1u << 2,		SALU = 1u << 2,
MFMA = 1u << 3,		MFMA = 1u << 3,
Show All 32 Lines	private:

// Count of the number of created SchedGroups, used to initialize SGID.		// Count of the number of created SchedGroups, used to initialize SGID.
static unsigned NumSchedGroups;		static unsigned NumSchedGroups;

ScheduleDAGInstrs *DAG;		ScheduleDAGInstrs *DAG;

const SIInstrInfo *TII;		const SIInstrInfo *TII;

// Try to add and edge from SU A to SU B.		// Try to add and edge from SU A to SU B. This returns false if there is a
		// dependency which makes adding the A->B edge impossible, otherwise it
		// returns true. The result is that it will return true even if no edge was
		// added. For example, if there is already an edge between A->B, this will
		// return true, even though DAG->addEdge does not add edge.
bool tryAddEdge(SUnit A, SUnit B);		bool tryAddEdge(SUnit A, SUnit B);

// Use SGMask to determine whether we can classify MI as a member of this		// Use SGMask to determine whether we can classify MI as a member of this
// SchedGroup object.		// SchedGroup object.
bool canAddMI(const MachineInstr &MI) const;		bool canAddMI(const MachineInstr &MI) const;

public:		public:
// Collection of SUnits that are classified as members of this group.		// Collection of SUnits that are classified as members of this group.
▲ Show 20 Lines • Show All 117 Lines • ▼ Show 20 Lines	class PipelineSolver {

// The cost penalty of not assigning a SU to a SchedGroup		// The cost penalty of not assigning a SU to a SchedGroup
int MissPenalty = 0;		int MissPenalty = 0;

// Costs in terms of the number of edges we are unable to add		// Costs in terms of the number of edges we are unable to add
int BestCost = -1;		int BestCost = -1;
int CurrCost = 0;		int CurrCost = 0;

		// A lower bound on the optimal cost for a complete pipeline
		int StaticLowerBound = 0;

// Index pointing to the conflicting instruction that is currently being		// Index pointing to the conflicting instruction that is currently being
// fitted		// fitted
int CurrConflInstNo = 0;		int CurrConflInstNo = 0;
// Index to the pipeline that is currently being fitted		// Index to the pipeline that is currently being fitted
int CurrSyncGroupIdx = 0;		int CurrSyncGroupIdx = 0;
// The first non trivial pipeline		// The first non trivial pipeline
int BeginSyncGroupIdx = 0;		int BeginSyncGroupIdx = 0;

Show All 11 Lines	class PipelineSolver {
// The polynomial time algorithm which attempts to find a good fit		// The polynomial time algorithm which attempts to find a good fit
bool solveGreedy();		bool solveGreedy();
// Whether or not the current solution is optimal		// Whether or not the current solution is optimal
bool checkOptimal();		bool checkOptimal();
// Populate the ready list, prioiritizing fewest missed edges first		// Populate the ready list, prioiritizing fewest missed edges first
void populateReadyList(SUToCandSGsPair &CurrSU,		void populateReadyList(SUToCandSGsPair &CurrSU,
SmallVectorImpl<std::pair<int, int>> &ReadyList,		SmallVectorImpl<std::pair<int, int>> &ReadyList,
SmallVectorImpl<SchedGroup> &SyncPipeline);		SmallVectorImpl<SchedGroup> &SyncPipeline);
		// Calculate best cost assignment of an unassigned SU without assigning it.
		// The sum of these costs across SUs represents a Lower Bound on the true best
		// cost for the set of unassigned SUs.
		int calculateLowerBound();
// Add edges corresponding to the SchedGroups as assigned by solver		// Add edges corresponding to the SchedGroups as assigned by solver
void makePipeline();		void makePipeline();
// Add the edges from the SU to the other SchedGroups in pipeline, and		// Add the edges from the SU to the other SchedGroups in pipeline, and
// return the number of edges missed.		// return the number of edges missed.
int addEdges(SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,		int addEdges(SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
std::vector<std::pair<SUnit , SUnit >> &AddedEdges);		std::vector<std::pair<SUnit , SUnit >> &AddedEdges,
		int BestCost = -1);
// Remove the edges passed via AddedEdges		// Remove the edges passed via AddedEdges
void removeEdges(const std::vector<std::pair<SUnit , SUnit >> &AddedEdges);		void removeEdges(const std::vector<std::pair<SUnit , SUnit >> &AddedEdges);
// Convert the passed in maps to arrays for bidirectional iterators		// Convert the passed in maps to arrays for bidirectional iterators
void convertSyncMapsToArrays();		void convertSyncMapsToArrays();

void reset();		void reset();

public:		public:
▲ Show 20 Lines • Show All 105 Lines • ▼ Show 20 Lines	for (; I != E; ++I) {
GroupA.link(GroupB);		GroupA.link(GroupB);
}		}
}		}
}		}
}		}

int PipelineSolver::addEdges(		int PipelineSolver::addEdges(
SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,		SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
std::vector<std::pair<SUnit , SUnit >> &AddedEdges) {		std::vector<std::pair<SUnit , SUnit >> &AddedEdges, int BestCost) {
int AddedCost = 0;		int AddedCost = 0;
bool MakePred = false;		bool MakePred = false;

// The groups in the pipeline are in reverse order. Thus,		// The groups in the pipeline are in reverse order. Thus,
// by traversing them from last to first, we are traversing		// by traversing them from last to first, we are traversing
// them in the order as they were introduced in the code. After we		// them in the order as they were introduced in the code. After we
// pass the group the SU is being assigned to, it should be		// pass the group the SU is being assigned to, it should be
// linked as a predecessor of the subsequent SchedGroups		// linked as a predecessor of the subsequent SchedGroups
auto GroupNo = (int)SyncPipeline.size() - 1;		auto GroupNo = (int)SyncPipeline.size() - 1;
for (; GroupNo >= 0; GroupNo--) {		for (; GroupNo >= 0; GroupNo--) {
		if (BestCost != -1 && AddedCost >= BestCost)
		return AddedCost;
if (SyncPipeline[GroupNo].getSGID() == SGID) {		if (SyncPipeline[GroupNo].getSGID() == SGID) {
MakePred = true;		MakePred = true;
continue;		continue;
}		}
auto Group = &SyncPipeline[GroupNo];		auto Group = &SyncPipeline[GroupNo];
AddedCost += Group->link(*SU, MakePred, AddedEdges);		AddedCost += Group->link(*SU, MakePred, AddedEdges);
assert(AddedCost >= 0);		assert(AddedCost >= 0);
}		}

return AddedCost;		return AddedCost;
}		}

void PipelineSolver::removeEdges(		void PipelineSolver::removeEdges(
const std::vector<std::pair<SUnit , SUnit >> &EdgesToRemove) {		const std::vector<std::pair<SUnit , SUnit >> &EdgesToRemove) {
// Only remove the edges that we have added when testing		// Only remove the edges that we have added when testing
// the fit.		// the fit.
for (auto &PredSuccPair : EdgesToRemove) {		for (auto &PredSuccPair : EdgesToRemove) {
SUnit *Pred = PredSuccPair.first;		SUnit *Pred = PredSuccPair.first;
SUnit *Succ = PredSuccPair.second;		SUnit *Succ = PredSuccPair.second;

auto Match =		auto Match =
std::find_if(Succ->Preds.begin(), Succ->Preds.end(),		std::find_if(Succ->Preds.begin(), Succ->Preds.end(), [&Pred](SDep &P) {
[&Pred](SDep &P) { return P.getSUnit() == Pred; });		return P.getSUnit() == Pred && P.isArtificial();
		});
if (Match != Succ->Preds.end()) {		if (Match != Succ->Preds.end()) {
assert(Match->isArtificial());		assert(Match->isArtificial());
Succ->removePred(*Match);		Succ->removePred(*Match);
}		}
}		}
}		}

void PipelineSolver::advancePosition() {		void PipelineSolver::advancePosition() {
▲ Show 20 Lines • Show All 43 Lines • ▼ Show 20 Lines	if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) {
}		}
assert(BestCost >= 0);		assert(BestCost >= 0);
}		}

bool DoneExploring = false;		bool DoneExploring = false;
if (MaxBranchesExplored > 0 && BranchesExplored >= MaxBranchesExplored)		if (MaxBranchesExplored > 0 && BranchesExplored >= MaxBranchesExplored)
DoneExploring = true;		DoneExploring = true;

return (DoneExploring \|\| BestCost == 0);		return (DoneExploring \|\| BestCost == StaticLowerBound);
}		}

void PipelineSolver::populateReadyList(		void PipelineSolver::populateReadyList(
SUToCandSGsPair &CurrSU, SmallVectorImpl<std::pair<int, int>> &ReadyList,		SUToCandSGsPair &CurrSU, SmallVectorImpl<std::pair<int, int>> &ReadyList,
SmallVectorImpl<SchedGroup> &SyncPipeline) {		SmallVectorImpl<SchedGroup> &SyncPipeline) {
assert(CurrSU.second.size() >= 1);		assert(CurrSU.second.size() >= 1);
auto I = CurrSU.second.rbegin();		auto I = CurrSU.second.rbegin();
auto E = CurrSU.second.rend();		auto E = CurrSU.second.rend();
for (; I != E; ++I) {
std::vector<std::pair<SUnit , SUnit >> AddedEdges;		std::vector<std::pair<SUnit , SUnit >> AddedEdges;
		for (; I != E; ++I) {

int CandSGID = *I;		int CandSGID = *I;
SchedGroup *Match;		SchedGroup *Match;
for (auto &SG : SyncPipeline) {		for (auto &SG : SyncPipeline) {
if (SG.getSGID() == CandSGID)		if (SG.getSGID() == CandSGID)
Match = &SG;		Match = &SG;
}		}

if (UseCostHeur) {		if (UseCostHeur) {
if (Match->isFull()) {		if (Match->isFull()) {
ReadyList.push_back(std::make_pair(*I, MissPenalty));		ReadyList.push_back(std::make_pair(*I, MissPenalty));
continue;		continue;
}		}
		AddedEdges.clear();

int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);		int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
ReadyList.push_back(std::make_pair(*I, TempCost));		ReadyList.push_back(std::make_pair(*I, TempCost));
removeEdges(AddedEdges);		removeEdges(AddedEdges);
} else		} else
ReadyList.push_back(std::make_pair(*I, -1));		ReadyList.push_back(std::make_pair(*I, -1));
}		}

if (UseCostHeur) {		if (UseCostHeur) {
std::sort(ReadyList.begin(), ReadyList.end(),		std::sort(ReadyList.begin(), ReadyList.end(),
[](std::pair<int, int> A, std::pair<int, int> B) {		[](std::pair<int, int> A, std::pair<int, int> B) {
return A.second < B.second;		return A.second < B.second;
});		});
}		}

assert(ReadyList.size() == CurrSU.second.size());		assert(ReadyList.size() == CurrSU.second.size());
}		}

		int PipelineSolver::calculateLowerBound() {
		if (CurrSyncGroupIdx >= (int)CurrPipeline.size())
		return 0;
		int TempConflInstNo = CurrConflInstNo;
		int TmpSyncGroupIdx = CurrSyncGroupIdx;
		int MinimumCost = 0;
		std::vector<std::pair<SUnit , SUnit >> AddedEdges;
		arsenmUnsubmitted Done Reply Inline Actions I'd assume this can be a SmallVector arsenm: I'd assume this can be a SmallVector

		for (; TmpSyncGroupIdx < (int)CurrPipeline.size(); TmpSyncGroupIdx++) {
		auto SyncPipeline = CurrPipeline[TmpSyncGroupIdx];
		for (; TempConflInstNo < (int)PipelineInstrs[TmpSyncGroupIdx].size();
		TempConflInstNo++) {
		auto CurrSU = PipelineInstrs[TmpSyncGroupIdx][TempConflInstNo];
		auto I = CurrSU.second.rbegin();
		auto E = CurrSU.second.rend();
		int MinCostForSU = -1;
		for (; I != E; I++) {
		jsilvanusUnsubmitted Done Reply Inline Actions Maybe move the vector out of the loops to reduce allocations, and clear() here instead? jsilvanus: Maybe move the vector out of the loops to reduce allocations, and clear() here instead?
		jsilvanusUnsubmitted Done Reply Inline Actions Why not move out of the other two loops as well? Maybe with a comment that this is for performance only, and only needed in the most inner loop. jsilvanus: Why not move out of the other two loops as well? Maybe with a comment that this is for…
		int CandSGID = *I;
		SchedGroup *Match;
		for (auto &SG : SyncPipeline) {
		if (SG.getSGID() == CandSGID)
		Match = &SG;
		}

		if (Match->isFull()) {
		if (MinCostForSU == -1 \|\| MissPenalty < MinCostForSU)
		MinCostForSU = MissPenalty;
		continue;
		}
		AddedEdges.clear();
		int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID,
		jsilvanusUnsubmitted Done Reply Inline Actions In addEdges, we could abort once the added cost exceeds MinCostForSU (if MinCostForSU != -1). Not sure if that would be worth the effort. jsilvanus: In addEdges, we could abort once the added cost exceeds MinCostForSU (if MinCostForSU != -1).
		AddedEdges, MinCostForSU);
		if (MinCostForSU == -1 \|\| TempCost < MinCostForSU)
		MinCostForSU = TempCost;

		removeEdges(AddedEdges);
		if (MinCostForSU == 0)
		break;
		}
		MinimumCost += MinCostForSU;
		}
		TempConflInstNo = 0;
		}
		return MinimumCost;
		}

bool PipelineSolver::solveExact() {		bool PipelineSolver::solveExact() {
if (checkOptimal())		if (checkOptimal())
return true;		return true;

if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size())		if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size())
return false;		return false;

assert(static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size());		assert(static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size());
assert(static_cast<size_t>(CurrConflInstNo) <		assert(static_cast<size_t>(CurrConflInstNo) <
PipelineInstrs[CurrSyncGroupIdx].size());		PipelineInstrs[CurrSyncGroupIdx].size());
SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];		SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum		LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum
<< ") in Pipeline # " << CurrSyncGroupIdx << "\n");		<< ") in Pipeline # " << CurrSyncGroupIdx << "\n");

// SchedGroup -> Cost pairs		// SchedGroup -> Cost pairs
SmallVector<std::pair<int, int>, 4> ReadyList;		SmallVector<std::pair<int, int>, 4> ReadyList;
// Prioritize the candidate sched groups in terms of lowest cost first		// Prioritize the candidate sched groups in terms of lowest cost first
populateReadyList(CurrSU, ReadyList, CurrPipeline[CurrSyncGroupIdx]);		populateReadyList(CurrSU, ReadyList, CurrPipeline[CurrSyncGroupIdx]);
		std::vector<std::pair<SUnit , SUnit >> AddedEdges;

auto I = ReadyList.begin();		auto I = ReadyList.begin();
auto E = ReadyList.end();		auto E = ReadyList.end();
for (; I != E; ++I) {		for (; I != E; ++I) {
// If we are trying SGs in least cost order, and the current SG is cost		// If we are trying SGs in least cost order, and the current SG is cost
// infeasible, then all subsequent SGs will also be cost infeasible, so we		// infeasible, then all subsequent SGs will also be cost infeasible, so we
// can prune.		// can prune.
if (BestCost != -1 && (CurrCost + I->second > BestCost))		if (BestCost != -1 && (CurrCost + I->second > BestCost))
return false;		return false;

int CandSGID = I->first;		int CandSGID = I->first;
int AddedCost = 0;		int AddedCost = 0;
std::vector<std::pair<SUnit , SUnit >> AddedEdges;
auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];		auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
SchedGroup *Match;		SchedGroup *Match;
for (auto &SG : SyncPipeline) {		for (auto &SG : SyncPipeline) {
if (SG.getSGID() == CandSGID)		if (SG.getSGID() == CandSGID)
Match = &SG;		Match = &SG;
}		}

if (Match->isFull())		if (Match->isFull())
continue;		continue;

LLVM_DEBUG(dbgs() << "Assigning to SchedGroup with Mask "		LLVM_DEBUG(dbgs() << "Assigning to SchedGroup with Mask "
<< (int)Match->getMask() << "and ID " << CandSGID		<< (int)Match->getMask() << "and ID " << CandSGID
<< "\n");		<< "\n");
Match->add(*CurrSU.first);		Match->add(*CurrSU.first);
		AddedEdges.clear();
AddedCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);		AddedCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
LLVM_DEBUG(dbgs() << "Cost of Assignment: " << AddedCost << "\n");		LLVM_DEBUG(dbgs() << "Cost of Assignment: " << AddedCost << "\n");
CurrCost += AddedCost;		CurrCost += AddedCost;
advancePosition();		advancePosition();
++BranchesExplored;		++BranchesExplored;
bool FinishedExploring = false;		bool FinishedExploring = false;
// If the Cost after adding edges is greater than a known solution,		// If the Cost after adding edges is greater than a known solution,
// backtrack		// backtrack
if (CurrCost < BestCost \|\| BestCost == -1) {		int LBCost =
		(EnableLowerBound && BestCost != -1) ? calculateLowerBound() : 0;
		if (BestCost == -1 \|\| CurrCost + LBCost < BestCost) {
if (solveExact()) {		if (solveExact()) {
FinishedExploring = BestCost != 0;		FinishedExploring = BestCost != StaticLowerBound;
if (!FinishedExploring)		if (!FinishedExploring)
return true;		return true;
}		}
}		}

retreatPosition();		retreatPosition();
CurrCost -= AddedCost;		CurrCost -= AddedCost;
removeEdges(AddedEdges);		removeEdges(AddedEdges);
Show All 9 Lines	bool PipelineSolver::solveExact() {
CurrCost += MissPenalty;		CurrCost += MissPenalty;
advancePosition();		advancePosition();

LLVM_DEBUG(dbgs() << "NOT Assigned (" << CurrSU.first->NodeNum << ")\n");		LLVM_DEBUG(dbgs() << "NOT Assigned (" << CurrSU.first->NodeNum << ")\n");

bool FinishedExploring = false;		bool FinishedExploring = false;
if (CurrCost < BestCost \|\| BestCost == -1) {		if (CurrCost < BestCost \|\| BestCost == -1) {
if (solveExact()) {		if (solveExact()) {
bool FinishedExploring = BestCost != 0;		bool FinishedExploring = BestCost != StaticLowerBound;
if (!FinishedExploring)		if (!FinishedExploring)
return true;		return true;
}		}
}		}

retreatPosition();		retreatPosition();
CurrCost -= MissPenalty;		CurrCost -= MissPenalty;
return FinishedExploring;		return FinishedExploring;
Show All 30 Lines	for (; I != E; ++I) {

LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask "		LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask "
<< (int)Match->getMask() << "\n");		<< (int)Match->getMask() << "\n");

if (Match->isFull()) {		if (Match->isFull()) {
LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " is full\n");		LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " is full\n");
continue;		continue;
}		}
TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);		TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges,
		BestNodeCost);
LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << "\n");		LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << "\n");
if (TempCost < BestNodeCost \|\| BestNodeCost == -1) {		if (TempCost < BestNodeCost \|\| BestNodeCost == -1) {
BestGroup = Match;		BestGroup = Match;
BestNodeCost = TempCost;		BestNodeCost = TempCost;
BestGroupID = CandSGID;		BestGroupID = CandSGID;
}		}
removeEdges(AddedEdges);		removeEdges(AddedEdges);
if (BestNodeCost == 0)		if (BestNodeCost == 0)
Show All 37 Lines	void PipelineSolver::solve() {
MissPenalty = (ProblemSize / 2) + 1;		MissPenalty = (ProblemSize / 2) + 1;

LLVM_DEBUG(DAG->dump());		LLVM_DEBUG(DAG->dump());
if (EnableExactSolver \|\| BelowCutoff) {		if (EnableExactSolver \|\| BelowCutoff) {
LLVM_DEBUG(dbgs() << "Starting Greedy pipeline solver\n");		LLVM_DEBUG(dbgs() << "Starting Greedy pipeline solver\n");
solveGreedy();		solveGreedy();
reset();		reset();
LLVM_DEBUG(dbgs() << "Greedy produced best cost of " << BestCost << "\n");		LLVM_DEBUG(dbgs() << "Greedy produced best cost of " << BestCost << "\n");
if (BestCost > 0) {		StaticLowerBound = calculateLowerBound();
		LLVM_DEBUG(dbgs() << "Lower Bound on Pipeline Cost is " << StaticLowerBound
		<< "\n");
		arsenmUnsubmitted Not Done Reply Inline Actions Single quotes arsenm: Single quotes
		if (BestCost > StaticLowerBound) {
LLVM_DEBUG(dbgs() << "Starting EXACT pipeline solver\n");		LLVM_DEBUG(dbgs() << "Starting EXACT pipeline solver\n");
solveExact();		solveExact();
LLVM_DEBUG(dbgs() << "Exact produced best cost of " << BestCost << "\n");		LLVM_DEBUG(dbgs() << "Exact produced best cost of " << BestCost << "\n");
}		}
} else { // Use the Greedy Algorithm by default		} else { // Use the Greedy Algorithm by default
LLVM_DEBUG(dbgs() << "Starting GREEDY pipeline solver\n");		LLVM_DEBUG(dbgs() << "Starting GREEDY pipeline solver\n");
solveGreedy();		solveGreedy();
}		}
▲ Show 20 Lines • Show All 210 Lines • ▼ Show 20 Lines	int SchedGroup::link(SUnit &SU, bool MakePred,
int MissedEdges = 0;		int MissedEdges = 0;
for (auto A : Collection) {		for (auto A : Collection) {
SUnit *B = &SU;		SUnit *B = &SU;
if (A == B \|\| A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)		if (A == B \|\| A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
continue;		continue;
if (MakePred)		if (MakePred)
std::swap(A, B);		std::swap(A, B);

if (DAG->IsReachable(B, A))
continue;
// tryAddEdge returns false if there is a dependency that makes adding
// the A->B edge impossible, otherwise it returns true;
bool Added = tryAddEdge(A, B);		bool Added = tryAddEdge(A, B);
if (Added)		if (Added)
AddedEdges.push_back(std::make_pair(A, B));		AddedEdges.push_back(std::make_pair(A, B));
else		else
++MissedEdges;		++MissedEdges;
}		}

return MissedEdges;		return MissedEdges;
▲ Show 20 Lines • Show All 209 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/igrouplp-pipelinesolver-lowerbound.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -misched-cluster=0 -amdgpu-igrouplp-exact-solver-max-branches=250000 < %s \| FileCheck -check-prefix=EXACT %s
				; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -misched-cluster=0 -amdgpu-igrouplp-exact-solver=1 -amdgpu-igrouplp-exact-solver-max-branches=200000 -amdgpu-igrouplp-exact-solver-cost-heur=1 < %s \| FileCheck -check-prefix=LB %s

				define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE(<32 x i32> addrspace(1)* noalias %in, <32 x i32> addrspace(1)* noalias %out) #0 {
				; EXACT-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE:
				; EXACT: ; %bb.0:
				; EXACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
				; EXACT-NEXT: v_lshlrev_b32_e32 v16, 7, v0
				; EXACT-NEXT: ; kill: killed $sgpr0_sgpr1
				; EXACT-NEXT: s_waitcnt lgkmcnt(0)
				; EXACT-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:32
				; EXACT-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:48
				; EXACT-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
				; EXACT-NEXT: s_waitcnt vmcnt(1)
				; EXACT-NEXT: v_mul_lo_u32 v13, v13, v13
				; EXACT-NEXT: s_waitcnt vmcnt(0)
				; EXACT-NEXT: v_mul_lo_u32 v7, v7, v7
				; EXACT-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
				; EXACT-NEXT: v_mul_lo_u32 v6, v6, v6
				; EXACT-NEXT: v_mul_lo_u32 v12, v12, v12
				; EXACT-NEXT: v_mul_lo_u32 v15, v15, v15
				; EXACT-NEXT: v_mul_lo_u32 v14, v14, v14
				; EXACT-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
				; EXACT-NEXT: s_waitcnt vmcnt(0)
				; EXACT-NEXT: v_mul_lo_u32 v3, v3, v3
				; EXACT-NEXT: v_mul_lo_u32 v2, v2, v2
				; EXACT-NEXT: v_mul_lo_u32 v1, v1, v1
				; EXACT-NEXT: v_mul_lo_u32 v0, v0, v0
				; EXACT-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3]
				; EXACT-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:112
				; EXACT-NEXT: s_waitcnt vmcnt(0)
				; EXACT-NEXT: v_mul_lo_u32 v3, v3, v3
				; EXACT-NEXT: v_mul_lo_u32 v2, v2, v2
				; EXACT-NEXT: v_mul_lo_u32 v1, v1, v1
				; EXACT-NEXT: v_mul_lo_u32 v0, v0, v0
				; EXACT-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:112
				; EXACT-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:96
				; EXACT-NEXT: s_waitcnt vmcnt(0)
				; EXACT-NEXT: v_mul_lo_u32 v3, v3, v3
				; EXACT-NEXT: v_mul_lo_u32 v2, v2, v2
				; EXACT-NEXT: v_mul_lo_u32 v1, v1, v1
				; EXACT-NEXT: v_mul_lo_u32 v0, v0, v0
				; EXACT-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:96
				; EXACT-NEXT: v_mul_lo_u32 v5, v5, v5
				; EXACT-NEXT: v_mul_lo_u32 v4, v4, v4
				; EXACT-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:48
				; EXACT-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:64
				; EXACT-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
				; EXACT-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
				; EXACT-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
				; EXACT-NEXT: s_waitcnt vmcnt(0)
				; EXACT-NEXT: v_mul_lo_u32 v7, v7, v7
				; EXACT-NEXT: v_mul_lo_u32 v6, v6, v6
				; EXACT-NEXT: v_mul_lo_u32 v5, v5, v5
				; EXACT-NEXT: v_mul_lo_u32 v4, v4, v4
				; EXACT-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:64
				; EXACT-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:32
				; EXACT-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:16
				; EXACT-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
				; EXACT-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
				; EXACT-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
				; EXACT-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
				; EXACT-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
				; EXACT-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
				; EXACT-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
				; EXACT-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
				; EXACT-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
				; EXACT-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
				; EXACT-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
				; EXACT-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
				; EXACT-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
				; EXACT-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
				; EXACT-NEXT: s_waitcnt vmcnt(0)
				; EXACT-NEXT: v_mul_lo_u32 v9, v9, v9
				; EXACT-NEXT: v_mul_lo_u32 v8, v8, v8
				; EXACT-NEXT: v_mul_lo_u32 v11, v11, v11
				; EXACT-NEXT: v_mul_lo_u32 v10, v10, v10
				; EXACT-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:16
				; EXACT-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:80
				; EXACT-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
				; EXACT-NEXT: s_waitcnt vmcnt(0)
				; EXACT-NEXT: v_mul_lo_u32 v11, v11, v11
				; EXACT-NEXT: v_mul_lo_u32 v10, v10, v10
				; EXACT-NEXT: v_mul_lo_u32 v9, v9, v9
				; EXACT-NEXT: v_mul_lo_u32 v8, v8, v8
				; EXACT-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:80
				; EXACT-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
				; EXACT-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
				; EXACT-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
				; EXACT-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
				; EXACT-NEXT: s_endpgm
				;
				; LB-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE:
				; LB: ; %bb.0:
				; LB-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
				; LB-NEXT: v_lshlrev_b32_e32 v12, 7, v0
				; LB-NEXT: s_waitcnt lgkmcnt(0)
				; LB-NEXT: global_load_dwordx4 v[8:11], v12, s[0:1] offset:64
				; LB-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
				; LB-NEXT: s_waitcnt vmcnt(0)
				; LB-NEXT: v_mul_lo_u32 v11, v11, v11
				; LB-NEXT: v_mul_lo_u32 v10, v10, v10
				; LB-NEXT: v_mul_lo_u32 v9, v9, v9
				; LB-NEXT: v_mul_lo_u32 v8, v8, v8
				; LB-NEXT: global_store_dwordx4 v12, v[8:11], s[2:3] offset:64
				; LB-NEXT: global_load_dwordx4 v[0:3], v12, s[0:1]
				; LB-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
				; LB-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
				; LB-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
				; LB-NEXT: s_waitcnt vmcnt(0)
				; LB-NEXT: v_mul_lo_u32 v3, v3, v3
				; LB-NEXT: v_mul_lo_u32 v2, v2, v2
				; LB-NEXT: global_load_dwordx4 v[8:11], v12, s[0:1] offset:32
				; LB-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
				; LB-NEXT: s_waitcnt vmcnt(0)
				; LB-NEXT: v_mul_lo_u32 v9, v9, v9
				; LB-NEXT: v_mul_lo_u32 v8, v8, v8
				; LB-NEXT: v_mul_lo_u32 v11, v11, v11
				; LB-NEXT: v_mul_lo_u32 v10, v10, v10
				; LB-NEXT: global_store_dwordx4 v12, v[8:11], s[2:3] offset:32
				; LB-NEXT: global_load_dwordx4 v[4:7], v12, s[0:1] offset:112
				; LB-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
				; LB-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
				; LB-NEXT: s_waitcnt vmcnt(0)
				; LB-NEXT: v_mul_lo_u32 v7, v7, v7
				; LB-NEXT: v_mul_lo_u32 v6, v6, v6
				; LB-NEXT: v_mul_lo_u32 v1, v1, v1
				; LB-NEXT: v_mul_lo_u32 v0, v0, v0
				; LB-NEXT: global_store_dwordx4 v12, v[0:3], s[2:3]
				; LB-NEXT: global_load_dwordx4 v[0:3], v12, s[0:1] offset:96
				; LB-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
				; LB-NEXT: s_waitcnt vmcnt(0)
				; LB-NEXT: v_mul_lo_u32 v3, v3, v3
				; LB-NEXT: v_mul_lo_u32 v2, v2, v2
				; LB-NEXT: v_mul_lo_u32 v1, v1, v1
				; LB-NEXT: v_mul_lo_u32 v0, v0, v0
				; LB-NEXT: global_store_dwordx4 v12, v[0:3], s[2:3] offset:96
				; LB-NEXT: global_load_dwordx4 v[0:3], v12, s[0:1] offset:80
				; LB-NEXT: s_waitcnt vmcnt(0)
				; LB-NEXT: v_mul_lo_u32 v3, v3, v3
				; LB-NEXT: v_mul_lo_u32 v2, v2, v2
				; LB-NEXT: v_mul_lo_u32 v1, v1, v1
				; LB-NEXT: v_mul_lo_u32 v0, v0, v0
				; LB-NEXT: global_store_dwordx4 v12, v[0:3], s[2:3] offset:80
				; LB-NEXT: v_mul_lo_u32 v5, v5, v5
				; LB-NEXT: v_mul_lo_u32 v4, v4, v4
				; LB-NEXT: global_store_dwordx4 v12, v[4:7], s[2:3] offset:112
				; LB-NEXT: global_load_dwordx4 v[4:7], v12, s[0:1] offset:48
				; LB-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
				; LB-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
				; LB-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
				; LB-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
				; LB-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
				; LB-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
				; LB-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
				; LB-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
				; LB-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
				; LB-NEXT: s_waitcnt vmcnt(0)
				; LB-NEXT: v_mul_lo_u32 v5, v5, v5
				; LB-NEXT: global_load_dwordx4 v[0:3], v12, s[0:1] offset:16
				; LB-NEXT: v_mul_lo_u32 v4, v4, v4
				; LB-NEXT: s_waitcnt vmcnt(0)
				; LB-NEXT: v_mul_lo_u32 v1, v1, v1
				; LB-NEXT: v_mul_lo_u32 v0, v0, v0
				; LB-NEXT: v_mul_lo_u32 v3, v3, v3
				; LB-NEXT: v_mul_lo_u32 v2, v2, v2
				; LB-NEXT: global_store_dwordx4 v12, v[0:3], s[2:3] offset:16
				; LB-NEXT: v_mul_lo_u32 v7, v7, v7
				; LB-NEXT: v_mul_lo_u32 v6, v6, v6
				; LB-NEXT: global_store_dwordx4 v12, v[4:7], s[2:3] offset:48
				; LB-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
				; LB-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
				; LB-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
				; LB-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
				; LB-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
				; LB-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
				; LB-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
				; LB-NEXT: s_endpgm
				%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
				%gep1 = getelementptr <32 x i32>, <32 x i32> addrspace(1)* %in, i32 %tid
				%load = load <32 x i32>, <32 x i32> addrspace(1)* %gep1
				%mul = mul <32 x i32> %load, %load
				%gep2 = getelementptr <32 x i32>, <32 x i32> addrspace(1)* %out, i32 %tid
				store <32 x i32> %mul, <32 x i32> addrspace(1)* %gep2
				; 1 VMEM read
				call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0)
				; 2 VALU
				call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0)
				; 1 VMEM write
				call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0)
				; 1 VMEM read
				call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0)
				; 2 VALU
				call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0)
				; 1 VMEM write
				call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0)
				; 1 VMEM read
				call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0)
				; 2 VALU
				call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0)
				; 1 VMEM write
				call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0)
				; 1 VMEM read
				call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0)
				; 2 VALU
				call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0)
				; 1 VMEM write
				call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0)
				; 1 VMEM read
				call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0)
				; 2 VALU
				call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0)
				; 1 VMEM write
				call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0)
				; 1 VMEM read
				call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0)
				; 2 VALU
				call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0)
				; 1 VMEM write
				call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0)
				; 1 VMEM read
				call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0)
				; 2 VALU
				call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0)
				; 1 VMEM write
				call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0)
				; 1 VMEM read
				call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0)
				; 2 VALU
				call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0)
				; 1 VMEM write
				call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0)
				ret void
				}

				declare i32 @llvm.amdgcn.workitem.id.x() #0
				declare void @llvm.amdgcn.sched.group.barrier(i32, i32, i32) #0
				declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) #0

				attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1,256" readnone speculatable}

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Add Lower Bound to PipelineSolver
AbandonedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 460106

llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp

llvm/test/CodeGen/AMDGPU/igrouplp-pipelinesolver-lowerbound.ll

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Add Lower Bound to PipelineSolverAbandonedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 460106

llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp

llvm/test/CodeGen/AMDGPU/igrouplp-pipelinesolver-lowerbound.ll

[AMDGPU] Add Lower Bound to PipelineSolver
AbandonedPublic