Diff 507810

llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp

Show First 20 Lines • Show All 55 Lines • ▼ Show 20 Lines	static cl::opt<bool> UseCostHeur(
"amdgpu-igrouplp-exact-solver-cost-heur", cl::init(true), cl::Hidden,		"amdgpu-igrouplp-exact-solver-cost-heur", cl::init(true), cl::Hidden,
cl::desc("Whether to use the cost heuristic to make choices as we "		cl::desc("Whether to use the cost heuristic to make choices as we "
"traverse the search space using the exact solver. Defaulted "		"traverse the search space using the exact solver. Defaulted "
"to on, and if turned off, we will use the node order -- "		"to on, and if turned off, we will use the node order -- "
"attempting to put the later nodes in the later sched groups. "		"attempting to put the later nodes in the later sched groups. "
"Experimentally, results are mixed, so this should be set on a "		"Experimentally, results are mixed, so this should be set on a "
"case-by-case basis."));		"case-by-case basis."));

		static cl::opt<bool> EnableLowerBound(
		"amdgpu-igrouplp-exact-solver-lower-bound", cl::Hidden,
		cl::desc("Whether to use a lower bound when calculating the cost "
		"for a partial fit using the exact solver. The lower bound "
		"calculates the cost of assigning the remaining instructions "
		arsenmUnsubmitted Done Reply Inline Actions Typo calcalutes arsenm: Typo calcalutes
		"under idealized conditions. The LB reduces the overall search "
		"space but adds time complexity per branch explored."),
		cl::init(false));

// Components of the mask that determines which instruction types may be may be		// Components of the mask that determines which instruction types may be may be
// classified into a SchedGroup.		// classified into a SchedGroup.
enum class SchedGroupMask {		enum class SchedGroupMask {
NONE = 0u,		NONE = 0u,
ALU = 1u << 0,		ALU = 1u << 0,
VALU = 1u << 1,		VALU = 1u << 1,
SALU = 1u << 2,		SALU = 1u << 2,
MFMA = 1u << 3,		MFMA = 1u << 3,
Show All 32 Lines	private:

// Count of the number of created SchedGroups, used to initialize SGID.		// Count of the number of created SchedGroups, used to initialize SGID.
static unsigned NumSchedGroups;		static unsigned NumSchedGroups;

ScheduleDAGInstrs *DAG;		ScheduleDAGInstrs *DAG;

const SIInstrInfo *TII;		const SIInstrInfo *TII;

// Try to add and edge from SU A to SU B.		// Try to add and edge from SU A to SU B. This returns false if there is a
		// dependency which makes adding the A->B edge impossible, otherwise it
		// returns true. The result is that it will return true even if no edge was
		// added. For example, if there is already an edge between A->B, this will
		// return true, even though DAG->addEdge does not add edge.
bool tryAddEdge(SUnit A, SUnit B);		bool tryAddEdge(SUnit A, SUnit B);

// Use SGMask to determine whether we can classify MI as a member of this		// Use SGMask to determine whether we can classify MI as a member of this
// SchedGroup object.		// SchedGroup object.
bool canAddMI(const MachineInstr &MI) const;		bool canAddMI(const MachineInstr &MI) const;

public:		public:
// Collection of SUnits that are classified as members of this group.		// Collection of SUnits that are classified as members of this group.
SmallVector<SUnit *, 32> Collection;		SmallVector<SUnit *, 32> Collection;

// Returns true if SU can be added to this SchedGroup.		// Returns true if SU can be added to this SchedGroup.
bool canAddSU(SUnit &SU) const;		bool canAddSU(SUnit &SU) const;

// Add DAG dependencies from all SUnits in this SchedGroup and this SU. If		// Add DAG dependencies from all SUnits in this SchedGroup and this SU. If
// MakePred is true, SU will be a predecessor of the SUnits in this		// MakePred is true, SU will be a predecessor of the SUnits in this
// SchedGroup, otherwise SU will be a successor.		// SchedGroup, otherwise SU will be a successor.
void link(SUnit &SU, bool MakePred = false);		void link(SUnit &SU, bool MakePred = false);

// Add DAG dependencies and track which edges are added, and the count of		// Add DAG dependencies and track which edges are added, and the count of
// missed edges		// missed edges
int link(SUnit &SU, bool MakePred,		int link(SUnit &SU, bool MakePred,
std::vector<std::pair<SUnit , SUnit >> &AddedEdges);		SmallVectorImpl<std::pair<SUnit , SUnit >> &AddedEdges);

// Add DAG dependencies from all SUnits in this SchedGroup and this SU.		// Add DAG dependencies from all SUnits in this SchedGroup and this SU.
// Use the predicate to determine whether SU should be a predecessor (P =		// Use the predicate to determine whether SU should be a predecessor (P =
// true) or a successor (P = false) of this SchedGroup.		// true) or a successor (P = false) of this SchedGroup.
void link(SUnit &SU, function_ref<bool(const SUnit A, const SUnit B)> P);		void link(SUnit &SU, function_ref<bool(const SUnit A, const SUnit B)> P);

// Add DAG dependencies such that SUnits in this group shall be ordered		// Add DAG dependencies such that SUnits in this group shall be ordered
// before SUnits in OtherGroup.		// before SUnits in OtherGroup.
▲ Show 20 Lines • Show All 95 Lines • ▼ Show 20 Lines	class PipelineSolver {

// The cost penalty of not assigning a SU to a SchedGroup		// The cost penalty of not assigning a SU to a SchedGroup
int MissPenalty = 0;		int MissPenalty = 0;

// Costs in terms of the number of edges we are unable to add		// Costs in terms of the number of edges we are unable to add
int BestCost = -1;		int BestCost = -1;
int CurrCost = 0;		int CurrCost = 0;

		// A lower bound on the optimal cost for a complete pipeline
		int StaticLowerBound = 0;

// Index pointing to the conflicting instruction that is currently being		// Index pointing to the conflicting instruction that is currently being
// fitted		// fitted
int CurrConflInstNo = 0;		int CurrConflInstNo = 0;
// Index to the pipeline that is currently being fitted		// Index to the pipeline that is currently being fitted
int CurrSyncGroupIdx = 0;		int CurrSyncGroupIdx = 0;
// The first non trivial pipeline		// The first non trivial pipeline
int BeginSyncGroupIdx = 0;		int BeginSyncGroupIdx = 0;

Show All 11 Lines	class PipelineSolver {
// The polynomial time algorithm which attempts to find a good fit		// The polynomial time algorithm which attempts to find a good fit
bool solveGreedy();		bool solveGreedy();
// Whether or not the current solution is optimal		// Whether or not the current solution is optimal
bool checkOptimal();		bool checkOptimal();
// Populate the ready list, prioiritizing fewest missed edges first		// Populate the ready list, prioiritizing fewest missed edges first
void populateReadyList(SUToCandSGsPair &CurrSU,		void populateReadyList(SUToCandSGsPair &CurrSU,
SmallVectorImpl<std::pair<int, int>> &ReadyList,		SmallVectorImpl<std::pair<int, int>> &ReadyList,
SmallVectorImpl<SchedGroup> &SyncPipeline);		SmallVectorImpl<SchedGroup> &SyncPipeline);
		// Calculate best cost assignment of an unassigned SU without assigning it.
		// The sum of these costs across SUs represents a Lower Bound on the true best
		// cost for the set of unassigned SUs.
		int calculateLowerBound();
// Add edges corresponding to the SchedGroups as assigned by solver		// Add edges corresponding to the SchedGroups as assigned by solver
void makePipeline();		void makePipeline();
// Add the edges from the SU to the other SchedGroups in pipeline, and		// Add the edges from the SU to the other SchedGroups in pipeline, and
// return the number of edges missed.		// return the number of edges missed.
int addEdges(SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,		int addEdges(SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
std::vector<std::pair<SUnit , SUnit >> &AddedEdges);		SmallVectorImpl<std::pair<SUnit , SUnit >> &AddedEdges,
		int BestCost = -1);
// Remove the edges passed via AddedEdges		// Remove the edges passed via AddedEdges
void removeEdges(const std::vector<std::pair<SUnit , SUnit >> &AddedEdges);		void removeEdges(SmallVectorImpl<std::pair<SUnit , SUnit >> &AddedEdges);
// Convert the passed in maps to arrays for bidirectional iterators		// Convert the passed in maps to arrays for bidirectional iterators
void convertSyncMapsToArrays();		void convertSyncMapsToArrays();

void reset();		void reset();

public:		public:
// Invoke the solver to map instructions to instruction groups. Heuristic &&		// Invoke the solver to map instructions to instruction groups. Heuristic &&
// command-line-option determines to use exact or greedy algorithm.		// command-line-option determines to use exact or greedy algorithm.
▲ Show 20 Lines • Show All 101 Lines • ▼ Show 20 Lines	for (; I != E; ++I) {
GroupA.link(GroupB);		GroupA.link(GroupB);
}		}
}		}
}		}
}		}

int PipelineSolver::addEdges(		int PipelineSolver::addEdges(
SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,		SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
std::vector<std::pair<SUnit , SUnit >> &AddedEdges) {		SmallVectorImpl<std::pair<SUnit , SUnit >> &AddedEdges, int BestCost) {
int AddedCost = 0;		int AddedCost = 0;
bool MakePred = false;		bool MakePred = false;

// The groups in the pipeline are in reverse order. Thus,		// The groups in the pipeline are in reverse order. Thus,
// by traversing them from last to first, we are traversing		// by traversing them from last to first, we are traversing
// them in the order as they were introduced in the code. After we		// them in the order as they were introduced in the code. After we
// pass the group the SU is being assigned to, it should be		// pass the group the SU is being assigned to, it should be
// linked as a predecessor of the subsequent SchedGroups		// linked as a predecessor of the subsequent SchedGroups
auto GroupNo = (int)SyncPipeline.size() - 1;		auto GroupNo = (int)SyncPipeline.size() - 1;
for (; GroupNo >= 0; GroupNo--) {		for (; GroupNo >= 0; GroupNo--) {
		if (BestCost != -1 && AddedCost >= BestCost)
		return AddedCost;
if (SyncPipeline[GroupNo].getSGID() == SGID) {		if (SyncPipeline[GroupNo].getSGID() == SGID) {
MakePred = true;		MakePred = true;
continue;		continue;
}		}
auto Group = &SyncPipeline[GroupNo];		auto Group = &SyncPipeline[GroupNo];
AddedCost += Group->link(*SU, MakePred, AddedEdges);		AddedCost += Group->link(*SU, MakePred, AddedEdges);
assert(AddedCost >= 0);		assert(AddedCost >= 0);
}		}

return AddedCost;		return AddedCost;
}		}

void PipelineSolver::removeEdges(		void PipelineSolver::removeEdges(
const std::vector<std::pair<SUnit , SUnit >> &EdgesToRemove) {		SmallVectorImpl<std::pair<SUnit , SUnit >> &EdgesToRemove) {
// Only remove the edges that we have added when testing		// Only remove the edges that we have added when testing
// the fit.		// the fit.
for (auto &PredSuccPair : EdgesToRemove) {		for (auto &PredSuccPair : EdgesToRemove) {
SUnit *Pred = PredSuccPair.first;		SUnit *Pred = PredSuccPair.first;
SUnit *Succ = PredSuccPair.second;		SUnit *Succ = PredSuccPair.second;

auto Match = llvm::find_if(		auto Match =
Succ->Preds, [&Pred](SDep &P) { return P.getSUnit() == Pred; });		std::find_if(Succ->Preds.begin(), Succ->Preds.end(), [&Pred](SDep &P) {
		return P.getSUnit() == Pred && P.isArtificial();
		});

if (Match != Succ->Preds.end()) {		if (Match != Succ->Preds.end()) {
assert(Match->isArtificial());		assert(Match->isArtificial());
Succ->removePred(*Match);		Succ->removePred(*Match);
}		}
}		}
}		}

void PipelineSolver::advancePosition() {		void PipelineSolver::advancePosition() {
Show All 34 Lines	void PipelineSolver::retreatPosition() {
}		}
}		}

bool PipelineSolver::checkOptimal() {		bool PipelineSolver::checkOptimal() {
if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) {		if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) {
if (BestCost == -1 \|\| CurrCost < BestCost) {		if (BestCost == -1 \|\| CurrCost < BestCost) {
BestPipeline = CurrPipeline;		BestPipeline = CurrPipeline;
BestCost = CurrCost;		BestCost = CurrCost;
LLVM_DEBUG(dbgs() << "Found Fit with cost " << BestCost << "\n");		LLVM_DEBUG(dbgs() << "Found Fit with cost " << BestCost << '\n');
}		}
assert(BestCost >= 0);		assert(BestCost >= 0);
}		}

bool DoneExploring = false;		bool DoneExploring = false;
if (MaxBranchesExplored > 0 && BranchesExplored >= MaxBranchesExplored)		if (MaxBranchesExplored > 0 && BranchesExplored >= MaxBranchesExplored)
DoneExploring = true;		DoneExploring = true;

return (DoneExploring \|\| BestCost == 0);		return (DoneExploring \|\| BestCost == StaticLowerBound);
}		}

void PipelineSolver::populateReadyList(		void PipelineSolver::populateReadyList(
SUToCandSGsPair &CurrSU, SmallVectorImpl<std::pair<int, int>> &ReadyList,		SUToCandSGsPair &CurrSU, SmallVectorImpl<std::pair<int, int>> &ReadyList,
SmallVectorImpl<SchedGroup> &SyncPipeline) {		SmallVectorImpl<SchedGroup> &SyncPipeline) {
assert(CurrSU.second.size() >= 1);		assert(CurrSU.second.size() >= 1);
auto I = CurrSU.second.rbegin();		auto I = CurrSU.second.rbegin();
auto E = CurrSU.second.rend();		auto E = CurrSU.second.rend();
		SmallVector<std::pair<SUnit , SUnit >, 16> AddedEdges;
for (; I != E; ++I) {		for (; I != E; ++I) {
std::vector<std::pair<SUnit , SUnit >> AddedEdges;
int CandSGID = *I;		int CandSGID = *I;
SchedGroup *Match;		SchedGroup *Match;
for (auto &SG : SyncPipeline) {		for (auto &SG : SyncPipeline) {
if (SG.getSGID() == CandSGID)		if (SG.getSGID() == CandSGID)
Match = &SG;		Match = &SG;
}		}

if (UseCostHeur) {		if (UseCostHeur) {
if (Match->isFull()) {		if (Match->isFull()) {
ReadyList.push_back(std::pair(*I, MissPenalty));		ReadyList.push_back(std::pair(*I, MissPenalty));
continue;		continue;
}		}
		AddedEdges.clear();

int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);		int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
ReadyList.push_back(std::pair(*I, TempCost));		ReadyList.push_back(std::pair(*I, TempCost));
removeEdges(AddedEdges);		removeEdges(AddedEdges);
} else		} else
ReadyList.push_back(std::pair(*I, -1));		ReadyList.push_back(std::pair(*I, -1));
}		}

if (UseCostHeur) {		if (UseCostHeur) {
std::sort(ReadyList.begin(), ReadyList.end(),		std::sort(ReadyList.begin(), ReadyList.end(),
[](std::pair<int, int> A, std::pair<int, int> B) {		[](std::pair<int, int> A, std::pair<int, int> B) {
return A.second < B.second;		return A.second < B.second;
});		});
}		}

assert(ReadyList.size() == CurrSU.second.size());		assert(ReadyList.size() == CurrSU.second.size());
}		}

		int PipelineSolver::calculateLowerBound() {
		if (CurrSyncGroupIdx >= (int)CurrPipeline.size())
		return 0;
		int TempConflInstNo = CurrConflInstNo;
		int TmpSyncGroupIdx = CurrSyncGroupIdx;
		int MinimumCost = 0;
		SmallVector<std::pair<SUnit , SUnit >, 16> AddedEdges;
		arsenmUnsubmitted Done Reply Inline Actions I'd assume this can be a SmallVector arsenm: I'd assume this can be a SmallVector

		for (; TmpSyncGroupIdx < (int)CurrPipeline.size(); TmpSyncGroupIdx++) {
		auto SyncPipeline = CurrPipeline[TmpSyncGroupIdx];
		for (; TempConflInstNo < (int)PipelineInstrs[TmpSyncGroupIdx].size();
		TempConflInstNo++) {
		auto CurrSU = PipelineInstrs[TmpSyncGroupIdx][TempConflInstNo];
		auto I = CurrSU.second.rbegin();
		auto E = CurrSU.second.rend();
		int MinCostForSU = -1;
		for (; I != E; I++) {
		jsilvanusUnsubmitted Done Reply Inline Actions Maybe move the vector out of the loops to reduce allocations, and clear() here instead? jsilvanus: Maybe move the vector out of the loops to reduce allocations, and clear() here instead?
		jsilvanusUnsubmitted Done Reply Inline Actions Why not move out of the other two loops as well? Maybe with a comment that this is for performance only, and only needed in the most inner loop. jsilvanus: Why not move out of the other two loops as well? Maybe with a comment that this is for…
		int CandSGID = *I;
		SchedGroup *Match;
		for (auto &SG : SyncPipeline) {
		if (SG.getSGID() == CandSGID)
		Match = &SG;
		}

		if (Match->isFull()) {
		if (MinCostForSU == -1 \|\| MissPenalty < MinCostForSU)
		MinCostForSU = MissPenalty;
		continue;
		}
		AddedEdges.clear();
		int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID,
		jsilvanusUnsubmitted Done Reply Inline Actions In addEdges, we could abort once the added cost exceeds MinCostForSU (if MinCostForSU != -1). Not sure if that would be worth the effort. jsilvanus: In addEdges, we could abort once the added cost exceeds MinCostForSU (if MinCostForSU != -1).
		AddedEdges, MinCostForSU);
		if (MinCostForSU == -1 \|\| TempCost < MinCostForSU)
		MinCostForSU = TempCost;

		removeEdges(AddedEdges);
		if (MinCostForSU == 0)
		break;
		}
		MinimumCost += MinCostForSU;
		}
		TempConflInstNo = 0;
		}
		return MinimumCost;
		}

bool PipelineSolver::solveExact() {		bool PipelineSolver::solveExact() {
if (checkOptimal())		if (checkOptimal())
return true;		return true;

if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size())		if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size())
return false;		return false;

assert(static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size());		assert(static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size());
assert(static_cast<size_t>(CurrConflInstNo) <		assert(static_cast<size_t>(CurrConflInstNo) <
PipelineInstrs[CurrSyncGroupIdx].size());		PipelineInstrs[CurrSyncGroupIdx].size());
SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];		SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum		LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum
<< ") in Pipeline # " << CurrSyncGroupIdx << "\n");		<< ") in Pipeline # " << CurrSyncGroupIdx << '\n');

// SchedGroup -> Cost pairs		// SchedGroup -> Cost pairs
SmallVector<std::pair<int, int>, 4> ReadyList;		SmallVector<std::pair<int, int>, 4> ReadyList;
// Prioritize the candidate sched groups in terms of lowest cost first		// Prioritize the candidate sched groups in terms of lowest cost first
populateReadyList(CurrSU, ReadyList, CurrPipeline[CurrSyncGroupIdx]);		populateReadyList(CurrSU, ReadyList, CurrPipeline[CurrSyncGroupIdx]);
		SmallVector<std::pair<SUnit , SUnit >, 16> AddedEdges;

auto I = ReadyList.begin();		auto I = ReadyList.begin();
auto E = ReadyList.end();		auto E = ReadyList.end();
for (; I != E; ++I) {		for (; I != E; ++I) {
// If we are trying SGs in least cost order, and the current SG is cost		// If we are trying SGs in least cost order, and the current SG is cost
// infeasible, then all subsequent SGs will also be cost infeasible, so we		// infeasible, then all subsequent SGs will also be cost infeasible, so we
// can prune.		// can prune.
if (BestCost != -1 && (CurrCost + I->second > BestCost))		if (BestCost != -1 && (CurrCost + I->second > BestCost))
return false;		return false;

int CandSGID = I->first;		int CandSGID = I->first;
int AddedCost = 0;		int AddedCost = 0;
std::vector<std::pair<SUnit , SUnit >> AddedEdges;
auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];		auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
SchedGroup *Match;		SchedGroup *Match;
for (auto &SG : SyncPipeline) {		for (auto &SG : SyncPipeline) {
if (SG.getSGID() == CandSGID)		if (SG.getSGID() == CandSGID)
Match = &SG;		Match = &SG;
}		}

if (Match->isFull())		if (Match->isFull())
continue;		continue;

LLVM_DEBUG(dbgs() << "Assigning to SchedGroup with Mask "		LLVM_DEBUG(dbgs() << "Assigning to SchedGroup with Mask "
<< (int)Match->getMask() << "and ID " << CandSGID		<< (int)Match->getMask() << "and ID " << CandSGID
<< "\n");		<< '\n');
Match->add(*CurrSU.first);		Match->add(*CurrSU.first);
		AddedEdges.clear();
AddedCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);		AddedCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
LLVM_DEBUG(dbgs() << "Cost of Assignment: " << AddedCost << "\n");		LLVM_DEBUG(dbgs() << "Cost of Assignment: " << AddedCost << '\n');
CurrCost += AddedCost;		CurrCost += AddedCost;
advancePosition();		advancePosition();
++BranchesExplored;		++BranchesExplored;
bool FinishedExploring = false;		bool FinishedExploring = false;
// If the Cost after adding edges is greater than a known solution,		// If the Cost after adding edges is greater than a known solution,
// backtrack		// backtrack
if (CurrCost < BestCost \|\| BestCost == -1) {		int LBCost =
		(EnableLowerBound && BestCost != -1) ? calculateLowerBound() : 0;
		if (BestCost == -1 \|\| CurrCost + LBCost < BestCost) {
if (solveExact()) {		if (solveExact()) {
FinishedExploring = BestCost != 0;		FinishedExploring = BestCost != StaticLowerBound;
if (!FinishedExploring)		if (!FinishedExploring)
return true;		return true;
}		}
}		}

retreatPosition();		retreatPosition();
CurrCost -= AddedCost;		CurrCost -= AddedCost;
removeEdges(AddedEdges);		removeEdges(AddedEdges);
Show All 9 Lines	bool PipelineSolver::solveExact() {
CurrCost += MissPenalty;		CurrCost += MissPenalty;
advancePosition();		advancePosition();

LLVM_DEBUG(dbgs() << "NOT Assigned (" << CurrSU.first->NodeNum << ")\n");		LLVM_DEBUG(dbgs() << "NOT Assigned (" << CurrSU.first->NodeNum << ")\n");

bool FinishedExploring = false;		bool FinishedExploring = false;
if (CurrCost < BestCost \|\| BestCost == -1) {		if (CurrCost < BestCost \|\| BestCost == -1) {
if (solveExact()) {		if (solveExact()) {
bool FinishedExploring = BestCost != 0;		bool FinishedExploring = BestCost != StaticLowerBound;
if (!FinishedExploring)		if (!FinishedExploring)
return true;		return true;
}		}
}		}

retreatPosition();		retreatPosition();
CurrCost -= MissPenalty;		CurrCost -= MissPenalty;
return FinishedExploring;		return FinishedExploring;
}		}

bool PipelineSolver::solveGreedy() {		bool PipelineSolver::solveGreedy() {
BestCost = 0;		BestCost = 0;
std::vector<std::pair<SUnit , SUnit >> AddedEdges;		SmallVector<std::pair<SUnit , SUnit >, 16> AddedEdges;

while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) {		while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) {
SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];		SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
int BestNodeCost = -1;		int BestNodeCost = -1;
int TempCost;		int TempCost;
SchedGroup *BestGroup = nullptr;		SchedGroup *BestGroup = nullptr;
int BestGroupID = -1;		int BestGroupID = -1;
auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];		auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum		LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum
<< ") in Pipeline # " << CurrSyncGroupIdx << "\n");		<< ") in Pipeline # " << CurrSyncGroupIdx << '\n');

// Since we have added the potential SchedGroups from bottom up, but		// Since we have added the potential SchedGroups from bottom up, but
// traversed the DAG from top down, parse over the groups from last to		// traversed the DAG from top down, parse over the groups from last to
// first. If we fail to do this for the greedy algorithm, the solution will		// first. If we fail to do this for the greedy algorithm, the solution will
// likely not be good in more complex cases.		// likely not be good in more complex cases.
auto I = CurrSU.second.rbegin();		auto I = CurrSU.second.rbegin();
auto E = CurrSU.second.rend();		auto E = CurrSU.second.rend();
for (; I != E; ++I) {		for (; I != E; ++I) {
std::vector<std::pair<SUnit , SUnit >> AddedEdges;		SmallVector<std::pair<SUnit , SUnit >, 16> AddedEdges;
int CandSGID = *I;		int CandSGID = *I;
SchedGroup *Match;		SchedGroup *Match;
for (auto &SG : SyncPipeline) {		for (auto &SG : SyncPipeline) {
if (SG.getSGID() == CandSGID)		if (SG.getSGID() == CandSGID)
Match = &SG;		Match = &SG;
}		}

LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask "		LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask "
<< (int)Match->getMask() << "\n");		<< (int)Match->getMask() << '\n');

if (Match->isFull()) {		if (Match->isFull()) {
LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " is full\n");		LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " is full\n");
continue;		continue;
}		}
TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);		TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges,
LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << "\n");		BestNodeCost);
		LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << '\n');
if (TempCost < BestNodeCost \|\| BestNodeCost == -1) {		if (TempCost < BestNodeCost \|\| BestNodeCost == -1) {
BestGroup = Match;		BestGroup = Match;
BestNodeCost = TempCost;		BestNodeCost = TempCost;
BestGroupID = CandSGID;		BestGroupID = CandSGID;
}		}
removeEdges(AddedEdges);		removeEdges(AddedEdges);
if (BestNodeCost == 0)		if (BestNodeCost == 0)
break;		break;
}		}

if (BestGroupID != -1) {		if (BestGroupID != -1) {
BestGroup->add(*CurrSU.first);		BestGroup->add(*CurrSU.first);
addEdges(SyncPipeline, CurrSU.first, BestGroupID, AddedEdges);		addEdges(SyncPipeline, CurrSU.first, BestGroupID, AddedEdges);
LLVM_DEBUG(dbgs() << "Best Group has ID: " << BestGroupID << " and Mask"		LLVM_DEBUG(dbgs() << "Best Group has ID: " << BestGroupID << " and Mask"
<< (int)BestGroup->getMask() << "\n");		<< (int)BestGroup->getMask() << '\n');
BestCost += TempCost;		BestCost += TempCost;
} else		} else
BestCost += MissPenalty;		BestCost += MissPenalty;

CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;		CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
advancePosition();		advancePosition();
}		}
BestPipeline = CurrPipeline;		BestPipeline = CurrPipeline;
Show All 20 Lines	void PipelineSolver::solve() {
bool BelowCutoff = (CutoffForExact > 0) && ProblemSize <= CutoffForExact;		bool BelowCutoff = (CutoffForExact > 0) && ProblemSize <= CutoffForExact;
MissPenalty = (ProblemSize / 2) + 1;		MissPenalty = (ProblemSize / 2) + 1;

LLVM_DEBUG(DAG->dump());		LLVM_DEBUG(DAG->dump());
if (EnableExactSolver \|\| BelowCutoff) {		if (EnableExactSolver \|\| BelowCutoff) {
LLVM_DEBUG(dbgs() << "Starting Greedy pipeline solver\n");		LLVM_DEBUG(dbgs() << "Starting Greedy pipeline solver\n");
solveGreedy();		solveGreedy();
reset();		reset();
LLVM_DEBUG(dbgs() << "Greedy produced best cost of " << BestCost << "\n");		LLVM_DEBUG(dbgs() << "Greedy produced best cost of " << BestCost << '\n');
if (BestCost > 0) {		StaticLowerBound = calculateLowerBound();
		LLVM_DEBUG(dbgs() << "Lower Bound on Pipeline Cost is " << StaticLowerBound
		arsenmUnsubmitted Not Done Reply Inline Actions Single quotes arsenm: Single quotes
		<< '\n');
		if (BestCost > StaticLowerBound) {
LLVM_DEBUG(dbgs() << "Starting EXACT pipeline solver\n");		LLVM_DEBUG(dbgs() << "Starting EXACT pipeline solver\n");
solveExact();		solveExact();
LLVM_DEBUG(dbgs() << "Exact produced best cost of " << BestCost << "\n");		LLVM_DEBUG(dbgs() << "Exact produced best cost of " << BestCost << '\n');
}		}
} else { // Use the Greedy Algorithm by default		} else { // Use the Greedy Algorithm by default
LLVM_DEBUG(dbgs() << "Starting GREEDY pipeline solver\n");		LLVM_DEBUG(dbgs() << "Starting GREEDY pipeline solver\n");
solveGreedy();		solveGreedy();
}		}

makePipeline();		makePipeline();
}		}
▲ Show 20 Lines • Show All 167 Lines • ▼ Show 20 Lines	bool SchedGroup::canAddMI(const MachineInstr &MI) const {
LLVM_DEBUG(		LLVM_DEBUG(
dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true)		dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true)
<< (Result ? " could classify " : " unable to classify ") << MI);		<< (Result ? " could classify " : " unable to classify ") << MI);

return Result;		return Result;
}		}

int SchedGroup::link(SUnit &SU, bool MakePred,		int SchedGroup::link(SUnit &SU, bool MakePred,
std::vector<std::pair<SUnit , SUnit >> &AddedEdges) {		SmallVectorImpl<std::pair<SUnit , SUnit >> &AddedEdges) {
int MissedEdges = 0;		int MissedEdges = 0;
for (auto *A : Collection) {		for (auto *A : Collection) {
SUnit *B = &SU;		SUnit *B = &SU;
if (A == B \|\| A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)		if (A == B \|\| A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
continue;		continue;
if (MakePred)		if (MakePred)
std::swap(A, B);		std::swap(A, B);

if (DAG->IsReachable(B, A))
continue;
// tryAddEdge returns false if there is a dependency that makes adding
// the A->B edge impossible, otherwise it returns true;
bool Added = tryAddEdge(A, B);		bool Added = tryAddEdge(A, B);
if (Added)		if (Added)
AddedEdges.push_back(std::pair(A, B));		AddedEdges.push_back(std::pair(A, B));
else		else
++MissedEdges;		++MissedEdges;
}		}

return MissedEdges;		return MissedEdges;
▲ Show 20 Lines • Show All 209 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/igrouplp-pipelinesolver-lowerbound.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -misched-cluster=0 -amdgpu-igrouplp-exact-solver-max-branches=250000 < %s \| FileCheck -check-prefix=EXACT %s
				; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -misched-cluster=0 -amdgpu-igrouplp-exact-solver=1 -amdgpu-igrouplp-exact-solver-max-branches=200000 -amdgpu-igrouplp-exact-solver-cost-heur=1 < %s \| FileCheck -check-prefix=LB %s

				define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE(<32 x i32> addrspace(1)* noalias %in, <32 x i32> addrspace(1)* noalias %out) #0 {
				; EXACT-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE:
				; EXACT: ; %bb.0:
				; EXACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
				; EXACT-NEXT: v_lshlrev_b32_e32 v16, 7, v0
				; EXACT-NEXT: ; kill: killed $sgpr0_sgpr1
				; EXACT-NEXT: s_waitcnt lgkmcnt(0)
				; EXACT-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:32
				; EXACT-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:48
				; EXACT-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
				; EXACT-NEXT: s_waitcnt vmcnt(1)
				; EXACT-NEXT: v_mul_lo_u32 v13, v13, v13
				; EXACT-NEXT: s_waitcnt vmcnt(0)
				; EXACT-NEXT: v_mul_lo_u32 v7, v7, v7
				; EXACT-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
				; EXACT-NEXT: v_mul_lo_u32 v6, v6, v6
				; EXACT-NEXT: v_mul_lo_u32 v12, v12, v12
				; EXACT-NEXT: v_mul_lo_u32 v15, v15, v15
				; EXACT-NEXT: v_mul_lo_u32 v14, v14, v14
				; EXACT-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
				; EXACT-NEXT: s_waitcnt vmcnt(0)
				; EXACT-NEXT: v_mul_lo_u32 v3, v3, v3
				; EXACT-NEXT: v_mul_lo_u32 v2, v2, v2
				; EXACT-NEXT: v_mul_lo_u32 v1, v1, v1
				; EXACT-NEXT: v_mul_lo_u32 v0, v0, v0
				; EXACT-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3]
				; EXACT-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:112
				; EXACT-NEXT: s_waitcnt vmcnt(0)
				; EXACT-NEXT: v_mul_lo_u32 v3, v3, v3
				; EXACT-NEXT: v_mul_lo_u32 v2, v2, v2
				; EXACT-NEXT: v_mul_lo_u32 v1, v1, v1
				; EXACT-NEXT: v_mul_lo_u32 v0, v0, v0
				; EXACT-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:112
				; EXACT-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:96
				; EXACT-NEXT: s_waitcnt vmcnt(0)
				; EXACT-NEXT: v_mul_lo_u32 v3, v3, v3
				; EXACT-NEXT: v_mul_lo_u32 v2, v2, v2
				; EXACT-NEXT: v_mul_lo_u32 v1, v1, v1
				; EXACT-NEXT: v_mul_lo_u32 v0, v0, v0
				; EXACT-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:96
				; EXACT-NEXT: v_mul_lo_u32 v5, v5, v5
				; EXACT-NEXT: v_mul_lo_u32 v4, v4, v4
				; EXACT-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:48
				; EXACT-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:64
				; EXACT-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
				; EXACT-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
				; EXACT-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
				; EXACT-NEXT: s_waitcnt vmcnt(0)
				; EXACT-NEXT: v_mul_lo_u32 v7, v7, v7
				; EXACT-NEXT: v_mul_lo_u32 v6, v6, v6
				; EXACT-NEXT: v_mul_lo_u32 v5, v5, v5
				; EXACT-NEXT: v_mul_lo_u32 v4, v4, v4
				; EXACT-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:64
				; EXACT-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:32
				; EXACT-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:16
				; EXACT-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
				; EXACT-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
				; EXACT-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
				; EXACT-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
				; EXACT-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
				; EXACT-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
				; EXACT-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
				; EXACT-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
				; EXACT-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
				; EXACT-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
				; EXACT-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
				; EXACT-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
				; EXACT-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
				; EXACT-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
				; EXACT-NEXT: s_waitcnt vmcnt(0)
				; EXACT-NEXT: v_mul_lo_u32 v9, v9, v9
				; EXACT-NEXT: v_mul_lo_u32 v8, v8, v8
				; EXACT-NEXT: v_mul_lo_u32 v11, v11, v11
				; EXACT-NEXT: v_mul_lo_u32 v10, v10, v10
				; EXACT-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:16
				; EXACT-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:80
				; EXACT-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
				; EXACT-NEXT: s_waitcnt vmcnt(0)
				; EXACT-NEXT: v_mul_lo_u32 v11, v11, v11
				; EXACT-NEXT: v_mul_lo_u32 v10, v10, v10
				; EXACT-NEXT: v_mul_lo_u32 v9, v9, v9
				; EXACT-NEXT: v_mul_lo_u32 v8, v8, v8
				; EXACT-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:80
				; EXACT-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
				; EXACT-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
				; EXACT-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
				; EXACT-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
				; EXACT-NEXT: s_endpgm
				;
				; LB-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE:
				; LB: ; %bb.0:
				; LB-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
				; LB-NEXT: v_lshlrev_b32_e32 v12, 7, v0
				; LB-NEXT: s_waitcnt lgkmcnt(0)
				; LB-NEXT: global_load_dwordx4 v[8:11], v12, s[0:1] offset:64
				; LB-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
				; LB-NEXT: s_waitcnt vmcnt(0)
				; LB-NEXT: v_mul_lo_u32 v11, v11, v11
				; LB-NEXT: v_mul_lo_u32 v10, v10, v10
				; LB-NEXT: v_mul_lo_u32 v9, v9, v9
				; LB-NEXT: v_mul_lo_u32 v8, v8, v8
				; LB-NEXT: global_store_dwordx4 v12, v[8:11], s[2:3] offset:64
				; LB-NEXT: global_load_dwordx4 v[0:3], v12, s[0:1]
				; LB-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
				; LB-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
				; LB-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
				; LB-NEXT: s_waitcnt vmcnt(0)
				; LB-NEXT: v_mul_lo_u32 v3, v3, v3
				; LB-NEXT: v_mul_lo_u32 v2, v2, v2
				; LB-NEXT: global_load_dwordx4 v[8:11], v12, s[0:1] offset:32
				; LB-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
				; LB-NEXT: s_waitcnt vmcnt(0)
				; LB-NEXT: v_mul_lo_u32 v9, v9, v9
				; LB-NEXT: v_mul_lo_u32 v8, v8, v8
				; LB-NEXT: v_mul_lo_u32 v11, v11, v11
				; LB-NEXT: v_mul_lo_u32 v10, v10, v10
				; LB-NEXT: global_store_dwordx4 v12, v[8:11], s[2:3] offset:32
				; LB-NEXT: global_load_dwordx4 v[4:7], v12, s[0:1] offset:112
				; LB-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
				; LB-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
				; LB-NEXT: s_waitcnt vmcnt(0)
				; LB-NEXT: v_mul_lo_u32 v7, v7, v7
				; LB-NEXT: v_mul_lo_u32 v6, v6, v6
				; LB-NEXT: v_mul_lo_u32 v1, v1, v1
				; LB-NEXT: v_mul_lo_u32 v0, v0, v0
				; LB-NEXT: global_store_dwordx4 v12, v[0:3], s[2:3]
				; LB-NEXT: global_load_dwordx4 v[0:3], v12, s[0:1] offset:96
				; LB-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
				; LB-NEXT: s_waitcnt vmcnt(0)
				; LB-NEXT: v_mul_lo_u32 v3, v3, v3
				; LB-NEXT: v_mul_lo_u32 v2, v2, v2
				; LB-NEXT: v_mul_lo_u32 v1, v1, v1
				; LB-NEXT: v_mul_lo_u32 v0, v0, v0
				; LB-NEXT: global_store_dwordx4 v12, v[0:3], s[2:3] offset:96
				; LB-NEXT: global_load_dwordx4 v[0:3], v12, s[0:1] offset:80
				; LB-NEXT: s_waitcnt vmcnt(0)
				; LB-NEXT: v_mul_lo_u32 v3, v3, v3
				; LB-NEXT: v_mul_lo_u32 v2, v2, v2
				; LB-NEXT: v_mul_lo_u32 v1, v1, v1
				; LB-NEXT: v_mul_lo_u32 v0, v0, v0
				; LB-NEXT: global_store_dwordx4 v12, v[0:3], s[2:3] offset:80
				; LB-NEXT: v_mul_lo_u32 v5, v5, v5
				; LB-NEXT: v_mul_lo_u32 v4, v4, v4
				; LB-NEXT: global_store_dwordx4 v12, v[4:7], s[2:3] offset:112
				; LB-NEXT: global_load_dwordx4 v[4:7], v12, s[0:1] offset:48
				; LB-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
				; LB-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
				; LB-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
				; LB-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
				; LB-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
				; LB-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
				; LB-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
				; LB-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
				; LB-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
				; LB-NEXT: s_waitcnt vmcnt(0)
				; LB-NEXT: v_mul_lo_u32 v5, v5, v5
				; LB-NEXT: global_load_dwordx4 v[0:3], v12, s[0:1] offset:16
				; LB-NEXT: v_mul_lo_u32 v4, v4, v4
				; LB-NEXT: s_waitcnt vmcnt(0)
				; LB-NEXT: v_mul_lo_u32 v1, v1, v1
				; LB-NEXT: v_mul_lo_u32 v0, v0, v0
				; LB-NEXT: v_mul_lo_u32 v3, v3, v3
				; LB-NEXT: v_mul_lo_u32 v2, v2, v2
				; LB-NEXT: global_store_dwordx4 v12, v[0:3], s[2:3] offset:16
				; LB-NEXT: v_mul_lo_u32 v7, v7, v7
				; LB-NEXT: v_mul_lo_u32 v6, v6, v6
				; LB-NEXT: global_store_dwordx4 v12, v[4:7], s[2:3] offset:48
				; LB-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
				; LB-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
				; LB-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
				; LB-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
				; LB-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
				; LB-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
				; LB-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
				; LB-NEXT: s_endpgm
				%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
				%gep1 = getelementptr <32 x i32>, <32 x i32> addrspace(1)* %in, i32 %tid
				%load = load <32 x i32>, <32 x i32> addrspace(1)* %gep1
				%mul = mul <32 x i32> %load, %load
				%gep2 = getelementptr <32 x i32>, <32 x i32> addrspace(1)* %out, i32 %tid
				store <32 x i32> %mul, <32 x i32> addrspace(1)* %gep2
				; 1 VMEM read
				call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0)
				; 2 VALU
				call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0)
				; 1 VMEM write
				call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0)
				; 1 VMEM read
				call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0)
				; 2 VALU
				call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0)
				; 1 VMEM write
				call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0)
				; 1 VMEM read
				call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0)
				; 2 VALU
				call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0)
				; 1 VMEM write
				call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0)
				; 1 VMEM read
				call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0)
				; 2 VALU
				call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0)
				; 1 VMEM write
				call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0)
				; 1 VMEM read
				call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0)
				; 2 VALU
				call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0)
				; 1 VMEM write
				call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0)
				; 1 VMEM read
				call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0)
				; 2 VALU
				call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0)
				; 1 VMEM write
				call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0)
				; 1 VMEM read
				call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0)
				; 2 VALU
				call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0)
				; 1 VMEM write
				call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0)
				; 1 VMEM read
				call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0)
				; 2 VALU
				call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0)
				; 1 VMEM write
				call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0)
				ret void
				}

				declare i32 @llvm.amdgcn.workitem.id.x() #0
				declare void @llvm.amdgcn.sched.group.barrier(i32, i32, i32) #0
				declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) #0

				attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1,256" readnone speculatable}

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Add Lower Bound to PipelineSolver
AbandonedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 507810

llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp

llvm/test/CodeGen/AMDGPU/igrouplp-pipelinesolver-lowerbound.ll

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Add Lower Bound to PipelineSolverAbandonedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 507810

llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp

llvm/test/CodeGen/AMDGPU/igrouplp-pipelinesolver-lowerbound.ll

[AMDGPU] Add Lower Bound to PipelineSolver
AbandonedPublic