This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Speed up live-in virtual register set computaion in GCNScheduleDAGMILive
ClosedPublic

Authored by vpykhtin on May 24 2019, 9:06 AM.

Download Raw Diff

Details

Reviewers

Commits

rG7e854e1cdd23: [AMDGPU] Speed up live-in virtual register set computaion in…
rL363661: [AMDGPU] Speed up live-in virtual register set computaion in…

Summary

On functions with large number of basic block live-in virtual register set computation on every BB in the scheduler takes very long time. Currently it has complexity:

C1 = O(NumVirtRegs * lg(averageLiveRangeSegmentsPerReg) * BBNumber).

This patch changes the complexity to:

C2 = O(NumVirtRegs * averageLiveRangeSegmentsPerReg * lg(BBNumber)).

For this purpose live-ins are calculated all at once for all BBs. BB's starting SlotIndexes are collected and sorted and then searched using binary seach from within LiveRange segments giving logarithm on BB number. This gives almost 3 times speedup on luxmark hotel scene.

Diff Detail

Repository: rL LLVM

Event Timeline

vpykhtin created this revision.May 24 2019, 9:06 AM

Herald added a project: Restricted Project. · View Herald TranscriptMay 24 2019, 9:06 AM

Herald added subscribers: llvm-commits, mgrang, t-tye and 9 others. · View Herald Transcript

arsenm added inline comments.May 24 2019, 9:17 AM

include/llvm/CodeGen/LiveInterval.h
608–627 ↗	(On Diff #201265)	This should be split into a separate patch

LGTM, but you need to split LiveInterval.h part in a parent review.

split LiveInterval.h change into different patch

LGTM

This revision is now accepted and ready to land.May 24 2019, 10:50 AM

vpykhtin added a parent revision: D62411: LiveIntervals: add LiveRange::findIndexesLiveAt function - return a list of SlotIndexes the LiveRange live at..May 24 2019, 10:52 AM

replaced std::vector with SmallVector, moved out of the loop.

Closed by commit rL363661: [AMDGPU] Speed up live-in virtual register set computaion in… (authored by vpykhtin). · Explain WhyJun 18 2019, 4:40 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

trunk/

lib/

Target/

AMDGPU/

47 lines

7 lines

3 lines

28 lines

Diff 205302

llvm/trunk/lib/Target/AMDGPU/GCNRegPressure.h

Show First 20 Lines • Show All 184 Lines • ▼ Show 20 Lines	LaneBitmask getLiveLaneMask(unsigned Reg,
SlotIndex SI,		SlotIndex SI,
const LiveIntervals &LIS,		const LiveIntervals &LIS,
const MachineRegisterInfo &MRI);		const MachineRegisterInfo &MRI);

GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI,		GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI,
const LiveIntervals &LIS,		const LiveIntervals &LIS,
const MachineRegisterInfo &MRI);		const MachineRegisterInfo &MRI);

		/// creates a map MachineInstr -> LiveRegSet
		/// R - range of iterators on instructions
		/// After - upon entry or exit of every instruction
		/// Note: there is no entry in the map for instructions with empty live reg set
		/// Complexity = O(NumVirtRegs * averageLiveRangeSegmentsPerReg * lg(R))
		template <typename Range>
		DenseMap<MachineInstr*, GCNRPTracker::LiveRegSet>
		getLiveRegMap(Range &&R, bool After, LiveIntervals &LIS) {
		std::vector<SlotIndex> Indexes;
		Indexes.reserve(std::distance(R.begin(), R.end()));
		auto &SII = *LIS.getSlotIndexes();
		for (MachineInstr *I : R) {
		auto SI = SII.getInstructionIndex(*I);
		Indexes.push_back(After ? SI.getDeadSlot() : SI.getBaseIndex());
		}
		std::sort(Indexes.begin(), Indexes.end());

		auto &MRI = (*R.begin())->getParent()->getParent()->getRegInfo();
		DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> LiveRegMap;
		SmallVector<SlotIndex, 32> LiveIdxs, SRLiveIdxs;
		for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
		auto Reg = TargetRegisterInfo::index2VirtReg(I);
		if (!LIS.hasInterval(Reg))
		continue;
		auto &LI = LIS.getInterval(Reg);
		LiveIdxs.clear();
		if (!LI.findIndexesLiveAt(Indexes, std::back_inserter(LiveIdxs)))
		continue;
		if (!LI.hasSubRanges()) {
		for (auto SI : LiveIdxs)
		LiveRegMap[SII.getInstructionFromIndex(SI)][Reg] =
		MRI.getMaxLaneMaskForVReg(Reg);
		} else
		for (const auto &S : LI.subranges()) {
		// constrain search for subranges by indexes live at main range
		SRLiveIdxs.clear();
		S.findIndexesLiveAt(LiveIdxs, std::back_inserter(SRLiveIdxs));
		for (auto SI : SRLiveIdxs)
		LiveRegMap[SII.getInstructionFromIndex(SI)][Reg] \|= S.LaneMask;
		}
		}
		return LiveRegMap;
		}

inline GCNRPTracker::LiveRegSet getLiveRegsAfter(const MachineInstr &MI,		inline GCNRPTracker::LiveRegSet getLiveRegsAfter(const MachineInstr &MI,
const LiveIntervals &LIS) {		const LiveIntervals &LIS) {
return getLiveRegs(LIS.getInstructionIndex(MI).getDeadSlot(), LIS,		return getLiveRegs(LIS.getInstructionIndex(MI).getDeadSlot(), LIS,
MI.getParent()->getParent()->getRegInfo());		MI.getParent()->getParent()->getRegInfo());
}		}

inline GCNRPTracker::LiveRegSet getLiveRegsBefore(const MachineInstr &MI,		inline GCNRPTracker::LiveRegSet getLiveRegsBefore(const MachineInstr &MI,
const LiveIntervals &LIS) {		const LiveIntervals &LIS) {
return getLiveRegs(LIS.getInstructionIndex(MI).getBaseIndex(), LIS,		return getLiveRegs(LIS.getInstructionIndex(MI).getBaseIndex(), LIS,
MI.getParent()->getParent()->getRegInfo());		MI.getParent()->getParent()->getRegInfo());
}		}

template <typename Range>		template <typename Range>
GCNRegPressure getRegPressure(const MachineRegisterInfo &MRI,		GCNRegPressure getRegPressure(const MachineRegisterInfo &MRI,
Range &&LiveRegs) {		Range &&LiveRegs) {
GCNRegPressure Res;		GCNRegPressure Res;
for (const auto &RM : LiveRegs)		for (const auto &RM : LiveRegs)
Res.inc(RM.first, LaneBitmask::getNone(), RM.second, MRI);		Res.inc(RM.first, LaneBitmask::getNone(), RM.second, MRI);
return Res;		return Res;
}		}

		bool isEqual(const GCNRPTracker::LiveRegSet &S1,
		const GCNRPTracker::LiveRegSet &S2);

void printLivesAt(SlotIndex SI,		void printLivesAt(SlotIndex SI,
const LiveIntervals &LIS,		const LiveIntervals &LIS,
const MachineRegisterInfo &MRI);		const MachineRegisterInfo &MRI);

} // end namespace llvm		} // end namespace llvm

#endif // LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H		#endif // LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H

llvm/trunk/lib/Target/AMDGPU/GCNRegPressure.cpp

Show First 20 Lines • Show All 57 Lines • ▼ Show 20 Lines	if (LI.hasSubRanges()) {
}		}
} else if (LI.liveAt(SI)) {		} else if (LI.liveAt(SI)) {
dbgs() << " " << LI << '\n';		dbgs() << " " << LI << '\n';
++Num;		++Num;
}		}
}		}
if (!Num) dbgs() << " <none>\n";		if (!Num) dbgs() << " <none>\n";
}		}
		#endif

static bool isEqual(const GCNRPTracker::LiveRegSet &S1,		bool llvm::isEqual(const GCNRPTracker::LiveRegSet &S1,
const GCNRPTracker::LiveRegSet &S2) {		const GCNRPTracker::LiveRegSet &S2) {
if (S1.size() != S2.size())		if (S1.size() != S2.size())
return false;		return false;

for (const auto &P : S1) {		for (const auto &P : S1) {
auto I = S2.find(P.first);		auto I = S2.find(P.first);
if (I == S2.end() \|\| I->second != P.second)		if (I == S2.end() \|\| I->second != P.second)
return false;		return false;
}		}
return true;		return true;
}		}
#endif

///////////////////////////////////////////////////////////////////////////////		///////////////////////////////////////////////////////////////////////////////
// GCNRegPressure		// GCNRegPressure

unsigned GCNRegPressure::getRegKind(unsigned Reg,		unsigned GCNRegPressure::getRegKind(unsigned Reg,
const MachineRegisterInfo &MRI) {		const MachineRegisterInfo &MRI) {
assert(TargetRegisterInfo::isVirtualRegister(Reg));		assert(TargetRegisterInfo::isVirtualRegister(Reg));
const auto RC = MRI.getRegClass(Reg);		const auto RC = MRI.getRegClass(Reg);
▲ Show 20 Lines • Show All 419 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/AMDGPU/GCNSchedStrategy.h

Show First 20 Lines • Show All 84 Lines • ▼ Show 20 Lines	class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
SmallVector<GCNRPTracker::LiveRegSet, 32> LiveIns;		SmallVector<GCNRPTracker::LiveRegSet, 32> LiveIns;

// Region pressure cache.		// Region pressure cache.
SmallVector<GCNRegPressure, 32> Pressure;		SmallVector<GCNRegPressure, 32> Pressure;

// Temporary basic block live-in cache.		// Temporary basic block live-in cache.
DenseMap<const MachineBasicBlock*, GCNRPTracker::LiveRegSet> MBBLiveIns;		DenseMap<const MachineBasicBlock*, GCNRPTracker::LiveRegSet> MBBLiveIns;

		DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> BBLiveInMap;
		DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> getBBLiveInMap() const;

// Return current region pressure.		// Return current region pressure.
GCNRegPressure getRealRegPressure() const;		GCNRegPressure getRealRegPressure() const;

// Compute and cache live-ins and pressure for all regions in block.		// Compute and cache live-ins and pressure for all regions in block.
void computeBlockPressure(const MachineBasicBlock *MBB);		void computeBlockPressure(const MachineBasicBlock *MBB);


public:		public:
Show All 11 Lines

llvm/trunk/lib/Target/AMDGPU/GCNSchedStrategy.cpp

Show First 20 Lines • Show All 439 Lines • ▼ Show 20 Lines	void GCNScheduleDAGMILive::computeBlockPressure(const MachineBasicBlock *MBB) {

auto I = MBB->begin();		auto I = MBB->begin();
auto LiveInIt = MBBLiveIns.find(MBB);		auto LiveInIt = MBBLiveIns.find(MBB);
if (LiveInIt != MBBLiveIns.end()) {		if (LiveInIt != MBBLiveIns.end()) {
auto LiveIn = std::move(LiveInIt->second);		auto LiveIn = std::move(LiveInIt->second);
RPTracker.reset(*MBB->begin(), &LiveIn);		RPTracker.reset(*MBB->begin(), &LiveIn);
MBBLiveIns.erase(LiveInIt);		MBBLiveIns.erase(LiveInIt);
} else {		} else {
I = Regions[CurRegion].first;		auto &Rgn = Regions[CurRegion];
RPTracker.reset(*I);		I = Rgn.first;
		auto NonDbgMI = &skipDebugInstructionsForward(Rgn.first, Rgn.second);
		auto LRS = BBLiveInMap.lookup(NonDbgMI);
		assert(isEqual(getLiveRegsBefore(NonDbgMI, LIS), LRS));
		RPTracker.reset(*I, &LRS);
}		}

for ( ; ; ) {		for ( ; ; ) {
I = RPTracker.getNext();		I = RPTracker.getNext();

if (Regions[CurRegion].first == I) {		if (Regions[CurRegion].first == I) {
LiveIns[CurRegion] = RPTracker.getLiveRegs();		LiveIns[CurRegion] = RPTracker.getLiveRegs();
RPTracker.clearMaxPressure();		RPTracker.clearMaxPressure();
Show All 14 Lines	if (I != MBB->end()) {
RPTracker.advance(MBB->end());		RPTracker.advance(MBB->end());
}		}
RPTracker.reset(*OnlySucc->begin(), &RPTracker.getLiveRegs());		RPTracker.reset(*OnlySucc->begin(), &RPTracker.getLiveRegs());
RPTracker.advanceBeforeNext();		RPTracker.advanceBeforeNext();
MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs();		MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs();
}		}
}		}

		DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet>
		GCNScheduleDAGMILive::getBBLiveInMap() const {
		assert(!Regions.empty());
		std::vector<MachineInstr *> BBStarters;
		BBStarters.reserve(Regions.size());
		auto I = Regions.rbegin(), E = Regions.rend();
		auto *BB = I->first->getParent();
		do {
		auto MI = &skipDebugInstructionsForward(I->first, I->second);
		BBStarters.push_back(MI);
		do {
		++I;
		} while (I != E && I->first->getParent() == BB);
		} while (I != E);
		return getLiveRegMap(BBStarters, false /After/, *LIS);
		}

void GCNScheduleDAGMILive::finalizeSchedule() {		void GCNScheduleDAGMILive::finalizeSchedule() {
GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;		GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;
LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");		LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");

LiveIns.resize(Regions.size());		LiveIns.resize(Regions.size());
Pressure.resize(Regions.size());		Pressure.resize(Regions.size());

		if (!Regions.empty())
		BBLiveInMap = getBBLiveInMap();

do {		do {
Stage++;		Stage++;
RegionIdx = 0;		RegionIdx = 0;
MachineBasicBlock *MBB = nullptr;		MachineBasicBlock *MBB = nullptr;

if (Stage > 1) {		if (Stage > 1) {
// Retry function scheduling if we found resulting occupancy and it is		// Retry function scheduling if we found resulting occupancy and it is
// lower than used for first pass scheduling. This will give more freedom		// lower than used for first pass scheduling. This will give more freedom
▲ Show 20 Lines • Show All 52 Lines • Show Last 20 Lines