This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Attempt to reschedule withou clustering
ClosedPublic

Authored by rampitec on Jan 24 2020, 3:30 PM.

Download Raw Diff

Details

Reviewers

foad
kerbowa
vpykhtin

Commits

rG53eb0f8c0713: [AMDGPU] Attempt to reschedule withou clustering

Summary

We want to have more load/store clustering but we also want
to maintain low register pressure which are oposit targets.
Allow scheduler to reschedule regions without mutations
applied if we hit a register limit.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

rampitec created this revision.Jan 24 2020, 3:30 PM

Herald added a project: Restricted Project. · View Herald TranscriptJan 24 2020, 3:30 PM

Herald added subscribers: hiraditya, t-tye, tpr and 7 others. · View Herald Transcript

This tries scheduling with all mutations disabled including macro fusion right?

Instead of adding another scheduling pass could you try always disabling mutations for the first pass, and have them enabled for the second pass with a fallback if we drop occupancy?

In D73386#1839876, @kerbowa wrote:

This tries scheduling with all mutations disabled including macro fusion right?

Yes. Macrofusion shall have the same impact on the pressure as clustering. It is a clustering in a sense.

Instead of adding another scheduling pass could you try always disabling mutations for the first pass, and have them enabled for the second pass with a fallback if we drop occupancy?

I though about it but have chosen not to. It will effectively double scheduling time. Note I am only rescheduling when we run out of registers. If I do it another way I will have to always attempt rescheduling. Currently the impact on the compile time should be really minimal.

LGTM

This revision is now accepted and ready to land.Jan 25 2020, 4:24 PM

LGTM.

llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
97	Maybe a BitVector? Don't have a strong opinion though.

Switched to BitVector.

LGTM, Thanks.

Closed by commit rG53eb0f8c0713: [AMDGPU] Attempt to reschedule withou clustering (authored by rampitec). · Explain WhyJan 27 2020, 10:29 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

lib/

Target/

AMDGPU/

GCNSchedStrategy.h

12 lines

GCNSchedStrategy.cpp

69 lines

test/

CodeGen/

AMDGPU/

schedule-regpressure-limit-clustering.ll

36 lines

Diff 240634

llvm/lib/Target/AMDGPU/GCNSchedStrategy.h

Show First 20 Lines • Show All 58 Lines • ▼ Show 20 Lines	public:

void initialize(ScheduleDAGMI *DAG) override;		void initialize(ScheduleDAGMI *DAG) override;

void setTargetOccupancy(unsigned Occ) { TargetOccupancy = Occ; }		void setTargetOccupancy(unsigned Occ) { TargetOccupancy = Occ; }
};		};

class GCNScheduleDAGMILive final : public ScheduleDAGMILive {		class GCNScheduleDAGMILive final : public ScheduleDAGMILive {

		enum : unsigned {
		Collect,
		InitialSchedule,
		UnclusteredReschedule,
		ClusteredLowOccupancyReschedule,
		LastStage = ClusteredLowOccupancyReschedule
		};

const GCNSubtarget &ST;		const GCNSubtarget &ST;

SIMachineFunctionInfo &MFI;		SIMachineFunctionInfo &MFI;

// Occupancy target at the beginning of function scheduling cycle.		// Occupancy target at the beginning of function scheduling cycle.
unsigned StartingOccupancy;		unsigned StartingOccupancy;

// Minimal real occupancy recorder for the function.		// Minimal real occupancy recorder for the function.
unsigned MinOccupancy;		unsigned MinOccupancy;

// Scheduling stage number.		// Scheduling stage number.
unsigned Stage;		unsigned Stage;

// Current region index.		// Current region index.
size_t RegionIdx;		size_t RegionIdx;

// Vector of regions recorder for later rescheduling		// Vector of regions recorder for later rescheduling
SmallVector<std::pair<MachineBasicBlock::iterator,		SmallVector<std::pair<MachineBasicBlock::iterator,
MachineBasicBlock::iterator>, 32> Regions;		MachineBasicBlock::iterator>, 32> Regions;

		// Records if a region is not yet scheduled, or schedule has been reverted,
		// or we generally desire to reschedule it.
		BitVector RescheduleRegions;
		vpykhtinUnsubmitted Done Reply Inline Actions Maybe a BitVector? Don't have a strong opinion though. vpykhtin: Maybe a BitVector? Don't have a strong opinion though.

// Region live-in cache.		// Region live-in cache.
SmallVector<GCNRPTracker::LiveRegSet, 32> LiveIns;		SmallVector<GCNRPTracker::LiveRegSet, 32> LiveIns;

// Region pressure cache.		// Region pressure cache.
SmallVector<GCNRegPressure, 32> Pressure;		SmallVector<GCNRegPressure, 32> Pressure;

// Temporary basic block live-in cache.		// Temporary basic block live-in cache.
DenseMap<const MachineBasicBlock*, GCNRPTracker::LiveRegSet> MBBLiveIns;		DenseMap<const MachineBasicBlock*, GCNRPTracker::LiveRegSet> MBBLiveIns;
Show All 23 Lines

llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp

Show First 20 Lines • Show All 310 Lines • ▼ Show 20 Lines
}		}

GCNScheduleDAGMILive::GCNScheduleDAGMILive(MachineSchedContext *C,		GCNScheduleDAGMILive::GCNScheduleDAGMILive(MachineSchedContext *C,
std::unique_ptr<MachineSchedStrategy> S) :		std::unique_ptr<MachineSchedStrategy> S) :
ScheduleDAGMILive(C, std::move(S)),		ScheduleDAGMILive(C, std::move(S)),
ST(MF.getSubtarget<GCNSubtarget>()),		ST(MF.getSubtarget<GCNSubtarget>()),
MFI(*MF.getInfo<SIMachineFunctionInfo>()),		MFI(*MF.getInfo<SIMachineFunctionInfo>()),
StartingOccupancy(MFI.getOccupancy()),		StartingOccupancy(MFI.getOccupancy()),
MinOccupancy(StartingOccupancy), Stage(0), RegionIdx(0) {		MinOccupancy(StartingOccupancy), Stage(Collect), RegionIdx(0) {

LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");		LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
}		}

void GCNScheduleDAGMILive::schedule() {		void GCNScheduleDAGMILive::schedule() {
if (Stage == 0) {		if (Stage == Collect) {
// Just record regions at the first pass.		// Just record regions at the first pass.
Regions.push_back(std::make_pair(RegionBegin, RegionEnd));		Regions.push_back(std::make_pair(RegionBegin, RegionEnd));
return;		return;
}		}

std::vector<MachineInstr*> Unsched;		std::vector<MachineInstr*> Unsched;
Unsched.reserve(NumRegionInstrs);		Unsched.reserve(NumRegionInstrs);
for (auto &I : *this) {		for (auto &I : *this) {
Show All 9 Lines	LLVM_DEBUG(dbgs() << "Pressure before scheduling:\nRegion live-ins:";
dbgs() << "Region live-in pressure: ";		dbgs() << "Region live-in pressure: ";
llvm::getRegPressure(MRI, LiveIns[RegionIdx]).print(dbgs());		llvm::getRegPressure(MRI, LiveIns[RegionIdx]).print(dbgs());
dbgs() << "Region register pressure: ";		dbgs() << "Region register pressure: ";
PressureBefore.print(dbgs()));		PressureBefore.print(dbgs()));
}		}

ScheduleDAGMILive::schedule();		ScheduleDAGMILive::schedule();
Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd);		Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd);
		RescheduleRegions[RegionIdx] = false;

if (!LIS)		if (!LIS)
return;		return;

// Check the results of scheduling.		// Check the results of scheduling.
GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;		GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;
auto PressureAfter = getRealRegPressure();		auto PressureAfter = getRealRegPressure();

Show All 25 Lines	void GCNScheduleDAGMILive::schedule() {
}		}
if (NewOccupancy < MinOccupancy) {		if (NewOccupancy < MinOccupancy) {
MinOccupancy = NewOccupancy;		MinOccupancy = NewOccupancy;
MFI.limitOccupancy(MinOccupancy);		MFI.limitOccupancy(MinOccupancy);
LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "		LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
<< MinOccupancy << ".\n");		<< MinOccupancy << ".\n");
}		}

		unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
		unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
		if (PressureAfter.getVGPRNum() > MaxVGPRs \|\|
		PressureAfter.getSGPRNum() > MaxSGPRs)
		RescheduleRegions[RegionIdx] = true;

if (WavesAfter >= MinOccupancy) {		if (WavesAfter >= MinOccupancy) {
unsigned TotalVGPRs = AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST);		if (Stage == UnclusteredReschedule &&
unsigned TotalSGPRs = AMDGPU::IsaInfo::getAddressableNumSGPRs(&ST);		!PressureAfter.less(ST, PressureBefore)) {
if (WavesAfter > MFI.getMinWavesPerEU() \|\|		LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
		} else if (WavesAfter > MFI.getMinWavesPerEU() \|\|
PressureAfter.less(ST, PressureBefore) \|\|		PressureAfter.less(ST, PressureBefore) \|\|
(TotalVGPRs >= PressureAfter.getVGPRNum() &&		!RescheduleRegions[RegionIdx]) {
TotalSGPRs >= PressureAfter.getSGPRNum())) {
Pressure[RegionIdx] = PressureAfter;		Pressure[RegionIdx] = PressureAfter;
return;		return;
}		} else {
LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");		LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
}		}
		}

LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");		LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
		RescheduleRegions[RegionIdx] = true;
RegionEnd = RegionBegin;		RegionEnd = RegionBegin;
for (MachineInstr *MI : Unsched) {		for (MachineInstr *MI : Unsched) {
if (MI->isDebugInstr())		if (MI->isDebugInstr())
continue;		continue;

if (MI->getIterator() != RegionEnd) {		if (MI->getIterator() != RegionEnd) {
BB->remove(MI);		BB->remove(MI);
BB->insert(RegionEnd, MI);		BB->insert(RegionEnd, MI);
▲ Show 20 Lines • Show All 113 Lines • ▼ Show 20 Lines
}		}

void GCNScheduleDAGMILive::finalizeSchedule() {		void GCNScheduleDAGMILive::finalizeSchedule() {
GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;		GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;
LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");		LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");

LiveIns.resize(Regions.size());		LiveIns.resize(Regions.size());
Pressure.resize(Regions.size());		Pressure.resize(Regions.size());
		RescheduleRegions.resize(Regions.size());
		RescheduleRegions.set();

if (!Regions.empty())		if (!Regions.empty())
BBLiveInMap = getBBLiveInMap();		BBLiveInMap = getBBLiveInMap();

		std::vector<std::unique_ptr<ScheduleDAGMutation>> SavedMutations;

do {		do {
Stage++;		Stage++;
RegionIdx = 0;		RegionIdx = 0;
MachineBasicBlock *MBB = nullptr;		MachineBasicBlock *MBB = nullptr;

if (Stage > 1) {		if (Stage > InitialSchedule) {
		if (!LIS)
		break;

// Retry function scheduling if we found resulting occupancy and it is		// Retry function scheduling if we found resulting occupancy and it is
// lower than used for first pass scheduling. This will give more freedom		// lower than used for first pass scheduling. This will give more freedom
// to schedule low register pressure blocks.		// to schedule low register pressure blocks.
// Code is partially copied from MachineSchedulerBase::scheduleRegions().		// Code is partially copied from MachineSchedulerBase::scheduleRegions().

if (!LIS \|\| StartingOccupancy <= MinOccupancy)		if (Stage == UnclusteredReschedule) {
		if (RescheduleRegions.none())
		continue;
		LLVM_DEBUG(dbgs() <<
		"Retrying function scheduling without clustering.\n");
		}

		if (Stage == ClusteredLowOccupancyReschedule) {
		if (StartingOccupancy <= MinOccupancy)
break;		break;

LLVM_DEBUG(		LLVM_DEBUG(
dbgs()		dbgs()
<< "Retrying function scheduling with lowest recorded occupancy "		<< "Retrying function scheduling with lowest recorded occupancy "
<< MinOccupancy << ".\n");		<< MinOccupancy << ".\n");

S.setTargetOccupancy(MinOccupancy);		S.setTargetOccupancy(MinOccupancy);
}		}
		}

		if (Stage == UnclusteredReschedule)
		SavedMutations.swap(Mutations);

for (auto Region : Regions) {		for (auto Region : Regions) {
		if (Stage == UnclusteredReschedule && !RescheduleRegions[RegionIdx])
		continue;

RegionBegin = Region.first;		RegionBegin = Region.first;
RegionEnd = Region.second;		RegionEnd = Region.second;

if (RegionBegin->getParent() != MBB) {		if (RegionBegin->getParent() != MBB) {
if (MBB) finishBlock();		if (MBB) finishBlock();
MBB = RegionBegin->getParent();		MBB = RegionBegin->getParent();
startBlock(MBB);		startBlock(MBB);
if (Stage == 1)		if (Stage == InitialSchedule)
computeBlockPressure(MBB);		computeBlockPressure(MBB);
}		}

unsigned NumRegionInstrs = std::distance(begin(), end());		unsigned NumRegionInstrs = std::distance(begin(), end());
enterRegion(MBB, begin(), end(), NumRegionInstrs);		enterRegion(MBB, begin(), end(), NumRegionInstrs);

// Skip empty scheduling regions (0 or 1 schedulable instructions).		// Skip empty scheduling regions (0 or 1 schedulable instructions).
if (begin() == end() \|\| begin() == std::prev(end())) {		if (begin() == end() \|\| begin() == std::prev(end())) {
Show All 11 Lines	for (auto Region : Regions) {

schedule();		schedule();

exitRegion();		exitRegion();
++RegionIdx;		++RegionIdx;
}		}
finishBlock();		finishBlock();

} while (Stage < 2);		if (Stage == UnclusteredReschedule)
		SavedMutations.swap(Mutations);
		} while (Stage != LastStage);
}		}

llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll

This file was added.

				; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s

				; Interleave loads and stores to fit into 9 VGPR limit.
				; This requires to avoid load/store clustering.

				; GCN: global_load_dwordx4
				; GCN: global_store_dwordx4
				; GCN: global_load_dwordx4
				; GCN: global_store_dwordx4
				; GCN: global_load_dwordx4
				; GCN: global_store_dwordx4
				; GCN: NumVgprs: {{[0-9]$}}
				; GCN: ScratchSize: 0{{$}}

				define amdgpu_kernel void @load_store_max_9vgprs(<4 x i32> addrspace(1)* nocapture noalias readonly %arg, <4 x i32> addrspace(1)* nocapture noalias %arg1) #1 {
				bb:
				%id = call i32 @llvm.amdgcn.workitem.id.x()
				%base = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i32 %id
				%tmp = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %base, i32 1
				%tmp2 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp, align 4
				%tmp3 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %base, i32 3
				%tmp4 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp3, align 4
				%tmp5 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %base, i32 5
				%tmp6 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp5, align 4
				store <4 x i32> %tmp2, <4 x i32> addrspace(1)* %arg1, align 4
				%tmp7 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 3
				store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %tmp7, align 4
				%tmp8 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 5
				store <4 x i32> %tmp6, <4 x i32> addrspace(1)* %tmp8, align 4
				ret void
				}

				declare i32 @llvm.amdgcn.workitem.id.x() #0

				attributes #0 = { nounwind readnone }
				attributes #1 = { "amdgpu-num-vgpr"="9" }