Diff 263432

llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp

	Show All 26 Lines
	};			};

	static bool isExport(const SUnit &SU) {			static bool isExport(const SUnit &SU) {
	const MachineInstr *MI = SU.getInstr();			const MachineInstr *MI = SU.getInstr();
	return MI->getOpcode() == AMDGPU::EXP \|\|			return MI->getOpcode() == AMDGPU::EXP \|\|
	MI->getOpcode() == AMDGPU::EXP_DONE;			MI->getOpcode() == AMDGPU::EXP_DONE;
	}			}

				static bool isPositionExport(const SIInstrInfo TII, SUnit SU) {
				const MachineInstr *MI = SU->getInstr();
				int Imm = TII->getNamedOperand(*MI, AMDGPU::OpName::tgt)->getImm();
				return Imm >= 12 && Imm <= 15;
				foadUnsubmitted Not Done Reply Inline Actions Can you say why it's beneficial to do position exports first? foad: Can you say why it's beneficial to do position exports first?
				critsonAuthorUnsubmitted Done Reply Inline Actions I will add comment. critson: I will add comment.
				}

				static void sortChain(const SIInstrInfo TII, SmallVector<SUnit , 8> &Chain,
				foadUnsubmitted Not Done Reply Inline Actions It's a shame that this sorting is O(n^2) but I guess it's not a problem because the average chain length will be 2? You could probably do this sorting in a cute way with std::partition_copy if you felt inclined: https://en.cppreference.com/w/cpp/algorithm/partition_copy foad: It's a shame that this sorting is O(n^2) but I guess it's not a problem because the average…
				critsonAuthorUnsubmitted Done Reply Inline Actions This sort is O(n), it passes through the list only once moving elements to the top as it goes. critson: This sort is O(n), it passes through the list only once moving elements to the top as it goes.
				foadUnsubmitted Not Done Reply Inline Actions Yeah but each move is done with an erase and an insert, which both have to move the all the remaining items in the vector. foad: Yeah but each move is done with an erase and an insert, which both have to move the all the…
				unsigned PosCount) {
				if (!PosCount \|\| PosCount == Chain.size())
				return;

				// Position exports should occur as soon as possible in the shader
				// for optimal performance. This moves position exports before
				// other exports while preserving the order within different export
				// types (pos or other).
				SmallVector<SUnit *, 8> Copy(Chain);
				unsigned PosIdx = 0;
				unsigned OtherIdx = PosCount;
				for (SUnit *SU : Copy) {
				if (isPositionExport(TII, SU))
				Chain[PosIdx++] = SU;
				else
				Chain[OtherIdx++] = SU;
				}
				}

	static void buildCluster(ArrayRef<SUnit > Exports, ScheduleDAGInstrs DAG) {			static void buildCluster(ArrayRef<SUnit > Exports, ScheduleDAGInstrs DAG) {
	// Cluster a series of exports. Also copy all dependencies to the first			SUnit *ChainHead = Exports.front();
	// export to avoid computation being inserted into the chain.
	SUnit *ChainHead = Exports[0];			// Now construct cluster from chain by adding new edges.
	for (unsigned Idx = 0, End = Exports.size() - 1; Idx < End; ++Idx) {			for (unsigned Idx = 0, End = Exports.size() - 1; Idx < End; ++Idx) {
	SUnit *SUa = Exports[Idx];			SUnit *SUa = Exports[Idx];
	SUnit *SUb = Exports[Idx + 1];			SUnit *SUb = Exports[Idx + 1];
	if (DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) {
				// Copy all dependencies to the head of the chain to avoid any
				// computation being inserted into the chain.
	for (const SDep &Pred : SUb->Preds) {			for (const SDep &Pred : SUb->Preds) {
	SUnit *PredSU = Pred.getSUnit();			SUnit *PredSU = Pred.getSUnit();
	if (Pred.isWeak() \|\| isExport(*PredSU))			if (!isExport(*PredSU) && !Pred.isWeak())
	continue;
	DAG->addEdge(ChainHead, SDep(PredSU, SDep::Artificial));			DAG->addEdge(ChainHead, SDep(PredSU, SDep::Artificial));
	}			}
	}
				// New barrier edge ordering exports
				DAG->addEdge(SUb, SDep(SUa, SDep::Barrier));
				// Also add cluster edge
				DAG->addEdge(SUb, SDep(SUa, SDep::Cluster));
	}			}
	}			}

	void ExportClustering::apply(ScheduleDAGInstrs *DAG) {			void ExportClustering::apply(ScheduleDAGInstrs *DAG) {
	SmallVector<SmallVector<SUnit *, 8>, 4> ExportChains;			const SIInstrInfo TII = static_cast<const SIInstrInfo >(DAG->TII);
	DenseMap<unsigned, unsigned> ChainMap;
				SmallVector<SUnit *, 8> Chain;

	// Build chains of exports			// Pass through DAG gathering a list of exports and removing barrier edges
				// creating dependencies on exports. Freeing exports of successor edges
				// allows more scheduling freedom, and nothing should be order dependent
				// on exports. Edges will be added later to order the exports.
				unsigned PosCount = 0;
	for (SUnit &SU : DAG->SUnits) {			for (SUnit &SU : DAG->SUnits) {
				foadUnsubmitted Not Done Reply Inline Actions Why are the barrier edges there in the first place? Either the exports can be reordered, so the barrier edges should not be there; or they can't be, so we shouldn't ignore the barrier edges! foad: Why are the barrier edges there in the first place? Either the exports can be reordered, so the…
				critsonAuthorUnsubmitted Not Done Reply Inline Actions At a high level, what is a barrier dependency on an export? They get introduced because of intrinsics which access memory, etc. Consider the following: call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float 1.0, float 1.0, float 1.0, float 1.0, i1 false, i1 false) call void @llvm.amdgcn.exp.f32(i32 33, i32 15, float 1.0, float 1.0, float 1.0, float 0.5, i1 false, i1 false) %load = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0) call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float 0.0, float 0.0, float 0.0, float %load, i1 true, i1 false) The load forces ordering between the exports either side of it. I do not think there is a hardware motivation for this? critson: At a high level, what is a barrier dependency on an export? They get introduced because of…
				foadUnsubmitted Not Done Reply Inline Actions EXP and EXP_DONE instructions have mayStore=1 and it seems reasonable that the scheduler doesn't have any information about what they're storing or where, so it adds barriers to stop them from being reordered. In particular they satisfy `isGlobalMemoryObject` in ScheduleDAGInstrs.cpp. So I think it's fine to remove the existing barriers and add your own, if you know more about it than the scheduler :-) foad: EXP and EXP_DONE instructions have mayStore=1 and it seems reasonable that the scheduler…
	if (!isExport(SU))			if (isExport(SU)) {
	continue;			Chain.push_back(&SU);
				if (isPositionExport(TII, &SU))
				PosCount++;
				}

	unsigned ChainID = ExportChains.size();			SmallVector<SDep, 2> ToRemove;
	for (const SDep &Pred : SU.Preds) {			for (const SDep &Pred : SU.Preds) {
	const SUnit &PredSU = *Pred.getSUnit();			SUnit *PredSU = Pred.getSUnit();
	if (isExport(PredSU) && !Pred.isArtificial()) {			if (Pred.isBarrier() && isExport(*PredSU))
	ChainID = ChainMap.lookup(PredSU.NodeNum);			ToRemove.push_back(Pred);
	break;
	}
	}			}
	ChainMap[SU.NodeNum] = ChainID;			for (SDep Pred : ToRemove)
				SU.removePred(Pred);
	if (ChainID == ExportChains.size())
	ExportChains.push_back(SmallVector<SUnit *, 8>());

	auto &Chain = ExportChains[ChainID];
	Chain.push_back(&SU);
	}			}

	// Apply clustering			// Apply clustering if there are multiple exports
	for (auto &Chain : ExportChains)			if (Chain.size() > 1) {
				sortChain(TII, Chain, PosCount);
	buildCluster(Chain, DAG);			buildCluster(Chain, DAG);
	}			}
				}

	} // end namespace			} // end namespace

	namespace llvm {			namespace llvm {

	std::unique_ptr<ScheduleDAGMutation> createAMDGPUExportClusteringDAGMutation() {			std::unique_ptr<ScheduleDAGMutation> createAMDGPUExportClusteringDAGMutation() {
	return std::make_unique<ExportClustering>();			return std::make_unique<ExportClustering>();
	}			}

	} // end namespace llvm			} // end namespace llvm

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll

	; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -strict-whitespace -check-prefix=GCN %s			; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -strict-whitespace -check-prefix=GCN %s
	; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -strict-whitespace -check-prefix=GCN %s			; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -strict-whitespace -check-prefix=GCN %s

	declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1			declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
	declare void @llvm.amdgcn.exp.i32(i32, i32, i32, i32, i32, i32, i1, i1) #1			declare void @llvm.amdgcn.exp.i32(i32, i32, i32, i32, i32, i32, i1, i1) #1
				declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32) #2

	; GCN-LABEL: {{^}}test_export_zeroes_f32:			; GCN-LABEL: {{^}}test_export_zeroes_f32:
	; GCN: exp mrt0 off, off, off, off{{$}}			; GCN: exp mrt0 off, off, off, off{{$}}
	; GCN: exp mrt0 off, off, off, off done{{$}}			; GCN: exp mrt0 off, off, off, off done{{$}}
	define amdgpu_kernel void @test_export_zeroes_f32() #0 {			define amdgpu_kernel void @test_export_zeroes_f32() #0 {

	call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0, i1 false, i1 false)			call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0, i1 false, i1 false)
	call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0, i1 true, i1 false)			call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0, i1 true, i1 false)
	▲ Show 20 Lines • Show All 538 Lines • ▼ Show 20 Lines
	define amdgpu_kernel void @test_export_clustering(float %x, float %y) #0 {			define amdgpu_kernel void @test_export_clustering(float %x, float %y) #0 {
	%z0 = fadd float %x, %y			%z0 = fadd float %x, %y
	call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %x, float %y, float %z0, float 0.0, i1 false, i1 false)			call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %x, float %y, float %z0, float 0.0, i1 false, i1 false)
	%z1 = fsub float %y, %x			%z1 = fsub float %y, %x
	call void @llvm.amdgcn.exp.f32(i32 33, i32 15, float %x, float %y, float %z1, float 1.0, i1 true, i1 false)			call void @llvm.amdgcn.exp.f32(i32 33, i32 15, float %x, float %y, float %z1, float 1.0, i1 true, i1 false)
	ret void			ret void
	}			}

				; GCN-LABEL: {{^}}test_export_pos_before_param:
				; GCN: exp pos0
				; GCN-NOT: s_waitcnt
				; GCN: exp param0
				define amdgpu_kernel void @test_export_pos_before_param(float %x, float %y) #0 {
				%z0 = fadd float %x, %y
				call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float 1.0, float 1.0, float 1.0, float %z0, i1 false, i1 false)
				%z1 = fsub float %y, %x
				call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float 0.0, float 0.0, float 0.0, float %z1, i1 true, i1 false)
				ret void
				}

				; GCN-LABEL: {{^}}test_export_pos_before_param_ordered:
				; GCN: exp pos0
				; GCN: exp pos1
				; GCN: exp pos2
				; GCN-NOT: s_waitcnt
				; GCN: exp param0
				; GCN: exp param1
				; GCN: exp param2
				define amdgpu_kernel void @test_export_pos_before_param_ordered(float %x, float %y) #0 {
				%z0 = fadd float %x, %y
				call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float 1.0, float 1.0, float 1.0, float %z0, i1 false, i1 false)
				call void @llvm.amdgcn.exp.f32(i32 33, i32 15, float 1.0, float 1.0, float 1.0, float %z0, i1 false, i1 false)
				call void @llvm.amdgcn.exp.f32(i32 34, i32 15, float 1.0, float 1.0, float 1.0, float %z0, i1 false, i1 false)
				%z1 = fsub float %y, %x
				call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float 0.0, float 0.0, float 0.0, float %z1, i1 false, i1 false)
				call void @llvm.amdgcn.exp.f32(i32 13, i32 15, float 0.0, float 0.0, float 0.0, float %z1, i1 false, i1 false)
				call void @llvm.amdgcn.exp.f32(i32 14, i32 15, float 0.0, float 0.0, float 0.0, float %z1, i1 true, i1 false)
				ret void
				}

				; GCN-LABEL: {{^}}test_export_pos_before_param_across_load:
				; GCN: exp pos0
				; GCN-NEXT: exp param0
				; GCN-NEXT: exp param1
				define amdgpu_kernel void @test_export_pos_before_param_across_load(i32 %idx) #0 {
				call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float 1.0, float 1.0, float 1.0, float 1.0, i1 false, i1 false)
				call void @llvm.amdgcn.exp.f32(i32 33, i32 15, float 1.0, float 1.0, float 1.0, float 0.5, i1 false, i1 false)
				%load = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0)
				call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float 0.0, float 0.0, float 0.0, float %load, i1 true, i1 false)
				ret void
				}

	attributes #0 = { nounwind }			attributes #0 = { nounwind }
	attributes #1 = { nounwind inaccessiblememonly }			attributes #1 = { nounwind inaccessiblememonly }
				attributes #2 = { nounwind readnone }

llvm/test/CodeGen/AMDGPU/wait.ll

	; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -strict-whitespace %s --check-prefix=DEFAULT			; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -strict-whitespace %s --check-prefix=DEFAULT
	; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -strict-whitespace %s --check-prefix=DEFAULT			; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -strict-whitespace %s --check-prefix=DEFAULT
	; RUN: llc -march=amdgcn --misched=ilpmax -verify-machineinstrs < %s \| FileCheck -strict-whitespace %s --check-prefix=ILPMAX			; RUN: llc -march=amdgcn --misched=ilpmax -verify-machineinstrs < %s \| FileCheck -strict-whitespace %s --check-prefix=ILPMAX
	; RUN: llc -march=amdgcn --misched=ilpmax -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -strict-whitespace %s --check-prefix=ILPMAX			; RUN: llc -march=amdgcn --misched=ilpmax -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -strict-whitespace %s --check-prefix=ILPMAX
	; The ilpmax scheduler is used for the second test to get the ordering we want for the test.			; The ilpmax scheduler is used for the second test to get the ordering we want for the test.

	; DEFAULT-LABEL: {{^}}main:			; DEFAULT-LABEL: {{^}}main:
	; DEFAULT: s_load_dwordx4			; DEFAULT: s_load_dwordx4
	; DEFAULT: s_load_dwordx4			; DEFAULT: s_load_dwordx4
	; DEFAULT: s_waitcnt lgkmcnt(0)			; DEFAULT: s_waitcnt lgkmcnt(0)
	; DEFAULT: buffer_load_format_xyzw			; DEFAULT: buffer_load_format_xyzw
	; DEFAULT: buffer_load_format_xyzw			; DEFAULT: buffer_load_format_xyzw
	; DEFAULT: s_waitcnt vmcnt(0)			; DEFAULT-DAG: s_waitcnt vmcnt(0)
	; DEFAULT: exp			; DEFAULT-DAG: exp
	; DEFAULT: exp			; DEFAULT: exp
	; DEFAULT-NEXT: s_endpgm			; DEFAULT-NEXT: s_endpgm
	define amdgpu_vs void @main(<16 x i8> addrspace(4)* inreg %arg, <16 x i8> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, <16 x i8> addrspace(4)* inreg %arg3, <16 x i8> addrspace(4)* inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(4)* inreg %constptr) #0 {			define amdgpu_vs void @main(<16 x i8> addrspace(4)* inreg %arg, <16 x i8> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, <16 x i8> addrspace(4)* inreg %arg3, <16 x i8> addrspace(4)* inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(4)* inreg %constptr) #0 {
	main_body:			main_body:
	%tmp = getelementptr <16 x i8>, <16 x i8> addrspace(4)* %arg3, i32 0			%tmp = getelementptr <16 x i8>, <16 x i8> addrspace(4)* %arg3, i32 0
	%tmp10 = load <16 x i8>, <16 x i8> addrspace(4)* %tmp, !tbaa !0			%tmp10 = load <16 x i8>, <16 x i8> addrspace(4)* %tmp, !tbaa !0
	%tmp10.cast = bitcast <16 x i8> %tmp10 to <4 x i32>			%tmp10.cast = bitcast <16 x i8> %tmp10 to <4 x i32>
	%tmp11 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %tmp10.cast, i32 %arg6, i32 0, i32 0, i32 0)			%tmp11 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %tmp10.cast, i32 %arg6, i32 0, i32 0, i32 0)
	▲ Show 20 Lines • Show All 65 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Order pos exports before param exports
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 263432

llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll

llvm/test/CodeGen/AMDGPU/wait.ll

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Order pos exports before param exportsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 263432

llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll

llvm/test/CodeGen/AMDGPU/wait.ll

[AMDGPU] Order pos exports before param exports
ClosedPublic