Diff 330622

llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp

	Show First 20 Lines • Show All 312 Lines • ▼ Show 20 Lines

	/// Mark all relevant definitions of register \p Reg in usage \p UseMI.			/// Mark all relevant definitions of register \p Reg in usage \p UseMI.
	void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,			void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
	Register Reg, unsigned SubReg, char Flag,			Register Reg, unsigned SubReg, char Flag,
	std::vector<WorkItem> &Worklist) {			std::vector<WorkItem> &Worklist) {
	LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);			LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);

	LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI));			LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI));
	if (!UseLRQ.valueIn())			const VNInfo *Value = UseLRQ.valueIn();
				if (!Value)
	return;			return;

	// Note: this code assumes that lane masks on AMDGPU completely			// Note: this code assumes that lane masks on AMDGPU completely
	// cover registers.			// cover registers.
				const LaneBitmask UseLanes =
				SubReg ? TRI->getSubRegIndexLaneMask(SubReg)
				: (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg)
				: LaneBitmask::getNone());

				// Perform a depth-first iteration of the LiveRange graph marking defs.
				// Stop processing of a given branch when all use lanes have been defined.
				// The first definition stops processing for a physical register.
				struct PhiEntry {
				const VNInfo *Phi;
				unsigned PredIdx;
				unsigned VisitIdx;
	LaneBitmask DefinedLanes;			LaneBitmask DefinedLanes;
	LaneBitmask UseLanes;
	if (SubReg) {
	UseLanes = TRI->getSubRegIndexLaneMask(SubReg);
	} else if (Reg.isVirtual()) {
	UseLanes = MRI->getMaxLaneMaskForVReg(Reg);
	}

	SmallPtrSet<const VNInfo *, 4> Visited;			PhiEntry(const VNInfo *Phi, unsigned PredIdx, unsigned VisitIdx,
	SmallVector<const VNInfo *, 4> ToProcess;			LaneBitmask DefinedLanes)
	ToProcess.push_back(UseLRQ.valueIn());			: Phi(Phi), PredIdx(PredIdx), VisitIdx(VisitIdx),
				DefinedLanes(DefinedLanes) {}
				};
				SmallSetVector<const VNInfo *, 4> Visited;
				SmallVector<PhiEntry, 2> PhiStack;
				LaneBitmask DefinedLanes;
				unsigned NextPredIdx; // Only used for processing phi nodes
				piotrUnsubmitted Done Reply Inline Actions I think it would be clearer if you mentioned that NextPredIdx is only needed for the phi case, as opposed to DefinedLanes for instance. Can you please add a comment or change its name to include "phi" somehow (whatever seems right for you)? piotr: I think it would be clearer if you mentioned that NextPredIdx is only needed for the phi case…
				critsonAuthorUnsubmitted Done Reply Inline Actions Yep, I can rename this variable. critson: Yep, I can rename this variable.
	do {			do {
	const VNInfo *Value = ToProcess.pop_back_val();			const VNInfo *NextValue = nullptr;

				if (!Visited.count(Value)) {
	Visited.insert(Value);			Visited.insert(Value);
				// On first visit to a phi then start processing first predecessor
				NextPredIdx = 0;
				}

	if (Value->isPHIDef()) {			if (Value->isPHIDef()) {
	// Need to mark all defs used in the PHI node			// Each predecessor node in the phi must be processed as a subgraph
	const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def);			const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def);
	assert(MBB && "Phi-def has no defining MBB");			assert(MBB && "Phi-def has no defining MBB");
	for (MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(),
	PE = MBB->pred_end();			// Find next predecessor to process
	PI != PE; ++PI) {			unsigned Idx = NextPredIdx;
				piotrUnsubmitted Done Reply Inline Actions To keep the original behaviour, you also need to adjust Idx here and assign it to NextPredIdx, because Idx is used after the for loop in line 377, right? piotr: To keep the original behaviour, you also need to adjust Idx here and assign it to NextPredIdx…
				critsonAuthorUnsubmitted Done Reply Inline Actions Yes, I noticed that immediately after pushing this diff. critson: Yes, I noticed that immediately after pushing this diff.
				auto PI = MBB->pred_begin() + Idx;
				auto PE = MBB->pred_end();
				for (; PI != PE && !NextValue; ++PI, ++Idx) {
				foadUnsubmitted Done Reply Inline Actions I think you could use a SetVector here instead of a separate set and vector. foad: I think you could use a SetVector here instead of a separate set and vector.
	if (const VNInfo VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(PI))) {			if (const VNInfo VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(PI))) {
	if (!Visited.count(VN))			if (!Visited.count(VN))
				piotrUnsubmitted Done Reply Inline Actions Instead of starting with Idx = 0 and skipping, can you start with Idx = NextPredIdx? That would remove the need for the "continue". piotr: Instead of starting with Idx = 0 and skipping, can you start with Idx = NextPredIdx? That would…
				critsonAuthorUnsubmitted Done Reply Inline Actions What is happening here is we are skipping through the list of predecessor blocks to the correct number. I do not think there is a way to access a specific predecessor by index? critson: What is happening here is we are skipping through the list of predecessor blocks to the correct…
				piotrUnsubmitted Done Reply Inline Actions auto PI = MBB->pred_begin() + NextPredIdx; ? piotr: auto PI = MBB->pred_begin() + NextPredIdx; ?
				critsonAuthorUnsubmitted Done Reply Inline Actions Thanks! I didn't know you could do that with iterators. (You can see the limits of my C++ knowledge.) critson: Thanks! I didn't know you could do that with iterators. (You can see the limits of my C++…
	ToProcess.push_back(VN);			NextValue = VN;
	}			}
	}			}

				// If there are more predecessors to process; add phi to stack
				foadUnsubmitted Done Reply Inline Actions "predecessors" foad: "predecessors"
				if (PI != PE)
				PhiStack.emplace_back(Value, Idx, Visited.size(), DefinedLanes);
	} else {			} else {
	MachineInstr *MI = LIS->getInstructionFromIndex(Value->def);			MachineInstr *MI = LIS->getInstructionFromIndex(Value->def);
	assert(MI && "Def has no defining instruction");			assert(MI && "Def has no defining instruction");

	if (Reg.isVirtual()) {			if (Reg.isVirtual()) {
	// Iterate over all operands to find relevant definitions			// Iterate over all operands to find relevant definitions
	bool HasDef = false;			bool HasDef = false;
	for (const MachineOperand &Op : MI->operands()) {			for (const MachineOperand &Op : MI->operands()) {
	if (!(Op.isReg() && Op.isDef() && Op.getReg() == Reg))			if (!(Op.isReg() && Op.isDef() && Op.getReg() == Reg))
	continue;			continue;

	// Compute lanes defined and overlap with use			// Compute lanes defined and overlap with use
	LaneBitmask OpLanes =			LaneBitmask OpLanes =
	Op.isUndef() ? LaneBitmask::getAll()			Op.isUndef() ? LaneBitmask::getAll()
	: TRI->getSubRegIndexLaneMask(Op.getSubReg());			: TRI->getSubRegIndexLaneMask(Op.getSubReg());
	LaneBitmask Overlap = (UseLanes & OpLanes);			LaneBitmask Overlap = (UseLanes & OpLanes);

	// Record if this instruction defined any of use			// Record if this instruction defined any of use
	HasDef \|= Overlap.any();			HasDef \|= Overlap.any();

	// Check if all lanes of use have been defined			// Mark any lanes defined
	DefinedLanes \|= OpLanes;			DefinedLanes \|= OpLanes;
				}

				// Check if all lanes of use have been defined
	if ((DefinedLanes & UseLanes) != UseLanes) {			if ((DefinedLanes & UseLanes) != UseLanes) {
	// Definition not complete; need to process input value			// Definition not complete; need to process input value
	LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI));			LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI));
	if (const VNInfo *VN = LRQ.valueIn()) {			if (const VNInfo *VN = LRQ.valueIn()) {
	if (!Visited.count(VN))			if (!Visited.count(VN))
	ToProcess.push_back(VN);			NextValue = VN;
	}
	}			}
	}			}

	// Only mark the instruction if it defines some part of the use			// Only mark the instruction if it defines some part of the use
	if (HasDef)			if (HasDef)
	markInstruction(*MI, Flag, Worklist);			markInstruction(*MI, Flag, Worklist);
	} else {			} else {
	// For physical registers simply mark the defining instruction			// For physical registers simply mark the defining instruction
	markInstruction(*MI, Flag, Worklist);			markInstruction(*MI, Flag, Worklist);
	}			}
	}			}
	} while (!ToProcess.empty());

	assert(!Reg.isVirtual() \|\| ((DefinedLanes & UseLanes) == UseLanes));			if (!NextValue && !PhiStack.empty()) {
				// Reach end of chain; revert to processing last phi
				PhiEntry &Entry = PhiStack.back();
				foadUnsubmitted Done Reply Inline Actions PhiStack.back() foad: PhiStack.back()
				NextValue = Entry.Phi;
				NextPredIdx = Entry.PredIdx;
				DefinedLanes = Entry.DefinedLanes;
				// Rewind visited set to correct state
				while (Visited.size() > Entry.VisitIdx)
				Visited.pop_back();
				PhiStack.pop_back();
				}

				Value = NextValue;
				} while (Value);
	}			}

	void SIWholeQuadMode::markOperand(const MachineInstr &MI,			void SIWholeQuadMode::markOperand(const MachineInstr &MI,
	const MachineOperand &Op, char Flag,			const MachineOperand &Op, char Flag,
	std::vector<WorkItem> &Worklist) {			std::vector<WorkItem> &Worklist) {
	assert(Op.isReg());			assert(Op.isReg());
	Register Reg = Op.getReg();			Register Reg = Op.getReg();

	▲ Show 20 Lines • Show All 1,156 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/wqm.ll

Show First 20 Lines • Show All 853 Lines • ▼ Show 20 Lines	main_body:
%s = fadd float %a, %b		%s = fadd float %a, %b
ret float %s		ret float %s
}		}

; CHECK-LABEL: {{^}}test_loop_vcc:		; CHECK-LABEL: {{^}}test_loop_vcc:
; CHECK-NEXT: ; %entry		; CHECK-NEXT: ; %entry
; CHECK-NEXT: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec		; CHECK-NEXT: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
; CHECK: s_wqm_b64 exec, exec		; CHECK: s_wqm_b64 exec, exec
		; CHECK: v_mov
		; CHECK: v_mov
		; CHECK: v_mov
		; CHECK: v_mov
; CHECK: s_and_b64 exec, exec, [[LIVE]]		; CHECK: s_and_b64 exec, exec, [[LIVE]]
; CHECK: image_store		; CHECK: image_store
; CHECK: s_wqm_b64 exec, exec		; CHECK: s_wqm_b64 exec, exec
; CHECK-DAG: v_mov_b32_e32 [[CTR:v[0-9]+]], 0		; CHECK-DAG: v_mov_b32_e32 [[CTR:v[0-9]+]], 0
; CHECK-DAG: s_mov_b32 [[SEVEN:s[0-9]+]], 0x40e00000		; CHECK-DAG: s_mov_b32 [[SEVEN:s[0-9]+]], 0x40e00000

; CHECK: [[LOOPHDR:BB[0-9]+_[0-9]+]]: ; %body		; CHECK: [[LOOPHDR:BB[0-9]+_[0-9]+]]: ; %body
; CHECK: v_add_f32_e32 [[CTR]], 2.0, [[CTR]]		; CHECK: v_add_f32_e32 [[CTR]], 2.0, [[CTR]]
▲ Show 20 Lines • Show All 582 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/wqm.mir

Show First 20 Lines • Show All 253 Lines • ▼ Show 20 Lines	bb.0:
%2:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN %1:vgpr_32, %0:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec		%2:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN %1:vgpr_32, %0:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
%2.sub0:vreg_64 = V_SET_INACTIVE_B32 %2.sub0:vreg_64, 0, implicit $exec, implicit-def $scc		%2.sub0:vreg_64 = V_SET_INACTIVE_B32 %2.sub0:vreg_64, 0, implicit $exec, implicit-def $scc
%2.sub1:vreg_64 = V_SET_INACTIVE_B32 %2.sub1:vreg_64, 0, implicit $exec, implicit-def $scc		%2.sub1:vreg_64 = V_SET_INACTIVE_B32 %2.sub1:vreg_64, 0, implicit $exec, implicit-def $scc
%3:vreg_64 = nnan nsz arcp contract reassoc nofpexcept V_MAX_F64_e64 0, %2:vreg_64, 0, %2:vreg_64, 0, 0, implicit $mode, implicit $exec		%3:vreg_64 = nnan nsz arcp contract reassoc nofpexcept V_MAX_F64_e64 0, %2:vreg_64, 0, %2:vreg_64, 0, 0, implicit $mode, implicit $exec
$vgpr0 = STRICT_WWM %3.sub0:vreg_64, implicit $exec		$vgpr0 = STRICT_WWM %3.sub0:vreg_64, implicit $exec
$vgpr1 = STRICT_WWM %3.sub1:vreg_64, implicit $exec		$vgpr1 = STRICT_WWM %3.sub1:vreg_64, implicit $exec
SI_RETURN_TO_EPILOG $vgpr0, $vgpr1		SI_RETURN_TO_EPILOG $vgpr0, $vgpr1
...		...

		---
		# Check that WQM marking occurs correctly through phi nodes in live range graph.
		# If not then initial V_MOV will not be in WQM.
		#
		#CHECK-LABEL: name: test_wqm_lr_phi
		#CHECK: COPY $exec
		#CHECK-NEXT: S_WQM
		#CHECK-NEXT: V_MOV_B32_e32 -10
		#CHECK-NEXT: V_MOV_B32_e32 0
		name: test_wqm_lr_phi
		tracksRegLiveness: true
		body: \|
		bb.0:
		undef %0.sub0:vreg_64 = V_MOV_B32_e32 -10, implicit $exec
		%0.sub1:vreg_64 = V_MOV_B32_e32 0, implicit $exec
		%1:sreg_64 = S_GETPC_B64
		%2:sgpr_256 = S_LOAD_DWORDX8_IMM %1:sreg_64, 32, 0, 0

		bb.1:
		$vcc = V_CMP_LT_U32_e64 4, 4, implicit $exec
		S_CBRANCH_VCCNZ %bb.3, implicit $vcc
		S_BRANCH %bb.2

		bb.2:
		%0.sub0:vreg_64 = V_ADD_U32_e32 1, %0.sub1, implicit $exec
		S_BRANCH %bb.3

		bb.3:
		%0.sub1:vreg_64 = V_ADD_U32_e32 1, %0.sub1, implicit $exec
		S_BRANCH %bb.4

		bb.4:
		%3:sgpr_128 = IMPLICIT_DEF
		%4:vreg_128 = IMAGE_SAMPLE_V4_V2 %0:vreg_64, %2:sgpr_256, %3:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "ImageResource")
		$vgpr0 = COPY %4.sub0:vreg_128
		$vgpr1 = COPY %4.sub1:vreg_128
		SI_RETURN_TO_EPILOG $vgpr0, $vgpr1
		...

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Fix shortfalls in WQM marking
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 330622

llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp

llvm/test/CodeGen/AMDGPU/wqm.ll

llvm/test/CodeGen/AMDGPU/wqm.mir

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Fix shortfalls in WQM markingClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 330622

llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp

llvm/test/CodeGen/AMDGPU/wqm.ll

llvm/test/CodeGen/AMDGPU/wqm.mir

[AMDGPU] Fix shortfalls in WQM marking
ClosedPublic