This is an archive of the discontinued LLVM Phabricator instance.

llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
358–366	The coding style is strange here because it looks like it could call checkReadM0Hazards four times. But I guess in practice at most one of the conditionals will be true?

rampitec added inline comments.Apr 28 2022, 1:56 AM

llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
358–366	It tests for different types of instructions, so the actual function will be called once at most. Moreover, these are not common instructions. In fact scanning for operands to see if it uses LDS_DIRECT is more expensive.

ping

Collapsed all conditions around checkReadM0Hazards(). To me it is less readable but since there is a concern we may call checkReadM0Hazards() more than needed, combined.

arsenm added inline comments.May 3 2022, 10:55 AM

llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
358–359	Don't see why you merged in these cases that early returned before

rampitec added inline comments.May 3 2022, 10:57 AM

llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
358–366	I am generally concerned by these early returns. We may skip checking other hazards which require more waitstates. I think we may need to remove all early returns here completely with the complexity of the hazard recognizer growing.

Returned early return, at least for this patch. I have convinced myself that hazards checked later cannot interfere with these instructions.

Harbormaster completed remote builds in B162530: Diff 426799.May 3 2022, 1:56 PM

rampitec mentioned this in D124884: [AMDGPU] Add intrinsics llvm.amdgcn.{raw|struct}.buffer.load.lds.May 3 2022, 3:06 PM

arsenm accepted this revision.May 4 2022, 1:58 PM

This revision is now accepted and ready to land.May 4 2022, 1:58 PM

Herald added a subscriber: jsilvanus. · View Herald TranscriptMay 4 2022, 1:58 PM

This revision was landed with ongoing or failed builds.May 4 2022, 2:45 PM

Closed by commit rG63f21f4cc7bb: [AMDGPU] Handle LDS DMA and LDS_DIRECT hazards (authored by rampitec). · Explain Why

This revision was automatically updated to reflect the committed changes.

rampitec added a commit: rG63f21f4cc7bb: [AMDGPU] Handle LDS DMA and LDS_DIRECT hazards.

Revision Contents

Path

Size

llvm/

lib/

Target/

AMDGPU/

GCNHazardRecognizer.cpp

35 lines

GCNSubtarget.h

8 lines

test/

CodeGen/

AMDGPU/

hazard.mir

24 lines

lds-dma-hazards.mir

49 lines

Diff 426772

llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp

Show First 20 Lines • Show All 160 Lines • ▼ Show 20 Lines
}		}

static bool isPermlane(const MachineInstr &MI) {		static bool isPermlane(const MachineInstr &MI) {
unsigned Opcode = MI.getOpcode();		unsigned Opcode = MI.getOpcode();
return Opcode == AMDGPU::V_PERMLANE16_B32_e64 \|\|		return Opcode == AMDGPU::V_PERMLANE16_B32_e64 \|\|
Opcode == AMDGPU::V_PERMLANEX16_B32_e64;		Opcode == AMDGPU::V_PERMLANEX16_B32_e64;
}		}

		static bool isLdsDma(const MachineInstr &MI) {
		return SIInstrInfo::isVALU(MI) &&
		(SIInstrInfo::isMUBUF(MI) \|\| SIInstrInfo::isFLAT(MI));
		}

static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {		static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,		const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
AMDGPU::OpName::simm16);		AMDGPU::OpName::simm16);
return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_;		return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_;
}		}

ScheduleHazardRecognizer::HazardType		ScheduleHazardRecognizer::HazardType
GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {		GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
▲ Show 20 Lines • Show All 44 Lines • ▼ Show 20 Lines	if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
return HazardType;		return HazardType;

if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)		if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
return HazardType;		return HazardType;

if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)		if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
return HazardType;		return HazardType;

if (ST.hasReadM0MovRelInterpHazard() &&		if (((ST.hasReadM0MovRelInterpHazard() &&
(TII.isVINTRP(*MI) \|\| isSMovRel(MI->getOpcode())) &&		(TII.isVINTRP(*MI) \|\| isSMovRel(MI->getOpcode()))) \|\|
checkReadM0Hazards(MI) > 0)		(ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) \|\|
return HazardType;		(ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) \|\|
		(ST.hasReadM0LdsDirectHazard() &&
if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI) &&		MI->readsRegister(AMDGPU::LDS_DIRECT))) &&
checkReadM0Hazards(MI) > 0)		checkReadM0Hazards(MI) > 0)
return HazardType;		return HazardType;

if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)		if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
return HazardType;		return HazardType;

if ((SIInstrInfo::isVMEM(*MI) \|\|		if ((SIInstrInfo::isVMEM(*MI) \|\|
SIInstrInfo::isFLAT(*MI) \|\|		SIInstrInfo::isFLAT(*MI) \|\|
▲ Show 20 Lines • Show All 102 Lines • ▼ Show 20 Lines	unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
if (isSGetReg(MI->getOpcode()))		if (isSGetReg(MI->getOpcode()))
return std::max(WaitStates, checkGetRegHazards(MI));		return std::max(WaitStates, checkGetRegHazards(MI));

if (isSSetReg(MI->getOpcode()))		if (isSSetReg(MI->getOpcode()))
return std::max(WaitStates, checkSetRegHazards(MI));		return std::max(WaitStates, checkSetRegHazards(MI));

if (isRFE(MI->getOpcode()))		if (isRFE(MI->getOpcode()))
return std::max(WaitStates, checkRFEHazards(MI));		return std::max(WaitStates, checkRFEHazards(MI));

if (ST.hasReadM0MovRelInterpHazard() && (TII.isVINTRP(*MI) \|\|		if ((ST.hasReadM0MovRelInterpHazard() &&
		arsenmUnsubmitted Done Reply Inline Actions Don't see why you merged in these cases that early returned before arsenm: Don't see why you merged in these cases that early returned before
isSMovRel(MI->getOpcode())))		(TII.isVINTRP(*MI) \|\| isSMovRel(MI->getOpcode()))) \|\|
return std::max(WaitStates, checkReadM0Hazards(MI));		(ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) \|\|
		(ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) \|\|
if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI))		(ST.hasReadM0LdsDirectHazard() && MI->readsRegister(AMDGPU::LDS_DIRECT)))
return std::max(WaitStates, checkReadM0Hazards(MI));		WaitStates = std::max(WaitStates, checkReadM0Hazards(MI));

if (SIInstrInfo::isMAI(*MI))		if (SIInstrInfo::isMAI(*MI))
		foadUnsubmitted Done Reply Inline Actions The coding style is strange here because it looks like it could call checkReadM0Hazards four times. But I guess in practice at most one of the conditionals will be true? foad: The coding style is strange here because it looks like it could call checkReadM0Hazards four…
		rampitecAuthorUnsubmitted Done Reply Inline Actions It tests for different types of instructions, so the actual function will be called once at most. Moreover, these are not common instructions. In fact scanning for operands to see if it uses LDS_DIRECT is more expensive. rampitec: It tests for different types of instructions, so the actual function will be called once at…
		rampitecAuthorUnsubmitted Done Reply Inline Actions I am generally concerned by these early returns. We may skip checking other hazards which require more waitstates. I think we may need to remove all early returns here completely with the complexity of the hazard recognizer growing. rampitec: I am generally concerned by these early returns. We may skip checking other hazards which…
return std::max(WaitStates, checkMAIHazards(MI));		return std::max(WaitStates, checkMAIHazards(MI));

if (SIInstrInfo::isVMEM(*MI) \|\|		if (SIInstrInfo::isVMEM(*MI) \|\|
SIInstrInfo::isFLAT(*MI) \|\|		SIInstrInfo::isFLAT(*MI) \|\|
SIInstrInfo::isDS(*MI))		SIInstrInfo::isDS(*MI))
return std::max(WaitStates, checkMAILdStHazards(MI));		return std::max(WaitStates, checkMAILdStHazards(MI));

return WaitStates;		return WaitStates;
▲ Show 20 Lines • Show All 639 Lines • ▼ Show 20 Lines	auto IsHazardFn = [TII](const MachineInstr &MI) {
return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;		return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
};		};
int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);		int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
return RFEWaitStates - WaitStatesNeeded;		return RFEWaitStates - WaitStatesNeeded;
}		}

int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {		int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
const SIInstrInfo *TII = ST.getInstrInfo();		const SIInstrInfo *TII = ST.getInstrInfo();
const int SMovRelWaitStates = 1;		const int ReadM0WaitStates = 1;
auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };		auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn,		return ReadM0WaitStates -
SMovRelWaitStates);		getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);
}		}

void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {		void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
fixVMEMtoScalarWriteHazards(MI);		fixVMEMtoScalarWriteHazards(MI);
fixVcmpxPermlaneHazards(MI);		fixVcmpxPermlaneHazards(MI);
fixSMEMtoVectorWriteHazards(MI);		fixSMEMtoVectorWriteHazards(MI);
fixVcmpxExecWARHazard(MI);		fixVcmpxExecWARHazard(MI);
fixLdsBranchVmemWARHazard(MI);		fixLdsBranchVmemWARHazard(MI);
▲ Show 20 Lines • Show All 1,125 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Show First 20 Lines • Show All 925 Lines • ▼ Show 20 Lines	bool hasReadM0MovRelInterpHazard() const {
return getGeneration() == AMDGPUSubtarget::GFX9;		return getGeneration() == AMDGPUSubtarget::GFX9;
}		}

bool hasReadM0SendMsgHazard() const {		bool hasReadM0SendMsgHazard() const {
return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&		return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
getGeneration() <= AMDGPUSubtarget::GFX9;		getGeneration() <= AMDGPUSubtarget::GFX9;
}		}

		bool hasReadM0LdsDmaHazard() const {
		return getGeneration() == AMDGPUSubtarget::GFX9;
		}

		bool hasReadM0LdsDirectHazard() const {
		return getGeneration() == AMDGPUSubtarget::GFX9;
		}

bool hasVcmpxPermlaneHazard() const {		bool hasVcmpxPermlaneHazard() const {
return HasVcmpxPermlaneHazard;		return HasVcmpxPermlaneHazard;
}		}

bool hasVMEMtoScalarWriteHazard() const {		bool hasVMEMtoScalarWriteHazard() const {
return HasVMEMtoScalarWriteHazard;		return HasVMEMtoScalarWriteHazard;
}		}

▲ Show 20 Lines • Show All 271 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/hazard.mir

Show First 20 Lines • Show All 165 Lines • ▼ Show 20 Lines	body: \|
bb.0:		bb.0:
$m0 = S_MOV_B32 -1		$m0 = S_MOV_B32 -1
SI_MASKED_UNREACHABLE		SI_MASKED_UNREACHABLE

bb.1:		bb.1:
S_SENDMSG 3, implicit $exec, implicit $m0		S_SENDMSG 3, implicit $exec, implicit $m0
S_ENDPGM 0		S_ENDPGM 0
...		...

		# GCN-LABEL: name: buffer_store_lds_dword
		# GCN: $m0 = S_MOV_B32 0
		# GFX9-NEXT: S_NOP 0
		# GCN-NEXT: BUFFER_STORE_LDS_DWORD
		---
		name: buffer_store_lds_dword
		body: \|
		bb.0:
		$m0 = S_MOV_B32 0
		BUFFER_STORE_LDS_DWORD $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec, implicit $m0
		...

		# GCN-LABEL: name: lds_direct_read_m0
		# GCN: $m0 = S_MOV_B32 0
		# GFX9-NEXT: S_NOP 0
		# GCN-NEXT: V_MOV_B32
		---
		name: lds_direct_read_m0
		body: \|
		bb.0:
		$m0 = S_MOV_B32 0
		$vgpr0 = V_MOV_B32_e32 $lds_direct, implicit $exec, implicit $m0
		...

llvm/test/CodeGen/AMDGPU/lds-dma-hazards.mir

This file was added.

				# RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - \| FileCheck --check-prefix=GCN %s

				# GCN-LABEL: name: buffer_load_dword_lds
				# GCN: $m0 = S_MOV_B32 0
				# GCN-NEXT: S_NOP 0
				# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_ADDR64
				---
				name: buffer_load_dword_lds
				body: \|
				bb.0:
				$m0 = S_MOV_B32 0
				BUFFER_LOAD_DWORD_LDS_ADDR64 $vgpr0_vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec, implicit $m0
				...

				# GCN-LABEL: name: buffer_store_lds_dword
				# GCN: $m0 = S_MOV_B32 0
				# GCN-NEXT: S_NOP 0
				# GCN-NEXT: BUFFER_STORE_LDS_DWORD
				---
				name: buffer_store_lds_dword
				body: \|
				bb.0:
				$m0 = S_MOV_B32 0
				BUFFER_STORE_LDS_DWORD $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec, implicit $m0
				...

				# GCN-LABEL: name: global_load_lds_dword
				# GCN: $m0 = S_MOV_B32 0
				# GCN-NEXT: S_NOP 0
				# GCN-NEXT: GLOBAL_LOAD_LDS_DWORD
				---
				name: global_load_lds_dword
				body: \|
				bb.0:
				$m0 = S_MOV_B32 0
				GLOBAL_LOAD_LDS_DWORD $vgpr2_vgpr3, 0, 0, implicit $exec, implicit $m0
				...

				# GCN-LABEL: name: scratch_load_lds_dword
				# GCN: $m0 = S_MOV_B32 0
				# GCN-NEXT: S_NOP 0
				# GCN-NEXT: SCRATCH_LOAD_LDS_DWORD
				---
				name: scratch_load_lds_dword
				body: \|
				bb.0:
				$m0 = S_MOV_B32 0
				SCRATCH_LOAD_LDS_DWORD $vgpr2, 0, 0, implicit $exec, implicit $m0
				...

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Handle LDS DMA and LDS_DIRECT hazardsClosedPublic

Details

Diff Detail