This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/
-
lib/Target/AMDGPU/
-
Target/
-
AMDGPU/
3/3
SILoadStoreOptimizer.cpp
-
test/CodeGen/AMDGPU/
-
CodeGen/
-
AMDGPU/
1/1
merge-sbuffer-load.mir

Differential D133787

[AMDGPU][SILoadStoreOptimizer] Merge SGPR_IMM scalar buffer loads.
ClosedPublic

Authored by kosarev on Sep 13 2022, 9:56 AM.

Download Raw Diff

Details

Reviewers

foad
rampitec
arsenm

Commits

rG693f81628815: [AMDGPU][SILoadStoreOptimizer] Merge SGPR_IMM scalar buffer loads.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

kosarev created this revision.Sep 13 2022, 9:56 AM

Herald added a project: Restricted Project. · View Herald TranscriptSep 13 2022, 9:56 AM

Herald added subscribers: kerbowa, hiraditya, t-tye and 6 others. · View Herald Transcript

kosarev requested review of this revision.Sep 13 2022, 9:56 AM

Herald added a project: Restricted Project. · View Herald TranscriptSep 13 2022, 9:56 AM

Herald added subscribers: llvm-commits, wdng. · View Herald Transcript

Harbormaster completed remote builds in B186398: Diff 459789.Sep 13 2022, 11:10 AM

rampitec added inline comments.Sep 13 2022, 1:34 PM

llvm/test/CodeGen/AMDGPU/merge-sbuffer-load.mir
136	Add couple negative tests maybe, one with different sgprs used and one with non-adjacent offsets?

I can't resist pointing out that this patch would be simpler if we never used the _SGPR form on subtargets where the _SGPR_IMM form is available.

llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
644	Don't you need to list the _SGPR forms here too?

In D133787#3788908, @foad wrote:

I can't resist pointing out that this patch would be simpler if we never used the _SGPR form on subtargets where the _SGPR_IMM form is available.

It would be even simpler if we did this in the IR to begin with. It's only after codegen you have to worry so much about addressing mode minutia

Addressed review feedback.

kosarev marked 2 inline comments as done.Sep 14 2022, 9:03 AM

kosarev added inline comments.

llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
644	Oh, that's a nice catch, thanks. I've updated `hasSameBaseAddress()` above to compare the number of address operands so tests could catch this as well.

In D133787#3788908, @foad wrote:

I can't resist pointing out that this patch would be simpler if we never used the _SGPR form on subtargets where the _SGPR_IMM form is available.

True, the implementation would be a bit simpler. I don't mind it either way, it's just that a slightly simpler implementation doesn't necessarily work as an obviously sufficient argument when it comes to user-faced things. I guess some wider discussion might help here.

llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
1388	Also changed this line to what looks to me a more reliable implementation.

Harbormaster completed remote builds in B186649: Diff 460125.Sep 14 2022, 10:04 AM

LGTM

This revision is now accepted and ready to land.Sep 14 2022, 1:48 PM

LGTM, thanks!

This revision was landed with ongoing or failed builds.Sep 15 2022, 5:49 AM

Closed by commit rG693f81628815: [AMDGPU][SILoadStoreOptimizer] Merge SGPR_IMM scalar buffer loads. (authored by kosarev). · Explain Why

This revision was automatically updated to reflect the committed changes.

kosarev added a commit: rG693f81628815: [AMDGPU][SILoadStoreOptimizer] Merge SGPR_IMM scalar buffer loads..

Revision Contents

Path

Size

llvm/

lib/

Target/

AMDGPU/

SILoadStoreOptimizer.cpp

87 lines

test/

CodeGen/

AMDGPU/

merge-sbuffer-load.mir

57 lines

Diff 460383

llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp

Show First 20 Lines • Show All 68 Lines • ▼ Show 20 Lines
#define DEBUG_TYPE "si-load-store-opt"		#define DEBUG_TYPE "si-load-store-opt"

namespace {		namespace {
enum InstClassEnum {		enum InstClassEnum {
UNKNOWN,		UNKNOWN,
DS_READ,		DS_READ,
DS_WRITE,		DS_WRITE,
S_BUFFER_LOAD_IMM,		S_BUFFER_LOAD_IMM,
		S_BUFFER_LOAD_SGPR_IMM,
S_LOAD_IMM,		S_LOAD_IMM,
BUFFER_LOAD,		BUFFER_LOAD,
BUFFER_STORE,		BUFFER_STORE,
MIMG,		MIMG,
TBUFFER_LOAD,		TBUFFER_LOAD,
TBUFFER_STORE,		TBUFFER_STORE,
GLOBAL_LOAD_SADDR,		GLOBAL_LOAD_SADDR,
GLOBAL_STORE_SADDR,		GLOBAL_STORE_SADDR,
Show All 31 Lines	struct CombineInfo {
unsigned CPol = 0;		unsigned CPol = 0;
bool IsAGPR;		bool IsAGPR;
bool UseST64;		bool UseST64;
int AddrIdx[MaxAddressRegs];		int AddrIdx[MaxAddressRegs];
const MachineOperand *AddrReg[MaxAddressRegs];		const MachineOperand *AddrReg[MaxAddressRegs];
unsigned NumAddresses;		unsigned NumAddresses;
unsigned Order;		unsigned Order;

bool hasSameBaseAddress(const MachineInstr &MI) {		bool hasSameBaseAddress(const CombineInfo &CI) {
		if (NumAddresses != CI.NumAddresses)
		return false;

		const MachineInstr &MI = *CI.I;
for (unsigned i = 0; i < NumAddresses; i++) {		for (unsigned i = 0; i < NumAddresses; i++) {
const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);		const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);

if (AddrReg[i]->isImm() \|\| AddrRegNext.isImm()) {		if (AddrReg[i]->isImm() \|\| AddrRegNext.isImm()) {
if (AddrReg[i]->isImm() != AddrRegNext.isImm() \|\|		if (AddrReg[i]->isImm() != AddrRegNext.isImm() \|\|
AddrReg[i]->getImm() != AddrRegNext.getImm()) {		AddrReg[i]->getImm() != AddrRegNext.getImm()) {
return false;		return false;
}		}
Show All 22 Lines	bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
// non-register		// non-register
if (!AddrOp->isReg())		if (!AddrOp->isReg())
return false;		return false;

// TODO: We should be able to merge physical reg addresses.		// TODO: We should be able to merge physical reg addresses.
if (AddrOp->getReg().isPhysical())		if (AddrOp->getReg().isPhysical())
return false;		return false;

// If an address has only one use then there will be on other		// If an address has only one use then there will be no other
// instructions with the same address, so we can't merge this one.		// instructions with the same address, so we can't merge this one.
if (MRI.hasOneNonDBGUse(AddrOp->getReg()))		if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
return false;		return false;
}		}
return true;		return true;
}		}

void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);		void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
▲ Show 20 Lines • Show All 149 Lines • ▼ Show 20 Lines	if (TII.isMIMG(MI)) {
return countPopulation(DMaskImm);		return countPopulation(DMaskImm);
}		}
if (TII.isMTBUF(Opc)) {		if (TII.isMTBUF(Opc)) {
return AMDGPU::getMTBUFElements(Opc);		return AMDGPU::getMTBUFElements(Opc);
}		}

switch (Opc) {		switch (Opc) {
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:		case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
		case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
		case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
case AMDGPU::S_LOAD_DWORD_IMM:		case AMDGPU::S_LOAD_DWORD_IMM:
case AMDGPU::GLOBAL_LOAD_DWORD:		case AMDGPU::GLOBAL_LOAD_DWORD:
case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:		case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
case AMDGPU::GLOBAL_STORE_DWORD:		case AMDGPU::GLOBAL_STORE_DWORD:
case AMDGPU::GLOBAL_STORE_DWORD_SADDR:		case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
case AMDGPU::FLAT_LOAD_DWORD:		case AMDGPU::FLAT_LOAD_DWORD:
case AMDGPU::FLAT_STORE_DWORD:		case AMDGPU::FLAT_STORE_DWORD:
return 1;		return 1;
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:		case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
		case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
		case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
case AMDGPU::S_LOAD_DWORDX2_IMM:		case AMDGPU::S_LOAD_DWORDX2_IMM:
case AMDGPU::GLOBAL_LOAD_DWORDX2:		case AMDGPU::GLOBAL_LOAD_DWORDX2:
case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:		case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
case AMDGPU::GLOBAL_STORE_DWORDX2:		case AMDGPU::GLOBAL_STORE_DWORDX2:
case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:		case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
case AMDGPU::FLAT_LOAD_DWORDX2:		case AMDGPU::FLAT_LOAD_DWORDX2:
case AMDGPU::FLAT_STORE_DWORDX2:		case AMDGPU::FLAT_STORE_DWORDX2:
return 2;		return 2;
case AMDGPU::GLOBAL_LOAD_DWORDX3:		case AMDGPU::GLOBAL_LOAD_DWORDX3:
case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:		case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
case AMDGPU::GLOBAL_STORE_DWORDX3:		case AMDGPU::GLOBAL_STORE_DWORDX3:
case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:		case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
case AMDGPU::FLAT_LOAD_DWORDX3:		case AMDGPU::FLAT_LOAD_DWORDX3:
case AMDGPU::FLAT_STORE_DWORDX3:		case AMDGPU::FLAT_STORE_DWORDX3:
return 3;		return 3;
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:		case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
		case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
		case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
case AMDGPU::S_LOAD_DWORDX4_IMM:		case AMDGPU::S_LOAD_DWORDX4_IMM:
case AMDGPU::GLOBAL_LOAD_DWORDX4:		case AMDGPU::GLOBAL_LOAD_DWORDX4:
case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:		case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
case AMDGPU::GLOBAL_STORE_DWORDX4:		case AMDGPU::GLOBAL_STORE_DWORDX4:
case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:		case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
case AMDGPU::FLAT_LOAD_DWORDX4:		case AMDGPU::FLAT_LOAD_DWORDX4:
case AMDGPU::FLAT_STORE_DWORDX4:		case AMDGPU::FLAT_STORE_DWORDX4:
return 4;		return 4;
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:		case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
		case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
		case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
case AMDGPU::S_LOAD_DWORDX8_IMM:		case AMDGPU::S_LOAD_DWORDX8_IMM:
return 8;		return 8;
case AMDGPU::DS_READ_B32: [[fallthrough]];		case AMDGPU::DS_READ_B32: [[fallthrough]];
case AMDGPU::DS_READ_B32_gfx9: [[fallthrough]];		case AMDGPU::DS_READ_B32_gfx9: [[fallthrough]];
case AMDGPU::DS_WRITE_B32: [[fallthrough]];		case AMDGPU::DS_WRITE_B32: [[fallthrough]];
case AMDGPU::DS_WRITE_B32_gfx9:		case AMDGPU::DS_WRITE_B32_gfx9:
return 1;		return 1;
case AMDGPU::DS_READ_B64: [[fallthrough]];		case AMDGPU::DS_READ_B64: [[fallthrough]];
▲ Show 20 Lines • Show All 57 Lines • ▼ Show 20 Lines	if (TII.isMTBUF(Opc)) {
}		}
}		}
return UNKNOWN;		return UNKNOWN;
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:		case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:		case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:		case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:		case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
return S_BUFFER_LOAD_IMM;		return S_BUFFER_LOAD_IMM;
		// For the purposes of this optimization SGPR variants of buffer loads
		// are considered to be zero-offsetted SGPR_IMM loads.
		case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
		case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
		case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
		case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
		case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
		case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
		case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
		case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
		return S_BUFFER_LOAD_SGPR_IMM;
case AMDGPU::S_LOAD_DWORD_IMM:		case AMDGPU::S_LOAD_DWORD_IMM:
case AMDGPU::S_LOAD_DWORDX2_IMM:		case AMDGPU::S_LOAD_DWORDX2_IMM:
case AMDGPU::S_LOAD_DWORDX4_IMM:		case AMDGPU::S_LOAD_DWORDX4_IMM:
case AMDGPU::S_LOAD_DWORDX8_IMM:		case AMDGPU::S_LOAD_DWORDX8_IMM:
return S_LOAD_IMM;		return S_LOAD_IMM;
case AMDGPU::DS_READ_B32:		case AMDGPU::DS_READ_B32:
case AMDGPU::DS_READ_B32_gfx9:		case AMDGPU::DS_READ_B32_gfx9:
case AMDGPU::DS_READ_B64:		case AMDGPU::DS_READ_B64:
▲ Show 20 Lines • Show All 60 Lines • ▼ Show 20 Lines	static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::DS_WRITE_B64:		case AMDGPU::DS_WRITE_B64:
case AMDGPU::DS_WRITE_B64_gfx9:		case AMDGPU::DS_WRITE_B64_gfx9:
return Opc;		return Opc;
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:		case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:		case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:		case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:		case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;		return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
		// For the purposes of this optimization SGPR variants of buffer loads
		// are considered to be zero-offsetted SGPR_IMM loads.
		case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
		case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
		case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
		case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
		case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
		case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
		case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
		case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
		return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
case AMDGPU::S_LOAD_DWORD_IMM:		case AMDGPU::S_LOAD_DWORD_IMM:
case AMDGPU::S_LOAD_DWORDX2_IMM:		case AMDGPU::S_LOAD_DWORDX2_IMM:
case AMDGPU::S_LOAD_DWORDX4_IMM:		case AMDGPU::S_LOAD_DWORDX4_IMM:
case AMDGPU::S_LOAD_DWORDX8_IMM:		case AMDGPU::S_LOAD_DWORDX8_IMM:
return AMDGPU::S_LOAD_DWORD_IMM;		return AMDGPU::S_LOAD_DWORD_IMM;
case AMDGPU::GLOBAL_LOAD_DWORD:		case AMDGPU::GLOBAL_LOAD_DWORD:
case AMDGPU::GLOBAL_LOAD_DWORDX2:		case AMDGPU::GLOBAL_LOAD_DWORDX2:
case AMDGPU::GLOBAL_LOAD_DWORDX3:		case AMDGPU::GLOBAL_LOAD_DWORDX3:
▲ Show 20 Lines • Show All 81 Lines • ▼ Show 20 Lines	if (AMDGPU::getMTBUFHasSoffset(Opc))
Result.SOffset = true;		Result.SOffset = true;

return Result;		return Result;
}		}

switch (Opc) {		switch (Opc) {
default:		default:
return Result;		return Result;
		case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
		foadUnsubmitted Done Reply Inline Actions Don't you need to list the _SGPR forms here too? foad: Don't you need to list the _SGPR forms here too?
		kosarevAuthorUnsubmitted Done Reply Inline Actions Oh, that's a nice catch, thanks. I've updated `hasSameBaseAddress()` above to compare the number of address operands so tests could catch this as well. kosarev: Oh, that's a nice catch, thanks. I've updated `hasSameBaseAddress()` above to compare the…
		case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
		case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
		case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
		case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
		case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
		case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
		case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
		Result.SOffset = true;
		[[fallthrough]];
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:		case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:		case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:		case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:		case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
case AMDGPU::S_LOAD_DWORD_IMM:		case AMDGPU::S_LOAD_DWORD_IMM:
case AMDGPU::S_LOAD_DWORDX2_IMM:		case AMDGPU::S_LOAD_DWORDX2_IMM:
case AMDGPU::S_LOAD_DWORDX4_IMM:		case AMDGPU::S_LOAD_DWORDX4_IMM:
case AMDGPU::S_LOAD_DWORDX8_IMM:		case AMDGPU::S_LOAD_DWORDX8_IMM:
▲ Show 20 Lines • Show All 58 Lines • ▼ Show 20 Lines	EltSize =
: 4;		: 4;
break;		break;
case DS_WRITE:		case DS_WRITE:
EltSize =		EltSize =
(Opc == AMDGPU::DS_WRITE_B64 \|\| Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8		(Opc == AMDGPU::DS_WRITE_B64 \|\| Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
: 4;		: 4;
break;		break;
case S_BUFFER_LOAD_IMM:		case S_BUFFER_LOAD_IMM:
		case S_BUFFER_LOAD_SGPR_IMM:
case S_LOAD_IMM:		case S_LOAD_IMM:
EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);		EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);
break;		break;
default:		default:
EltSize = 4;		EltSize = 4;
break;		break;
}		}

if (InstClass == MIMG) {		if (InstClass == MIMG) {
DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();		DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
// Offset is not considered for MIMG instructions.		// Offset is not considered for MIMG instructions.
Offset = 0;		Offset = 0;
} else {		} else {
int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);		int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
Offset = I->getOperand(OffsetIdx).getImm();		Offset = OffsetIdx == -1 ? 0 : I->getOperand(OffsetIdx).getImm();
}		}

if (InstClass == TBUFFER_LOAD \|\| InstClass == TBUFFER_STORE)		if (InstClass == TBUFFER_LOAD \|\| InstClass == TBUFFER_STORE)
Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();		Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();

Width = getOpcodeWidth(I, LSO.TII);		Width = getOpcodeWidth(I, LSO.TII);

if ((InstClass == DS_READ) \|\| (InstClass == DS_WRITE)) {		if ((InstClass == DS_READ) \|\| (InstClass == DS_WRITE)) {
▲ Show 20 Lines • Show All 290 Lines • ▼ Show 20 Lines
bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,		bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
const CombineInfo &CI,		const CombineInfo &CI,
const CombineInfo &Paired) {		const CombineInfo &Paired) {
const unsigned Width = (CI.Width + Paired.Width);		const unsigned Width = (CI.Width + Paired.Width);
switch (CI.InstClass) {		switch (CI.InstClass) {
default:		default:
return (Width <= 4) && (STM.hasDwordx3LoadStores() \|\| (Width != 3));		return (Width <= 4) && (STM.hasDwordx3LoadStores() \|\| (Width != 3));
case S_BUFFER_LOAD_IMM:		case S_BUFFER_LOAD_IMM:
		case S_BUFFER_LOAD_SGPR_IMM:
case S_LOAD_IMM:		case S_LOAD_IMM:
switch (Width) {		switch (Width) {
default:		default:
return false;		return false;
case 2:		case 2:
case 4:		case 4:
case 8:		case 8:
return true;		return true;
▲ Show 20 Lines • Show All 314 Lines • ▼ Show 20 Lines	MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
Register DestReg = MRI->createVirtualRegister(SuperRC);		Register DestReg = MRI->createVirtualRegister(SuperRC);
unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);		unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);

// It shouldn't be possible to get this far if the two instructions		// It shouldn't be possible to get this far if the two instructions
// don't have a single memoperand, because MachineInstr::mayAlias()		// don't have a single memoperand, because MachineInstr::mayAlias()
// will return true if this is the case.		// will return true if this is the case.
assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());		assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());

MachineInstr *New =		MachineInstrBuilder New =
BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)		BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
.add(TII->getNamedOperand(CI.I, AMDGPU::OpName::sbase))		.add(TII->getNamedOperand(CI.I, AMDGPU::OpName::sbase));
.addImm(MergedOffset) // offset		if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
.addImm(CI.CPol) // cpol		New.add(TII->getNamedOperand(CI.I, AMDGPU::OpName::soffset));
.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));		// For convenience, when SGPR_IMM buffer loads are merged into a
		// zero-offset load, we generate its SGPR variant.
		if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::offset) != -1)
		kosarevAuthorUnsubmitted Done Reply Inline Actions Also changed this line to what looks to me a more reliable implementation. kosarev: Also changed this line to what looks to me a more reliable implementation.
		New.addImm(MergedOffset);
		New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));

std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);		std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);		const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);		const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);

// Copy to the old destination registers.		// Copy to the old destination registers.
const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);		const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
const auto Dest0 = TII->getNamedOperand(CI.I, AMDGPU::OpName::sdst);		const auto Dest0 = TII->getNamedOperand(CI.I, AMDGPU::OpName::sdst);
▲ Show 20 Lines • Show All 291 Lines • ▼ Show 20 Lines	default:
return 0;		return 0;
case 2:		case 2:
return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;		return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
case 4:		case 4:
return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;		return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
case 8:		case 8:
return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;		return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
}		}
		case S_BUFFER_LOAD_SGPR_IMM:
		switch (Width) {
		default:
		return 0;
		case 2:
		return CI.Offset == 0 ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR
		: AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
		case 4:
		return CI.Offset == 0 ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR
		: AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
		case 8:
		return CI.Offset == 0 ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR
		: AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
		}
case S_LOAD_IMM:		case S_LOAD_IMM:
switch (Width) {		switch (Width) {
default:		default:
return 0;		return 0;
case 2:		case 2:
return AMDGPU::S_LOAD_DWORDX2_IMM;		return AMDGPU::S_LOAD_DWORDX2_IMM;
case 4:		case 4:
return AMDGPU::S_LOAD_DWORDX4_IMM;		return AMDGPU::S_LOAD_DWORDX4_IMM;
▲ Show 20 Lines • Show All 103 Lines • ▼ Show 20 Lines	SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
}		}

return std::make_pair(Idx0, Idx1);		return std::make_pair(Idx0, Idx1);
}		}

const TargetRegisterClass *		const TargetRegisterClass *
SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,		SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
const CombineInfo &Paired) {		const CombineInfo &Paired) {
if (CI.InstClass == S_BUFFER_LOAD_IMM \|\| CI.InstClass == S_LOAD_IMM) {		if (CI.InstClass == S_BUFFER_LOAD_IMM \|\|
		CI.InstClass == S_BUFFER_LOAD_SGPR_IMM \|\| CI.InstClass == S_LOAD_IMM) {
switch (CI.Width + Paired.Width) {		switch (CI.Width + Paired.Width) {
default:		default:
return nullptr;		return nullptr;
case 2:		case 2:
return &AMDGPU::SReg_64_XEXECRegClass;		return &AMDGPU::SReg_64_XEXECRegClass;
case 4:		case 4:
return &AMDGPU::SGPR_128RegClass;		return &AMDGPU::SGPR_128RegClass;
case 8:		case 8:
▲ Show 20 Lines • Show All 375 Lines • ▼ Show 20 Lines	bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
return false;		return false;
}		}

void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,		void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
std::list<std::list<CombineInfo> > &MergeableInsts) const {		std::list<std::list<CombineInfo> > &MergeableInsts) const {
for (std::list<CombineInfo> &AddrList : MergeableInsts) {		for (std::list<CombineInfo> &AddrList : MergeableInsts) {
if (AddrList.front().InstClass == CI.InstClass &&		if (AddrList.front().InstClass == CI.InstClass &&
AddrList.front().IsAGPR == CI.IsAGPR &&		AddrList.front().IsAGPR == CI.IsAGPR &&
AddrList.front().hasSameBaseAddress(*CI.I)) {		AddrList.front().hasSameBaseAddress(CI)) {
AddrList.emplace_back(CI);		AddrList.emplace_back(CI);
return;		return;
}		}
}		}

// Base address not found, so add a new list.		// Base address not found, so add a new list.
MergeableInsts.emplace_back(1, CI);		MergeableInsts.emplace_back(1, CI);
}		}
▲ Show 20 Lines • Show All 160 Lines • ▼ Show 20 Lines	default:
break;		break;
case DS_READ:		case DS_READ:
NewMI = mergeRead2Pair(CI, Paired, Where->I);		NewMI = mergeRead2Pair(CI, Paired, Where->I);
break;		break;
case DS_WRITE:		case DS_WRITE:
NewMI = mergeWrite2Pair(CI, Paired, Where->I);		NewMI = mergeWrite2Pair(CI, Paired, Where->I);
break;		break;
case S_BUFFER_LOAD_IMM:		case S_BUFFER_LOAD_IMM:
		case S_BUFFER_LOAD_SGPR_IMM:
case S_LOAD_IMM:		case S_LOAD_IMM:
NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);		NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
OptimizeListAgain \|= CI.Width + Paired.Width < 8;		OptimizeListAgain \|= CI.Width + Paired.Width < 8;
break;		break;
case BUFFER_LOAD:		case BUFFER_LOAD:
NewMI = mergeBufferLoadPair(CI, Paired, Where->I);		NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
OptimizeListAgain \|= CI.Width + Paired.Width < 4;		OptimizeListAgain \|= CI.Width + Paired.Width < 4;
break;		break;
▲ Show 20 Lines • Show All 89 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/merge-sbuffer-load.mir

Show First 20 Lines • Show All 107 Lines • ▼ Show 20 Lines	bb.0:
%0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3		%0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
%1:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s128))		%1:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s128))
%2:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %0:sgpr_128, 16, 0 :: (dereferenceable invariant load (s128))		%2:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %0:sgpr_128, 16, 0 :: (dereferenceable invariant load (s128))

S_ENDPGM 0		S_ENDPGM 0
...		...
---		---


# CHECK-LABEL: name: merge_s_buffer_load_x8_mixed		# CHECK-LABEL: name: merge_s_buffer_load_x8_mixed
# CHECK: S_BUFFER_LOAD_DWORDX8_IMM %0, 0, 0 :: (dereferenceable invariant load (s256), align 16)		# CHECK: S_BUFFER_LOAD_DWORDX8_IMM %0, 0, 0 :: (dereferenceable invariant load (s256), align 16)
name: merge_s_buffer_load_x8_mixed		name: merge_s_buffer_load_x8_mixed
tracksRegLiveness: true		tracksRegLiveness: true
body: \|		body: \|
bb.0:		bb.0:
liveins: $sgpr0_sgpr1_sgpr2_sgpr3		liveins: $sgpr0_sgpr1_sgpr2_sgpr3

%0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3		%0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
%1:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s128))		%1:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s128))
%2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32))		%2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32))
%3:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM %0:sgpr_128, 24, 0 :: (dereferenceable invariant load (s64))		%3:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM %0:sgpr_128, 24, 0 :: (dereferenceable invariant load (s64))
%4:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 20, 0 :: (dereferenceable invariant load (s32))		%4:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 20, 0 :: (dereferenceable invariant load (s32))

S_ENDPGM 0		S_ENDPGM 0
...		...
---		---

		# CHECK-LABEL: name: merge_s_buffer_load_sgpr_imm
		# CHECK: S_BUFFER_LOAD_DWORDX4_SGPR %0, %1, 0 :: (dereferenceable invariant load (s128), align 4)
		name: merge_s_buffer_load_sgpr_imm
		rampitecUnsubmitted Done Reply Inline Actions Add couple negative tests maybe, one with different sgprs used and one with non-adjacent offsets? rampitec: Add couple negative tests maybe, one with different sgprs used and one with non-adjacent…
		tracksRegLiveness: true
		body: \|
		bb.0:
		liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4

		%0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
		%1:sreg_32 = COPY $sgpr4
		%2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR %0:sgpr_128, %1:sreg_32, 0 :: (dereferenceable invariant load (s32))
		%3:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM %0:sgpr_128, %1:sreg_32, 4, 0 :: (dereferenceable invariant load (s32))
		%4:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM %0:sgpr_128, %1:sreg_32, 8, 0 :: (dereferenceable invariant load (s32))
		%5:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM %0:sgpr_128, %1:sreg_32, 12, 0 :: (dereferenceable invariant load (s32))

		S_ENDPGM 0
		...
		---

		# CHECK-LABEL: name: no_merge_for_different_soffsets
		# CHECK: S_BUFFER_LOAD_DWORD_SGPR_IMM %0, %1, 4, 0 :: (dereferenceable invariant load (s32))
		# CHECK: S_BUFFER_LOAD_DWORD_SGPR_IMM %0, %2, 8, 0 :: (dereferenceable invariant load (s32))
		name: no_merge_for_different_soffsets
		tracksRegLiveness: true
		body: \|
		bb.0:
		liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $sgpr5

		%0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
		%1:sreg_32 = COPY $sgpr4
		%2:sreg_32 = COPY $sgpr5
		%3:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM %0:sgpr_128, %1:sreg_32, 4, 0 :: (dereferenceable invariant load (s32))
		%4:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM %0:sgpr_128, %2:sreg_32, 8, 0 :: (dereferenceable invariant load (s32))

		S_ENDPGM 0
		...
		---

		# CHECK-LABEL: name: no_merge_for_non_adjacent_offsets
		# CHECK: S_BUFFER_LOAD_DWORD_SGPR_IMM %0, %1, 4, 0 :: (dereferenceable invariant load (s32))
		# CHECK: S_BUFFER_LOAD_DWORD_SGPR_IMM %0, %1, 12, 0 :: (dereferenceable invariant load (s32))
		name: no_merge_for_non_adjacent_offsets
		tracksRegLiveness: true
		body: \|
		bb.0:
		liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4

		%0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
		%1:sreg_32 = COPY $sgpr4
		%2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM %0:sgpr_128, %1:sreg_32, 4, 0 :: (dereferenceable invariant load (s32))
		%3:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM %0:sgpr_128, %1:sreg_32, 12, 0 :: (dereferenceable invariant load (s32))

		S_ENDPGM 0
		...
		---