Diff 260800

llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp

Show First 20 Lines • Show All 274 Lines • ▼ Show 20 Lines

char &llvm::GCNRegBankReassignID = GCNRegBankReassign::ID;		char &llvm::GCNRegBankReassignID = GCNRegBankReassign::ID;

unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg) const {		unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg) const {
assert(Register::isPhysicalRegister(Reg));		assert(Register::isPhysicalRegister(Reg));

const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);		const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
unsigned Size = TRI->getRegSizeInBits(*RC);		unsigned Size = TRI->getRegSizeInBits(*RC);
if (Size > 32)		if (Size == 16)
		Reg = TRI->get32BitRegister(Reg);
		else if (Size > 32)
Reg = TRI->getSubReg(Reg, AMDGPU::sub0);		Reg = TRI->getSubReg(Reg, AMDGPU::sub0);

if (TRI->hasVGPRs(RC)) {		if (TRI->hasVGPRs(RC)) {
Reg -= AMDGPU::VGPR0;		Reg -= AMDGPU::VGPR0;
return Reg % NUM_VGPR_BANKS;		return Reg % NUM_VGPR_BANKS;
}		}

Reg = TRI->getEncodingValue(Reg) / 2;		Reg = TRI->getEncodingValue(Reg) / 2;
Show All 9 Lines	if (Register::isVirtualRegister(Reg)) {
Reg = VRM->getPhys(Reg);		Reg = VRM->getPhys(Reg);
if (!Reg)		if (!Reg)
return 0;		return 0;
if (SubReg)		if (SubReg)
Reg = TRI->getSubReg(Reg, SubReg);		Reg = TRI->getSubReg(Reg, SubReg);
}		}

const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);		const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
unsigned Size = TRI->getRegSizeInBits(*RC) / 32;		unsigned Size = TRI->getRegSizeInBits(*RC);

		if (Size == 16) {
		Reg = TRI->get32BitRegister(Reg);
		Size = 1;
		} else {
		Size /= 32;
if (Size > 1)		if (Size > 1)
Reg = TRI->getSubReg(Reg, AMDGPU::sub0);		Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
		}

if (TRI->hasVGPRs(RC)) {		if (TRI->hasVGPRs(RC)) {
// VGPRs have 4 banks assigned in a round-robin fashion.		// VGPRs have 4 banks assigned in a round-robin fashion.
Reg -= AMDGPU::VGPR0;		Reg -= AMDGPU::VGPR0;
uint32_t Mask = maskTrailingOnes<uint32_t>(Size);		uint32_t Mask = maskTrailingOnes<uint32_t>(Size);
unsigned Used = 0;		unsigned Used = 0;
// Bitmask lacks an extract method		// Bitmask lacks an extract method
for (unsigned I = 0; I < Size; ++I)		for (unsigned I = 0; I < Size; ++I)
▲ Show 20 Lines • Show All 115 Lines • ▼ Show 20 Lines	for (auto U : MRI->use_nodbg_operands(Reg)) {
if (U.isImplicit())		if (U.isImplicit())
return false;		return false;
const MachineInstr *UseInst = U.getParent();		const MachineInstr *UseInst = U.getParent();
if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg)		if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg)
return false;		return false;
}		}

const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysReg);		const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysReg);
		unsigned Size = TRI->getRegSizeInBits(*RC);

		// TODO: Support 16 bit registers. Those needs to be moved with their
		// parent VGPR_32 and potentially a sibling 16 bit sub-register.
		if (Size < 32)
		return false;
		arsenmUnsubmitted Not Done Reply Inline Actions This kills the code below arsenm: This kills the code below
		rampitecAuthorUnsubmitted Done Reply Inline Actions Yes, and it has TODO above it. These registers are still reassignable, it just needs more code. rampitec: Yes, and it has TODO above it. These registers are still reassignable, it just needs more code.

if (TRI->hasVGPRs(RC))		if (TRI->hasVGPRs(RC))
return true;		return true;

unsigned Size = TRI->getRegSizeInBits(*RC);		if (Size == 16)
		return AMDGPU::SGPR_LO16RegClass.contains(PhysReg);
		arsenmUnsubmitted Not Done Reply Inline Actions Does this need to be updated for SReg_Lo16 in the other patch? arsenm: Does this need to be updated for SReg_Lo16 in the other patch?
		rampitecAuthorUnsubmitted Done Reply Inline Actions No, pass does not reassign special SGPRs. In essence you cannot replace vcc with s[0:1] here. rampitec: No, pass does not reassign special SGPRs. In essence you cannot replace vcc with s[0:1] here.

if (Size > 32)		if (Size > 32)
PhysReg = TRI->getSubReg(PhysReg, AMDGPU::sub0);		PhysReg = TRI->getSubReg(PhysReg, AMDGPU::sub0);
		arsenmUnsubmitted Done Reply Inline Actions Can't you just check SGPR16_LO.contains instead of going through the size check? I would expect you can get rid of the explicit size check by just trying getSubReg/getSuperReg and see if either failed arsenm: Can't you just check SGPR16_LO.contains instead of going through the size check? I would expect…

return AMDGPU::SGPR_32RegClass.contains(PhysReg);		return AMDGPU::SGPR_32RegClass.contains(PhysReg);
}		}

unsigned GCNRegBankReassign::getFreeBanks(unsigned Mask,		unsigned GCNRegBankReassign::getFreeBanks(unsigned Mask,
unsigned UsedBanks) const {		unsigned UsedBanks) const {
unsigned Size = countPopulation(Mask);		unsigned Size = countPopulation(Mask);
unsigned FreeBanks = 0;		unsigned FreeBanks = 0;
▲ Show 20 Lines • Show All 349 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Show First 20 Lines • Show All 688 Lines • ▼ Show 20 Lines	if (RI.getRegSizeInBits(*RC) == 16) {
bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);		bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);		bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
bool DstLow = AMDGPU::VGPR_LO16RegClass.contains(DestReg) \|\|		bool DstLow = AMDGPU::VGPR_LO16RegClass.contains(DestReg) \|\|
AMDGPU::SReg_LO16RegClass.contains(DestReg) \|\|		AMDGPU::SReg_LO16RegClass.contains(DestReg) \|\|
AMDGPU::AGPR_LO16RegClass.contains(DestReg);		AMDGPU::AGPR_LO16RegClass.contains(DestReg);
bool SrcLow = AMDGPU::VGPR_LO16RegClass.contains(SrcReg) \|\|		bool SrcLow = AMDGPU::VGPR_LO16RegClass.contains(SrcReg) \|\|
AMDGPU::SReg_LO16RegClass.contains(SrcReg) \|\|		AMDGPU::SReg_LO16RegClass.contains(SrcReg) \|\|
AMDGPU::AGPR_LO16RegClass.contains(SrcReg);		AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
const TargetRegisterClass *DstRC = IsSGPRDst ? &AMDGPU::SGPR_32RegClass		MCRegister NewDestReg = RI.get32BitRegister(DestReg);
: IsAGPRDst ? &AMDGPU::AGPR_32RegClass		MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
: &AMDGPU::VGPR_32RegClass;
const TargetRegisterClass *SrcRC = IsSGPRSrc ? &AMDGPU::SGPR_32RegClass
: IsAGPRSrc ? &AMDGPU::AGPR_32RegClass
: &AMDGPU::VGPR_32RegClass;
MCRegister NewDestReg =
RI.getMatchingSuperReg(DestReg, DstLow ? AMDGPU::lo16 : AMDGPU::hi16,
DstRC);
MCRegister NewSrcReg =
RI.getMatchingSuperReg(SrcReg, SrcLow ? AMDGPU::lo16 : AMDGPU::hi16,
SrcRC);

if (IsSGPRDst) {		if (IsSGPRDst) {
if (!IsSGPRSrc) {		if (!IsSGPRSrc) {
reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);		reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
return;		return;
}		}

BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)		BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
▲ Show 20 Lines • Show All 6,103 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/SIRegisterInfo.h

Show First 20 Lines • Show All 277 Lines • ▼ Show 20 Lines	static unsigned getNumCoveredRegs(LaneBitmask LM) {
uint64_t Even = Mask & 0xAAAAAAAAAAAAAAAAULL;		uint64_t Even = Mask & 0xAAAAAAAAAAAAAAAAULL;
Mask = (Even >> 1) \| Mask;		Mask = (Even >> 1) \| Mask;
uint64_t Odd = Mask & 0x5555555555555555ULL;		uint64_t Odd = Mask & 0x5555555555555555ULL;
return countPopulation(Odd);		return countPopulation(Odd);
}		}

// \returns a DWORD offset of a \p SubReg		// \returns a DWORD offset of a \p SubReg
unsigned getChannelFromSubReg(unsigned SubReg) const {		unsigned getChannelFromSubReg(unsigned SubReg) const {
return SubReg ? divideCeil(getSubRegIdxOffset(SubReg), 32) : 0;		return SubReg ? (getSubRegIdxOffset(SubReg) + 31) / 32 : 0;
}		}
		arsenmUnsubmitted Not Done Reply Inline Actions I would expect this to be an assert, but. guess it already handled this case arsenm: I would expect this to be an assert, but. guess it already handled this case
		rampitecAuthorUnsubmitted Done Reply Inline Actions That's because hi16 returns 16 as an offset, and then divideCell returns 1 which is plain wrong. rampitec: That's because hi16 returns 16 as an offset, and then divideCell returns 1 which is plain wrong.
		foadUnsubmitted Not Done Reply Inline Actions divideCeil(16, 32) returns 1 which is plain right! foad: divideCeil(16, 32) returns 1 which is plain right!
		rampitecAuthorUnsubmitted Done Reply Inline Actions It is channel 0 and not 1, and it was a bug. rampitec: It is channel 0 and not 1, and it was a bug.

// \returns a DWORD size of a \p SubReg		// \returns a DWORD size of a \p SubReg
unsigned getNumChannelsFromSubReg(unsigned SubReg) const {		unsigned getNumChannelsFromSubReg(unsigned SubReg) const {
return getNumCoveredRegs(getSubRegIndexLaneMask(SubReg));		return getNumCoveredRegs(getSubRegIndexLaneMask(SubReg));
}		}

		// For a given 16 bit \p Reg \returns a 32 bit register holding it.
		// \returns \p Reg otherwise.
		MCPhysReg get32BitRegister(MCPhysReg Reg) const;

private:		private:
void buildSpillLoadStore(MachineBasicBlock::iterator MI,		void buildSpillLoadStore(MachineBasicBlock::iterator MI,
unsigned LoadStoreOp,		unsigned LoadStoreOp,
int Index,		int Index,
Register ValueReg,		Register ValueReg,
bool ValueIsKill,		bool ValueIsKill,
MCRegister ScratchRsrcReg,		MCRegister ScratchRsrcReg,
MCRegister ScratchOffsetReg,		MCRegister ScratchOffsetReg,
int64_t InstrOffset,		int64_t InstrOffset,
MachineMemOperand *MMO,		MachineMemOperand *MMO,
RegScavenger *RS) const;		RegScavenger *RS) const;
};		};

} // End namespace llvm		} // End namespace llvm

#endif		#endif

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Show First 20 Lines • Show All 1,792 Lines • ▼ Show 20 Lines	MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg,

if (!Def \|\| !MDT.dominates(Def, &Use))		if (!Def \|\| !MDT.dominates(Def, &Use))
return nullptr;		return nullptr;

assert(Def->modifiesRegister(Reg, this));		assert(Def->modifiesRegister(Reg, this));

return Def;		return Def;
}		}

		MCPhysReg SIRegisterInfo::get32BitRegister(MCPhysReg Reg) const {
		const TargetRegisterClass *RC = getPhysRegClass(Reg);
		assert(getRegSizeInBits(*RC) <= 32);

		arsenmUnsubmitted Done Reply Inline Actions Seems like this should just be an assert arsenm: Seems like this should just be an assert
		rampitecAuthorUnsubmitted Done Reply Inline Actions I am going to use it without checking is a register actually 16 or 32 bit, just to get a 32 bit operand from whatever input. But it is reasonable to add assert for "Size <= 32". rampitec: I am going to use it without checking is a register actually 16 or 32 bit, just to get a 32 bit…
		arsenmUnsubmitted Done Reply Inline Actions I think you can get away with looking up the specific register class, and picking the right 32-bit class by just trying VGPR,SGPR,AGPR in succession and see if any succeed arsenm: I think you can get away with looking up the specific register class, and picking the right 32…
		rampitecAuthorUnsubmitted Done Reply Inline Actions I could but I don't think it is faster. rampitec: I could but I don't think it is faster.
		for (const TargetRegisterClass &RC : { AMDGPU::VGPR_32RegClass,
		AMDGPU::SReg_32RegClass,
		AMDGPU::AGPR_32RegClass } ) {
		if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::lo16, &RC))
		return Super;
		}
		if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::hi16,
		&AMDGPU::VGPR_32RegClass)) {
		return Super;
		}

		return AMDGPU::NoRegister;
		}

llvm/test/CodeGen/AMDGPU/regbank-reassign.mir

Show First 20 Lines • Show All 358 Lines • ▼ Show 20 Lines	bb.0:
DS_WRITE2_B32_gfx9 %2, %1.sub4, %1.sub5, 4, 5, 0, implicit $exec		DS_WRITE2_B32_gfx9 %2, %1.sub4, %1.sub5, 4, 5, 0, implicit $exec
DS_WRITE2_B32_gfx9 %2, %1.sub6, %1.sub7, 6, 7, 0, implicit $exec		DS_WRITE2_B32_gfx9 %2, %1.sub6, %1.sub7, 6, 7, 0, implicit $exec
DS_WRITE2_B32_gfx9 %2, %1.sub8, %1.sub9, 8, 9, 0, implicit $exec		DS_WRITE2_B32_gfx9 %2, %1.sub8, %1.sub9, 8, 9, 0, implicit $exec
DS_WRITE2_B32_gfx9 %2, %1.sub10, %1.sub11, 10, 11, 0, implicit $exec		DS_WRITE2_B32_gfx9 %2, %1.sub10, %1.sub11, 10, 11, 0, implicit $exec
DS_WRITE2_B32_gfx9 %2, %1.sub12, %1.sub13, 12, 13, 0, implicit $exec		DS_WRITE2_B32_gfx9 %2, %1.sub12, %1.sub13, 12, 13, 0, implicit $exec
DS_WRITE2_B32_gfx9 %2, %1.sub14, %1.sub15, 14, 15, 0, implicit $exec		DS_WRITE2_B32_gfx9 %2, %1.sub14, %1.sub15, 14, 15, 0, implicit $exec
S_ENDPGM 0		S_ENDPGM 0
...		...

		# GCN-LABEL: vgpr_lo16_sub{{$}}
		# GCN: renamable $vgpr0 = V_AND_B32_e32 killed $vgpr3, killed $vgpr1, implicit $exec
		# GCN: renamable $vgpr1_lo16 = COPY renamable $vgpr0_lo16
		---
		name: vgpr_lo16_sub
		tracksRegLiveness: true
		registers:
		- { id: 0, class: vgpr_32, preferred-register: '$vgpr1' }
		- { id: 1, class: vgpr_32, preferred-register: '$vgpr5' }
		- { id: 2, class: vgpr_32 }
		- { id: 3, class: vgpr_lo16 }
		arsenmUnsubmitted Not Done Reply Inline Actions If you don't need the preferred-register hints you can eliminate the registers section arsenm: If you don't need the preferred-register hints you can eliminate the registers section
		rampitecAuthorUnsubmitted Done Reply Inline Actions I intend to reuse these tests when registers become reassignable. I also want to trigger code path in the pass which will actually go through the bank search logic. If RA will just assign v0 and v1 not all code would trigger. rampitec: I intend to reuse these tests when registers become reassignable. I also want to trigger code…
		body: \|
		bb.0:
		%0 = IMPLICIT_DEF
		%1 = IMPLICIT_DEF
		%2 = V_AND_B32_e32 %1, %0, implicit $exec
		%3 = COPY %2.lo16
		$vgpr1_lo16 = COPY %3
		SI_RETURN_TO_EPILOG $vgpr1_lo16
		...

		# GCN-LABEL: vgpr_lo16{{$}}
		# GCN: $vgpr1_lo16 = COPY killed renamable $vgpr0_lo16
		---
		name: vgpr_lo16
		tracksRegLiveness: true
		registers:
		- { id: 0, class: vgpr_lo16, preferred-register: '$vgpr4_lo16' }
		body: \|
		bb.0:
		liveins: $vgpr0_lo16

		%0 = COPY $vgpr0_lo16
		$vgpr1_lo16 = COPY %0
		SI_RETURN_TO_EPILOG $vgpr1_lo16
		...

		# GCN-LABEL: vgpr_hi16_sub{{$}}
		# GCN: renamable $vgpr0 = V_AND_B32_e32 killed $vgpr3, killed $vgpr1, implicit $exec
		# GCN: renamable $vgpr1_hi16 = COPY renamable $vgpr0_hi16
		---
		name: vgpr_hi16_sub
		tracksRegLiveness: true
		registers:
		- { id: 0, class: vgpr_32, preferred-register: '$vgpr1' }
		- { id: 1, class: vgpr_32, preferred-register: '$vgpr5' }
		- { id: 2, class: vgpr_32 }
		- { id: 3, class: vgpr_hi16 }
		body: \|
		bb.0:
		%0 = IMPLICIT_DEF
		%1 = IMPLICIT_DEF
		%2 = V_AND_B32_e32 %1, %0, implicit $exec
		%3 = COPY %2.hi16
		$vgpr1_hi16 = COPY %3
		SI_RETURN_TO_EPILOG $vgpr1_hi16
		...

		# GCN-LABEL: vgpr_hi16{{$}}
		# GCN: $vgpr1_hi16 = COPY killed renamable $vgpr0_hi16
		---
		name: vgpr_hi16
		tracksRegLiveness: true
		registers:
		- { id: 0, class: vgpr_hi16, preferred-register: '$vgpr4_hi16' }
		body: \|
		bb.0:
		liveins: $vgpr0_hi16

		%0 = COPY $vgpr0_hi16
		$vgpr1_hi16 = COPY %0
		SI_RETURN_TO_EPILOG $vgpr1_hi16
		...

		# GCN-LABEL: sgpr_lo16_sub{{$}}
		# GCN: renamable $sgpr0 = S_AND_B32 killed renamable $sgpr14, $sgpr0, implicit-def $scc
		# GCN: renamable $sgpr1_lo16 = COPY renamable $sgpr0_lo16
		---
		name: sgpr_lo16_sub
		tracksRegLiveness: true
		registers:
		- { id: 0, class: sgpr_32, preferred-register: '$sgpr16' }
		- { id: 1, class: sgpr_32 }
		- { id: 2, class: sgpr_lo16 }
		body: \|
		bb.0:
		%0 = IMPLICIT_DEF
		$sgpr0 = IMPLICIT_DEF
		%1 = S_AND_B32 %0, $sgpr0, implicit-def $scc
		%2 = COPY %1.lo16
		$sgpr1_lo16 = COPY %2
		SI_RETURN_TO_EPILOG $sgpr1_lo16
		...

		# GCN-LABEL: sgpr_lo16{{$}}
		# GCN: $sgpr1_lo16 = COPY killed renamable $sgpr0_lo16
		---
		name: sgpr_lo16
		tracksRegLiveness: true
		registers:
		- { id: 0, class: sgpr_lo16, preferred-register: '$sgpr4_lo16' }
		body: \|
		bb.0:
		liveins: $sgpr0_lo16

		%0 = COPY $sgpr0_lo16
		$sgpr1_lo16 = COPY %0
		SI_RETURN_TO_EPILOG $sgpr1_lo16
		...

		# Check that we do not use VGPR3 which we would use otherwise.
		# We cannot use it because of interference with VGPR3_LO16.
		# GCN-LABEL: v1_vs_v5_src_interence{{$}}
		# GCN: V_AND_B32_e32 killed $vgpr7, killed $vgpr1,
		---
		name: v1_vs_v5_src_interence
		tracksRegLiveness: true
		registers:
		- { id: 0, class: vgpr_32, preferred-register: '$vgpr1' }
		- { id: 1, class: vgpr_32, preferred-register: '$vgpr5' }
		- { id: 2, class: vgpr_32 }
		body: \|
		bb.0:
		%0 = IMPLICIT_DEF
		%1 = IMPLICIT_DEF
		$vgpr3_lo16 = IMPLICIT_DEF
		%2 = V_AND_B32_e32 %1, %0, implicit $exec
		S_ENDPGM 0
		...

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Adapt GCNRegBankReassign for 16 bit subregs
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 260800

llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

llvm/lib/Target/AMDGPU/SIRegisterInfo.h

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

llvm/test/CodeGen/AMDGPU/regbank-reassign.mir

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Adapt GCNRegBankReassign for 16 bit subregsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 260800

llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

llvm/lib/Target/AMDGPU/SIRegisterInfo.h

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

llvm/test/CodeGen/AMDGPU/regbank-reassign.mir

[AMDGPU] Adapt GCNRegBankReassign for 16 bit subregs
ClosedPublic