Diff 266173

llvm/lib/Target/AMDGPU/SIRegisterInfo.h

Show First 20 Lines • Show All 90 Lines • ▼ Show 20 Lines	void resolveFrameIndex(MachineInstr &MI, Register BaseReg,
int64_t Offset) const override;		int64_t Offset) const override;

bool isFrameOffsetLegal(const MachineInstr *MI, Register BaseReg,		bool isFrameOffsetLegal(const MachineInstr *MI, Register BaseReg,
int64_t Offset) const override;		int64_t Offset) const override;

const TargetRegisterClass *getPointerRegClass(		const TargetRegisterClass *getPointerRegClass(
const MachineFunction &MF, unsigned Kind = 0) const override;		const MachineFunction &MF, unsigned Kind = 0) const override;

		void buildSGPRSpillLoadStore(MachineBasicBlock::iterator MI, int Index,
		int Offset, unsigned EltSize, Register VGPR,
		int64_t VGPRLanes, RegScavenger *RS,
		bool IsLoad) const;

/// If \p OnlyToVGPR is true, this will only succeed if this		/// If \p OnlyToVGPR is true, this will only succeed if this
bool spillSGPR(MachineBasicBlock::iterator MI,		bool spillSGPR(MachineBasicBlock::iterator MI,
int FI, RegScavenger *RS,		int FI, RegScavenger *RS,
bool OnlyToVGPR = false) const;		bool OnlyToVGPR = false) const;

bool restoreSGPR(MachineBasicBlock::iterator MI,		bool restoreSGPR(MachineBasicBlock::iterator MI,
int FI, RegScavenger *RS,		int FI, RegScavenger *RS,
bool OnlyToVGPR = false) const;		bool OnlyToVGPR = false) const;
▲ Show 20 Lines • Show All 211 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

//===-- SIRegisterInfo.cpp - SI Register Information ---------------------===//		//===-- SIRegisterInfo.cpp - SI Register Information ---------------------===//
		Lint: Lint Inline Actions clang-format suggested style edits found: Lint: Lint: clang-format suggested style edits found:
//		//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.		// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
//		//
/// \file		/// \file
▲ Show 20 Lines • Show All 820 Lines • ▼ Show 20 Lines	void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
if (ScratchOffsetRegDelta != 0) {		if (ScratchOffsetRegDelta != 0) {
// Subtract the offset we added to the ScratchOffset register.		// Subtract the offset we added to the ScratchOffset register.
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), SOffset)		BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), SOffset)
.addReg(SOffset)		.addReg(SOffset)
.addImm(ScratchOffsetRegDelta);		.addImm(ScratchOffsetRegDelta);
}		}
}		}

		// Generate a VMEM access which loads or stores the VGPR containing an SGPR
		arsenmUnsubmitted Done Reply Inline Actions Needs a comment explaining the point of this function arsenm: Needs a comment explaining the point of this function
		// spill such that all the lanes set in VGPRLanes are loaded or stored.
		// This generates exec mask manipulation and will use SGPRs available in MI
		// or VGPR lanes in the VGPR to save and restore the exec mask.
		void SIRegisterInfo::buildSGPRSpillLoadStore(MachineBasicBlock::iterator MI,
		int Index, int Offset,
		unsigned EltSize, Register VGPR,
		int64_t VGPRLanes,
		RegScavenger *RS,
		bool IsLoad) const {
		MachineBasicBlock *MBB = MI->getParent();
		MachineFunction *MF = MBB->getParent();
		SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
		const SIInstrInfo *TII = ST.getInstrInfo();

		Register SuperReg = MI->getOperand(0).getReg();
		const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
		ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
		unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
		unsigned FirstPart = isWave32 ? Offset * 16 : Offset * 32;

		bool IsKill = MI->getOperand(0).isKill();
		arsenmUnsubmitted Done Reply Inline Actions I don't understand these lane numbers; it picks the one past the midpoint lane? arsenm: I don't understand these lane numbers; it picks the one past the midpoint lane?
		critsonAuthorUnsubmitted Done Reply Inline Actions Correct, because lanes before the midpoint may be used to store the SGPRs. I have added a comment. critson: Correct, because lanes before the midpoint may be used to store the SGPRs. I have added a…
		const DebugLoc &DL = MI->getDebugLoc();

		const bool SuperRegIsExec =
		SuperReg == AMDGPU::EXEC \|\| SuperReg == AMDGPU::EXEC_LO;

		// If exec mask is stored in the VGPR, make sure it is stored after
		// any lanes used by the spill (16 lanes on Wave32, 32 lanes on Wave64).
		const unsigned ExecLoLane = SuperRegIsExec ? 0 : (isWave32 ? 16 : 32);
		arsenmUnsubmitted Done Reply Inline Actions I'm not sure I understand only sometimes saving exec_hi arsenm: I'm not sure I understand only sometimes saving exec_hi
		critsonAuthorUnsubmitted Done Reply Inline Actions Made the comment cleared. The idea is that we try to turn off all unused lanes for the load/store on the assumption that this /may/ have some benefit (performance or power). However if we do not have space for saving a copy of EXEC_HI then it is safe not to adjust the lanes. critson: Made the comment cleared. The idea is that we try to turn off all unused lanes for the…
		const unsigned ExecHiLane = SuperRegIsExec ? 1 : (isWave32 ? 17 : 33);

		// Always try to use the src/dst SGPRs to hold a copy of the exec mask.
		arsenmUnsubmitted Done Reply Inline Actions We're trying to eliminate use of NoRegister in favor of using the default Register constructor, so you don't need the initializer here (or explicit checks for it later in the function) arsenm: We're trying to eliminate use of NoRegister in favor of using the default Register constructor…
		critsonAuthorUnsubmitted Done Reply Inline Actions I can get rid of the initializer, but I definitely need the explicit tests as getMatchingSuperReg returns NoRegister which must be handled correctly. critson: I can get rid of the initializer, but I definitely need the explicit tests as…
		arsenmUnsubmitted Done Reply Inline Actions These are the same thing. getMatchingSuperReg should return MCRegister. All of the unsigneds refering to registers should use Register arsenm: These are the same thing. getMatchingSuperReg should return MCRegister. All of the unsigneds…
		// This is not possible when the src value must be valid after spill
		// or src is smaller than exec mask. In which case use VGPR.
		arsenmUnsubmitted Not Done Reply Inline Actions The "In which case use VGPR." reads weirdly to me arsenm: The "In which case use VGPR." reads weirdly to me
		bool StoreExecInVGPR = !IsLoad && (SuperRegIsExec \|\| !IsKill);

		// On Wave32 only handle EXEC_LO.
		// On Wave64 only update EXEC_HI if there is sufficent space for a copy.
		bool OnlyExecLo = isWave32 \|\| NumSubRegs == 1;

		unsigned ExecMovOpc = OnlyExecLo ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
		unsigned ExecReg = OnlyExecLo ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
		arsenmUnsubmitted Done Reply Inline Actions getUndefRegState arsenm: getUndefRegState

		Register SavedExecReg;

		// Backup EXEC
		if (SuperRegIsExec) {
		arsenmUnsubmitted Not Done Reply Inline Actions Capitalize arsenm: Capitalize
		// do nothing; exec is already stored in VGPR or will be overwritten
		} else if (StoreExecInVGPR) {
		BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
		VGPR)
		.addReg(AMDGPU::EXEC_LO)
		.addImm(ExecLoLane)
		.addReg(VGPR, getUndefRegState(IsLoad));

		if (!isWave32) {
		BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
		VGPR)
		arsenmUnsubmitted Done Reply Inline Actions Grammar, a odd size->an odd size arsenm: Grammar, a odd size->an odd size
		.addReg(AMDGPU::EXEC_HI)
		.addImm(ExecHiLane)
		.addReg(VGPR);
		}
		} else {
		if (OnlyExecLo) {
		SavedExecReg = NumSubRegs == 1
		? SuperReg
		: getSubReg(SuperReg, SplitParts[FirstPart]);
		} else {
		SavedExecReg =
		getMatchingSuperReg(getSubReg(SuperReg, SplitParts[FirstPart]),
		arsenmUnsubmitted Done Reply Inline Actions Could this just turn on all lanes for the memory spill? mov -1 is always free for code size, but some other bitmask may not be arsenm: Could this just turn on all lanes for the memory spill? mov -1 is always free for code size…
		critsonAuthorUnsubmitted Done Reply Inline Actions As mentioned above, the idea is that we try to only enable lanes that are used, in case there is some potential benefit. The only lane combinations which required additional encoding space are S256, S512 and S1024. Note that S1024 will not occur on Wave32, as we avoid filling all but half the lanes in the VGPR before calling this function. I would argue that the encoding space is probably not an issue as most common spills are S32, S64, S128. Of course I am working without any tangible assessment of the benefit of disabling lanes, it might be that for >=S256 we should just set the mask to -1 anyway. critson: As mentioned above, the idea is that we try to only enable lanes that are used, in case there…
		arsenmUnsubmitted Done Reply Inline Actions But is there an actual benefit? I don't think the hardware saves anything by having fewer lanes. I would rather not add complexity to handle a hypothetical case that doesn't provide a real benefit arsenm: But is there an actual benefit? I don't think the hardware saves anything by having fewer lanes.
		critsonAuthorUnsubmitted Done Reply Inline Actions I have simplified the code. critson: I have simplified the code.
		AMDGPU::sub0, &AMDGPU::SGPR_64RegClass);
		// If src/dst is an odd size it is possible subreg0 is not aligned.
		if ((!SavedExecReg \|\| SavedExecReg == AMDGPU::NoRegister) &&
		NumSubRegs > 2)
		arsenmUnsubmitted Done Reply Inline Actions What other places do is have ExecMovOpc and ExecReg set from isWave32 and unify the BuildMI calls arsenm: What other places do is have ExecMovOpc and ExecReg set from isWave32 and unify the BuildMI…
		critsonAuthorUnsubmitted Done Reply Inline Actions I have applied a simplification to the code based on this. critson: I have applied a simplification to the code based on this.
		SavedExecReg =
		getMatchingSuperReg(getSubReg(SuperReg, SplitParts[FirstPart + 1]),
		AMDGPU::sub0, &AMDGPU::SGPR_64RegClass);
		}
		arsenmUnsubmitted Done Reply Inline Actions Should assert this isn't an SGPRSpill stack ID. You can't reuse the SGPR spill frame index for the real stack spill index arsenm: Should assert this isn't an SGPRSpill stack ID. You can't reuse the SGPR spill frame index for…
		critsonAuthorUnsubmitted Done Reply Inline Actions Sorry I am not totally clear on what you are saying? This will only get here if it is not an SGPR to VGPR spill slot (tested in spillSGPR / restoreSGPR). critson: Sorry I am not totally clear on what you are saying? This will only get here if it is not an…
		arsenmUnsubmitted Done Reply Inline Actions It's hard to follow how all of the SGPR->VGPR spills work, and it doesn't hurt to be sure this is definitely not an SGPR frame index arsenm: It's hard to follow how all of the SGPR->VGPR spills work, and it doesn't hurt to be sure this…
		critsonAuthorUnsubmitted Done Reply Inline Actions Assertion added. critson: Assertion added.

		assert(SavedExecReg && SavedExecReg != AMDGPU::NoRegister);
		BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), SavedExecReg).addReg(ExecReg);
		}

		arsenmUnsubmitted Done Reply Inline Actions I think the correct thing to do is use use the base alignment here and add the offset to PtrInfo.getWithOffset arsenm: I think the correct thing to do is use use the base alignment here and add the offset to…
		critsonAuthorUnsubmitted Done Reply Inline Actions Again, I am not clear on this. Can you explain further? critson: Again, I am not clear on this. Can you explain further?
		arsenmUnsubmitted Done Reply Inline Actions I mean the usage of the MachineMemOperand isn't correct with respect to alignment. It's supposed to be expressed as a base MMO alignment with an offset applied. You're trying to figure out the total alignment. There are helpers for getting the memory operand with an offset arsenm: I mean the usage of the MachineMemOperand isn't correct with respect to alignment. It's…
		critsonAuthorUnsubmitted Done Reply Inline Actions As this now directly calls buildSpillLoadStore it only needs to generate an MMO for the base pointer (with base alignment), and the offset is passed directly to buildSpillLoadStore which handles any further alignment requirements (in the way suggested). critson: As this now directly calls buildSpillLoadStore it only needs to generate an MMO for the base…
		// Setup EXEC
		BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), ExecReg).addImm(VGPRLanes);

		// Load/store VGPR
		MachineFrameInfo &FrameInfo = MF->getFrameInfo();
		assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill);

		Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF)
		? getBaseRegister()
		arsenmUnsubmitted Done Reply Inline Actions I know this is what this was doing before, but do we really need to use the intermediate spill pseudo here? As a follow on change could we directly emit the stack operation here? arsenm: I know this is what this was doing before, but do we really need to use the intermediate spill…
		critsonAuthorUnsubmitted Done Reply Inline Actions Agreed, I will work on a follow up change to directly emit VMEM. critson: Agreed, I will work on a follow up change to directly emit VMEM.
		: getFrameRegister(*MF);

		Align Alignment = FrameInfo.getObjectAlign(Index);
		MachinePointerInfo PtrInfo =
		MachinePointerInfo::getFixedStack(MF, Index, EltSize Offset);
		MachineMemOperand *MMO = MF->getMachineMemOperand(
		PtrInfo, IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore,
		EltSize, commonAlignment(Alignment, EltSize * Offset));

		if (IsLoad) {
		buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
		Index,
		VGPR, false,
		MFI->getScratchRSrcReg(), FrameReg,
		Offset * EltSize, MMO,
		RS);
		} else {
		buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET,
		Index,
		VGPR, !StoreExecInVGPR,
		MFI->getScratchRSrcReg(), FrameReg,
		Offset * EltSize, MMO,
		RS);
		// This only ever adds one VGPR spill
		MFI->addToSpilledVGPRs(1);
		}

		// Restore EXEC
		arsenmUnsubmitted Not Done Reply Inline Actions Capitalize arsenm: Capitalize
		if (SuperRegIsExec && IsLoad) {
		// do nothing; exec will be overwritten
		} else if (StoreExecInVGPR) {
		BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
		AMDGPU::EXEC_LO)
		.addReg(VGPR, getKillRegState(!IsLoad && isWave32))
		.addImm(ExecLoLane);
		if (!isWave32) {
		BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
		AMDGPU::EXEC_HI)
		.addReg(VGPR, getKillRegState(!IsLoad))
		.addImm(ExecHiLane);
		}
		} else {
		assert(SavedExecReg && SavedExecReg != AMDGPU::NoRegister);
		BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), ExecReg)
		.addReg(SavedExecReg, RegState::Kill);
		}
		}

bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,		bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
int Index,		int Index,
RegScavenger *RS,		RegScavenger *RS,
bool OnlyToVGPR) const {		bool OnlyToVGPR) const {
MachineBasicBlock *MBB = MI->getParent();		MachineBasicBlock *MBB = MI->getParent();
MachineFunction *MF = MBB->getParent();		MachineFunction *MF = MBB->getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();		SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
DenseSet<unsigned> SGPRSpillVGPRDefinedSet;		DenseSet<unsigned> SGPRSpillVGPRDefinedSet;

ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills		ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills
= MFI->getSGPRToVGPRSpills(Index);		= MFI->getSGPRToVGPRSpills(Index);
bool SpillToVGPR = !VGPRSpills.empty();		bool SpillToVGPR = !VGPRSpills.empty();
if (OnlyToVGPR && !SpillToVGPR)		if (OnlyToVGPR && !SpillToVGPR)
return false;		return false;

const SIInstrInfo *TII = ST.getInstrInfo();		const SIInstrInfo *TII = ST.getInstrInfo();

Register SuperReg = MI->getOperand(0).getReg();		Register SuperReg = MI->getOperand(0).getReg();
bool IsKill = MI->getOperand(0).isKill();		bool IsKill = MI->getOperand(0).isKill();
const DebugLoc &DL = MI->getDebugLoc();		const DebugLoc &DL = MI->getDebugLoc();

MachineFrameInfo &FrameInfo = MF->getFrameInfo();

assert(SpillToVGPR \|\| (SuperReg != MFI->getStackPtrOffsetReg() &&		assert(SpillToVGPR \|\| (SuperReg != MFI->getStackPtrOffsetReg() &&
SuperReg != MFI->getFrameOffsetReg()));		SuperReg != MFI->getFrameOffsetReg()));

assert(SuperReg != AMDGPU::M0 && "m0 should never spill");		assert(SuperReg != AMDGPU::M0 && "m0 should never spill");

unsigned EltSize = 4;		unsigned EltSize = 4;
const TargetRegisterClass *RC = getPhysRegClass(SuperReg);		const TargetRegisterClass *RC = getPhysRegClass(SuperReg);

ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);		ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();		unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();

// Scavenged temporary VGPR to use. It must be scavenged once for any number		if (SpillToVGPR) {
// of spilled subregs.
Register TmpVGPR;

// SubReg carries the "Kill" flag when SubReg == SuperReg.
unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill);
for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {		for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
Register SubReg =		Register SubReg =
NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]);		NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]);

if (SpillToVGPR) {
SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];		SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];

// During SGPR spilling to VGPR, determine if the VGPR is defined. The		// During SGPR spilling to VGPR, determine if the VGPR is defined. The
// only circumstance in which we say it is undefined is when it is the		// only circumstance in which we say it is undefined is when it is the
// first spill to this VGPR in the first basic block.		// first spill to this VGPR in the first basic block.
bool VGPRDefined = true;		bool VGPRDefined = true;
if (MBB == &MF->front())		if (MBB == &MF->front())
VGPRDefined = !SGPRSpillVGPRDefinedSet.insert(Spill.VGPR).second;		VGPRDefined = !SGPRSpillVGPRDefinedSet.insert(Spill.VGPR).second;

// Mark the "old value of vgpr" input undef only if this is the first sgpr		// Mark the "old value of vgpr" input undef only if this is the first sgpr
// spill to this specific vgpr in the first basic block.		// spill to this specific vgpr in the first basic block.
BuildMI(*MBB, MI, DL,		BuildMI(*MBB, MI, DL,
TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),		TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
Spill.VGPR)		Spill.VGPR)
.addReg(SubReg, getKillRegState(IsKill))		.addReg(SubReg, getKillRegState(IsKill))
.addImm(Spill.Lane)		.addImm(Spill.Lane)
.addReg(Spill.VGPR, VGPRDefined ? 0 : RegState::Undef);		.addReg(Spill.VGPR, VGPRDefined ? 0 : RegState::Undef);

// FIXME: Since this spills to another register instead of an actual		// FIXME: Since this spills to another register instead of an actual
// frame index, we should delete the frame index when all references to		// frame index, we should delete the frame index when all references to
// it are fixed.		// it are fixed.
		}
} else {		} else {
// XXX - Can to VGPR spill fail for some subregisters but not others?		// Scavenged temporary VGPR to use. It must be scavenged once for any number
if (OnlyToVGPR)		// of spilled subregs.
return false;		Register TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);

// Spill SGPR to a frame index.		// SubReg carries the "Kill" flag when SubReg == SuperReg.
if (!TmpVGPR.isValid())		unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill);
TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
		unsigned PerVGPR = isWave32 ? 16 : 32;
MachineInstrBuilder Mov		unsigned NumVGPRs = (NumSubRegs + (PerVGPR - 1)) / PerVGPR;
= BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)		int64_t VGPRLanes = (1LL << std::min(PerVGPR, NumSubRegs)) - 1LL;
.addReg(SubReg, SubKillState);
		for (unsigned Offset = 0; Offset < NumVGPRs; ++Offset) {
		unsigned TmpVGPRFlags = RegState::Undef;

		// Write sub registers into the VGPR
		for (unsigned i = Offset * PerVGPR,
		e = std::min((Offset + 1) * PerVGPR, NumSubRegs);
		i < e; ++i) {
		Register SubReg =
		NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]);

		MachineInstrBuilder WriteLane =
		BuildMI(*MBB, MI, DL,
		TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
		TmpVGPR)
		.addReg(SubReg, SubKillState)
		.addImm(i % PerVGPR)
		.addReg(TmpVGPR, TmpVGPRFlags);
		TmpVGPRFlags = 0;

// There could be undef components of a spilled super register.		// There could be undef components of a spilled super register.
// TODO: Can we detect this and skip the spill?		// TODO: Can we detect this and skip the spill?
if (NumSubRegs > 1) {		if (NumSubRegs > 1) {
// The last implicit use of the SuperReg carries the "Kill" flag.		// The last implicit use of the SuperReg carries the "Kill" flag.
unsigned SuperKillState = 0;		unsigned SuperKillState = 0;
if (i + 1 == e)		if (i + 1 == NumSubRegs)
SuperKillState \|= getKillRegState(IsKill);		SuperKillState \|= getKillRegState(IsKill);
Mov.addReg(SuperReg, RegState::Implicit \| SuperKillState);		WriteLane.addReg(SuperReg, RegState::Implicit \| SuperKillState);
		}
}		}

Align Alignment = FrameInfo.getObjectAlign(Index);		// Write out VGPR
MachinePointerInfo PtrInfo		buildSGPRSpillLoadStore(MI, Index, Offset, EltSize, TmpVGPR, VGPRLanes,
= MachinePointerInfo::getFixedStack(MF, Index, EltSize i);		RS, false);
MachineMemOperand *MMO =
MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, EltSize,
commonAlignment(Alignment, EltSize * i));
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE))
.addReg(TmpVGPR, RegState::Kill) // src
.addFrameIndex(Index) // vaddr
.addReg(MFI->getScratchRSrcReg()) // srrsrc
.addReg(MFI->getStackPtrOffsetReg()) // soffset
.addImm(i * 4) // offset
.addMemOperand(MMO);
}		}
}		}

MI->eraseFromParent();		MI->eraseFromParent();
MFI->addToSpilledSGPRs(NumSubRegs);		MFI->addToSpilledSGPRs(NumSubRegs);
return true;		return true;
}		}

bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,		bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
int Index,		int Index,
RegScavenger *RS,		RegScavenger *RS,
bool OnlyToVGPR) const {		bool OnlyToVGPR) const {
MachineFunction *MF = MI->getParent()->getParent();		MachineFunction *MF = MI->getParent()->getParent();
MachineBasicBlock *MBB = MI->getParent();		MachineBasicBlock *MBB = MI->getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();		SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();

ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills		ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills
= MFI->getSGPRToVGPRSpills(Index);		= MFI->getSGPRToVGPRSpills(Index);
bool SpillToVGPR = !VGPRSpills.empty();		bool SpillToVGPR = !VGPRSpills.empty();
if (OnlyToVGPR && !SpillToVGPR)		if (OnlyToVGPR && !SpillToVGPR)
return false;		return false;

MachineFrameInfo &FrameInfo = MF->getFrameInfo();
const SIInstrInfo *TII = ST.getInstrInfo();		const SIInstrInfo *TII = ST.getInstrInfo();
const DebugLoc &DL = MI->getDebugLoc();		const DebugLoc &DL = MI->getDebugLoc();

Register SuperReg = MI->getOperand(0).getReg();		Register SuperReg = MI->getOperand(0).getReg();

assert(SuperReg != AMDGPU::M0 && "m0 should never spill");		assert(SuperReg != AMDGPU::M0 && "m0 should never spill");

unsigned EltSize = 4;		unsigned EltSize = 4;

const TargetRegisterClass *RC = getPhysRegClass(SuperReg);		const TargetRegisterClass *RC = getPhysRegClass(SuperReg);

ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);		ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();		unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();

Register TmpVGPR;		if (SpillToVGPR) {

for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {		for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
Register SubReg =		Register SubReg =
NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]);		NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]);

if (SpillToVGPR) {
SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];		SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
auto MIB =		auto MIB =
BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),		BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
SubReg)		SubReg)
.addReg(Spill.VGPR)		.addReg(Spill.VGPR)
.addImm(Spill.Lane);		.addImm(Spill.Lane);

if (NumSubRegs > 1 && i == 0)		if (NumSubRegs > 1 && i == 0)
MIB.addReg(SuperReg, RegState::ImplicitDefine);		MIB.addReg(SuperReg, RegState::ImplicitDefine);
		}
} else {		} else {
if (OnlyToVGPR)		Register TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
return false;

// Restore SGPR from a stack slot.
// FIXME: We should use S_LOAD_DWORD here for VI.
if (!TmpVGPR.isValid())
TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
Align Alignment = FrameInfo.getObjectAlign(Index);

MachinePointerInfo PtrInfo
= MachinePointerInfo::getFixedStack(MF, Index, EltSize i);

MachineMemOperand *MMO =
MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, EltSize,
commonAlignment(Alignment, EltSize * i));

BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpVGPR)		unsigned PerVGPR = isWave32 ? 16 : 32;
.addFrameIndex(Index) // vaddr		unsigned NumVGPRs = (NumSubRegs + (PerVGPR - 1)) / PerVGPR;
.addReg(MFI->getScratchRSrcReg()) // srsrc		int64_t VGPRLanes = (1LL << std::min(PerVGPR, NumSubRegs)) - 1LL;
.addReg(MFI->getStackPtrOffsetReg()) // soffset
.addImm(i * 4) // offset		for (unsigned Offset = 0; Offset < NumVGPRs; ++Offset) {
.addMemOperand(MMO);		// Load in VGPR data
		buildSGPRSpillLoadStore(MI, Index, Offset, EltSize, TmpVGPR, VGPRLanes,
		RS, true);

		// Unpack lanes
		for (unsigned i = Offset * PerVGPR,
		e = std::min((Offset + 1) * PerVGPR, NumSubRegs);
		i < e; ++i) {
		Register SubReg =
		NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]);

		bool LastSubReg = (i + 1 == e);
auto MIB =		auto MIB =
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)		BuildMI(*MBB, MI, DL,
.addReg(TmpVGPR, RegState::Kill);		TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), SubReg)
		.addReg(TmpVGPR, getKillRegState(LastSubReg))
		.addImm(i);

if (NumSubRegs > 1)		if (NumSubRegs > 1 && i == 0)
MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);		MIB.addReg(SuperReg, RegState::ImplicitDefine);
		}
}		}
}		}

MI->eraseFromParent();		MI->eraseFromParent();
return true;		return true;
}		}

/// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to		/// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to
▲ Show 20 Lines • Show All 819 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll

	Show All 22 Lines

	; Spill load			; Spill load
	; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill			; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill

	; Spill saved exec			; Spill saved exec
	; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]]			; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]]
	; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]]			; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]]

	; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]]			; VMEM: v_writelane_b32 v[[V_SAVEEXEC:[0-9]+]], s[[SAVEEXEC_LO]], 0
	; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], 0 offset:20 ; 4-byte Folded Spill			; VMEM: v_writelane_b32 v[[V_SAVEEXEC]], s[[SAVEEXEC_HI]], 1
	; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]]			; VMEM: buffer_store_dword v[[V_SAVEEXEC]], off, s[0:3], 0 offset:20 ; 4-byte Folded Spill
	; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], 0 offset:24 ; 4-byte Folded Spill

	; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}			; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}

	; GCN: s_cbranch_execz [[ENDIF:BB[0-9]+_[0-9]+]]			; GCN: s_cbranch_execz [[ENDIF:BB[0-9]+_[0-9]+]]

	; GCN: ; %bb.{{[0-9]+}}: ; %if			; GCN: ; %bb.{{[0-9]+}}: ; %if
	; GCN: s_mov_b32 m0, -1			; GCN: s_mov_b32 m0, -1
	; GCN: ds_read_b32 [[LOAD1:v[0-9]+]]			; GCN: ds_read_b32 [[LOAD1:v[0-9]+]]
	; GCN: buffer_load_dword [[RELOAD_LOAD0:v[0-9]+]], off, s[0:3], 0 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload			; GCN: buffer_load_dword [[RELOAD_LOAD0:v[0-9]+]], off, s[0:3], 0 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload
	; GCN: s_waitcnt vmcnt(0) lgkmcnt(0)			; GCN: s_waitcnt vmcnt(0) lgkmcnt(0)


	; Spill val register			; Spill val register
	; GCN: v_add_i32_e32 [[VAL:v[0-9]+]], vcc, [[LOAD1]], [[RELOAD_LOAD0]]			; GCN: v_add_i32_e32 [[VAL:v[0-9]+]], vcc, [[LOAD1]], [[RELOAD_LOAD0]]
	; GCN: buffer_store_dword [[VAL]], off, s[0:3], 0 offset:[[VAL_OFFSET:[0-9]+]] ; 4-byte Folded Spill			; GCN: buffer_store_dword [[VAL]], off, s[0:3], 0 offset:[[VAL_OFFSET:[0-9]+]] ; 4-byte Folded Spill

	; VMEM: [[ENDIF]]:			; VMEM: [[ENDIF]]:

	; Reload and restore exec mask			; Reload and restore exec mask
	; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]			; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
	; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]			; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]



	; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], 0 offset:20 ; 4-byte Folded Reload			; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC:[0-9]+]], off, s[0:3], 0 offset:20 ; 4-byte Folded Reload
	; VMEM: s_waitcnt vmcnt(0)			; VMEM: s_waitcnt vmcnt(0)
	; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]]			; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 0
				; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 1
	; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], 0 offset:24 ; 4-byte Folded Reload
	; VMEM: s_waitcnt vmcnt(0)
	; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]]

	; GCN: s_or_b64 exec, exec, s{{\[}}[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]{{\]}}			; GCN: s_or_b64 exec, exec, s{{\[}}[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]{{\]}}

	; Restore val			; Restore val
	; GCN: buffer_load_dword [[RELOAD_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload			; GCN: buffer_load_dword [[RELOAD_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload

	; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RELOAD_VAL]]			; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RELOAD_VAL]]
	define amdgpu_kernel void @divergent_if_endif(i32 addrspace(1)* %out) #0 {			define amdgpu_kernel void @divergent_if_endif(i32 addrspace(1)* %out) #0 {
	Show All 30 Lines
	; Spill load			; Spill load
	; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill			; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill

	; Spill saved exec			; Spill saved exec
	; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]]			; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]]
	; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]]			; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]]


	; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]]			; VMEM: v_writelane_b32 v[[V_SAVEEXEC:[0-9]+]], s[[SAVEEXEC_LO]], 0
	; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], 0 offset:24 ; 4-byte Folded Spill			; VMEM: v_writelane_b32 v[[V_SAVEEXEC]], s[[SAVEEXEC_HI]], 1
	; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]]			; VMEM: buffer_store_dword v[[V_SAVEEXEC]], off, s[0:3], 0 offset:24 ; 4-byte Folded Spill
	; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], 0 offset:28 ; 4-byte Folded Spill

	; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}			; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}

	; GCN-NEXT: s_cbranch_execz [[END:BB[0-9]+_[0-9]+]]			; GCN-NEXT: s_cbranch_execz [[END:BB[0-9]+_[0-9]+]]


	; GCN: [[LOOP:BB[0-9]+_[0-9]+]]:			; GCN: [[LOOP:BB[0-9]+_[0-9]+]]:
	; GCN: buffer_load_dword v[[VAL_LOOP_RELOAD:[0-9]+]], off, s[0:3], 0 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload			; GCN: buffer_load_dword v[[VAL_LOOP_RELOAD:[0-9]+]], off, s[0:3], 0 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload
	; GCN: v_subrev_i32_e32 [[VAL_LOOP:v[0-9]+]], vcc, v{{[0-9]+}}, v[[VAL_LOOP_RELOAD]]			; GCN: v_subrev_i32_e32 [[VAL_LOOP:v[0-9]+]], vcc, v{{[0-9]+}}, v[[VAL_LOOP_RELOAD]]
	; GCN: s_cmp_lg_u32			; GCN: s_cmp_lg_u32
	; GCN: buffer_store_dword [[VAL_LOOP]], off, s[0:3], 0 offset:{{[0-9]+}} ; 4-byte Folded Spill			; GCN: buffer_store_dword [[VAL_LOOP]], off, s[0:3], 0 offset:{{[0-9]+}} ; 4-byte Folded Spill
	; GCN-NEXT: s_cbranch_scc1 [[LOOP]]			; GCN-NEXT: s_cbranch_scc1 [[LOOP]]

	; GCN: buffer_store_dword [[VAL_LOOP]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET:[0-9]+]] ; 4-byte Folded Spill			; GCN: buffer_store_dword [[VAL_LOOP]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET:[0-9]+]] ; 4-byte Folded Spill

	; GCN: [[END]]:			; GCN: [[END]]:
	; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]			; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
	; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]			; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]

	; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], 0 offset:24 ; 4-byte Folded Reload			; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC:[0-9]+]], off, s[0:3], 0 offset:24 ; 4-byte Folded Reload
	; VMEM: s_waitcnt vmcnt(0)
	; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]]

	; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], 0 offset:28 ; 4-byte Folded Reload
	; VMEM: s_waitcnt vmcnt(0)			; VMEM: s_waitcnt vmcnt(0)
	; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]]			; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 0
				; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 1

	; GCN: s_or_b64 exec, exec, s{{\[}}[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]{{\]}}			; GCN: s_or_b64 exec, exec, s{{\[}}[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]{{\]}}
	; GCN: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload			; GCN: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload

	; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[VAL_END]]			; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[VAL_END]]
	define amdgpu_kernel void @divergent_loop(i32 addrspace(1)* %out) #0 {			define amdgpu_kernel void @divergent_loop(i32 addrspace(1)* %out) #0 {
	entry:			entry:
	%tid = call i32 @llvm.amdgcn.workitem.id.x()			%tid = call i32 @llvm.amdgcn.workitem.id.x()
	Show All 31 Lines

	; Spill load			; Spill load
	; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill			; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill

	; Spill saved exec			; Spill saved exec
	; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]]			; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]]
	; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]]			; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]]

	; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]]			; VMEM: v_writelane_b32 v[[V_SAVEEXEC:[0-9]+]], s[[SAVEEXEC_LO]], 0
	; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], 0 offset:[[SAVEEXEC_LO_OFFSET:[0-9]+]] ; 4-byte Folded Spill			; VMEM: v_writelane_b32 v[[V_SAVEEXEC]], s[[SAVEEXEC_HI]], 1
	; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]]			; VMEM: buffer_store_dword v[[V_SAVEEXEC]], off, s[0:3], 0 offset:[[SAVEEXEC_OFFSET:[0-9]+]] ; 4-byte Folded Spill
	; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], 0 offset:[[SAVEEXEC_HI_OFFSET:[0-9]+]] ; 4-byte Folded Spill

	; GCN: s_mov_b64 exec, [[CMP0]]			; GCN: s_mov_b64 exec, [[CMP0]]

	; FIXME: It makes no sense to put this skip here			; FIXME: It makes no sense to put this skip here
	; GCN: s_cbranch_execz [[FLOW:BB[0-9]+_[0-9]+]]			; GCN: s_cbranch_execz [[FLOW:BB[0-9]+_[0-9]+]]
	; GCN-NEXT: s_branch [[ELSE:BB[0-9]+_[0-9]+]]			; GCN-NEXT: s_branch [[ELSE:BB[0-9]+_[0-9]+]]

	; GCN: [[FLOW]]: ; %Flow			; GCN: [[FLOW]]: ; %Flow
	; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]			; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
	; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]			; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]


	; VMEM: buffer_load_dword v[[FLOW_V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], 0 offset:[[SAVEEXEC_LO_OFFSET]]			; VMEM: buffer_load_dword v[[FLOW_V_RELOAD_SAVEEXEC:[0-9]+]], off, s[0:3], 0 offset:[[SAVEEXEC_OFFSET]]
	; VMEM: s_waitcnt vmcnt(0)			; VMEM: s_waitcnt vmcnt(0)
	; VMEM: v_readfirstlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC_LO]]			; VMEM: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC]], 0
				; VMEM: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC]], 1
	; VMEM: buffer_load_dword v[[FLOW_V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], 0 offset:[[SAVEEXEC_HI_OFFSET]] ; 4-byte Folded Reload
	; VMEM: s_waitcnt vmcnt(0)
	; VMEM: v_readfirstlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC_HI]]

	; GCN: s_or_saveexec_b64 s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]{{\]}}, s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]{{\]}}			; GCN: s_or_saveexec_b64 s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]{{\]}}, s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]{{\]}}

	; Regular spill value restored after exec modification			; Regular spill value restored after exec modification
	; GCN: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload			; GCN: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload


	; Spill saved exec			; Spill saved exec
	; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[FLOW_S_RELOAD_SAVEEXEC_LO]], [[FLOW_SAVEEXEC_LO_LANE:[0-9]+]]			; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[FLOW_S_RELOAD_SAVEEXEC_LO]], [[FLOW_SAVEEXEC_LO_LANE:[0-9]+]]
	; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[FLOW_S_RELOAD_SAVEEXEC_HI]], [[FLOW_SAVEEXEC_HI_LANE:[0-9]+]]			; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[FLOW_S_RELOAD_SAVEEXEC_HI]], [[FLOW_SAVEEXEC_HI_LANE:[0-9]+]]


	; VMEM: v_mov_b32_e32 v[[FLOW_V_SAVEEXEC_LO:[0-9]+]], s[[FLOW_S_RELOAD_SAVEEXEC_LO]]			; VMEM: v_writelane_b32 v[[FLOW_V_SAVEEXEC:[0-9]+]], s[[FLOW_S_RELOAD_SAVEEXEC_LO]], 0
	; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC_LO]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_LO_OFFSET:[0-9]+]] ; 4-byte Folded Spill			; VMEM: v_writelane_b32 v[[FLOW_V_SAVEEXEC]], s[[FLOW_S_RELOAD_SAVEEXEC_HI]], 1
	; VMEM: v_mov_b32_e32 v[[FLOW_V_SAVEEXEC_HI:[0-9]+]], s[[FLOW_S_RELOAD_SAVEEXEC_HI]]			; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_OFFSET:[0-9]+]] ; 4-byte Folded Spill
	; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC_HI]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_HI_OFFSET:[0-9]+]] ; 4-byte Folded Spill

	; GCN: buffer_store_dword [[FLOW_VAL]], off, s[0:3], 0 offset:[[RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill			; GCN: buffer_store_dword [[FLOW_VAL]], off, s[0:3], 0 offset:[[RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill
	; GCN: s_xor_b64 exec, exec, s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]{{\]}}			; GCN: s_xor_b64 exec, exec, s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]{{\]}}
	; GCN-NEXT: s_cbranch_execz [[ENDIF:BB[0-9]+_[0-9]+]]			; GCN-NEXT: s_cbranch_execz [[ENDIF:BB[0-9]+_[0-9]+]]


	; GCN: ; %bb.{{[0-9]+}}: ; %if			; GCN: ; %bb.{{[0-9]+}}: ; %if
	; GCN: ds_read_b32			; GCN: ds_read_b32
	; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], 0 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload			; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], 0 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload
	; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, v{{[0-9]+}}, v[[LOAD0_RELOAD]]			; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, v{{[0-9]+}}, v[[LOAD0_RELOAD]]
	; GCN: buffer_store_dword [[ADD]], off, s[0:3], 0 offset:[[RESULT_OFFSET]] ; 4-byte Folded Spill			; GCN: buffer_store_dword [[ADD]], off, s[0:3], 0 offset:[[RESULT_OFFSET]] ; 4-byte Folded Spill
	; GCN-NEXT: s_branch [[ENDIF:BB[0-9]+_[0-9]+]]			; GCN-NEXT: s_branch [[ENDIF:BB[0-9]+_[0-9]+]]

	; GCN: [[ELSE]]: ; %else			; GCN: [[ELSE]]: ; %else
	; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], 0 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload			; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], 0 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload
	; GCN: v_subrev_i32_e32 [[SUB:v[0-9]+]], vcc, v{{[0-9]+}}, v[[LOAD0_RELOAD]]			; GCN: v_subrev_i32_e32 [[SUB:v[0-9]+]], vcc, v{{[0-9]+}}, v[[LOAD0_RELOAD]]
	; GCN: buffer_store_dword [[ADD]], off, s[0:3], 0 offset:[[FLOW_RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill			; GCN: buffer_store_dword [[ADD]], off, s[0:3], 0 offset:[[FLOW_RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill
	; GCN-NEXT: s_branch [[FLOW]]			; GCN-NEXT: s_branch [[FLOW]]

	; GCN: [[ENDIF]]:			; GCN: [[ENDIF]]:
	; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[FLOW_SAVEEXEC_LO_LANE]]			; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[FLOW_SAVEEXEC_LO_LANE]]
	; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[FLOW_SAVEEXEC_HI_LANE]]			; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[FLOW_SAVEEXEC_HI_LANE]]


	; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_LO_OFFSET]] ; 4-byte Folded Reload			; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC:[0-9]+]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_OFFSET]] ; 4-byte Folded Reload
	; VMEM: s_waitcnt vmcnt(0)
	; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]]

	; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_HI_OFFSET]] ; 4-byte Folded Reload
	; VMEM: s_waitcnt vmcnt(0)			; VMEM: s_waitcnt vmcnt(0)
	; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]]			; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 0
				; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 1

	; GCN: s_or_b64 exec, exec, s{{\[}}[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]{{\]}}			; GCN: s_or_b64 exec, exec, s{{\[}}[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]{{\]}}

	; GCN: buffer_load_dword v[[RESULT:[0-9]+]], off, s[0:3], 0 offset:[[RESULT_OFFSET]] ; 4-byte Folded Reload			; GCN: buffer_load_dword v[[RESULT:[0-9]+]], off, s[0:3], 0 offset:[[RESULT_OFFSET]] ; 4-byte Folded Reload
	; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RESULT]]			; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RESULT]]
	define amdgpu_kernel void @divergent_if_else_endif(i32 addrspace(1)* %out) #0 {			define amdgpu_kernel void @divergent_if_else_endif(i32 addrspace(1)* %out) #0 {
	entry:			entry:
	%tid = call i32 @llvm.amdgcn.workitem.id.x()			%tid = call i32 @llvm.amdgcn.workitem.id.x()
	Show All 24 Lines

llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll

	Show First 20 Lines • Show All 557 Lines • ▼ Show 20 Lines
	; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 27			; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 27
	; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 28			; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 28
	; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 29			; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 29
	; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 30			; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 30
	; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v31, 31			; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v31, 31
	; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}			; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}

	; GCN: buffer_load_dword v[[RESTORE_TMP:[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0			; GCN: buffer_load_dword v[[RESTORE_TMP:[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0
	; GCN: v_readfirstlane_b32 s[[USE_TMP_LO:[0-9]+]], v[[RESTORE_TMP]]			; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v[[RESTORE_TMP]], 0
	; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0			; GCN: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v[[RESTORE_TMP]], 1
	; GCN: v_readfirstlane_b32 s[[USE_TMP_HI:[0-9]+]], v[[RESTORE_TMP]]
	; GCN: ;;#ASMSTART			; GCN: ;;#ASMSTART
	; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}			; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
	define amdgpu_kernel void @no_vgprs_last_sgpr_spill(i32 addrspace(1)* %out, i32 %in) #1 {			define amdgpu_kernel void @no_vgprs_last_sgpr_spill(i32 addrspace(1)* %out, i32 %in) #1 {
	call void asm sideeffect "", "~{v[0:7]}" () #0			call void asm sideeffect "", "~{v[0:7]}" () #0
	call void asm sideeffect "", "~{v[8:15]}" () #0			call void asm sideeffect "", "~{v[8:15]}" () #0
	call void asm sideeffect "", "~{v[16:23]}" () #0			call void asm sideeffect "", "~{v[16:23]}" () #0
	call void asm sideeffect "", "~{v[24:27]}"() #0			call void asm sideeffect "", "~{v[24:27]}"() #0
	call void asm sideeffect "", "~{v[28:29]}"() #0			call void asm sideeffect "", "~{v[28:29]}"() #0
	Show All 24 Lines

llvm/test/CodeGen/AMDGPU/sgpr-spill.mir

This file was added.

				# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog %s -o - \| FileCheck -check-prefix=CHECK -check-prefix=GCN64 %s
				# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -run-pass=prologepilog %s -o - \| FileCheck -check-prefix=CHECK -check-prefix=GCN32 %s


				# CHECK-LABEL: name: check_spill

				# S32 with kill
				# CHECK: V_WRITELANE
				# CHECK: $sgpr12 = S_MOV_B32 $exec_lo
				# CHECK: $exec_lo = S_MOV_B32 1
				# CHECK: BUFFER_STORE_DWORD_OFFSET
				# CHECK: $exec_lo = S_MOV_B32 killed $sgpr12

				# S32 without kill
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: $exec_lo = S_MOV_B32 1
				# CHECK: BUFFER_STORE_DWORD_OFFSET
				# CHECK: $exec_lo = V_READLANE

				# S64 with kill
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# GCN32: $sgpr12 = S_MOV_B32 $exec_lo
				# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
				# GCN32: $exec_lo = S_MOV_B32 3
				# GCN64: $exec = S_MOV_B64 3
				# CHECK: BUFFER_STORE_DWORD_OFFSET
				# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12
				# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13

				# S64 without kill
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# GCN64: V_WRITELANE
				# GCN32: $exec_lo = S_MOV_B32 3
				# GCN64: $exec = S_MOV_B64 3
				# CHECK: BUFFER_STORE_DWORD_OFFSET
				# CHECK: $exec_lo = V_READLANE
				# GCN64: $exec_hi = V_READLANE

				# S96
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# GCN32: $sgpr12 = S_MOV_B32 $exec_lo
				# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
				# GCN32: $exec_lo = S_MOV_B32 7
				# GCN64: $exec = S_MOV_B64 7
				# CHECK: BUFFER_STORE_DWORD_OFFSET
				# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12
				# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13

				# S128
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# GCN32: $sgpr12 = S_MOV_B32 $exec_lo
				# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
				# GCN32: $exec_lo = S_MOV_B32 15
				# GCN64: $exec = S_MOV_B64 15
				# CHECK: BUFFER_STORE_DWORD_OFFSET
				# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12
				# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13

				# S160
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# GCN32: $sgpr12 = S_MOV_B32 $exec_lo
				# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
				# GCN32: $exec_lo = S_MOV_B32 31
				# GCN64: $exec = S_MOV_B64 31
				# CHECK: BUFFER_STORE_DWORD_OFFSET
				# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12
				# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13

				# S256
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# GCN32: $sgpr12 = S_MOV_B32 $exec_lo
				# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
				# GCN32: $exec_lo = S_MOV_B32 255
				# GCN64: $exec = S_MOV_B64 255
				# CHECK: BUFFER_STORE_DWORD_OFFSET
				# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12
				# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13

				# S512
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# GCN32: $sgpr12 = S_MOV_B32 $exec_lo
				# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
				# GCN32: $exec_lo = S_MOV_B32 65535
				# GCN64: $exec = S_MOV_B64 65535
				# CHECK: BUFFER_STORE_DWORD_OFFSET
				# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12
				# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13

				# S1024
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# GCN32: $sgpr64 = S_MOV_B32 $exec_lo
				# GCN32: $exec_lo = S_MOV_B32 65535
				# GCN32: BUFFER_STORE_DWORD_OFFSET
				# GCN32: $exec_lo = S_MOV_B32 killed $sgpr64
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# CHECK: V_WRITELANE
				# GCN32: $sgpr80 = S_MOV_B32 $exec_lo
				# GCN64: $sgpr64_sgpr65 = S_MOV_B64 $exec
				# GCN32: $exec_lo = S_MOV_B32 65535
				# GCN64: $exec = S_MOV_B64 4294967295
				# CHECK: BUFFER_STORE_DWORD_OFFSET
				# GCN32: $exec_lo = S_MOV_B32 killed $sgpr80
				# GCN64: $exec = S_MOV_B64 killed $sgpr64_sgpr65

				--- \|

				define amdgpu_kernel void @check_spill() #0 {
				ret void
				}

				define amdgpu_kernel void @check_reload() #0 {
				ret void
				}

				attributes #0 = { "frame-pointer"="all" }
				...
				---
				name: check_spill
				tracksRegLiveness: true
				liveins:
				- { reg: '$sgpr4_sgpr5' }
				- { reg: '$sgpr6_sgpr7' }
				- { reg: '$sgpr8' }
				frameInfo:
				maxAlignment: 4
				stack:
				- { id: 0, type: spill-slot, size: 4, alignment: 4 }
				- { id: 1, type: spill-slot, size: 8, alignment: 4 }
				- { id: 2, type: spill-slot, size: 12, alignment: 4 }
				- { id: 3, type: spill-slot, size: 16, alignment: 4 }
				- { id: 4, type: spill-slot, size: 20, alignment: 4 }
				- { id: 5, type: spill-slot, size: 32, alignment: 4 }
				- { id: 6, type: spill-slot, size: 64, alignment: 4 }
				- { id: 7, type: spill-slot, size: 128, alignment: 4 }
				machineFunctionInfo:
				explicitKernArgSize: 660
				maxKernArgAlign: 4
				isEntryFunction: true
				waveLimiter: true
				scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99'
				stackPtrOffsetReg: '$sgpr32'
				frameOffsetReg: '$sgpr33'
				argumentInfo:
				privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
				dispatchPtr: { reg: '$sgpr4_sgpr5' }
				kernargSegmentPtr: { reg: '$sgpr6_sgpr7' }
				workGroupIDX: { reg: '$sgpr8' }
				privateSegmentWaveByteOffset: { reg: '$sgpr9' }
				body: \|
				bb.0:
				liveins: $sgpr8, $sgpr4_sgpr5, $sgpr6_sgpr7

				renamable $sgpr12 = IMPLICIT_DEF
				SI_SPILL_S32_SAVE killed $sgpr12, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32

				renamable $sgpr12 = IMPLICIT_DEF
				SI_SPILL_S32_SAVE $sgpr12, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32

				renamable $sgpr12_sgpr13 = IMPLICIT_DEF
				SI_SPILL_S64_SAVE killed $sgpr12_sgpr13, %stack.1, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32

				renamable $sgpr12_sgpr13 = IMPLICIT_DEF
				SI_SPILL_S64_SAVE $sgpr12_sgpr13, %stack.1, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32

				renamable $sgpr12_sgpr13_sgpr14 = IMPLICIT_DEF
				SI_SPILL_S96_SAVE killed $sgpr12_sgpr13_sgpr14, %stack.2, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32

				renamable $sgpr12_sgpr13_sgpr14_sgpr15 = IMPLICIT_DEF
				SI_SPILL_S128_SAVE killed $sgpr12_sgpr13_sgpr14_sgpr15, %stack.3, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32

				renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 = IMPLICIT_DEF
				SI_SPILL_S160_SAVE killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16, %stack.4, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32

				renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = IMPLICIT_DEF
				SI_SPILL_S256_SAVE killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19, %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32

				renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 = IMPLICIT_DEF
				SI_SPILL_S512_SAVE killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27, %stack.6, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32

				renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = IMPLICIT_DEF
				SI_SPILL_S1024_SAVE killed $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, %stack.7, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32


				# CHECK-LABEL: name: check_reload

				# S32
				# CHECK: $sgpr12 = S_MOV_B32 $exec_lo
				# CHECK: $exec_lo = S_MOV_B32 1
				# CHECK: BUFFER_LOAD_DWORD_OFFSET
				# CHECK: $exec_lo = S_MOV_B32 killed $sgpr12
				# CHECK: $sgpr12 = V_READLANE

				# S64
				# GCN32: $sgpr12 = S_MOV_B32 $exec_lo
				# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
				# GCN32: $exec_lo = S_MOV_B32 3
				# GCN64: $exec = S_MOV_B64 3
				# CHECK: BUFFER_LOAD_DWORD_OFFSET
				# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12
				# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13
				# CHECK: $sgpr12 = V_READLANE
				# CHECK: $sgpr13 = V_READLANE

				# S96
				# GCN32: $sgpr12 = S_MOV_B32 $exec_lo
				# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
				# GCN32: $exec_lo = S_MOV_B32 7
				# GCN64: $exec = S_MOV_B64 7
				# CHECK: BUFFER_LOAD_DWORD_OFFSET
				# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12
				# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13
				# CHECK: $sgpr12 = V_READLANE
				# CHECK: $sgpr13 = V_READLANE
				# CHECK: $sgpr14 = V_READLANE

				# S128
				# GCN32: $sgpr12 = S_MOV_B32 $exec_lo
				# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
				# GCN32: $exec_lo = S_MOV_B32 15
				# GCN64: $exec = S_MOV_B64 15
				# CHECK: BUFFER_LOAD_DWORD_OFFSET
				# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12
				# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13
				# CHECK: $sgpr12 = V_READLANE
				# CHECK: $sgpr13 = V_READLANE
				# CHECK: $sgpr14 = V_READLANE
				# CHECK: $sgpr15 = V_READLANE

				# S160
				# GCN32: $sgpr12 = S_MOV_B32 $exec_lo
				# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
				# GCN32: $exec_lo = S_MOV_B32 31
				# GCN64: $exec = S_MOV_B64 31
				# CHECK: BUFFER_LOAD_DWORD_OFFSET
				# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12
				# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13
				# CHECK: $sgpr12 = V_READLANE
				# CHECK: $sgpr13 = V_READLANE
				# CHECK: $sgpr14 = V_READLANE
				# CHECK: $sgpr15 = V_READLANE
				# CHECK: $sgpr16 = V_READLANE

				# S256
				# GCN32: $sgpr12 = S_MOV_B32 $exec_lo
				# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
				# GCN32: $exec_lo = S_MOV_B32 255
				# GCN64: $exec = S_MOV_B64 255
				# CHECK: BUFFER_LOAD_DWORD_OFFSET
				# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12
				# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13
				# CHECK: $sgpr12 = V_READLANE
				# CHECK: $sgpr13 = V_READLANE
				# CHECK: $sgpr14 = V_READLANE
				# CHECK: $sgpr15 = V_READLANE
				# CHECK: $sgpr16 = V_READLANE
				# CHECK: $sgpr17 = V_READLANE
				# CHECK: $sgpr18 = V_READLANE
				# CHECK: $sgpr19 = V_READLANE

				# S512
				# GCN32: $sgpr12 = S_MOV_B32 $exec_lo
				# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
				# GCN32: $exec_lo = S_MOV_B32 65535
				# GCN64: $exec = S_MOV_B64 65535
				# CHECK: BUFFER_LOAD_DWORD_OFFSET
				# GCN32: $exec_lo = S_MOV_B32 killed $sgpr12
				# GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13
				# CHECK: $sgpr12 = V_READLANE
				# CHECK: $sgpr13 = V_READLANE
				# CHECK: $sgpr14 = V_READLANE
				# CHECK: $sgpr15 = V_READLANE
				# CHECK: $sgpr16 = V_READLANE
				# CHECK: $sgpr17 = V_READLANE
				# CHECK: $sgpr18 = V_READLANE
				# CHECK: $sgpr19 = V_READLANE
				# CHECK: $sgpr20 = V_READLANE
				# CHECK: $sgpr21 = V_READLANE
				# CHECK: $sgpr22 = V_READLANE
				# CHECK: $sgpr23 = V_READLANE
				# CHECK: $sgpr24 = V_READLANE
				# CHECK: $sgpr25 = V_READLANE
				# CHECK: $sgpr26 = V_READLANE
				# CHECK: $sgpr27 = V_READLANE

				# S1024
				# GCN32: $sgpr64 = S_MOV_B32 $exec_lo
				# GCN64: $sgpr64_sgpr65 = S_MOV_B64 $exec
				# GCN32: $exec_lo = S_MOV_B32 65535
				# GCN64: $exec = S_MOV_B64 4294967295
				# CHECK: BUFFER_LOAD_DWORD_OFFSET
				# GCN32: $exec_lo = S_MOV_B32 killed $sgpr64
				# GCN64: $exec = S_MOV_B64 killed $sgpr64_sgpr65
				# CHECK: $sgpr64 = V_READLANE
				# CHECK: $sgpr65 = V_READLANE
				# CHECK: $sgpr66 = V_READLANE
				# CHECK: $sgpr67 = V_READLANE
				# CHECK: $sgpr68 = V_READLANE
				# CHECK: $sgpr69 = V_READLANE
				# CHECK: $sgpr70 = V_READLANE
				# CHECK: $sgpr71 = V_READLANE
				# CHECK: $sgpr72 = V_READLANE
				# CHECK: $sgpr73 = V_READLANE
				# CHECK: $sgpr74 = V_READLANE
				# CHECK: $sgpr75 = V_READLANE
				# CHECK: $sgpr76 = V_READLANE
				# CHECK: $sgpr77 = V_READLANE
				# CHECK: $sgpr78 = V_READLANE
				# CHECK: $sgpr79 = V_READLANE
				# GCN32: $sgpr80 = S_MOV_B32 $exec_lo
				# GCN32: $exec_lo = S_MOV_B32 65535
				# GCN32: BUFFER_LOAD_DWORD_OFFSET
				# GCN32: $exec_lo = S_MOV_B32 killed $sgpr80
				# CHECK: $sgpr80 = V_READLANE
				# CHECK: $sgpr81 = V_READLANE
				# CHECK: $sgpr82 = V_READLANE
				# CHECK: $sgpr83 = V_READLANE
				# CHECK: $sgpr84 = V_READLANE
				# CHECK: $sgpr85 = V_READLANE
				# CHECK: $sgpr86 = V_READLANE
				# CHECK: $sgpr87 = V_READLANE
				# CHECK: $sgpr88 = V_READLANE
				# CHECK: $sgpr89 = V_READLANE
				# CHECK: $sgpr90 = V_READLANE
				# CHECK: $sgpr91 = V_READLANE
				# CHECK: $sgpr92 = V_READLANE
				# CHECK: $sgpr93 = V_READLANE
				# CHECK: $sgpr94 = V_READLANE
				# CHECK: $sgpr95 = V_READLANE

				---
				name: check_reload
				tracksRegLiveness: true
				liveins:
				- { reg: '$sgpr4_sgpr5' }
				- { reg: '$sgpr6_sgpr7' }
				- { reg: '$sgpr8' }
				frameInfo:
				maxAlignment: 4
				stack:
				- { id: 0, type: spill-slot, size: 4, alignment: 4 }
				- { id: 1, type: spill-slot, size: 8, alignment: 4 }
				- { id: 2, type: spill-slot, size: 12, alignment: 4 }
				- { id: 3, type: spill-slot, size: 16, alignment: 4 }
				- { id: 4, type: spill-slot, size: 20, alignment: 4 }
				- { id: 5, type: spill-slot, size: 32, alignment: 4 }
				- { id: 6, type: spill-slot, size: 64, alignment: 4 }
				- { id: 7, type: spill-slot, size: 128, alignment: 4 }
				machineFunctionInfo:
				explicitKernArgSize: 660
				maxKernArgAlign: 4
				isEntryFunction: true
				waveLimiter: true
				scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99'
				stackPtrOffsetReg: '$sgpr32'
				frameOffsetReg: '$sgpr33'
				argumentInfo:
				privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
				dispatchPtr: { reg: '$sgpr4_sgpr5' }
				kernargSegmentPtr: { reg: '$sgpr6_sgpr7' }
				workGroupIDX: { reg: '$sgpr8' }
				privateSegmentWaveByteOffset: { reg: '$sgpr9' }
				body: \|
				bb.0:
				liveins: $sgpr8, $sgpr4_sgpr5, $sgpr6_sgpr7

				renamable $sgpr12 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32

				renamable $sgpr12_sgpr13 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32

				renamable $sgpr12_sgpr13_sgpr14 = SI_SPILL_S96_RESTORE %stack.2, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32

				renamable $sgpr12_sgpr13_sgpr14_sgpr15 = SI_SPILL_S128_RESTORE %stack.3, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32

				renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16 = SI_SPILL_S160_RESTORE %stack.4, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32

				renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = SI_SPILL_S256_RESTORE %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32

				renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 = SI_SPILL_S512_RESTORE %stack.6, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32

				renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.7, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32

llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll

	; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -check-prefix=ALL -check-prefix=SGPR %s			; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -check-prefix=ALL -check-prefix=SGPR %s

	; Make sure this doesn't crash.			; Make sure this doesn't crash.
	; ALL-LABEL: {{^}}test:			; ALL-LABEL: {{^}}test:
	; ALL: s_mov_b32 s[[LO:[0-9]+]], SCRATCH_RSRC_DWORD0			; ALL: s_mov_b32 s[[LO:[0-9]+]], SCRATCH_RSRC_DWORD0
	; ALL: s_mov_b32 s[[HI:[0-9]+]], 0xe80000			; ALL: s_mov_b32 s[[HI:[0-9]+]], 0xe80000

	; Make sure we are handling hazards correctly.			; Make sure we are handling hazards correctly.
	; SGPR: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:16			; SGPR: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:4
				; SGPR-NEXT: s_mov_b64 exec, s[0:1]
	; SGPR-NEXT: s_waitcnt vmcnt(0)			; SGPR-NEXT: s_waitcnt vmcnt(0)
	; SGPR-NEXT: v_readfirstlane_b32 s[[HI:[0-9]+]], [[VHI]]			; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 0
				; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 1
				; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 2
				; SGPR-NEXT: v_readlane_b32 s[[HI:[0-9]+]], [[VHI]], 3
	; SGPR-NEXT: s_nop 4			; SGPR-NEXT: s_nop 4
	; SGPR-NEXT: buffer_store_dword v0, off, s[0:[[HI]]{{\]}}, 0			; SGPR-NEXT: buffer_store_dword v0, off, s[0:[[HI]]{{\]}}, 0

	; ALL: s_endpgm			; ALL: s_endpgm
	define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 %in) {			define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 %in) {
	call void asm sideeffect "", "~{s[0:7]}" ()			call void asm sideeffect "", "~{s[0:7]}" ()
	call void asm sideeffect "", "~{s[8:15]}" ()			call void asm sideeffect "", "~{s[8:15]}" ()
	call void asm sideeffect "", "~{s[16:23]}" ()			call void asm sideeffect "", "~{s[16:23]}" ()
	▲ Show 20 Lines • Show All 45 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/spill-m0.ll

	; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=1 -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=TOVGPR -check-prefix=GCN %s			; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=1 -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=TOVGPR -check-prefix=GCN %s
	; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=1 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -check-prefix=TOVGPR -check-prefix=GCN %s			; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=1 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -check-prefix=TOVGPR -check-prefix=GCN %s
	; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=TOVMEM -check-prefix=GCN %s			; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=TOVMEM -check-prefix=GCN %s
	; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -check-prefix=TOVMEM -check-prefix=GCN %s			; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -check-prefix=TOVMEM -check-prefix=GCN %s

	; XXX - Why does it like to use vcc?			; XXX - Why does it like to use vcc?

	; GCN-LABEL: {{^}}spill_m0:			; GCN-LABEL: {{^}}spill_m0:

	; GCN-DAG: s_cmp_lg_u32			; GCN-DAG: s_cmp_lg_u32

	; TOVGPR-DAG: s_mov_b32 [[M0_COPY:s[0-9]+]], m0			; TOVGPR-DAG: s_mov_b32 [[M0_COPY:s[0-9]+]], m0
	; TOVGPR: v_writelane_b32 [[SPILL_VREG:v[0-9]+]], [[M0_COPY]], 2			; TOVGPR: v_writelane_b32 [[SPILL_VREG:v[0-9]+]], [[M0_COPY]], 2

	; TOVMEM-DAG: s_mov_b32 [[M0_COPY:s[0-9]+]], m0			; TOVMEM-DAG: s_mov_b32 [[M0_COPY:s[0-9]+]], m0
	; TOVMEM-DAG: v_mov_b32_e32 [[SPILL_VREG:v[0-9]+]], [[M0_COPY]]			; TOVMEM-DAG: v_writelane_b32 [[SPILL_VREG:v[0-9]+]], [[M0_COPY]], 0
	; TOVMEM: buffer_store_dword [[SPILL_VREG]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12 ; 4-byte Folded Spill			; TOVMEM: buffer_store_dword [[SPILL_VREG]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12 ; 4-byte Folded Spill

	; GCN: s_cbranch_scc1 [[ENDIF:BB[0-9]+_[0-9]+]]			; GCN: s_cbranch_scc1 [[ENDIF:BB[0-9]+_[0-9]+]]

	; GCN: [[ENDIF]]:			; GCN: [[ENDIF]]:
	; TOVGPR: v_readlane_b32 [[M0_RESTORE:s[0-9]+]], [[SPILL_VREG]], 2			; TOVGPR: v_readlane_b32 [[M0_RESTORE:s[0-9]+]], [[SPILL_VREG]], 2
	; TOVGPR: s_mov_b32 m0, [[M0_RESTORE]]			; TOVGPR: s_mov_b32 m0, [[M0_RESTORE]]

	; TOVMEM: buffer_load_dword [[RELOAD_VREG:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12 ; 4-byte Folded Reload			; TOVMEM: buffer_load_dword [[RELOAD_VREG:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12 ; 4-byte Folded Reload
	; TOVMEM: s_waitcnt vmcnt(0)			; TOVMEM: s_waitcnt vmcnt(0)
	; TOVMEM: v_readfirstlane_b32 [[M0_RESTORE:s[0-9]+]], [[RELOAD_VREG]]			; TOVMEM: v_readlane_b32 [[M0_RESTORE:s[0-9]+]], [[RELOAD_VREG]], 0
	; TOVMEM: s_mov_b32 m0, [[M0_RESTORE]]			; TOVMEM: s_mov_b32 m0, [[M0_RESTORE]]

	; GCN: s_add_i32 s{{[0-9]+}}, m0, 1			; GCN: s_add_i32 s{{[0-9]+}}, m0, 1
	define amdgpu_kernel void @spill_m0(i32 %cond, i32 addrspace(1)* %out) #0 {			define amdgpu_kernel void @spill_m0(i32 %cond, i32 addrspace(1)* %out) #0 {
	entry:			entry:
	%m0 = call i32 asm sideeffect "s_mov_b32 m0, 0", "={m0}"() #0			%m0 = call i32 asm sideeffect "s_mov_b32 m0, 0", "={m0}"() #0
	%cmp0 = icmp eq i32 %cond, 0			%cmp0 = icmp eq i32 %cond, 0
	br i1 %cmp0, label %if, label %endif			br i1 %cmp0, label %if, label %endif
	▲ Show 20 Lines • Show All 52 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll

Show All 29 Lines	; mark most VGPR registers as used to increase register pressure

%outptr = getelementptr <1280 x i32>, <1280 x i32> addrspace(1)* %out, i32 %tid		%outptr = getelementptr <1280 x i32>, <1280 x i32> addrspace(1)* %out, i32 %tid
store <1280 x i32> %a, <1280 x i32> addrspace(1)* %outptr		store <1280 x i32> %a, <1280 x i32> addrspace(1)* %outptr

ret void		ret void
}		}

; CHECK-LABEL: test_limited_sgpr		; CHECK-LABEL: test_limited_sgpr
; GFX6: s_add_u32 s32, s32, 0x84100		; GFX6: s_add_u32 s32, s32, 0x[[OFFSET:[0-9]+]]
; GFX6-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[{{[0-9:]+}}], s32		; GFX6-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[{{[0-9:]+}}], s32
; GFX6-NEXT: s_sub_u32 s32, s32, 0x84100		; GFX6-NEXT: s_sub_u32 s32, s32, 0x[[OFFSET:[0-9]+]]
; GFX6: NumSgprs: 48		; GFX6: NumSgprs: 48
; GFX6: ScratchSize: 8624		; GFX6: ScratchSize: 8624
define amdgpu_kernel void @test_limited_sgpr(<64 x i32> addrspace(1)* %out, <64 x i32> addrspace(1)* %in) #0 {		define amdgpu_kernel void @test_limited_sgpr(<64 x i32> addrspace(1)* %out, <64 x i32> addrspace(1)* %in) #0 {
entry:		entry:
%lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)		%lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
%tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)		%tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)

; allocate enough scratch to go beyond 2^12 addressing		; allocate enough scratch to go beyond 2^12 addressing
▲ Show 20 Lines • Show All 50 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/spill-wide-sgpr.ll

	; RUN: llc -O0 -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s \| FileCheck -check-prefix=ALL -check-prefix=VGPR %s			; RUN: llc -O0 -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s \| FileCheck -check-prefix=ALL -check-prefix=VGPR %s
	; RUN: llc -O0 -march=amdgcn -mcpu=fiji -amdgpu-spill-sgpr-to-vgpr=0 -verify-machineinstrs < %s \| FileCheck -check-prefix=ALL -check-prefix=VMEM %s			; RUN: llc -O0 -march=amdgcn -mcpu=fiji -amdgpu-spill-sgpr-to-vgpr=0 -verify-machineinstrs < %s \| FileCheck -check-prefix=ALL -check-prefix=VMEM %s

	; ALL-LABEL: {{^}}spill_sgpr_x2:			; ALL-LABEL: {{^}}spill_sgpr_x2:

	; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 0			; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 0
	; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 1			; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 1
	; VGPR: s_cbranch_scc1			; VGPR: s_cbranch_scc1

	; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 0			; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 0
	; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1			; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1


	; VMEM: buffer_store_dword			; VMEM: buffer_store_dword
	; VMEM: buffer_store_dword
	; VMEM: s_cbranch_scc1			; VMEM: s_cbranch_scc1

	; VMEM: buffer_load_dword			; VMEM: buffer_load_dword
	; VMEM: buffer_load_dword
	define amdgpu_kernel void @spill_sgpr_x2(i32 addrspace(1)* %out, i32 %in) #0 {			define amdgpu_kernel void @spill_sgpr_x2(i32 addrspace(1)* %out, i32 %in) #0 {
	%wide.sgpr = call <2 x i32> asm sideeffect "; def $0", "=s" () #0			%wide.sgpr = call <2 x i32> asm sideeffect "; def $0", "=s" () #0
	%cmp = icmp eq i32 %in, 0			%cmp = icmp eq i32 %in, 0
	br i1 %cmp, label %bb0, label %ret			br i1 %cmp, label %bb0, label %ret

	bb0:			bb0:
	call void asm sideeffect "; use $0", "s"(<2 x i32> %wide.sgpr) #0			call void asm sideeffect "; use $0", "s"(<2 x i32> %wide.sgpr) #0
	br label %ret			br label %ret
	Show All 10 Lines
	; VGPR: s_cbranch_scc1			; VGPR: s_cbranch_scc1

	; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 0			; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 0
	; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1			; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1
	; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 2			; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 2


	; VMEM: buffer_store_dword			; VMEM: buffer_store_dword
	; VMEM: buffer_store_dword
	; VMEM: buffer_store_dword
	; VMEM: s_cbranch_scc1			; VMEM: s_cbranch_scc1

	; VMEM: buffer_load_dword			; VMEM: buffer_load_dword
	; VMEM: buffer_load_dword
	; VMEM: buffer_load_dword
	define amdgpu_kernel void @spill_sgpr_x3(i32 addrspace(1)* %out, i32 %in) #0 {			define amdgpu_kernel void @spill_sgpr_x3(i32 addrspace(1)* %out, i32 %in) #0 {
	%wide.sgpr = call <3 x i32> asm sideeffect "; def $0", "=s" () #0			%wide.sgpr = call <3 x i32> asm sideeffect "; def $0", "=s" () #0
	%cmp = icmp eq i32 %in, 0			%cmp = icmp eq i32 %in, 0
	br i1 %cmp, label %bb0, label %ret			br i1 %cmp, label %bb0, label %ret

	bb0:			bb0:
	call void asm sideeffect "; use $0", "s"(<3 x i32> %wide.sgpr) #0			call void asm sideeffect "; use $0", "s"(<3 x i32> %wide.sgpr) #0
	br label %ret			br label %ret
	Show All 12 Lines

	; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 0			; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 0
	; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1			; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1
	; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 2			; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 2
	; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 3			; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 3


	; VMEM: buffer_store_dword			; VMEM: buffer_store_dword
	; VMEM: buffer_store_dword
	; VMEM: buffer_store_dword
	; VMEM: buffer_store_dword
	; VMEM: s_cbranch_scc1			; VMEM: s_cbranch_scc1

	; VMEM: buffer_load_dword			; VMEM: buffer_load_dword
	; VMEM: buffer_load_dword
	; VMEM: buffer_load_dword
	; VMEM: buffer_load_dword
	define amdgpu_kernel void @spill_sgpr_x4(i32 addrspace(1)* %out, i32 %in) #0 {			define amdgpu_kernel void @spill_sgpr_x4(i32 addrspace(1)* %out, i32 %in) #0 {
	%wide.sgpr = call <4 x i32> asm sideeffect "; def $0", "=s" () #0			%wide.sgpr = call <4 x i32> asm sideeffect "; def $0", "=s" () #0
	%cmp = icmp eq i32 %in, 0			%cmp = icmp eq i32 %in, 0
	br i1 %cmp, label %bb0, label %ret			br i1 %cmp, label %bb0, label %ret

	bb0:			bb0:
	call void asm sideeffect "; use $0", "s"(<4 x i32> %wide.sgpr) #0			call void asm sideeffect "; use $0", "s"(<4 x i32> %wide.sgpr) #0
	br label %ret			br label %ret
	Show All 14 Lines
	; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 0			; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 0
	; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1			; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1
	; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 2			; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 2
	; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 3			; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 3
	; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 4			; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 4


	; VMEM: buffer_store_dword			; VMEM: buffer_store_dword
	; VMEM: buffer_store_dword
	; VMEM: buffer_store_dword
	; VMEM: buffer_store_dword
	; VMEM: buffer_store_dword
	; VMEM: s_cbranch_scc1			; VMEM: s_cbranch_scc1

	; VMEM: buffer_load_dword			; VMEM: buffer_load_dword
	; VMEM: buffer_load_dword
	; VMEM: buffer_load_dword
	; VMEM: buffer_load_dword
	; VMEM: buffer_load_dword
	define amdgpu_kernel void @spill_sgpr_x5(i32 addrspace(1)* %out, i32 %in) #0 {			define amdgpu_kernel void @spill_sgpr_x5(i32 addrspace(1)* %out, i32 %in) #0 {
	%wide.sgpr = call <5 x i32> asm sideeffect "; def $0", "=s" () #0			%wide.sgpr = call <5 x i32> asm sideeffect "; def $0", "=s" () #0
	%cmp = icmp eq i32 %in, 0			%cmp = icmp eq i32 %in, 0
	br i1 %cmp, label %bb0, label %ret			br i1 %cmp, label %bb0, label %ret

	bb0:			bb0:
	call void asm sideeffect "; use $0", "s"(<5 x i32> %wide.sgpr) #0			call void asm sideeffect "; use $0", "s"(<5 x i32> %wide.sgpr) #0
	br label %ret			br label %ret
	Show All 19 Lines
	; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 2			; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 2
	; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 3			; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 3
	; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 4			; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 4
	; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 5			; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 5
	; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 6			; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 6
	; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 7			; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 7

	; VMEM: buffer_store_dword			; VMEM: buffer_store_dword
	; VMEM: buffer_store_dword
	; VMEM: buffer_store_dword
	; VMEM: buffer_store_dword
	; VMEM: buffer_store_dword
	; VMEM: buffer_store_dword
	; VMEM: buffer_store_dword
	; VMEM: buffer_store_dword
	; VMEM: s_cbranch_scc1			; VMEM: s_cbranch_scc1

	; VMEM: buffer_load_dword			; VMEM: buffer_load_dword
	; VMEM: buffer_load_dword
	; VMEM: buffer_load_dword
	; VMEM: buffer_load_dword
	; VMEM: buffer_load_dword
	; VMEM: buffer_load_dword
	; VMEM: buffer_load_dword
	; VMEM: buffer_load_dword
	define amdgpu_kernel void @spill_sgpr_x8(i32 addrspace(1)* %out, i32 %in) #0 {			define amdgpu_kernel void @spill_sgpr_x8(i32 addrspace(1)* %out, i32 %in) #0 {
	%wide.sgpr = call <8 x i32> asm sideeffect "; def $0", "=s" () #0			%wide.sgpr = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
	%cmp = icmp eq i32 %in, 0			%cmp = icmp eq i32 %in, 0
	br i1 %cmp, label %bb0, label %ret			br i1 %cmp, label %bb0, label %ret

	bb0:			bb0:
	call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr) #0			call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr) #0
	br label %ret			br label %ret
	Show All 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Make SGPR spills exec mask agnostic
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 266173

llvm/lib/Target/AMDGPU/SIRegisterInfo.h

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll

llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll

llvm/test/CodeGen/AMDGPU/sgpr-spill.mir

llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll

llvm/test/CodeGen/AMDGPU/spill-m0.ll

llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll

llvm/test/CodeGen/AMDGPU/spill-wide-sgpr.ll

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Make SGPR spills exec mask agnosticClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 266173

llvm/lib/Target/AMDGPU/SIRegisterInfo.h

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll

llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll

llvm/test/CodeGen/AMDGPU/sgpr-spill.mir

llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll

llvm/test/CodeGen/AMDGPU/spill-m0.ll

llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll

llvm/test/CodeGen/AMDGPU/spill-wide-sgpr.ll

[AMDGPU] Make SGPR spills exec mask agnostic
ClosedPublic