Diff 70725

lib/Target/AMDGPU/SIInstrInfo.cpp

Show First 20 Lines • Show All 337 Lines • ▼ Show 20 Lines	bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
return (NumLoads * DstRC->getSize()) <= LoadClusterThreshold;		return (NumLoads * DstRC->getSize()) <= LoadClusterThreshold;
}		}

void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,		void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,		MachineBasicBlock::iterator MI,
const DebugLoc &DL, unsigned DestReg,		const DebugLoc &DL, unsigned DestReg,
unsigned SrcReg, bool KillSrc) const {		unsigned SrcReg, bool KillSrc) const {

// If we are trying to copy to or from SCC, there is a bug somewhere else in
// the backend. While it may be theoretically possible to do this, it should
// never be necessary.
assert(DestReg != AMDGPU::SCC && SrcReg != AMDGPU::SCC);

static const int16_t Sub0_15[] = {		static const int16_t Sub0_15[] = {
AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,		AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,		AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,		AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,		AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
};		};

static const int16_t Sub0_15_64[] = {		static const int16_t Sub0_15_64[] = {
Show All 28 Lines	void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
static const int16_t Sub0_1[] = {		static const int16_t Sub0_1[] = {
AMDGPU::sub0, AMDGPU::sub1,		AMDGPU::sub0, AMDGPU::sub1,
};		};

unsigned Opcode;		unsigned Opcode;
ArrayRef<int16_t> SubIndices;		ArrayRef<int16_t> SubIndices;

if (AMDGPU::SReg_32RegClass.contains(DestReg)) {		if (AMDGPU::SReg_32RegClass.contains(DestReg)) {
		if (SrcReg == AMDGPU::SCC) {
		BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
		.addImm(-1)
		.addImm(0);
		return;
		}

assert(AMDGPU::SReg_32RegClass.contains(SrcReg));		assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)		BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));		.addReg(SrcReg, getKillRegState(KillSrc));
return;		return;

} else if (AMDGPU::SReg_64RegClass.contains(DestReg)) {		} else if (AMDGPU::SReg_64RegClass.contains(DestReg)) {
if (DestReg == AMDGPU::VCC) {		if (DestReg == AMDGPU::VCC) {
if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {		if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
Show All 10 Lines	if (DestReg == AMDGPU::VCC) {
return;		return;
}		}

assert(AMDGPU::SReg_64RegClass.contains(SrcReg));		assert(AMDGPU::SReg_64RegClass.contains(SrcReg));
BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)		BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));		.addReg(SrcReg, getKillRegState(KillSrc));
return;		return;

		} else if (DestReg == AMDGPU::SCC) {
		assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
		BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
		.addReg(SrcReg, getKillRegState(KillSrc))
		.addImm(0);
		return;
} else if (AMDGPU::SReg_128RegClass.contains(DestReg)) {		} else if (AMDGPU::SReg_128RegClass.contains(DestReg)) {
assert(AMDGPU::SReg_128RegClass.contains(SrcReg));		assert(AMDGPU::SReg_128RegClass.contains(SrcReg));
Opcode = AMDGPU::S_MOV_B64;		Opcode = AMDGPU::S_MOV_B64;
SubIndices = Sub0_3_64;		SubIndices = Sub0_3_64;

} else if (AMDGPU::SReg_256RegClass.contains(DestReg)) {		} else if (AMDGPU::SReg_256RegClass.contains(DestReg)) {
assert(AMDGPU::SReg_256RegClass.contains(SrcReg));		assert(AMDGPU::SReg_256RegClass.contains(SrcReg));
Opcode = AMDGPU::S_MOV_B64;		Opcode = AMDGPU::S_MOV_B64;
▲ Show 20 Lines • Show All 2,815 Lines • Show Last 20 Lines

lib/Target/AMDGPU/SIWholeQuadMode.cpp

Show First 20 Lines • Show All 123 Lines • ▼ Show 20 Lines	private:
void markInstruction(MachineInstr &MI, char Flag,		void markInstruction(MachineInstr &MI, char Flag,
std::vector<WorkItem> &Worklist);		std::vector<WorkItem> &Worklist);
void markUsesWQM(const MachineInstr &MI, std::vector<WorkItem> &Worklist);		void markUsesWQM(const MachineInstr &MI, std::vector<WorkItem> &Worklist);
char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);		char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);		void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);		void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
char analyzeFunction(MachineFunction &MF);		char analyzeFunction(MachineFunction &MF);

		bool requiresCorrectState(const MachineInstr &MI) const;

		MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
		MachineBasicBlock::iterator Before);
		MachineBasicBlock::iterator
		prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
		MachineBasicBlock::iterator Last, bool PreferLast,
		bool SaveSCC);
void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,		void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
unsigned SaveWQM, unsigned LiveMaskReg);		unsigned SaveWQM, unsigned LiveMaskReg);
void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,		void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
unsigned SavedWQM);		unsigned SavedWQM);
void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);		void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);

void lowerLiveMaskQueries(unsigned LiveMaskReg);		void lowerLiveMaskQueries(unsigned LiveMaskReg);

▲ Show 20 Lines • Show All 253 Lines • ▼ Show 20 Lines	if (WI.MI)
propagateInstruction(*WI.MI, Worklist);		propagateInstruction(*WI.MI, Worklist);
else		else
propagateBlock(*WI.MBB, Worklist);		propagateBlock(*WI.MBB, Worklist);
}		}

return GlobalFlags;		return GlobalFlags;
}		}

		/// Whether \p MI really requires the exec state computed during analysis.
		///
		/// Scalar instructions must occasionally be marked WQM for correct propagation
		/// (e.g. thread masks leading up to branches), but when it comes to actual
		/// execution, they don't care about EXEC.
		bool SIWholeQuadMode::requiresCorrectState(const MachineInstr &MI) const {
		if (MI.isTerminator())
		return true;

		// Skip instructions that are not affected by EXEC
		if (TII->isScalarUnit(MI))
		return false;

		// Generic instructions such as COPY will either disappear by register
		// coalescing or be lowered to SALU or VALU instructions.
		if (MI.isTransient()) {
		arsenmUnsubmitted Not Done Reply Inline Actions MI.isTransient() might be more accurate. INLINEASM for example will not be removed arsenm: MI.isTransient() might be more accurate. INLINEASM for example will not be removed
		if (MI.getNumExplicitOperands() >= 1) {
		const MachineOperand &Op = MI.getOperand(0);
		if (Op.isReg()) {
		if (TRI->isSGPRReg(*MRI, Op.getReg())) {
		// SGPR instructions are not affected by EXEC
		return false;
		}
		}
		}
		}

		return true;
		}

		MachineBasicBlock::iterator
		SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
		MachineBasicBlock::iterator Before) {
		unsigned SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);

		MachineInstr *Save =
		BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
		.addReg(AMDGPU::SCC);
		MachineInstr *Restore =
		BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
		.addReg(SaveReg);
		arsenmUnsubmitted Done Reply Inline Actions I think this should use -1 instead of 1, and use compare != 0 to be more canonical arsenm: I think this should use -1 instead of 1, and use compare != 0 to be more canonical
		arsenmUnsubmitted Done Reply Inline Actions Maybe you should just emit COPY and move this expansion to copyPhysReg for scc arsenm: Maybe you should just emit COPY and move this expansion to copyPhysReg for scc

		LIS->InsertMachineInstrInMaps(*Save);
		LIS->InsertMachineInstrInMaps(*Restore);
		LIS->createAndComputeVirtRegInterval(SaveReg);

		return Restore;
		}

		// Return an iterator in the (inclusive) range [First, Last] at which
		// instructions can be safely inserted, keeping in mind that some of the
		// instructions we want to add necessarily clobber SCC.
		MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
		MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
		MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
		if (!SaveSCC)
		return PreferLast ? Last : First;

		LiveRange &LR = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));
		auto MBBE = MBB.end();
		SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
		: LIS->getMBBEndIdx(&MBB);
		SlotIndex LastIdx =
		Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
		SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
		const LiveRange::Segment *S;

		for (;;) {
		S = LR.getSegmentContaining(Idx);
		if (!S)
		break;

		if (PreferLast) {
		SlotIndex Next = S->start.getBaseIndex();
		if (Next < FirstIdx)
		break;
		Idx = Next;
		} else {
		SlotIndex Next = S->end.getNextIndex().getBaseIndex();
		if (Next > LastIdx)
		break;
		Idx = Next;
		}
		}

		MachineBasicBlock::iterator MBBI;

		if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
		MBBI = MI;
		else {
		assert(Idx == LIS->getMBBEndIdx(&MBB));
		MBBI = MBB.end();
		}

		if (S)
		MBBI = saveSCC(MBB, MBBI);

		return MBBI;
		}

void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,		void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
MachineBasicBlock::iterator Before,		MachineBasicBlock::iterator Before,
unsigned SaveWQM, unsigned LiveMaskReg) {		unsigned SaveWQM, unsigned LiveMaskReg) {
		MachineInstr *MI;

if (SaveWQM) {		if (SaveWQM) {
BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_SAVEEXEC_B64),		MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_SAVEEXEC_B64),
		arsenmUnsubmitted Not Done Reply Inline Actions Should set DebugLoc arsenm: Should set DebugLoc
SaveWQM)		SaveWQM)
.addReg(LiveMaskReg);		.addReg(LiveMaskReg);
} else {		} else {
BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_B64),		MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_B64),
AMDGPU::EXEC)		AMDGPU::EXEC)
		arsenmUnsubmitted Not Done Reply Inline Actions These can be combined with findRegisterUseOperandIdx arsenm: These can be combined with findRegisterUseOperandIdx
.addReg(AMDGPU::EXEC)		.addReg(AMDGPU::EXEC)
.addReg(LiveMaskReg);		.addReg(LiveMaskReg);
}		}

		LIS->InsertMachineInstrInMaps(*MI);
}		}

void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,		void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
MachineBasicBlock::iterator Before,		MachineBasicBlock::iterator Before,
unsigned SavedWQM) {		unsigned SavedWQM) {
		MachineInstr *MI;

if (SavedWQM) {		if (SavedWQM) {
BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC)		MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC)
		arsenmUnsubmitted Not Done Reply Inline Actions Should set DebugLoc arsenm: Should set DebugLoc
.addReg(SavedWQM);		.addReg(SavedWQM);
} else {		} else {
BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),		MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
AMDGPU::EXEC)		AMDGPU::EXEC)
.addReg(AMDGPU::EXEC);		.addReg(AMDGPU::EXEC);
}		}

		LIS->InsertMachineInstrInMaps(*MI);
}		}

void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,		void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
bool isEntry) {		bool isEntry) {
auto BII = Blocks.find(&MBB);		auto BII = Blocks.find(&MBB);
if (BII == Blocks.end())		if (BII == Blocks.end())
return;		return;

const BlockInfo &BI = BII->second;		const BlockInfo &BI = BII->second;

if (!(BI.InNeeds & StateWQM))		if (!(BI.InNeeds & StateWQM))
return;		return;

// This is a non-entry block that is WQM throughout, so no need to do		// This is a non-entry block that is WQM throughout, so no need to do
// anything.		// anything.
if (!isEntry && !(BI.Needs & StateExact) && BI.OutNeeds != StateExact)		if (!isEntry && !(BI.Needs & StateExact) && BI.OutNeeds != StateExact)
return;		return;

		arsenmUnsubmitted Not Done Reply Inline Actions This function is kind of complicated to special case track SCC liveness. Can you use LivePhysRegs or check the LiveIntervals to simplify it (I think there was some issue with tracking physical registers with LIS last time I tried to do it, but I don't remember) arsenm: This function is kind of complicated to special case track SCC liveness. Can you use…
DEBUG(dbgs() << "\nProcessing block BB#" << MBB.getNumber() << ":\n");		DEBUG(dbgs() << "\nProcessing block BB#" << MBB.getNumber() << ":\n");

unsigned SavedWQMReg = 0;		unsigned SavedWQMReg = 0;
bool WQMFromExec = isEntry;		bool WQMFromExec = isEntry;
char State = isEntry ? StateExact : StateWQM;		char State = isEntry ? StateExact : StateWQM;
MachineInstr *FirstNonWQM = nullptr;

auto II = MBB.getFirstNonPHI(), IE = MBB.end();		auto II = MBB.getFirstNonPHI(), IE = MBB.end();
while (II != IE) {		if (isEntry)
MachineInstr &MI = *II;		++II; // Skip the instruction that saves LiveMask
		arsenmUnsubmitted Done Reply Inline Actions Capitalize arsenm: Capitalize
++II;

// Skip instructions that are not affected by EXEC		MachineBasicBlock::iterator First = IE;
if (TII->isScalarUnit(MI) && !MI.isTerminator())		for (;;) {
continue;		MachineBasicBlock::iterator Next = II;
		char Needs = 0;
		char OutNeeds = 0;

// Generic instructions such as COPY will either disappear by register		if (First == IE)
// coalescing or be lowered to SALU or VALU instructions.		First = II;
		arsenmUnsubmitted Not Done Reply Inline Actions The Clean = true can be hoisted to the initializer arsenm: The Clean = true can be hoisted to the initializer
if (TargetInstrInfo::isGenericOpcode(MI.getOpcode())) {
if (MI.getNumExplicitOperands() >= 1) {		if (II != IE) {
const MachineOperand &Op = MI.getOperand(0);		MachineInstr &MI = *II;
if (Op.isReg()) {
		arsenmUnsubmitted Not Done Reply Inline Actions range loop arsenm: range loop
if (TRI->isSGPRReg(*MRI, Op.getReg())) {		if (requiresCorrectState(MI)) {
		arsenmUnsubmitted Not Done Reply Inline Actions Maybe this should get the read operand and see if it is undef? arsenm: Maybe this should get the read operand and see if it is undef?
// SGPR instructions are not affected by EXEC		auto III = Instructions.find(&MI);
continue;		if (III != Instructions.end()) {
}		Needs = III->second.Needs;
		OutNeeds = III->second.OutNeeds;
}		}
		arsenmUnsubmitted Not Done Reply Inline Actions Should check if it's dead? arsenm: Should check if it's dead?
}		}

		if (MI.isTerminator() && !Needs && OutNeeds == StateExact)
		Needs = StateExact;

		if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact)
		MI.getOperand(3).setImm(1);

		++Next;
		} else {
		// End of basic block
		if (BI.OutNeeds & StateWQM)
		Needs = StateWQM;
		else if (BI.OutNeeds == StateExact)
		Needs = StateExact;
}		}

char Needs = 0;		if (Needs) {
		arsenmUnsubmitted Not Done Reply Inline Actions I think these should use MI's DebugLoc arsenm: I think these should use MI's DebugLoc
char OutNeeds = 0;		if (Needs != State) {
auto InstrInfoIt = Instructions.find(&MI);		MachineBasicBlock::iterator Before =
if (InstrInfoIt != Instructions.end()) {		prepareInsertion(MBB, First, II, Needs == StateWQM,
Needs = InstrInfoIt->second.Needs;		Needs == StateExact \|\| WQMFromExec);
OutNeeds = InstrInfoIt->second.OutNeeds;
}

// Keep track of the first consecutive non-WQM instruction, so that we
// switch away from WQM as soon as possible, potentially saving a small
// bit of bandwidth on loads.
if (Needs == StateWQM)
FirstNonWQM = nullptr;
else if (!FirstNonWQM)
FirstNonWQM = &MI;

// State switching
if (Needs && State != Needs) {
if (Needs == StateExact) {		if (Needs == StateExact) {
assert(!SavedWQMReg);

if (!WQMFromExec && (OutNeeds & StateWQM))		if (!WQMFromExec && (OutNeeds & StateWQM))
SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);		SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);

		arsenmUnsubmitted Done Reply Inline Actions Insert through LIS arsenm: Insert through LIS
toExact(MBB, FirstNonWQM, SavedWQMReg, LiveMaskReg);		toExact(MBB, Before, SavedWQMReg, LiveMaskReg);
} else {		} else {
assert(WQMFromExec == (SavedWQMReg == 0));		assert(WQMFromExec == (SavedWQMReg == 0));
		arsenmUnsubmitted Done Reply Inline Actions This can just return Restore arsenm: This can just return Restore
toWQM(MBB, &MI, SavedWQMReg);
		toWQM(MBB, Before, SavedWQMReg);

		if (SavedWQMReg) {
		LIS->createAndComputeVirtRegInterval(SavedWQMReg);
SavedWQMReg = 0;		SavedWQMReg = 0;
}		}
		}

State = Needs;		State = Needs;
}		}

if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact)		First = IE;
MI.getOperand(3).setImm(1);
}		}

if ((BI.OutNeeds & StateWQM) && State != StateWQM) {		if (II == IE)
assert(WQMFromExec == (SavedWQMReg == 0));		break;
toWQM(MBB, MBB.end(), SavedWQMReg);		II = Next;
} else if (BI.OutNeeds == StateExact && State != StateExact) {
toExact(MBB, FirstNonWQM ? MachineBasicBlock::iterator(FirstNonWQM)
: MBB.getFirstTerminator(),
0, LiveMaskReg);
}		}
}		}

void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {		void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
for (MachineInstr *MI : LiveMaskQueries) {		for (MachineInstr *MI : LiveMaskQueries) {
const DebugLoc &DL = MI->getDebugLoc();		const DebugLoc &DL = MI->getDebugLoc();
unsigned Dest = MI->getOperand(0).getReg();		unsigned Dest = MI->getOperand(0).getReg();
		MachineInstr *Copy =
BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)		BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
.addReg(LiveMaskReg);		.addReg(LiveMaskReg);

		LIS->ReplaceMachineInstrInMaps(MI, Copy);
MI->eraseFromParent();		MI->eraseFromParent();
}		}
}		}

bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {		bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
if (MF.getFunction()->getCallingConv() != CallingConv::AMDGPU_PS)		if (MF.getFunction()->getCallingConv() != CallingConv::AMDGPU_PS)
return false;		return false;

Show All 17 Lines	bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
// Store a copy of the original live mask when required		// Store a copy of the original live mask when required
unsigned LiveMaskReg = 0;		unsigned LiveMaskReg = 0;
{		{
MachineBasicBlock &Entry = MF.front();		MachineBasicBlock &Entry = MF.front();
MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();		MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();

if (GlobalFlags & StateExact \|\| !LiveMaskQueries.empty()) {		if (GlobalFlags & StateExact \|\| !LiveMaskQueries.empty()) {
LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);		LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)		MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(),
		TII->get(AMDGPU::COPY), LiveMaskReg)
.addReg(AMDGPU::EXEC);		.addReg(AMDGPU::EXEC);
		LIS->InsertMachineInstrInMaps(*MI);
}		}

if (GlobalFlags == StateWQM) {		if (GlobalFlags == StateWQM) {
// For a shader that needs only WQM, we can just set it once.		// For a shader that needs only WQM, we can just set it once.
BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),		BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
AMDGPU::EXEC)		AMDGPU::EXEC)
.addReg(AMDGPU::EXEC);		.addReg(AMDGPU::EXEC);

lowerLiveMaskQueries(LiveMaskReg);		lowerLiveMaskQueries(LiveMaskReg);
// EntryMI may become invalid here		// EntryMI may become invalid here
return true;		return true;
}		}
}		}

DEBUG(printInfo());		DEBUG(printInfo());

lowerLiveMaskQueries(LiveMaskReg);		lowerLiveMaskQueries(LiveMaskReg);

// Handle the general case		// Handle the general case
for (auto BII : Blocks)		for (auto BII : Blocks)
processBlock(BII.first, LiveMaskReg, BII.first == &MF.begin());		processBlock(BII.first, LiveMaskReg, BII.first == &MF.begin());

		// Physical registers like SCC aren't tracked by default anyway, so just
		// removing the ranges we computed is the simplest option for maintaining
		// the analysis results.
		LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));

return true;		return true;
}		}

test/CodeGen/AMDGPU/wqm.ll

	Show First 20 Lines • Show All 460 Lines • ▼ Show 20 Lines
	if:			if:
	store volatile <4 x float> %dtex, <4 x float>* undef			store volatile <4 x float> %dtex, <4 x float>* undef
	unreachable			unreachable

	else:			else:
	ret <4 x float> %dtex			ret <4 x float> %dtex
	}			}

				; Test awareness that s_wqm_b64 clobbers SCC.
				arsenmUnsubmitted Not Done Reply Inline Actions I don't see check lines for the cmp + select restore pattern here. Where is the scc def? This test also probably needs a comment arsenm: I don't see check lines for the cmp + select restore pattern here. Where is the scc def? This…
				nhaehnleAuthorUnsubmitted Not Done Reply Inline Actions That's because I didn't actually manage to concoct a test where the pattern is unavoidable. Usually, the SCC def ends up right before the SCC use, so that the WQM instruction can just be moved around it. I've tested the pattern with some artificial hacks in the code (in the latest version by setting First == Last in prepareInsertion). The problem is that I cannot prove that it will never be needed. For example, it might be needed if the machine scheduler makes unusual decisions about moving store instructions between the SCC def and the SCC use. nhaehnle: That's because I didn't actually manage to concoct a test where the pattern is unavoidable.
				;
				; CHECK-LABEL: {{^}}test_scc:
				; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
				; CHECK: s_wqm_b64 exec, exec
				; CHECK: s_cmp_
				; CHECK-NEXT: s_cbranch_scc
				arsenmUnsubmitted Done Reply Inline Actions CHECK-NEXT for s_cbranch_scc? arsenm: CHECK-NEXT for s_cbranch_scc?
				; CHECK: ; %if
				; CHECK: s_and_b64 exec, exec, [[ORIG]]
				; CHECK: image_sample
				; CHECK: ; %else
				; CHECK: s_and_b64 exec, exec, [[ORIG]]
				; CHECK: image_sample
				; CHECK: ; %end
				define amdgpu_ps <4 x float> @test_scc(i32 inreg %sel, i32 %idx) #1 {
				main_body:
				%cc = icmp sgt i32 %sel, 0
				br i1 %cc, label %if, label %else

				if:
				%r.if = call <4 x float> @llvm.SI.image.sample.i32(i32 0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
				br label %end

				else:
				%r.else = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 0, i32 1>, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
				br label %end

				end:
				%r = phi <4 x float> [ %r.if, %if ], [ %r.else, %else ]

				call void @llvm.amdgcn.buffer.store.f32(float 1.0, <4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0)

				ret <4 x float> %r
				}


	declare void @llvm.amdgcn.image.store.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1			declare void @llvm.amdgcn.image.store.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
	declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #1			declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #1
	declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1			declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1

	declare <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2			declare <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2
	declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #2			declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #2

	declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3			declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3
				declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3
	declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3			declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3

	declare void @llvm.AMDGPU.kill(float)			declare void @llvm.AMDGPU.kill(float)
	declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)			declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)

	attributes #1 = { nounwind }			attributes #1 = { nounwind }
	attributes #2 = { nounwind readonly }			attributes #2 = { nounwind readonly }
	attributes #3 = { nounwind readnone }			attributes #3 = { nounwind readnone }
	attributes #4 = { "amdgpu-ps-wqm-outputs" }			attributes #4 = { "amdgpu-ps-wqm-outputs" }

This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Do not clobber SCC in SIWholeQuadMode
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 70725

lib/Target/AMDGPU/SIInstrInfo.cpp

lib/Target/AMDGPU/SIWholeQuadMode.cpp

test/CodeGen/AMDGPU/wqm.ll

This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Do not clobber SCC in SIWholeQuadModeClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 70725

lib/Target/AMDGPU/SIInstrInfo.cpp

lib/Target/AMDGPU/SIWholeQuadMode.cpp

test/CodeGen/AMDGPU/wqm.ll

AMDGPU: Do not clobber SCC in SIWholeQuadMode
ClosedPublic