Diff 70267

llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp

Show First 20 Lines • Show All 63 Lines • ▼ Show 20 Lines

namespace {		namespace {

enum {		enum {
StateWQM = 0x1,		StateWQM = 0x1,
StateExact = 0x2,		StateExact = 0x2,
};		};

		struct PrintState {
		public:
		explicit PrintState(int State) : State(State) {}

		int State;
		};

		static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
		if (PS.State & StateWQM)
		OS << "WQM";
		if (PS.State & StateExact) {
		if (PS.State & StateWQM)
		OS << '\|';
		OS << "Exact";
		}

		return OS;
		}

struct InstrInfo {		struct InstrInfo {
char Needs = 0;		char Needs = 0;
char OutNeeds = 0;		char OutNeeds = 0;
};		};

struct BlockInfo {		struct BlockInfo {
char Needs = 0;		char Needs = 0;
char InNeeds = 0;		char InNeeds = 0;
Show All 13 Lines
private:		private:
const SIInstrInfo *TII;		const SIInstrInfo *TII;
const SIRegisterInfo *TRI;		const SIRegisterInfo *TRI;
MachineRegisterInfo *MRI;		MachineRegisterInfo *MRI;
LiveIntervals *LIS;		LiveIntervals *LIS;

DenseMap<const MachineInstr *, InstrInfo> Instructions;		DenseMap<const MachineInstr *, InstrInfo> Instructions;
DenseMap<MachineBasicBlock *, BlockInfo> Blocks;		DenseMap<MachineBasicBlock *, BlockInfo> Blocks;
SmallVector<const MachineInstr *, 2> ExecExports;
SmallVector<MachineInstr *, 1> LiveMaskQueries;		SmallVector<MachineInstr *, 1> LiveMaskQueries;

		void printInfo();

void markInstruction(MachineInstr &MI, char Flag,		void markInstruction(MachineInstr &MI, char Flag,
std::vector<WorkItem> &Worklist);		std::vector<WorkItem> &Worklist);
		void markUsesWQM(const MachineInstr &MI, std::vector<WorkItem> &Worklist);
char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);		char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);		void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);		void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
char analyzeFunction(MachineFunction &MF);		char analyzeFunction(MachineFunction &MF);

void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,		void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
unsigned SaveWQM, unsigned LiveMaskReg);		unsigned SaveWQM, unsigned LiveMaskReg);
void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,		void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
Show All 32 Lines	INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
false)		false)

char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;		char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;

FunctionPass *llvm::createSIWholeQuadModePass() {		FunctionPass *llvm::createSIWholeQuadModePass() {
return new SIWholeQuadMode;		return new SIWholeQuadMode;
}		}

		void SIWholeQuadMode::printInfo() {
		for (const auto &BII : Blocks) {
		dbgs() << "\nBB#" << BII.first->getNumber() << ":\n"
		<< " InNeeds = " << PrintState(BII.second.InNeeds)
		<< ", Needs = " << PrintState(BII.second.Needs)
		<< ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";

		for (const MachineInstr &MI : *BII.first) {
		auto III = Instructions.find(&MI);
		if (III == Instructions.end())
		continue;

		dbgs() << " " << MI << " Needs = " << PrintState(III->second.Needs)
		<< ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
		}
		}
		}

void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,		void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
std::vector<WorkItem> &Worklist) {		std::vector<WorkItem> &Worklist) {
InstrInfo &II = Instructions[&MI];		InstrInfo &II = Instructions[&MI];

assert(Flag == StateWQM \|\| Flag == StateExact);		assert(Flag == StateWQM \|\| Flag == StateExact);

// Ignore if the instruction is already marked. The typical case is that we		// Ignore if the instruction is already marked. The typical case is that we
// mark an instruction WQM multiple times, but for atomics it can happen that		// mark an instruction WQM multiple times, but for atomics it can happen that
// Flag is StateWQM, but Needs is already set to StateExact. In this case,		// Flag is StateWQM, but Needs is already set to StateExact. In this case,
// letting the atomic run in StateExact is correct as per the relevant specs.		// letting the atomic run in StateExact is correct as per the relevant specs.
if (II.Needs)		if (II.Needs)
return;		return;

II.Needs = Flag;		II.Needs = Flag;
Worklist.push_back(&MI);		Worklist.push_back(&MI);
}		}

		/// Mark all instructions defining the uses in \p MI as WQM.
		void SIWholeQuadMode::markUsesWQM(const MachineInstr &MI,
		std::vector<WorkItem> &Worklist) {
		for (const MachineOperand &Use : MI.uses()) {
		if (!Use.isReg() \|\| !Use.isUse())
		continue;

		unsigned Reg = Use.getReg();

		// Handle physical registers that we need to track; this is mostly relevant
		// for VCC, which can appear as the (implicit) input of a uniform branch,
		// e.g. when a loop counter is stored in a VGPR.
		if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
		if (Reg == AMDGPU::EXEC)
		continue;

		for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) {
		LiveRange &LR = LIS->getRegUnit(*RegUnit);
		const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
		if (!Value)
		continue;

		// Since we're in machine SSA, we do not need to track physical
		// registers across basic blocks.
		if (Value->isPHIDef())
		continue;

		markInstruction(*LIS->getInstructionFromIndex(Value->def), StateWQM,
		Worklist);
		}

		continue;
		}

		for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
		markInstruction(DefMI, StateWQM, Worklist);
		}
		}

// Scan instructions to determine which ones require an Exact execmask and		// Scan instructions to determine which ones require an Exact execmask and
// which ones seed WQM requirements.		// which ones seed WQM requirements.
char SIWholeQuadMode::scanInstructions(MachineFunction &MF,		char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
std::vector<WorkItem> &Worklist) {		std::vector<WorkItem> &Worklist) {
char GlobalFlags = 0;		char GlobalFlags = 0;
bool WQMOutputs = MF.getFunction()->hasFnAttribute("amdgpu-ps-wqm-outputs");		bool WQMOutputs = MF.getFunction()->hasFnAttribute("amdgpu-ps-wqm-outputs");

for (auto BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) {		for (auto BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) {
MachineBasicBlock &MBB = *BI;		MachineBasicBlock &MBB = *BI;

for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {		for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {
MachineInstr &MI = *II;		MachineInstr &MI = *II;
unsigned Opcode = MI.getOpcode();		unsigned Opcode = MI.getOpcode();
char Flags = 0;		char Flags = 0;

if (TII->isWQM(Opcode) \|\| TII->isDS(Opcode)) {		if (TII->isDS(Opcode)) {
Flags = StateWQM;		Flags = StateWQM;
		} else if (TII->isWQM(Opcode)) {
		// Sampling instructions don't need to produce results for all pixels
		// in a quad, they just require all inputs of a quad to have been
		// computed for derivatives.
		markUsesWQM(MI, Worklist);
		GlobalFlags \|= StateWQM;
		continue;
} else if (TII->isDisableWQM(MI)) {		} else if (TII->isDisableWQM(MI)) {
Flags = StateExact;		Flags = StateExact;
} else {		} else {
// Handle export instructions with the exec mask valid flag set		if (Opcode == AMDGPU::SI_PS_LIVE) {
if (Opcode == AMDGPU::EXP) {
if (MI.getOperand(4).getImm() != 0)
ExecExports.push_back(&MI);
} else if (Opcode == AMDGPU::SI_PS_LIVE) {
LiveMaskQueries.push_back(&MI);		LiveMaskQueries.push_back(&MI);
} else if (WQMOutputs) {		} else if (WQMOutputs) {
// The function is in machine SSA form, which means that physical		// The function is in machine SSA form, which means that physical
// VGPRs correspond to shader inputs and outputs. Inputs are		// VGPRs correspond to shader inputs and outputs. Inputs are
// only used, outputs are only defined.		// only used, outputs are only defined.
for (const MachineOperand &MO : MI.defs()) {		for (const MachineOperand &MO : MI.defs()) {
if (!MO.isReg())		if (!MO.isReg())
continue;		continue;
▲ Show 20 Lines • Show All 50 Lines • ▼ Show 20 Lines	if (!PrevMI->isPHI()) {
PrevII.OutNeeds \|= InNeeds;		PrevII.OutNeeds \|= InNeeds;
Worklist.push_back(PrevMI);		Worklist.push_back(PrevMI);
}		}
}		}
}		}

// Propagate WQM flag to instruction inputs		// Propagate WQM flag to instruction inputs
assert(II.Needs != (StateWQM \| StateExact));		assert(II.Needs != (StateWQM \| StateExact));
if (II.Needs != StateWQM)
return;

for (const MachineOperand &Use : MI.uses()) {
if (!Use.isReg() \|\| !Use.isUse())
continue;

unsigned Reg = Use.getReg();

// Handle physical registers that we need to track; this is mostly relevant
// for VCC, which can appear as the (implicit) input of a uniform branch,
// e.g. when a loop counter is stored in a VGPR.
if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
if (Reg == AMDGPU::EXEC)
continue;

for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) {
LiveRange &LR = LIS->getRegUnit(*RegUnit);
const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
if (!Value)
continue;

// Since we're in machine SSA, we do not need to track physical
// registers across basic blocks.
if (Value->isPHIDef())
continue;

markInstruction(*LIS->getInstructionFromIndex(Value->def), StateWQM,		if (II.Needs == StateWQM)
Worklist);		markUsesWQM(MI, Worklist);
}

continue;
}

for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
markInstruction(DefMI, StateWQM, Worklist);
}
}		}

void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,		void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
std::vector<WorkItem>& Worklist) {		std::vector<WorkItem>& Worklist) {
BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.		BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.

// Propagate through instructions		// Propagate through instructions
if (!MBB.empty()) {		if (!MBB.empty()) {
▲ Show 20 Lines • Show All 83 Lines • ▼ Show 20 Lines	void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
if (!(BI.InNeeds & StateWQM))		if (!(BI.InNeeds & StateWQM))
return;		return;

// This is a non-entry block that is WQM throughout, so no need to do		// This is a non-entry block that is WQM throughout, so no need to do
// anything.		// anything.
if (!isEntry && !(BI.Needs & StateExact) && BI.OutNeeds != StateExact)		if (!isEntry && !(BI.Needs & StateExact) && BI.OutNeeds != StateExact)
return;		return;

		DEBUG(dbgs() << "\nProcessing block BB#" << MBB.getNumber() << ":\n");

unsigned SavedWQMReg = 0;		unsigned SavedWQMReg = 0;
bool WQMFromExec = isEntry;		bool WQMFromExec = isEntry;
char State = isEntry ? StateExact : StateWQM;		char State = isEntry ? StateExact : StateWQM;
		MachineInstr *FirstNonWQM = nullptr;

auto II = MBB.getFirstNonPHI(), IE = MBB.end();		auto II = MBB.getFirstNonPHI(), IE = MBB.end();
while (II != IE) {		while (II != IE) {
MachineInstr &MI = *II;		MachineInstr &MI = *II;
++II;		++II;

// Skip instructions that are not affected by EXEC		// Skip instructions that are not affected by EXEC
if (TII->isScalarUnit(MI) && !MI.isTerminator())		if (TII->isScalarUnit(MI) && !MI.isTerminator())
Show All 14 Lines	while (II != IE) {
}		}

char Needs = 0;		char Needs = 0;
char OutNeeds = 0;		char OutNeeds = 0;
auto InstrInfoIt = Instructions.find(&MI);		auto InstrInfoIt = Instructions.find(&MI);
if (InstrInfoIt != Instructions.end()) {		if (InstrInfoIt != Instructions.end()) {
Needs = InstrInfoIt->second.Needs;		Needs = InstrInfoIt->second.Needs;
OutNeeds = InstrInfoIt->second.OutNeeds;		OutNeeds = InstrInfoIt->second.OutNeeds;

// Make sure to switch to Exact mode before the end of the block when
// Exact and only Exact is needed further downstream.
if (OutNeeds == StateExact && MI.isTerminator()) {
assert(Needs == 0);
Needs = StateExact;
}
}		}

		// Keep track of the first consecutive non-WQM instruction, so that we
		// switch away from WQM as soon as possible, potentially saving a small
		// bit of bandwidth on loads.
		if (Needs == StateWQM)
		FirstNonWQM = nullptr;
		else if (!FirstNonWQM)
		FirstNonWQM = &MI;

// State switching		// State switching
if (Needs && State != Needs) {		if (Needs && State != Needs) {
if (Needs == StateExact) {		if (Needs == StateExact) {
assert(!SavedWQMReg);		assert(!SavedWQMReg);

if (!WQMFromExec && (OutNeeds & StateWQM))		if (!WQMFromExec && (OutNeeds & StateWQM))
SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);		SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);

toExact(MBB, &MI, SavedWQMReg, LiveMaskReg);		toExact(MBB, FirstNonWQM, SavedWQMReg, LiveMaskReg);
} else {		} else {
assert(WQMFromExec == (SavedWQMReg == 0));		assert(WQMFromExec == (SavedWQMReg == 0));
toWQM(MBB, &MI, SavedWQMReg);		toWQM(MBB, &MI, SavedWQMReg);
SavedWQMReg = 0;		SavedWQMReg = 0;
}		}

State = Needs;		State = Needs;
}		}

if (MI.getOpcode() == AMDGPU::SI_ELSE && State == StateExact)		if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact)
MI.getOperand(3).setImm(1);		MI.getOperand(3).setImm(1);
}		}

if ((BI.OutNeeds & StateWQM) && State != StateWQM) {		if ((BI.OutNeeds & StateWQM) && State != StateWQM) {
assert(WQMFromExec == (SavedWQMReg == 0));		assert(WQMFromExec == (SavedWQMReg == 0));
toWQM(MBB, MBB.end(), SavedWQMReg);		toWQM(MBB, MBB.end(), SavedWQMReg);
} else if (BI.OutNeeds == StateExact && State != StateExact) {		} else if (BI.OutNeeds == StateExact && State != StateExact) {
toExact(MBB, MBB.end(), 0, LiveMaskReg);		toExact(MBB, FirstNonWQM ? MachineBasicBlock::iterator(FirstNonWQM)
		: MBB.getFirstTerminator(),
		0, LiveMaskReg);
}		}
}		}

void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {		void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
for (MachineInstr *MI : LiveMaskQueries) {		for (MachineInstr *MI : LiveMaskQueries) {
const DebugLoc &DL = MI->getDebugLoc();		const DebugLoc &DL = MI->getDebugLoc();
unsigned Dest = MI->getOperand(0).getReg();		unsigned Dest = MI->getOperand(0).getReg();
BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)		BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
.addReg(LiveMaskReg);		.addReg(LiveMaskReg);
MI->eraseFromParent();		MI->eraseFromParent();
}		}
}		}

bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {		bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
if (MF.getFunction()->getCallingConv() != CallingConv::AMDGPU_PS)		if (MF.getFunction()->getCallingConv() != CallingConv::AMDGPU_PS)
return false;		return false;

Instructions.clear();		Instructions.clear();
Blocks.clear();		Blocks.clear();
ExecExports.clear();
LiveMaskQueries.clear();		LiveMaskQueries.clear();

const SISubtarget &ST = MF.getSubtarget<SISubtarget>();		const SISubtarget &ST = MF.getSubtarget<SISubtarget>();

TII = ST.getInstrInfo();		TII = ST.getInstrInfo();
TRI = &TII->getRegisterInfo();		TRI = &TII->getRegisterInfo();
MRI = &MF.getRegInfo();		MRI = &MF.getRegInfo();
LIS = &getAnalysis<LiveIntervals>();		LIS = &getAnalysis<LiveIntervals>();
Show All 23 Lines	if (GlobalFlags == StateWQM) {
.addReg(AMDGPU::EXEC);		.addReg(AMDGPU::EXEC);

lowerLiveMaskQueries(LiveMaskReg);		lowerLiveMaskQueries(LiveMaskReg);
// EntryMI may become invalid here		// EntryMI may become invalid here
return true;		return true;
}		}
}		}

		DEBUG(printInfo());

lowerLiveMaskQueries(LiveMaskReg);		lowerLiveMaskQueries(LiveMaskReg);

// Handle the general case		// Handle the general case
for (auto BII : Blocks)		for (auto BII : Blocks)
processBlock(BII.first, LiveMaskReg, BII.first == &MF.begin());		processBlock(BII.first, LiveMaskReg, BII.first == &MF.begin());

return true;		return true;
}		}

llvm/trunk/test/CodeGen/AMDGPU/wqm.ll

	Show All 31 Lines
	}			}

	; ... but disabled for stores (and, in this simple case, not re-enabled).			; ... but disabled for stores (and, in this simple case, not re-enabled).
	;			;
	;CHECK-LABEL: {{^}}test3:			;CHECK-LABEL: {{^}}test3:
	;CHECK-NEXT: ; %main_body			;CHECK-NEXT: ; %main_body
	;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec			;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
	;CHECK-NEXT: s_wqm_b64 exec, exec			;CHECK-NEXT: s_wqm_b64 exec, exec
	;CHECK: image_sample
	;CHECK: s_and_b64 exec, exec, [[ORIG]]			;CHECK: s_and_b64 exec, exec, [[ORIG]]
				;CHECK: image_sample
	;CHECK: store			;CHECK: store
	;CHECK-NOT: exec			;CHECK-NOT: exec
	;CHECK: .size test3			;CHECK: .size test3
	define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <4 x i32> %c) {			define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <4 x i32> %c) {
	main_body:			main_body:
	%tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)			%tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
	%tex.1 = bitcast <4 x float> %tex to <4 x i32>			%tex.1 = bitcast <4 x float> %tex to <4 x i32>
	%tex.2 = extractelement <4 x i32> %tex.1, i32 0			%tex.2 = extractelement <4 x i32> %tex.1, i32 0

	call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %tex, <4 x i32> undef, i32 %tex.2, i32 0, i1 0, i1 0)			call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %tex, <4 x i32> undef, i32 %tex.2, i32 0, i1 0, i1 0)

	ret <4 x float> %tex			ret <4 x float> %tex
	}			}

	; Check that WQM is re-enabled when required.			; Check that WQM is re-enabled when required.
	;			;
	;CHECK-LABEL: {{^}}test4:			;CHECK-LABEL: {{^}}test4:
	;CHECK-NEXT: ; %main_body			;CHECK-NEXT: ; %main_body
	;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec			;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
	;CHECK-NEXT: s_wqm_b64 exec, exec			;CHECK-NEXT: s_wqm_b64 exec, exec
	;CHECK: v_mul_lo_i32 [[MUL:v[0-9]+]], v0, v1			;CHECK: v_mul_lo_i32 [[MUL:v[0-9]+]], v0, v1
	;CHECK: s_and_b64 exec, exec, [[ORIG]]			;CHECK: s_and_b64 exec, exec, [[ORIG]]
	;CHECK: store			;CHECK: store
	;CHECK: s_wqm_b64 exec, exec			;CHECK: s_wqm_b64 exec, exec
	;CHECK: image_sample v[0:3], [[MUL]], s[0:7], s[8:11] dmask:0xf			;CHECK: image_sample
				;CHECK: image_sample
	define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %d, float %data) {			define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %d, float %data) {
	main_body:			main_body:
	%c.1 = mul i32 %c, %d			%c.1 = mul i32 %c, %d

	call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> undef, <4 x i32> undef, i32 %c.1, i32 0, i1 0, i1 0)			call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> undef, <4 x i32> undef, i32 %c.1, i32 0, i1 0, i1 0)

	%tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)			%tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
	ret <4 x float> %tex			%tex.1 = bitcast <4 x float> %tex to <4 x i32>
				%dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
				ret <4 x float> %dtex
	}			}

	; Check a case of one branch of an if-else requiring WQM, the other requiring			; Check a case of one branch of an if-else requiring WQM, the other requiring
	; exact.			; exact.
	;			;
	; Note: In this particular case, the save-and-restore could be avoided if the			; Note: In this particular case, the save-and-restore could be avoided if the
	; analysis understood that the two branches of the if-else are mutually			; analysis understood that the two branches of the if-else are mutually
	; exclusive.			; exclusive.
	;			;
	;CHECK-LABEL: {{^}}test_control_flow_0:			;CHECK-LABEL: {{^}}test_control_flow_0:
	;CHECK-NEXT: ; %main_body			;CHECK-NEXT: ; %main_body
	;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec			;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
	;CHECK-NEXT: s_wqm_b64 exec, exec			;CHECK-NEXT: s_wqm_b64 exec, exec
	;CHECK: %ELSE			;CHECK: %ELSE
	;CHECK: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[ORIG]]			;CHECK: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[ORIG]]
	;CHECK: store			;CHECK: store
	;CHECK: s_mov_b64 exec, [[SAVED]]			;CHECK: s_mov_b64 exec, [[SAVED]]
	;CHECK: %IF			;CHECK: %IF
	;CHECK: image_sample			;CHECK: image_sample
				;CHECK: image_sample
	define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {			define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
	main_body:			main_body:
	%cmp = icmp eq i32 %z, 0			%cmp = icmp eq i32 %z, 0
	br i1 %cmp, label %IF, label %ELSE			br i1 %cmp, label %IF, label %ELSE

	IF:			IF:
	%tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)			%tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
	%data.if = extractelement <4 x float> %tex, i32 0			%tex.1 = bitcast <4 x float> %tex to <4 x i32>
				%dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
				%data.if = extractelement <4 x float> %dtex, i32 0
	br label %END			br label %END

	ELSE:			ELSE:
	call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i1 0, i1 0)			call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i1 0, i1 0)
	br label %END			br label %END

	END:			END:
	%r = phi float [ %data.if, %IF ], [ %data, %ELSE ]			%r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
	ret float %r			ret float %r
	}			}

	; Reverse branch order compared to the previous test.			; Reverse branch order compared to the previous test.
	;			;
	;CHECK-LABEL: {{^}}test_control_flow_1:			;CHECK-LABEL: {{^}}test_control_flow_1:
	;CHECK-NEXT: ; %main_body			;CHECK-NEXT: ; %main_body
	;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec			;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
	;CHECK-NEXT: s_wqm_b64 exec, exec			;CHECK-NEXT: s_wqm_b64 exec, exec
	;CHECK: %IF			;CHECK: %IF
	;CHECK: image_sample			;CHECK: image_sample
				;CHECK: image_sample
	;CHECK: %Flow			;CHECK: %Flow
	;CHECK-NEXT: s_or_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]],			;CHECK-NEXT: s_or_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]],
	;CHECK-NEXT: s_and_b64 exec, exec, [[ORIG]]			;CHECK-NEXT: s_and_b64 exec, exec, [[ORIG]]
	;CHECK-NEXT: s_and_b64 [[SAVED]], exec, [[SAVED]]			;CHECK-NEXT: s_and_b64 [[SAVED]], exec, [[SAVED]]
	;CHECK-NEXT: s_xor_b64 exec, exec, [[SAVED]]			;CHECK-NEXT: s_xor_b64 exec, exec, [[SAVED]]
	;CHECK-NEXT: mask branch [[END_BB:BB[0-9]+_[0-9]+]]			;CHECK-NEXT: mask branch [[END_BB:BB[0-9]+_[0-9]+]]
	;CHECK-NEXT: BB{{[0-9]+_[0-9]+}}: ; %ELSE			;CHECK-NEXT: BB{{[0-9]+_[0-9]+}}: ; %ELSE
	;CHECK: store_dword			;CHECK: store_dword
	;CHECK: [[END_BB]]: ; %END			;CHECK: [[END_BB]]: ; %END
	;CHECK: s_or_b64 exec, exec,			;CHECK: s_or_b64 exec, exec,
	;CHECK: v_mov_b32_e32 v0			;CHECK: v_mov_b32_e32 v0
	;CHECK: ; return			;CHECK: ; return
	define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {			define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
	main_body:			main_body:
	%cmp = icmp eq i32 %z, 0			%cmp = icmp eq i32 %z, 0
	br i1 %cmp, label %ELSE, label %IF			br i1 %cmp, label %ELSE, label %IF

	IF:			IF:
	%tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)			%tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
	%data.if = extractelement <4 x float> %tex, i32 0			%tex.1 = bitcast <4 x float> %tex to <4 x i32>
				%dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
				%data.if = extractelement <4 x float> %dtex, i32 0
	br label %END			br label %END

	ELSE:			ELSE:
	call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i1 0, i1 0)			call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i1 0, i1 0)
	br label %END			br label %END

	END:			END:
	%r = phi float [ %data.if, %IF ], [ %data, %ELSE ]			%r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
	▲ Show 20 Lines • Show All 49 Lines • ▼ Show 20 Lines
	; ... but only if they really do need it.			; ... but only if they really do need it.
	;			;
	;CHECK-LABEL: {{^}}test_control_flow_3:			;CHECK-LABEL: {{^}}test_control_flow_3:
	;CHECK-NEXT: ; %main_body			;CHECK-NEXT: ; %main_body
	;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec			;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
	;CHECK-NEXT: s_wqm_b64 exec, exec			;CHECK-NEXT: s_wqm_b64 exec, exec
	;CHECK: image_sample			;CHECK: image_sample
	;CHECK: s_and_b64 exec, exec, [[ORIG]]			;CHECK: s_and_b64 exec, exec, [[ORIG]]
	;CHECK: store			;CHECK: image_sample
	;CHECK: load
	;CHECK: store			;CHECK: store
	;CHECK: v_cmp			;CHECK: v_cmp
	define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <3 x i32> %idx, <2 x float> %data, i32 %coord) {			define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, i32 %coord) {
	main_body:			main_body:
	%tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)			%tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
	%tex.1 = extractelement <4 x float> %tex, i32 0			%tex.1 = bitcast <4 x float> %tex to <4 x i32>
				%dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
	%idx.1 = extractelement <3 x i32> %idx, i32 0			%dtex.1 = extractelement <4 x float> %dtex, i32 0
	%data.1 = extractelement <2 x float> %data, i32 0
	call void @llvm.amdgcn.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i1 0, i1 0)

	%idx.2 = extractelement <3 x i32> %idx, i32 1
	%z = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx.2, i32 0, i1 0, i1 0)

	%idx.3 = extractelement <3 x i32> %idx, i32 2			call void @llvm.amdgcn.buffer.store.f32(float %dtex.1, <4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0)
	%data.3 = extractelement <2 x float> %data, i32 1
	call void @llvm.amdgcn.buffer.store.f32(float %data.3, <4 x i32> undef, i32 %idx.3, i32 0, i1 0, i1 0)

	%cc = fcmp ogt float %z, 0.0			%cc = fcmp ogt float %dtex.1, 0.0
	br i1 %cc, label %IF, label %ELSE			br i1 %cc, label %IF, label %ELSE

	IF:			IF:
	%tex.IF = fmul float %tex.1, 3.0			%tex.IF = fmul float %dtex.1, 3.0
	br label %END			br label %END

	ELSE:			ELSE:
	%tex.ELSE = fmul float %tex.1, 4.0			%tex.ELSE = fmul float %dtex.1, 4.0
	br label %END			br label %END

	END:			END:
	%tex.END = phi float [ %tex.IF, %IF ], [ %tex.ELSE, %ELSE ]			%tex.END = phi float [ %tex.IF, %IF ], [ %tex.ELSE, %ELSE ]
	ret float %tex.END			ret float %tex.END
	}			}

	; Another test that failed at some point because of terminator handling.			; Another test that failed at some point because of terminator handling.
	;			;
	;CHECK-LABEL: {{^}}test_control_flow_4:			;CHECK-LABEL: {{^}}test_control_flow_4:
	;CHECK-NEXT: ; %main_body			;CHECK-NEXT: ; %main_body
	;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec			;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
	;CHECK-NEXT: s_wqm_b64 exec, exec			;CHECK-NEXT: s_wqm_b64 exec, exec
	;CHECK: %IF			;CHECK: %IF
	;CHECK: load
	;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]]			;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]]
				;CHECK: load
	;CHECK: store			;CHECK: store
	;CHECK: s_mov_b64 exec, [[SAVE]]			;CHECK: s_mov_b64 exec, [[SAVE]]
	;CHECK: %END			;CHECK: %END
	;CHECK: image_sample			;CHECK: image_sample
				;CHECK: image_sample
	define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %coord, i32 %y, float %z) {			define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %coord, i32 %y, float %z) {
	main_body:			main_body:
	%cond = icmp eq i32 %y, 0			%cond = icmp eq i32 %y, 0
	br i1 %cond, label %IF, label %END			br i1 %cond, label %IF, label %END

	IF:			IF:
	%data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 0, i32 0, i1 0, i1 0)			%data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 0, i32 0, i1 0, i1 0)
	call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 1, i32 0, i1 0, i1 0)			call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 1, i32 0, i1 0, i1 0)
	br label %END			br label %END

	END:			END:
	%tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)			%tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
	ret <4 x float> %tex			%tex.1 = bitcast <4 x float> %tex to <4 x i32>
				%dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
				ret <4 x float> %dtex
	}			}

	; Kill is performed in WQM mode so that uniform kill behaves correctly ...			; Kill is performed in WQM mode so that uniform kill behaves correctly ...
	;			;
	;CHECK-LABEL: {{^}}test_kill_0:			;CHECK-LABEL: {{^}}test_kill_0:
	;CHECK-NEXT: ; %main_body			;CHECK-NEXT: ; %main_body
	;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec			;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
	;CHECK-NEXT: s_wqm_b64 exec, exec			;CHECK-NEXT: s_wqm_b64 exec, exec
	;CHECK: image_sample
	;CHECK: s_and_b64 exec, exec, [[ORIG]]			;CHECK: s_and_b64 exec, exec, [[ORIG]]
				;CHECK: image_sample
	;CHECK: buffer_store_dword			;CHECK: buffer_store_dword
	;CHECK: s_wqm_b64 exec, exec			;CHECK: s_wqm_b64 exec, exec
	;CHECK: v_cmpx_			;CHECK: v_cmpx_
	;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]]			;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]]
	;CHECK: buffer_store_dword			;CHECK: buffer_store_dword
	;CHECK: s_mov_b64 exec, [[SAVE]]			;CHECK: s_mov_b64 exec, [[SAVE]]
	;CHECK: image_sample			;CHECK: image_sample
	define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, i32 %coord, i32 %coord2, float %z) {			define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, i32 %coord, i32 %coord2, float %z) {
	main_body:			main_body:
	%tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)			%tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)

	%idx.0 = extractelement <2 x i32> %idx, i32 0			%idx.0 = extractelement <2 x i32> %idx, i32 0
	%data.0 = extractelement <2 x float> %data, i32 0			%data.0 = extractelement <2 x float> %data, i32 0
	call void @llvm.amdgcn.buffer.store.f32(float %data.0, <4 x i32> undef, i32 %idx.0, i32 0, i1 0, i1 0)			call void @llvm.amdgcn.buffer.store.f32(float %data.0, <4 x i32> undef, i32 %idx.0, i32 0, i1 0, i1 0)

	call void @llvm.AMDGPU.kill(float %z)			call void @llvm.AMDGPU.kill(float %z)

	%idx.1 = extractelement <2 x i32> %idx, i32 1			%idx.1 = extractelement <2 x i32> %idx, i32 1
	%data.1 = extractelement <2 x float> %data, i32 1			%data.1 = extractelement <2 x float> %data, i32 1
	call void @llvm.amdgcn.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i1 0, i1 0)			call void @llvm.amdgcn.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i1 0, i1 0)

	%tex2 = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)			%tex2 = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
	%out = fadd <4 x float> %tex, %tex2			%tex2.1 = bitcast <4 x float> %tex2 to <4 x i32>
				%dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex2.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
				%out = fadd <4 x float> %tex, %dtex

	ret <4 x float> %out			ret <4 x float> %out
	}			}

	; ... but only if WQM is necessary.			; ... but only if WQM is necessary.
	;			;
	; CHECK-LABEL: {{^}}test_kill_1:			; CHECK-LABEL: {{^}}test_kill_1:
	; CHECK-NEXT: ; %main_body			; CHECK-NEXT: ; %main_body
	; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec			; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
	; CHECK: s_wqm_b64 exec, exec			; CHECK: s_wqm_b64 exec, exec
	; CHECK: image_sample			; CHECK: image_sample
	; CHECK: s_and_b64 exec, exec, [[ORIG]]			; CHECK: s_and_b64 exec, exec, [[ORIG]]
				; CHECK: image_sample
	; CHECK: buffer_store_dword			; CHECK: buffer_store_dword
	; CHECK-NOT: wqm			; CHECK-NOT: wqm
	; CHECK: v_cmpx_			; CHECK: v_cmpx_
	define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, i32 %coord, i32 %coord2, float %z) {			define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, i32 %coord, i32 %coord2, float %z) {
	main_body:			main_body:
	%tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)			%tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
				%tex.1 = bitcast <4 x float> %tex to <4 x i32>
				%dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)

	call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0)			call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0)

	call void @llvm.AMDGPU.kill(float %z)			call void @llvm.AMDGPU.kill(float %z)

	ret <4 x float> %tex			ret <4 x float> %dtex
	}			}

	; Check prolog shaders.			; Check prolog shaders.
	;			;
	; CHECK-LABEL: {{^}}test_prolog_1:			; CHECK-LABEL: {{^}}test_prolog_1:
	; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec			; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
	; CHECK: s_wqm_b64 exec, exec			; CHECK: s_wqm_b64 exec, exec
	; CHECK: v_add_f32_e32 v0,			; CHECK: v_add_f32_e32 v0,
	▲ Show 20 Lines • Show All 54 Lines • ▼ Show 20 Lines
	; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0			; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0
	; CHECK: s_wqm_b64 exec, exec			; CHECK: s_wqm_b64 exec, exec
	; CHECK: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen			; CHECK: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen
	; CHECK: s_and_b64 exec, exec, [[LIVE]]			; CHECK: s_and_b64 exec, exec, [[LIVE]]
	; CHECK: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen			; CHECK: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen
	; CHECK: s_wqm_b64 exec, exec			; CHECK: s_wqm_b64 exec, exec
	; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen			; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen

	; CHECK: image_sample
	; CHECK: s_and_b64 exec, exec, [[LIVE]]			; CHECK: s_and_b64 exec, exec, [[LIVE]]
				; CHECK: image_sample
	; CHECK: buffer_store_dwordx4			; CHECK: buffer_store_dwordx4
	define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind {			define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind {
	entry:			entry:
	%array = alloca [32 x i32], align 4			%array = alloca [32 x i32], align 4

	call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0)			call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0)

	%s.gep = getelementptr [32 x i32], [32 x i32]* %array, i32 0, i32 0			%s.gep = getelementptr [32 x i32], [32 x i32]* %array, i32 0, i32 0
	▲ Show 20 Lines • Show All 72 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Reduce the duration of whole-quad-mode
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 70267

llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp

llvm/trunk/test/CodeGen/AMDGPU/wqm.ll

This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Reduce the duration of whole-quad-modeClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 70267

llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp

llvm/trunk/test/CodeGen/AMDGPU/wqm.ll

AMDGPU: Reduce the duration of whole-quad-mode
ClosedPublic