This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Leave WQM earlier before VMEM loads
AbandonedPublic

Authored by nhaehnle on Jul 10 2016, 7:36 AM.

Download Raw Diff

Details

Reviewers

• tstellarAMD
arsenm
mareko

Summary

When the initial propagation determines that a VMEM load does not have to run
in WQM, make it run in Exact mode instead if the remainder of the shader
doesn't care. This can reduce the bandwidth required for loads.

This typically introduces two additional SALU instructions and uses two
additional SGPRs in pixel shaders for the livemask.

Diff Detail

Event Timeline

nhaehnle updated this revision to Diff 63427.Jul 10 2016, 7:36 AM

nhaehnle retitled this revision from to AMDGPU: Leave WQM earlier before VMEM loads.

nhaehnle updated this object.

nhaehnle added reviewers: arsenm, • tstellarAMD, mareko.

nhaehnle added a subscriber: llvm-commits.

Herald added subscribers: kzhuravl, arsenm. · View Herald TranscriptJul 10 2016, 7:36 AM

nhaehnle added parent revisions: D22092: AMDGPU: Reduce the duration of whole-quad-mode, D22195: AMDGPU: Move SIWholeQuadMode pass to after machine scheduling, D22198: AMDGPU: Do not clobber SCC in SIWholeQuadMode.Jul 10 2016, 7:36 AM

Rebase and add test case changes that I missed.

arsenm added inline comments.Jul 21 2016, 3:34 PM

lib/Target/AMDGPU/SIWholeQuadMode.cpp
372	TII->usesVM_CNT(MI)

Use usesVM_CNT

• tstellarAMD accepted this revision.Sep 8 2016, 6:50 AM

• tstellarAMD edited edge metadata.

This revision is now accepted and ready to land.Sep 8 2016, 6:50 AM

Is this still needed?

Herald added subscribers: kerbowa, javed.absar, t-tye and 5 others. · View Herald TranscriptJul 23 2020, 9:18 AM

nhaehnle abandoned this revision.Jul 27 2020, 2:12 AM

Revision Contents

Path

Size

lib/

Target/

AMDGPU/

SIWholeQuadMode.cpp

30 lines

test/

CodeGen/

AMDGPU/

llvm.SI.image.sample.ll

16 lines

llvm.SI.image.sample.o.ll

16 lines

llvm.amdgcn.ps.live.ll

7 lines

si-scheduler.ll

2 lines

wqm.ll

9 lines

Diff 65683

lib/Target/AMDGPU/SIWholeQuadMode.cpp

Show First 20 Lines • Show All 112 Lines • ▼ Show 20 Lines	private:
const SIInstrInfo *TII;		const SIInstrInfo *TII;
const SIRegisterInfo *TRI;		const SIRegisterInfo *TRI;
MachineRegisterInfo *MRI;		MachineRegisterInfo *MRI;
LiveIntervals *LIS;		LiveIntervals *LIS;

DenseMap<const MachineInstr *, InstrInfo> Instructions;		DenseMap<const MachineInstr *, InstrInfo> Instructions;
DenseMap<MachineBasicBlock *, BlockInfo> Blocks;		DenseMap<MachineBasicBlock *, BlockInfo> Blocks;
std::map<std::pair<SlotIndex, unsigned>, LaneBitmask> WQMValues;		std::map<std::pair<SlotIndex, unsigned>, LaneBitmask> WQMValues;
		SmallVector<MachineInstr *, 4> PreferExact;
SmallVector<MachineInstr *, 1> LiveMaskQueries;		SmallVector<MachineInstr *, 1> LiveMaskQueries;

void printInfo(MachineFunction &MF) const;		void printInfo(MachineFunction &MF) const;

void markValueWQM(SlotIndex Slot, unsigned Reg, LaneBitmask LaneMask,		void markValueWQM(SlotIndex Slot, unsigned Reg, LaneBitmask LaneMask,
std::vector<WorkItem> &Worklist);		std::vector<WorkItem> &Worklist);
void markInstruction(MachineInstr &MI, char Flag,		void markInstruction(MachineInstr &MI, char Flag,
std::vector<WorkItem> &Worklist);		std::vector<WorkItem> &Worklist);
void markUsesWQM(const MachineInstr &MI, std::vector<WorkItem> &Worklist);		void markUsesWQM(const MachineInstr &MI, std::vector<WorkItem> &Worklist);
char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);		char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
void propagateValueSub(const WorkItem::Value &V, LaneBitmask LaneMask,		void propagateValueSub(const WorkItem::Value &V, LaneBitmask LaneMask,
const LiveRange &LR, std::vector<WorkItem> &Worklist);		const LiveRange &LR, std::vector<WorkItem> &Worklist);
void propagateValue(const WorkItem::Value &V,		void propagateValue(const WorkItem::Value &V,
std::vector<WorkItem> &Worklist);		std::vector<WorkItem> &Worklist);
void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);		void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);		void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
		void propagate(std::vector<WorkItem> &Worklist);
char analyzeFunction(MachineFunction &MF);		char analyzeFunction(MachineFunction &MF);

bool requiresCorrectState(const MachineInstr &MI) const;		bool requiresCorrectState(const MachineInstr &MI) const;

MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,		MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
MachineBasicBlock::iterator Before);		MachineBasicBlock::iterator Before);
MachineBasicBlock::iterator		MachineBasicBlock::iterator
prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,		prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
▲ Show 20 Lines • Show All 189 Lines • ▼ Show 20 Lines	for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {
if (TII->isDS(Opcode)) {		if (TII->isDS(Opcode)) {
markInstruction(MI, StateWQM, Worklist);		markInstruction(MI, StateWQM, Worklist);
GlobalFlags \|= StateWQM;		GlobalFlags \|= StateWQM;
} else if (TII->isWQM(Opcode)) {		} else if (TII->isWQM(Opcode)) {
// Sampling instructions don't need to produce results for all pixels		// Sampling instructions don't need to produce results for all pixels
// in a quad, they just require all inputs of a quad to have been		// in a quad, they just require all inputs of a quad to have been
// computed for derivatives.		// computed for derivatives.
markUsesWQM(MI, Worklist);		markUsesWQM(MI, Worklist);
		PreferExact.push_back(&MI);
GlobalFlags \|= StateWQM;		GlobalFlags \|= StateWQM;
} else if (TII->isDisableWQM(MI)) {		} else if (TII->isDisableWQM(MI)) {
markInstruction(MI, StateExact, Worklist);		markInstruction(MI, StateExact, Worklist);
GlobalFlags \|= StateExact;		GlobalFlags \|= StateExact;
} else {		} else {
if (Opcode == AMDGPU::SI_PS_LIVE) {		if (Opcode == AMDGPU::SI_PS_LIVE) {
LiveMaskQueries.push_back(&MI);		LiveMaskQueries.push_back(&MI);
} else if (WQMOutputs) {		} else if (WQMOutputs) {
// Physical VGPRs correspond to shader inputs and outputs. Inputs are		// Physical VGPRs correspond to shader inputs and outputs. Inputs are
// only used, outputs are only defined.		// only used, outputs are only defined.
for (const MachineOperand &MO : MI.defs()) {		for (const MachineOperand &MO : MI.defs()) {
if (!MO.isReg())		if (!MO.isReg())
continue;		continue;

unsigned Reg = MO.getReg();		unsigned Reg = MO.getReg();

if (!TRI->isVirtualRegister(Reg) &&		if (!TRI->isVirtualRegister(Reg) &&
TRI->hasVGPRs(TRI->getPhysRegClass(Reg))) {		TRI->hasVGPRs(TRI->getPhysRegClass(Reg))) {
markInstruction(MI, StateWQM, Worklist);		markInstruction(MI, StateWQM, Worklist);
GlobalFlags \|= StateWQM;		GlobalFlags \|= StateWQM;
break;		break;
}		}
}		}
}		}

		// Vector memory instructions prefer to be run in exact mode even when
		// they're loads, to save a bit of memory bandwidth where possible.
		if (TII->usesVM_CNT(MI))
		arsenmUnsubmitted Not Done Reply Inline Actions TII->usesVM_CNT(MI) arsenm: TII->usesVM_CNT(MI)
		PreferExact.push_back(&MI);
}		}
}		}

if (WQMOutputs && MBB.succ_empty()) {		if (WQMOutputs && MBB.succ_empty()) {
// This is a prolog shader. Make sure we go back to exact mode at the end.		// This is a prolog shader. Make sure we go back to exact mode at the end.
assert(!Blocks[&MBB].Needs.Out);		assert(!Blocks[&MBB].Needs.Out);
Blocks[&MBB].Needs.Out = StateExact;		Blocks[&MBB].Needs.Out = StateExact;
Blocks[&MBB].Needs.In \|= StateExact;		Blocks[&MBB].Needs.In \|= StateExact;
▲ Show 20 Lines • Show All 111 Lines • ▼ Show 20 Lines	if (BI.Needs.Out & ~BI.Propagated.Out & StateWQM) {
for (MachineInstr &Terminator : MBB.terminators())		for (MachineInstr &Terminator : MBB.terminators())
markInstruction(Terminator, StateWQM, Worklist);		markInstruction(Terminator, StateWQM, Worklist);
}		}
}		}

Blocks[&MBB].Propagated = BI.Needs;		Blocks[&MBB].Propagated = BI.Needs;
}		}

char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {		void SIWholeQuadMode::propagate(std::vector<WorkItem> &Worklist) {
std::vector<WorkItem> Worklist;
char GlobalFlags = scanInstructions(MF, Worklist);

while (!Worklist.empty()) {		while (!Worklist.empty()) {
WorkItem WI = Worklist.back();		WorkItem WI = Worklist.back();
Worklist.pop_back();		Worklist.pop_back();

if (WI.MI)		if (WI.MI)
propagateInstruction(*WI.MI, Worklist);		propagateInstruction(*WI.MI, Worklist);
else if (WI.MBB)		else if (WI.MBB)
propagateBlock(*WI.MBB, Worklist);		propagateBlock(*WI.MBB, Worklist);
else		else
propagateValue(WI.V, Worklist);		propagateValue(WI.V, Worklist);
}		}
		}

		char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
		std::vector<WorkItem> Worklist;
		char GlobalFlags = scanInstructions(MF, Worklist);

		propagate(Worklist);

		for (MachineInstr *MI : PreferExact) {
		if (!Instructions[MI].Needs && !(Instructions[MI].OutNeeds & StateWQM)) {
		markInstruction(*MI, StateExact, Worklist);
		GlobalFlags \|= StateExact;
		}
		}

		propagate(Worklist);

return GlobalFlags;		return GlobalFlags;
}		}

/// Whether \p MI really requires the exec state computed during analysis.		/// Whether \p MI really requires the exec state computed during analysis.
///		///
/// Scalar instructions must occasionally be marked WQM for correct propagation		/// Scalar instructions must occasionally be marked WQM for correct propagation
/// (e.g. thread masks leading up to branches), but when it comes actual		/// (e.g. thread masks leading up to branches), but when it comes actual
▲ Show 20 Lines • Show All 225 Lines • ▼ Show 20 Lines
bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {		bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
if (MF.getFunction()->getCallingConv() != CallingConv::AMDGPU_PS)		if (MF.getFunction()->getCallingConv() != CallingConv::AMDGPU_PS)
return false;		return false;

Instructions.clear();		Instructions.clear();
WQMValues.clear();		WQMValues.clear();
Blocks.clear();		Blocks.clear();
LiveMaskQueries.clear();		LiveMaskQueries.clear();
		PreferExact.clear();

const SISubtarget &ST = MF.getSubtarget<SISubtarget>();		const SISubtarget &ST = MF.getSubtarget<SISubtarget>();

TII = ST.getInstrInfo();		TII = ST.getInstrInfo();
TRI = &TII->getRegisterInfo();		TRI = &TII->getRegisterInfo();
MRI = &MF.getRegInfo();		MRI = &MF.getRegInfo();
LIS = &getAnalysis<LiveIntervals>();		LIS = &getAnalysis<LiveIntervals>();

▲ Show 20 Lines • Show All 49 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/llvm.SI.image.sample.ll

;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs \| FileCheck %s		;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs \| FileCheck %s
;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs \| FileCheck %s		;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs \| FileCheck %s

;CHECK-LABEL: {{^}}sample:		;CHECK-LABEL: {{^}}sample:
;CHECK: s_wqm		;CHECK: s_wqm
;CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf		;CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
define amdgpu_ps void @sample() {		define amdgpu_ps void @sample() {
main_body:		main_body:
%r = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)		%r = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> <i32 0, i32 0, i32 0, i32 0>, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0		%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1		%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2		%r2 = extractelement <4 x float> %r, i32 2
%r3 = extractelement <4 x float> %r, i32 3		%r3 = extractelement <4 x float> %r, i32 3
call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)		call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
ret void		ret void
}		}

;CHECK-LABEL: {{^}}sample_cl:		;CHECK-LABEL: {{^}}sample_cl:
;CHECK: s_wqm		;CHECK: s_wqm
;CHECK: image_sample_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf		;CHECK: image_sample_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
define amdgpu_ps void @sample_cl() {		define amdgpu_ps void @sample_cl() {
main_body:		main_body:
%r = call <4 x float> @llvm.SI.image.sample.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)		%r = call <4 x float> @llvm.SI.image.sample.cl.v4i32(<4 x i32> <i32 0, i32 0, i32 0, i32 0>, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0		%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1		%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2		%r2 = extractelement <4 x float> %r, i32 2
%r3 = extractelement <4 x float> %r, i32 3		%r3 = extractelement <4 x float> %r, i32 3
call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)		call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
ret void		ret void
}		}

Show All 39 Lines	main_body:
ret void		ret void
}		}

;CHECK-LABEL: {{^}}sample_b:		;CHECK-LABEL: {{^}}sample_b:
;CHECK: s_wqm		;CHECK: s_wqm
;CHECK: image_sample_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf		;CHECK: image_sample_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
define amdgpu_ps void @sample_b() {		define amdgpu_ps void @sample_b() {
main_body:		main_body:
%r = call <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)		%r = call <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32> <i32 0, i32 0, i32 0, i32 0>, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0		%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1		%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2		%r2 = extractelement <4 x float> %r, i32 2
%r3 = extractelement <4 x float> %r, i32 3		%r3 = extractelement <4 x float> %r, i32 3
call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)		call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
ret void		ret void
}		}

;CHECK-LABEL: {{^}}sample_b_cl:		;CHECK-LABEL: {{^}}sample_b_cl:
;CHECK: s_wqm		;CHECK: s_wqm
;CHECK: image_sample_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf		;CHECK: image_sample_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
define amdgpu_ps void @sample_b_cl() {		define amdgpu_ps void @sample_b_cl() {
main_body:		main_body:
%r = call <4 x float> @llvm.SI.image.sample.b.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)		%r = call <4 x float> @llvm.SI.image.sample.b.cl.v4i32(<4 x i32> <i32 0, i32 0, i32 0, i32 0>, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0		%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1		%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2		%r2 = extractelement <4 x float> %r, i32 2
%r3 = extractelement <4 x float> %r, i32 3		%r3 = extractelement <4 x float> %r, i32 3
call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)		call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
ret void		ret void
}		}

Show All 39 Lines	main_body:
ret void		ret void
}		}

;CHECK-LABEL: {{^}}sample_c:		;CHECK-LABEL: {{^}}sample_c:
;CHECK: s_wqm		;CHECK: s_wqm
;CHECK: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf		;CHECK: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
define amdgpu_ps void @sample_c() {		define amdgpu_ps void @sample_c() {
main_body:		main_body:
%r = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)		%r = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> <i32 0, i32 0, i32 0, i32 0>, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0		%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1		%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2		%r2 = extractelement <4 x float> %r, i32 2
%r3 = extractelement <4 x float> %r, i32 3		%r3 = extractelement <4 x float> %r, i32 3
call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)		call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
ret void		ret void
}		}

;CHECK-LABEL: {{^}}sample_c_cl:		;CHECK-LABEL: {{^}}sample_c_cl:
;CHECK: s_wqm		;CHECK: s_wqm
;CHECK: image_sample_c_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf		;CHECK: image_sample_c_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
define amdgpu_ps void @sample_c_cl() {		define amdgpu_ps void @sample_c_cl() {
main_body:		main_body:
%r = call <4 x float> @llvm.SI.image.sample.c.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)		%r = call <4 x float> @llvm.SI.image.sample.c.cl.v4i32(<4 x i32> <i32 0, i32 0, i32 0, i32 0>, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0		%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1		%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2		%r2 = extractelement <4 x float> %r, i32 2
%r3 = extractelement <4 x float> %r, i32 3		%r3 = extractelement <4 x float> %r, i32 3
call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)		call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
ret void		ret void
}		}

Show All 39 Lines	main_body:
ret void		ret void
}		}

;CHECK-LABEL: {{^}}sample_c_b:		;CHECK-LABEL: {{^}}sample_c_b:
;CHECK: s_wqm		;CHECK: s_wqm
;CHECK: image_sample_c_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf		;CHECK: image_sample_c_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
define amdgpu_ps void @sample_c_b() {		define amdgpu_ps void @sample_c_b() {
main_body:		main_body:
%r = call <4 x float> @llvm.SI.image.sample.c.b.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)		%r = call <4 x float> @llvm.SI.image.sample.c.b.v4i32(<4 x i32> <i32 0, i32 0, i32 0, i32 0>, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0		%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1		%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2		%r2 = extractelement <4 x float> %r, i32 2
%r3 = extractelement <4 x float> %r, i32 3		%r3 = extractelement <4 x float> %r, i32 3
call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)		call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
ret void		ret void
}		}

;CHECK-LABEL: {{^}}sample_c_b_cl:		;CHECK-LABEL: {{^}}sample_c_b_cl:
;CHECK: s_wqm		;CHECK: s_wqm
;CHECK: image_sample_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf		;CHECK: image_sample_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
define amdgpu_ps void @sample_c_b_cl() {		define amdgpu_ps void @sample_c_b_cl() {
main_body:		main_body:
%r = call <4 x float> @llvm.SI.image.sample.c.b.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)		%r = call <4 x float> @llvm.SI.image.sample.c.b.cl.v4i32(<4 x i32> <i32 0, i32 0, i32 0, i32 0>, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0		%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1		%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2		%r2 = extractelement <4 x float> %r, i32 2
%r3 = extractelement <4 x float> %r, i32 3		%r3 = extractelement <4 x float> %r, i32 3
call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)		call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
ret void		ret void
}		}

▲ Show 20 Lines • Show All 68 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/llvm.SI.image.sample.o.ll

;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs \| FileCheck %s		;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs \| FileCheck %s
;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs \| FileCheck %s		;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs \| FileCheck %s

;CHECK-LABEL: {{^}}sample:		;CHECK-LABEL: {{^}}sample:
;CHECK: s_wqm		;CHECK: s_wqm
;CHECK: image_sample_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf		;CHECK: image_sample_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
define amdgpu_ps void @sample() {		define amdgpu_ps void @sample() {
main_body:		main_body:
%r = call <4 x float> @llvm.SI.image.sample.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)		%r = call <4 x float> @llvm.SI.image.sample.o.v4i32(<4 x i32> <i32 0, i32 0, i32 0, i32 0>, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0		%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1		%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2		%r2 = extractelement <4 x float> %r, i32 2
%r3 = extractelement <4 x float> %r, i32 3		%r3 = extractelement <4 x float> %r, i32 3
call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)		call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
ret void		ret void
}		}

;CHECK-LABEL: {{^}}sample_cl:		;CHECK-LABEL: {{^}}sample_cl:
;CHECK: s_wqm		;CHECK: s_wqm
;CHECK: image_sample_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf		;CHECK: image_sample_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
define amdgpu_ps void @sample_cl() {		define amdgpu_ps void @sample_cl() {
main_body:		main_body:
%r = call <4 x float> @llvm.SI.image.sample.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)		%r = call <4 x float> @llvm.SI.image.sample.cl.o.v4i32(<4 x i32> <i32 0, i32 0, i32 0, i32 0>, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0		%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1		%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2		%r2 = extractelement <4 x float> %r, i32 2
%r3 = extractelement <4 x float> %r, i32 3		%r3 = extractelement <4 x float> %r, i32 3
call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)		call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
ret void		ret void
}		}

Show All 39 Lines	main_body:
ret void		ret void
}		}

;CHECK-LABEL: {{^}}sample_b:		;CHECK-LABEL: {{^}}sample_b:
;CHECK: s_wqm		;CHECK: s_wqm
;CHECK: image_sample_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf		;CHECK: image_sample_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
define amdgpu_ps void @sample_b() {		define amdgpu_ps void @sample_b() {
main_body:		main_body:
%r = call <4 x float> @llvm.SI.image.sample.b.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)		%r = call <4 x float> @llvm.SI.image.sample.b.o.v4i32(<4 x i32> <i32 0, i32 0, i32 0, i32 0>, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0		%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1		%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2		%r2 = extractelement <4 x float> %r, i32 2
%r3 = extractelement <4 x float> %r, i32 3		%r3 = extractelement <4 x float> %r, i32 3
call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)		call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
ret void		ret void
}		}

;CHECK-LABEL: {{^}}sample_b_cl:		;CHECK-LABEL: {{^}}sample_b_cl:
;CHECK: s_wqm		;CHECK: s_wqm
;CHECK: image_sample_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf		;CHECK: image_sample_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
define amdgpu_ps void @sample_b_cl() {		define amdgpu_ps void @sample_b_cl() {
main_body:		main_body:
%r = call <4 x float> @llvm.SI.image.sample.b.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)		%r = call <4 x float> @llvm.SI.image.sample.b.cl.o.v4i32(<4 x i32> <i32 0, i32 0, i32 0, i32 0>, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0		%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1		%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2		%r2 = extractelement <4 x float> %r, i32 2
%r3 = extractelement <4 x float> %r, i32 3		%r3 = extractelement <4 x float> %r, i32 3
call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)		call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
ret void		ret void
}		}

Show All 39 Lines	main_body:
ret void		ret void
}		}

;CHECK-LABEL: {{^}}sample_c:		;CHECK-LABEL: {{^}}sample_c:
;CHECK: s_wqm		;CHECK: s_wqm
;CHECK: image_sample_c_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf		;CHECK: image_sample_c_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
define amdgpu_ps void @sample_c() {		define amdgpu_ps void @sample_c() {
main_body:		main_body:
%r = call <4 x float> @llvm.SI.image.sample.c.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)		%r = call <4 x float> @llvm.SI.image.sample.c.o.v4i32(<4 x i32> <i32 0, i32 0, i32 0, i32 0>, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0		%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1		%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2		%r2 = extractelement <4 x float> %r, i32 2
%r3 = extractelement <4 x float> %r, i32 3		%r3 = extractelement <4 x float> %r, i32 3
call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)		call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
ret void		ret void
}		}

;CHECK-LABEL: {{^}}sample_c_cl:		;CHECK-LABEL: {{^}}sample_c_cl:
;CHECK: s_wqm		;CHECK: s_wqm
;CHECK: image_sample_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf		;CHECK: image_sample_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
define amdgpu_ps void @sample_c_cl() {		define amdgpu_ps void @sample_c_cl() {
main_body:		main_body:
%r = call <4 x float> @llvm.SI.image.sample.c.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)		%r = call <4 x float> @llvm.SI.image.sample.c.cl.o.v4i32(<4 x i32> <i32 0, i32 0, i32 0, i32 0>, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0		%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1		%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2		%r2 = extractelement <4 x float> %r, i32 2
%r3 = extractelement <4 x float> %r, i32 3		%r3 = extractelement <4 x float> %r, i32 3
call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)		call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
ret void		ret void
}		}

Show All 39 Lines	main_body:
ret void		ret void
}		}

;CHECK-LABEL: {{^}}sample_c_b:		;CHECK-LABEL: {{^}}sample_c_b:
;CHECK: s_wqm		;CHECK: s_wqm
;CHECK: image_sample_c_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf		;CHECK: image_sample_c_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
define amdgpu_ps void @sample_c_b() {		define amdgpu_ps void @sample_c_b() {
main_body:		main_body:
%r = call <4 x float> @llvm.SI.image.sample.c.b.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)		%r = call <4 x float> @llvm.SI.image.sample.c.b.o.v4i32(<4 x i32> <i32 0, i32 0, i32 0, i32 0>, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0		%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1		%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2		%r2 = extractelement <4 x float> %r, i32 2
%r3 = extractelement <4 x float> %r, i32 3		%r3 = extractelement <4 x float> %r, i32 3
call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)		call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
ret void		ret void
}		}

;CHECK-LABEL: {{^}}sample_c_b_cl:		;CHECK-LABEL: {{^}}sample_c_b_cl:
;CHECK: s_wqm		;CHECK: s_wqm
;CHECK: image_sample_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf		;CHECK: image_sample_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
define amdgpu_ps void @sample_c_b_cl() {		define amdgpu_ps void @sample_c_b_cl() {
main_body:		main_body:
%r = call <4 x float> @llvm.SI.image.sample.c.b.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)		%r = call <4 x float> @llvm.SI.image.sample.c.b.cl.o.v4i32(<4 x i32> <i32 0, i32 0, i32 0, i32 0>, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0		%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1		%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2		%r2 = extractelement <4 x float> %r, i32 2
%r3 = extractelement <4 x float> %r, i32 3		%r3 = extractelement <4 x float> %r, i32 3
call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)		call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
ret void		ret void
}		}

▲ Show 20 Lines • Show All 68 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll

Show All 11 Lines	define amdgpu_ps float @test1() {
%live = call i1 @llvm.amdgcn.ps.live()		%live = call i1 @llvm.amdgcn.ps.live()
%live.32 = zext i1 %live to i32		%live.32 = zext i1 %live to i32
%r = bitcast i32 %live.32 to float		%r = bitcast i32 %live.32 to float
ret float %r		ret float %r
}		}

; CHECK-LABEL: {{^}}test2:		; CHECK-LABEL: {{^}}test2:
; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec		; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
		; CHECK-DAG: s_mov_b64 [[COPY:s\[[0-9]+:[0-9]+\]]], [[LIVE]]
		; TODO - figure out why MachineCopyPropagation doesn't eliminate the above
; CHECK-DAG: s_wqm_b64 exec, exec		; CHECK-DAG: s_wqm_b64 exec, exec
; CHECK-DAG: v_cndmask_b32_e64 [[VAR:v[0-9]+]], 0, 1, [[LIVE]]		; CHECK-DAG: v_cndmask_b32_e64 [[VAR:v[0-9]+]], 0, 1, [[COPY]]
; CHECK: image_sample v0, [[VAR]],		; CHECK: image_sample v0, [[VAR]],
define amdgpu_ps float @test2() {		define amdgpu_ps float @test2() {
%live = call i1 @llvm.amdgcn.ps.live()		%live = call i1 @llvm.amdgcn.ps.live()
%live.32 = zext i1 %live to i32		%live.32 = zext i1 %live to i32

%t = call <4 x float> @llvm.SI.image.sample.i32(i32 %live.32, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)		%t = call <4 x float> @llvm.SI.image.sample.i32(i32 %live.32, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)

%r = extractelement <4 x float> %t, i32 0		%r = extractelement <4 x float> %t, i32 0
ret float %r		ret float %r
}		}

; CHECK-LABEL: {{^}}test3:		; CHECK-LABEL: {{^}}test3:
; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec		; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
		; CHECK-DAG: s_mov_b64 [[COPY:s\[[0-9]+:[0-9]+\]]], [[LIVE]]
; CHECK-DAG: s_wqm_b64 exec, exec		; CHECK-DAG: s_wqm_b64 exec, exec
; CHECK-DAG: s_xor_b64 [[HELPER:s\[[0-9]+:[0-9]+\]]], [[LIVE]], -1		; CHECK-DAG: s_xor_b64 [[HELPER:s\[[0-9]+:[0-9]+\]]], [[COPY]], -1
; CHECK_DAG: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[HELPER]]		; CHECK_DAG: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[HELPER]]
; CHECK: ; %dead		; CHECK: ; %dead
define amdgpu_ps float @test3(i32 %in) {		define amdgpu_ps float @test3(i32 %in) {
entry:		entry:
%live = call i1 @llvm.amdgcn.ps.live()		%live = call i1 @llvm.amdgcn.ps.live()
br i1 %live, label %end, label %dead		br i1 %live, label %end, label %dead

dead:		dead:
Show All 16 Lines

test/CodeGen/AMDGPU/si-scheduler.ll

	; FIXME: The si scheduler crashes if when lane mask tracking is enabled, so			; FIXME: The si scheduler crashes if when lane mask tracking is enabled, so
	; we need to disable this when the si scheduler is being used.			; we need to disable this when the si scheduler is being used.
	; The only way the subtarget knows that the si machine scheduler is being used			; The only way the subtarget knows that the si machine scheduler is being used
	; is to specify -mattr=si-scheduler. If we just pass --misched=si, the backend			; is to specify -mattr=si-scheduler. If we just pass --misched=si, the backend
	; won't know what scheduler we are using.			; won't know what scheduler we are using.
	; RUN: llc -march=amdgcn -mcpu=SI --misched=si -mattr=si-scheduler < %s \| FileCheck %s			; RUN: llc -march=amdgcn -mcpu=SI --misched=si -mattr=si-scheduler < %s \| FileCheck %s

	; The test checks the "si" machine scheduler pass works correctly.			; The test checks the "si" machine scheduler pass works correctly.

	; CHECK-LABEL: {{^}}main:			; CHECK-LABEL: {{^}}main:
	; CHECK: s_wqm
	; CHECK: s_load_dwordx4			; CHECK: s_load_dwordx4
	; CHECK: s_load_dwordx8			; CHECK: s_load_dwordx8
				; CHECK: s_wqm
	; CHECK: s_waitcnt lgkmcnt(0)			; CHECK: s_waitcnt lgkmcnt(0)
	; CHECK: image_sample			; CHECK: image_sample
	; CHECK: s_waitcnt vmcnt(0)			; CHECK: s_waitcnt vmcnt(0)
	; CHECK: exp			; CHECK: exp
	; CHECK: s_endpgm			; CHECK: s_endpgm
	define amdgpu_ps void @main([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 {			define amdgpu_ps void @main([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 {
	main_body:			main_body:
	%tmp = bitcast [34 x <8 x i32>] addrspace(2)* %arg3 to <32 x i8> addrspace(2)*			%tmp = bitcast [34 x <8 x i32>] addrspace(2)* %arg3 to <32 x i8> addrspace(2)*
	Show All 39 Lines

test/CodeGen/AMDGPU/wqm.ll

	;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs \| FileCheck %s --check-prefix=CHECK --check-prefix=SI			;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs \| FileCheck %s --check-prefix=CHECK --check-prefix=SI
	;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs \| FileCheck %s --check-prefix=CHECK --check-prefix=VI			;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs \| FileCheck %s --check-prefix=CHECK --check-prefix=VI

	; Check that WQM isn't triggered by image load/store intrinsics.			; Check that WQM isn't triggered by image load/store intrinsics.
	;			;
	;CHECK-LABEL: {{^}}test1:			;CHECK-LABEL: {{^}}test1:
	;CHECK-NOT: s_wqm			;CHECK-NOT: s_wqm
	define amdgpu_ps <4 x float> @test1(<8 x i32> inreg %rsrc, <4 x i32> %c) {			define amdgpu_ps <4 x float> @test1(<8 x i32> inreg %rsrc, <4 x i32> %c) {
	main_body:			main_body:
	%tex = call <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)			%tex = call <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
	call void @llvm.amdgcn.image.store.v4i32(<4 x float> %tex, <4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)			call void @llvm.amdgcn.image.store.v4i32(<4 x float> %tex, <4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
	ret <4 x float> %tex			ret <4 x float> %tex
	}			}

	; Check that WQM is triggered by image samples and left untouched for loads...			; Check that WQM is triggered by image samples and then disabled again if the
				; the rest of the shader doesn't care.
	;			;
	;CHECK-LABEL: {{^}}test2:			;CHECK-LABEL: {{^}}test2:
	;CHECK-NEXT: ; %main_body			;CHECK-NEXT: ; %main_body
				;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
	;CHECK-NEXT: s_wqm_b64 exec, exec			;CHECK-NEXT: s_wqm_b64 exec, exec
				;CHECK: s_and_b64 exec, exec, [[ORIG]]
	;CHECK: image_sample			;CHECK: image_sample
	;CHECK-NOT: exec			;CHECK-NOT: exec
	;CHECK: _load_dword v0,			;CHECK: _load_dword v0,
	define amdgpu_ps float @test2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) {			define amdgpu_ps float @test2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) {
	main_body:			main_body:
	%c.1 = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)			%c.1 = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
	%c.2 = bitcast <4 x float> %c.1 to <4 x i32>			%c.2 = bitcast <4 x float> %c.1 to <4 x i32>
	%c.3 = extractelement <4 x i32> %c.2, i32 0			%c.3 = extractelement <4 x i32> %c.2, i32 0
	%gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.3			%gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.3
	%data = load float, float addrspace(1)* %gep			%data = load float, float addrspace(1)* %gep
	ret float %data			ret float %data
	}			}

	; ... but disabled for stores (and, in this simple case, not re-enabled).			; Check that WQM is disabled for stores (and, in this simple case, not re-enabled).
	;			;
	;CHECK-LABEL: {{^}}test3:			;CHECK-LABEL: {{^}}test3:
	;CHECK-NEXT: ; %main_body			;CHECK-NEXT: ; %main_body
	;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec			;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
	;CHECK-NEXT: s_wqm_b64 exec, exec			;CHECK-NEXT: s_wqm_b64 exec, exec
	;CHECK: s_and_b64 exec, exec, [[ORIG]]			;CHECK: s_and_b64 exec, exec, [[ORIG]]
	;CHECK: image_sample			;CHECK: image_sample
	;CHECK: store			;CHECK: store
	▲ Show 20 Lines • Show All 384 Lines • ▼ Show 20 Lines
	; CHECK-LABEL: {{^}}test_subregs:			; CHECK-LABEL: {{^}}test_subregs:
	; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec			; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
	; CHECK: s_wqm_b64 exec, exec			; CHECK: s_wqm_b64 exec, exec
	; CHECK: v_interp_p1_f32			; CHECK: v_interp_p1_f32
	; CHECK: v_interp_p2_f32			; CHECK: v_interp_p2_f32
	; CHECK: s_and_b64 exec, exec, [[ORIG]]			; CHECK: s_and_b64 exec, exec, [[ORIG]]
	; CHECK: _store			; CHECK: _store
	; CHECK: s_wqm_b64 exec, exec			; CHECK: s_wqm_b64 exec, exec
				; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 1
				; CHECK: s_and_b64 exec, exec, [[ORIG]]
	; CHECK: image_sample			; CHECK: image_sample
	;			;
	; Early coalescing merges %c into a 64 bit VGPR pair, so correctness requires			; Early coalescing merges %c into a 64 bit VGPR pair, so correctness requires
	; tracking of subregisters.			; tracking of subregisters.
	;			;
	define amdgpu_ps <4 x float> @test_subregs(i32 inreg %prims, <2 x i32> %ij, i32 %idx) #1 {			define amdgpu_ps <4 x float> @test_subregs(i32 inreg %prims, <2 x i32> %ij, i32 %idx) #1 {
	main_body:			main_body:
	%c = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %prims, <2 x i32> %ij)			%c = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %prims, <2 x i32> %ij)
	▲ Show 20 Lines • Show All 99 Lines • Show Last 20 Lines