This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Fix creating invalid copy when adjusting dmask
ClosedPublic

Authored by arsenm on Oct 17 2017, 10:44 PM.

Download Raw Diff

Details

Reviewers

airlied
mareko

Summary

Move the entire optimization to one place. Before it was possible
to adjust dmask without changing the register class of the output
instruction, since they were done in separate places. Fix all

Diff Detail

Event Timeline

arsenm created this revision.Oct 17 2017, 10:44 PM

Herald added subscribers: t-tye, tpr, dstuttard and 4 others. · View Herald TranscriptOct 17 2017, 10:44 PM

There are some test crashes with this and I think I made the wrong guess for dmask behavior

aejsmith added a subscriber: aejsmith.Oct 18 2017, 2:14 AM

Each bit of dmask determines whether that component is enabled. Image opcodes return 4 components if dmask == 0xf. If dmask == 0x2, image opcodes only return the 2nd component in <1 x float>. If dmask = 0x5, image opcodes return the 1st and 3rd component in <2 x float>. If dmask = 0xa, image opcodes return the 2nd and 4th component in <2 x float>.
Gather4 opcodes are an exception and always return 4 components.

Fix other test failures

LGTM.

This revision is now accepted and ready to land.Oct 24 2017, 2:14 AM

With this, when dmask = 0x2, we get "image_get_lod v[0:1], ...". Based on what Marek said, wouldn't it only be returning a single value to v0? What would v1 get set to?

In comparison, for image_sample with dmask = 0x2 I see only a single destination register specified. That's also what I see on the proprietary driver for image_get_lod.

In D39040#906491, @aejsmith wrote:

With this, when dmask = 0x2, we get "image_get_lod v[0:1], ...". Based on what Marek said, wouldn't it only be returning a single value to v0? What would v1 get set to?

In comparison, for image_sample with dmask = 0x2 I see only a single destination register specified. That's also what I see on the proprietary driver for image_get_lod.

You are right. dmask = 0x2 makes image_get_lod return the 2nd channel as <1 x float>, which will be in v0 in your example.

In case it's still confusing: the number of components returned by image opcodes is popcount(dmask). The code could just do popcount(dmask) instead of computing BitsSet.

arsenm updated this revision to Diff 125016.Nov 30 2017, 2:14 PM

arsenm edited the summary of this revision. (Show Details)

There are no piglit regressions.

r319705

Revision Contents

Path

Size

lib/

Target/

AMDGPU/

AMDGPUISelDAGToDAG.cpp

12 lines

6 lines

56 lines

2 lines

88 lines

26 lines

test/

CodeGen/

AMDGPU/

adjust-writemask-invalid-copy.ll

51 lines

Diff 125016

lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

	Show First 20 Lines • Show All 2,068 Lines • ▼ Show 20 Lines
	}			}

	void AMDGPUDAGToDAGISel::PostprocessISelDAG() {			void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
	const AMDGPUTargetLowering& Lowering =			const AMDGPUTargetLowering& Lowering =
	static_cast<const AMDGPUTargetLowering>(getTargetLowering());			static_cast<const AMDGPUTargetLowering>(getTargetLowering());
	bool IsModified = false;			bool IsModified = false;
	do {			do {
	IsModified = false;			IsModified = false;

	// Go over all selected nodes and try to fold them a bit more			// Go over all selected nodes and try to fold them a bit more
	for (SDNode &Node : CurDAG->allnodes()) {			SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_begin();
	MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(&Node);			while (Position != CurDAG->allnodes_end()) {
				SDNode Node = &Position++;
				MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(Node);
	if (!MachineNode)			if (!MachineNode)
	continue;			continue;

	SDNode ResNode = Lowering.PostISelFolding(MachineNode, CurDAG);			SDNode ResNode = Lowering.PostISelFolding(MachineNode, CurDAG);
	if (ResNode != &Node) {			if (ResNode != Node) {
	ReplaceUses(&Node, ResNode);			if (ResNode)
				ReplaceUses(Node, ResNode);
	IsModified = true;			IsModified = true;
	}			}
	}			}
	CurDAG->RemoveDeadNodes();			CurDAG->RemoveDeadNodes();
	} while (IsModified);			} while (IsModified);
	}			}

	void R600DAGToDAGISel::Select(SDNode *N) {			void R600DAGToDAGISel::Select(SDNode *N) {
	▲ Show 20 Lines • Show All 88 Lines • Show Last 20 Lines

lib/Target/AMDGPU/AMDGPUInstrInfo.h

Show First 20 Lines • Show All 44 Lines • ▼ Show 20 Lines	bool shouldScheduleLoadsNear(SDNode Load1, SDNode Load2,
int64_t Offset1, int64_t Offset2,		int64_t Offset1, int64_t Offset2,
unsigned NumLoads) const override;		unsigned NumLoads) const override;

/// \brief Return a target-specific opcode if Opcode is a pseudo instruction.		/// \brief Return a target-specific opcode if Opcode is a pseudo instruction.
/// Return -1 if the target-specific opcode for the pseudo instruction does		/// Return -1 if the target-specific opcode for the pseudo instruction does
/// not exist. If Opcode is not a pseudo instruction, this is identity.		/// not exist. If Opcode is not a pseudo instruction, this is identity.
int pseudoToMCOpcode(int Opcode) const;		int pseudoToMCOpcode(int Opcode) const;

/// \brief Given a MIMG \p Opcode that writes all 4 channels, return the		/// \brief Given a MIMG \p MI that writes any number of channels, return the
/// equivalent opcode that writes \p Channels Channels.		/// equivalent opcode that writes \p NewChannels Channels.
int getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const;		int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels) const;
};		};
} // End llvm namespace		} // End llvm namespace

#endif		#endif

lib/Target/AMDGPU/AMDGPUInstrInfo.cpp

Show First 20 Lines • Show All 50 Lines • ▼ Show 20 Lines	assert(Offset1 > Offset0 &&
"Second offset should be larger than first offset!");		"Second offset should be larger than first offset!");
// If we have less than 16 loads in a row, and the offsets are within 64		// If we have less than 16 loads in a row, and the offsets are within 64
// bytes, then schedule together.		// bytes, then schedule together.

// A cacheline is 64 bytes (for global memory).		// A cacheline is 64 bytes (for global memory).
return (NumLoads <= 16 && (Offset1 - Offset0) < 64);		return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
}		}

int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const {		static AMDGPU::Channels indexToChannel(unsigned Channel) {
switch (Channels) {		switch (Channel) {
default: return Opcode;		case 1:
case 1: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_1);		return AMDGPU::Channels_1;
case 2: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_2);		case 2:
case 3: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_3);		return AMDGPU::Channels_2;
		case 3:
		return AMDGPU::Channels_3;
		case 4:
		return AMDGPU::Channels_4;
		default:
		llvm_unreachable("invalid MIMG channel");
}		}
}		}

		// FIXME: Need to handle d16 images correctly.
		static unsigned rcToChannels(unsigned RCID) {
		switch (RCID) {
		case AMDGPU::VGPR_32RegClassID:
		return 1;
		case AMDGPU::VReg_64RegClassID:
		return 2;
		case AMDGPU::VReg_96RegClassID:
		return 3;
		case AMDGPU::VReg_128RegClassID:
		return 4;
		default:
		llvm_unreachable("invalid MIMG register class");
		}
		}

		int AMDGPUInstrInfo::getMaskedMIMGOp(unsigned Opc,
		unsigned NewChannels) const {
		AMDGPU::Channels Channel = indexToChannel(NewChannels);
		unsigned OrigChannels = rcToChannels(get(Opc).OpInfo[0].RegClass);
		if (NewChannels == OrigChannels)
		return Opc;

		switch (OrigChannels) {
		case 1:
		return AMDGPU::getMaskedMIMGOp1(Opc, Channel);
		case 2:
		return AMDGPU::getMaskedMIMGOp2(Opc, Channel);
		case 3:
		return AMDGPU::getMaskedMIMGOp3(Opc, Channel);
		case 4:
		return AMDGPU::getMaskedMIMGOp4(Opc, Channel);
		default:
		llvm_unreachable("invalid MIMG channel");
		}
		}


// This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td		// This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td
enum SIEncodingFamily {		enum SIEncodingFamily {
SI = 0,		SI = 0,
VI = 1,		VI = 1,
SDWA = 2,		SDWA = 2,
SDWA9 = 3,		SDWA9 = 3,
GFX9 = 4		GFX9 = 4
};		};
▲ Show 20 Lines • Show All 56 Lines • Show Last 20 Lines

lib/Target/AMDGPU/SIISelLowering.h

Show First 20 Lines • Show All 76 Lines • ▼ Show 20 Lines	class SITargetLowering final : public AMDGPUTargetLowering {
SDValue getSegmentAperture(unsigned AS, const SDLoc &DL,		SDValue getSegmentAperture(unsigned AS, const SDLoc &DL,
SelectionDAG &DAG) const;		SelectionDAG &DAG) const;

SDValue lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const;		SDValue lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;		SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;		SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const;		SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const;

void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;		SDNode adjustWritemask(MachineSDNode &N, SelectionDAG &DAG) const;

SDValue performUCharToFloatCombine(SDNode *N,		SDValue performUCharToFloatCombine(SDNode *N,
DAGCombinerInfo &DCI) const;		DAGCombinerInfo &DCI) const;
SDValue performSHLPtrCombine(SDNode *N,		SDValue performSHLPtrCombine(SDNode *N,
unsigned AS,		unsigned AS,
EVT MemVT,		EVT MemVT,
DAGCombinerInfo &DCI) const;		DAGCombinerInfo &DCI) const;

▲ Show 20 Lines • Show All 197 Lines • Show Last 20 Lines

lib/Target/AMDGPU/SIISelLowering.cpp

Show First 20 Lines • Show All 6,572 Lines • ▼ Show 20 Lines	static unsigned SubIdx2Lane(unsigned Idx) {
case AMDGPU::sub0: return 0;		case AMDGPU::sub0: return 0;
case AMDGPU::sub1: return 1;		case AMDGPU::sub1: return 1;
case AMDGPU::sub2: return 2;		case AMDGPU::sub2: return 2;
case AMDGPU::sub3: return 3;		case AMDGPU::sub3: return 3;
}		}
}		}

/// \brief Adjust the writemask of MIMG instructions		/// \brief Adjust the writemask of MIMG instructions
void SITargetLowering::adjustWritemask(MachineSDNode *&Node,		SDNode SITargetLowering::adjustWritemask(MachineSDNode &Node,
SelectionDAG &DAG) const {		SelectionDAG &DAG) const {
SDNode *Users[4] = { };		SDNode *Users[4] = { nullptr };
unsigned Lane = 0;		unsigned Lane = 0;
unsigned DmaskIdx = (Node->getNumOperands() - Node->getNumValues() == 9) ? 2 : 3;		unsigned DmaskIdx = (Node->getNumOperands() - Node->getNumValues() == 9) ? 2 : 3;
unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);		unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
unsigned NewDmask = 0;		unsigned NewDmask = 0;

// Try to figure out the used register components		// Try to figure out the used register components
for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();		for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
I != E; ++I) {		I != E; ++I) {

// Don't look at users of the chain.		// Don't look at users of the chain.
if (I.getUse().getResNo() != 0)		if (I.getUse().getResNo() != 0)
continue;		continue;

// Abort if we can't understand the usage		// Abort if we can't understand the usage
if (!I->isMachineOpcode() \|\|		if (!I->isMachineOpcode() \|\|
I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)		I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
return;		return Node;

// Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.		// Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
// Note that subregs are packed, i.e. Lane==0 is the first bit set		// Note that subregs are packed, i.e. Lane==0 is the first bit set
// in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit		// in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
// set, etc.		// set, etc.
Lane = SubIdx2Lane(I->getConstantOperandVal(1));		Lane = SubIdx2Lane(I->getConstantOperandVal(1));

// Set which texture component corresponds to the lane.		// Set which texture component corresponds to the lane.
unsigned Comp;		unsigned Comp;
for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) {		for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) {
assert(Dmask);		assert(Dmask);
Comp = countTrailingZeros(Dmask);		Comp = countTrailingZeros(Dmask);
Dmask &= ~(1 << Comp);		Dmask &= ~(1 << Comp);
}		}

// Abort if we have more than one user per component		// Abort if we have more than one user per component
if (Users[Lane])		if (Users[Lane])
return;		return Node;

Users[Lane] = *I;		Users[Lane] = *I;
NewDmask \|= 1 << Comp;		NewDmask \|= 1 << Comp;
}		}

// Abort if there's no change		// Abort if there's no change
if (NewDmask == OldDmask)		if (NewDmask == OldDmask)
return;		return Node;

		unsigned BitsSet = countPopulation(NewDmask);

		const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
		int NewOpcode = TII->getMaskedMIMGOp(Node->getMachineOpcode(), BitsSet);
		assert(NewOpcode != -1 &&
		NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
		"failed to find equivalent MIMG op");

// Adjust the writemask in the node		// Adjust the writemask in the node
std::vector<SDValue> Ops;		SmallVector<SDValue, 12> Ops;
Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);		Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));		Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());		Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops);

// If we only got one lane, replace it with a copy		MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
// (if NewDmask has only one bit set...)
if (NewDmask && (NewDmask & (NewDmask-1)) == 0) {		auto NewVTList =
SDValue RC = DAG.getTargetConstant(AMDGPU::VGPR_32RegClassID, SDLoc(),		DAG.getVTList(BitsSet == 1 ?
MVT::i32);		SVT : MVT::getVectorVT(SVT, BitsSet == 3 ? 4 : BitsSet),
SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,		MVT::Other);
SDLoc(), Users[Lane]->getValueType(0),
SDValue(Node, 0), RC);		MachineSDNode *NewNode = DAG.getMachineNode(NewOpcode, SDLoc(Node),
		NewVTList, Ops);
		// Update chain.
		DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));

		if (BitsSet == 1) {
		assert(Node->hasNUsesOfValue(1, 0));
		SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY,
		SDLoc(Node), Users[Lane]->getValueType(0),
		SDValue(NewNode, 0));
DAG.ReplaceAllUsesWith(Users[Lane], Copy);		DAG.ReplaceAllUsesWith(Users[Lane], Copy);
return;		return nullptr;
}		}

// Update the users of the node with the new indices		// Update the users of the node with the new indices
for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) {		for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) {
SDNode *User = Users[i];		SDNode *User = Users[i];
if (!User)		if (!User)
continue;		continue;

SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);		SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
DAG.UpdateNodeOperands(User, User->getOperand(0), Op);		DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);

switch (Idx) {		switch (Idx) {
default: break;		default: break;
case AMDGPU::sub0: Idx = AMDGPU::sub1; break;		case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
case AMDGPU::sub1: Idx = AMDGPU::sub2; break;		case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
case AMDGPU::sub2: Idx = AMDGPU::sub3; break;		case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
}		}
}		}

		DAG.RemoveDeadNode(Node);
		return nullptr;
}		}

static bool isFrameIndexOp(SDValue Op) {		static bool isFrameIndexOp(SDValue Op) {
if (Op.getOpcode() == ISD::AssertZext)		if (Op.getOpcode() == ISD::AssertZext)
Op = Op.getOperand(0);		Op = Op.getOperand(0);

return isa<FrameIndexSDNode>(Op);		return isa<FrameIndexSDNode>(Op);
}		}
▲ Show 20 Lines • Show All 41 Lines • ▼ Show 20 Lines	Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
Node->getOperand(i).getValueType(),		Node->getOperand(i).getValueType(),
Node->getOperand(i)), 0));		Node->getOperand(i)), 0));
}		}

return DAG.UpdateNodeOperands(Node, Ops);		return DAG.UpdateNodeOperands(Node, Ops);
}		}

/// \brief Fold the instructions after selecting them.		/// \brief Fold the instructions after selecting them.
		/// Returns null if users were already updated.
SDNode SITargetLowering::PostISelFolding(MachineSDNode Node,		SDNode SITargetLowering::PostISelFolding(MachineSDNode Node,
SelectionDAG &DAG) const {		SelectionDAG &DAG) const {
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();		const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
unsigned Opcode = Node->getMachineOpcode();		unsigned Opcode = Node->getMachineOpcode();

if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() &&		if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() &&
!TII->isGather4(Opcode))		!TII->isGather4(Opcode)) {
adjustWritemask(Node, DAG);		return adjustWritemask(Node, DAG);
		}

if (Opcode == AMDGPU::INSERT_SUBREG \|\|		if (Opcode == AMDGPU::INSERT_SUBREG \|\|
Opcode == AMDGPU::REG_SEQUENCE) {		Opcode == AMDGPU::REG_SEQUENCE) {
legalizeTargetIndependentNode(Node, DAG);		legalizeTargetIndependentNode(Node, DAG);
return Node;		return Node;
}		}

switch (Opcode) {		switch (Opcode) {
▲ Show 20 Lines • Show All 61 Lines • ▼ Show 20 Lines	void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();		MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();

if (TII->isVOP3(MI.getOpcode())) {		if (TII->isVOP3(MI.getOpcode())) {
// Make sure constant bus requirements are respected.		// Make sure constant bus requirements are respected.
TII->legalizeOperandsVOP3(MRI, MI);		TII->legalizeOperandsVOP3(MRI, MI);
return;		return;
}		}

if (TII->isMIMG(MI)) {
unsigned VReg = MI.getOperand(0).getReg();
const TargetRegisterClass *RC = MRI.getRegClass(VReg);
// TODO: Need mapping tables to handle other cases (register classes).
if (RC != &AMDGPU::VReg_128RegClass)
return;

unsigned DmaskIdx = MI.getNumOperands() == 12 ? 3 : 4;
unsigned Writemask = MI.getOperand(DmaskIdx).getImm();
unsigned BitsSet = 0;
for (unsigned i = 0; i < 4; ++i)
BitsSet += Writemask & (1 << i) ? 1 : 0;
switch (BitsSet) {
default: return;
case 1: RC = &AMDGPU::VGPR_32RegClass; break;
case 2: RC = &AMDGPU::VReg_64RegClass; break;
case 3: RC = &AMDGPU::VReg_96RegClass; break;
}

unsigned NewOpcode = TII->getMaskedMIMGOp(MI.getOpcode(), BitsSet);
MI.setDesc(TII->get(NewOpcode));
MRI.setRegClass(VReg, RC);
return;
}

// Replace unused atomics with the no return version.		// Replace unused atomics with the no return version.
int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode());		int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode());
if (NoRetAtomicOp != -1) {		if (NoRetAtomicOp != -1) {
if (!Node->hasAnyUseOfValue(0)) {		if (!Node->hasAnyUseOfValue(0)) {
MI.setDesc(TII->get(NoRetAtomicOp));		MI.setDesc(TII->get(NoRetAtomicOp));
MI.RemoveOperand(0);		MI.RemoveOperand(0);
return;		return;
}		}
▲ Show 20 Lines • Show All 240 Lines • Show Last 20 Lines

lib/Target/AMDGPU/SIInstrInfo.td

	Show First 20 Lines • Show All 1,817 Lines • ▼ Show 20 Lines
	def getBasicFromSDWAOp : InstrMapping {			def getBasicFromSDWAOp : InstrMapping {
	let FilterClass = "VOP";			let FilterClass = "VOP";
	let RowFields = ["OpName"];			let RowFields = ["OpName"];
	let ColFields = ["AsmVariantName"];			let ColFields = ["AsmVariantName"];
	let KeyCol = ["SDWA"];			let KeyCol = ["SDWA"];
	let ValueCols = [["Default"]];			let ValueCols = [["Default"]];
	}			}

	def getMaskedMIMGOp : InstrMapping {			def getMaskedMIMGOp1 : InstrMapping {
				let FilterClass = "MIMG_Mask";
				let RowFields = ["Op"];
				let ColFields = ["Channels"];
				let KeyCol = ["1"];
				let ValueCols = [["2"], ["3"], ["4"] ];
				}

				def getMaskedMIMGOp2 : InstrMapping {
				let FilterClass = "MIMG_Mask";
				let RowFields = ["Op"];
				let ColFields = ["Channels"];
				let KeyCol = ["2"];
				let ValueCols = [["1"], ["3"], ["4"] ];
				}

				def getMaskedMIMGOp3 : InstrMapping {
				let FilterClass = "MIMG_Mask";
				let RowFields = ["Op"];
				let ColFields = ["Channels"];
				let KeyCol = ["3"];
				let ValueCols = [["1"], ["2"], ["4"] ];
				}

				def getMaskedMIMGOp4 : InstrMapping {
	let FilterClass = "MIMG_Mask";			let FilterClass = "MIMG_Mask";
	let RowFields = ["Op"];			let RowFields = ["Op"];
	let ColFields = ["Channels"];			let ColFields = ["Channels"];
	let KeyCol = ["4"];			let KeyCol = ["4"];
	let ValueCols = [["1"], ["2"], ["3"] ];			let ValueCols = [["1"], ["2"], ["3"] ];
	}			}

	// Maps an commuted opcode to its original version			// Maps an commuted opcode to its original version
	▲ Show 20 Lines • Show All 68 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/adjust-writemask-invalid-copy.ll

This file was added.

				; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s

				; GCN-LABEL: {{^}}adjust_writemask_crash_0:
				; GCN: image_get_lod v0, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x2
				; GCN-NOT: v1
				; GCN-NOT: v0
				; GCN: buffer_store_dword v0
				define amdgpu_ps void @adjust_writemask_crash_0() #0 {
				main_body:
				%tmp = call <2 x float> @llvm.amdgcn.image.getlod.v2f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 3, i1 false, i1 false, i1 false, i1 false, i1 false)
				%tmp1 = bitcast <2 x float> %tmp to <2 x i32>
				%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
				%tmp3 = bitcast <4 x i32> %tmp2 to <4 x float>
				%tmp4 = extractelement <4 x float> %tmp3, i32 0
				store volatile float %tmp4, float addrspace(1)* undef
				ret void
				}

				; GCN-LABEL: {{^}}adjust_writemask_crash_1:
				; GCN: image_get_lod v0, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x1
				; GCN-NOT: v1
				; GCN-NOT: v0
				; GCN: buffer_store_dword v0
				define amdgpu_ps void @adjust_writemask_crash_1() #0 {
				main_body:
				%tmp = call <2 x float> @llvm.amdgcn.image.getlod.v2f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 3, i1 false, i1 false, i1 false, i1 false, i1 false)
				%tmp1 = bitcast <2 x float> %tmp to <2 x i32>
				%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> <i32 1, i32 0, i32 undef, i32 undef>
				%tmp3 = bitcast <4 x i32> %tmp2 to <4 x float>
				%tmp4 = extractelement <4 x float> %tmp3, i32 1
				store volatile float %tmp4, float addrspace(1)* undef
				ret void
				}

				define amdgpu_ps void @adjust_writemask_crash_0_v4() #0 {
				main_body:
				%tmp = call <4 x float> @llvm.amdgcn.image.getlod.v4f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 5, i1 false, i1 false, i1 false, i1 false, i1 false)
				%tmp1 = bitcast <4 x float> %tmp to <4 x i32>
				%tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
				%tmp3 = bitcast <4 x i32> %tmp2 to <4 x float>
				%tmp4 = extractelement <4 x float> %tmp3, i32 0
				store volatile float %tmp4, float addrspace(1)* undef
				ret void
				}


				declare <2 x float> @llvm.amdgcn.image.getlod.v2f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #1
				declare <4 x float> @llvm.amdgcn.image.getlod.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #1

				attributes #0 = { nounwind }
				attributes #1 = { nounwind readonly }