Diff 296645

llvm/lib/Target/AMDGPU/AMDGPU.td

Show First 20 Lines • Show All 228 Lines • ▼ Show 20 Lines
>;		>;

def FeatureOffset3fBug : SubtargetFeature<"offset-3f-bug",		def FeatureOffset3fBug : SubtargetFeature<"offset-3f-bug",
"HasOffset3fBug",		"HasOffset3fBug",
"true",		"true",
"Branch offset of 3f hardware bug"		"Branch offset of 3f hardware bug"
>;		>;

		def FeatureImageStoreD16Bug : SubtargetFeature<"image-store-d16-bug",
		"HasImageStoreD16Bug",
		"true",
		"Image Store D16 hardware bug"
		>;

		def FeatureImageGather4D16Bug : SubtargetFeature<"image-gather4-d16-bug",
		"HasImageGather4D16Bug",
		"true",
		"Image Gather4 D16 hardware bug"
		>;

class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature <		class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature <
"ldsbankcount"#Value,		"ldsbankcount"#Value,
"LDSBankCount",		"LDSBankCount",
!cast<string>(Value),		!cast<string>(Value),
"The number of LDS banks per compute unit."		"The number of LDS banks per compute unit."
>;		>;

def FeatureLDSBankCount16 : SubtargetFeatureLDSBankCount<16>;		def FeatureLDSBankCount16 : SubtargetFeatureLDSBankCount<16>;
▲ Show 20 Lines • Show All 560 Lines • ▼ Show 20 Lines	def FeatureISAVersion8_0_3 : FeatureSet<
FeatureUnpackedD16VMem,		FeatureUnpackedD16VMem,
FeatureDoesNotSupportXNACK,		FeatureDoesNotSupportXNACK,
FeatureCodeObjectV3]>;		FeatureCodeObjectV3]>;

def FeatureISAVersion8_1_0 : FeatureSet<		def FeatureISAVersion8_1_0 : FeatureSet<
[FeatureVolcanicIslands,		[FeatureVolcanicIslands,
FeatureLDSBankCount16,		FeatureLDSBankCount16,
FeatureXNACK,		FeatureXNACK,
FeatureCodeObjectV3]>;		FeatureCodeObjectV3,
		FeatureImageStoreD16Bug,
		FeatureImageGather4D16Bug]>;

def FeatureISAVersion9_0_0 : FeatureSet<		def FeatureISAVersion9_0_0 : FeatureSet<
[FeatureGFX9,		[FeatureGFX9,
FeatureMadMixInsts,		FeatureMadMixInsts,
FeatureLDSBankCount32,		FeatureLDSBankCount32,
FeatureCodeObjectV3,		FeatureCodeObjectV3,
FeatureDoesNotSupportXNACK,		FeatureDoesNotSupportXNACK,
FeatureDoesNotSupportSRAMECC]>;		FeatureDoesNotSupportSRAMECC,
		FeatureImageGather4D16Bug]>;

def FeatureISAVersion9_0_2 : FeatureSet<		def FeatureISAVersion9_0_2 : FeatureSet<
[FeatureGFX9,		[FeatureGFX9,
FeatureMadMixInsts,		FeatureMadMixInsts,
FeatureLDSBankCount32,		FeatureLDSBankCount32,
FeatureXNACK,		FeatureXNACK,
FeatureDoesNotSupportSRAMECC,		FeatureDoesNotSupportSRAMECC,
FeatureCodeObjectV3]>;		FeatureCodeObjectV3,
		FeatureImageGather4D16Bug]>;

def FeatureISAVersion9_0_4 : FeatureSet<		def FeatureISAVersion9_0_4 : FeatureSet<
[FeatureGFX9,		[FeatureGFX9,
FeatureLDSBankCount32,		FeatureLDSBankCount32,
FeatureFmaMixInsts,		FeatureFmaMixInsts,
FeatureDoesNotSupportXNACK,		FeatureDoesNotSupportXNACK,
FeatureDoesNotSupportSRAMECC,		FeatureDoesNotSupportSRAMECC,
FeatureCodeObjectV3]>;		FeatureCodeObjectV3,
		FeatureImageGather4D16Bug]>;

def FeatureISAVersion9_0_6 : FeatureSet<		def FeatureISAVersion9_0_6 : FeatureSet<
[FeatureGFX9,		[FeatureGFX9,
HalfRate64Ops,		HalfRate64Ops,
FeatureFmaMixInsts,		FeatureFmaMixInsts,
FeatureLDSBankCount32,		FeatureLDSBankCount32,
FeatureDLInsts,		FeatureDLInsts,
FeatureDot1Insts,		FeatureDot1Insts,
FeatureDot2Insts,		FeatureDot2Insts,
FeatureDoesNotSupportXNACK,		FeatureDoesNotSupportXNACK,
FeatureCodeObjectV3]>;		FeatureCodeObjectV3,
		FeatureImageGather4D16Bug]>;

def FeatureISAVersion9_0_8 : FeatureSet<		def FeatureISAVersion9_0_8 : FeatureSet<
[FeatureGFX9,		[FeatureGFX9,
HalfRate64Ops,		HalfRate64Ops,
FeatureFmaMixInsts,		FeatureFmaMixInsts,
FeatureLDSBankCount32,		FeatureLDSBankCount32,
FeatureDLInsts,		FeatureDLInsts,
FeatureDot1Insts,		FeatureDot1Insts,
FeatureDot2Insts,		FeatureDot2Insts,
FeatureDot3Insts,		FeatureDot3Insts,
FeatureDot4Insts,		FeatureDot4Insts,
FeatureDot5Insts,		FeatureDot5Insts,
FeatureDot6Insts,		FeatureDot6Insts,
FeatureMAIInsts,		FeatureMAIInsts,
FeaturePkFmacF16Inst,		FeaturePkFmacF16Inst,
FeatureAtomicFaddInsts,		FeatureAtomicFaddInsts,
FeatureSRAMECC,		FeatureSRAMECC,
FeatureMFMAInlineLiteralBug,		FeatureMFMAInlineLiteralBug,
FeatureCodeObjectV3]>;		FeatureCodeObjectV3,
		FeatureImageGather4D16Bug]>;

def FeatureISAVersion9_0_9 : FeatureSet<		def FeatureISAVersion9_0_9 : FeatureSet<
[FeatureGFX9,		[FeatureGFX9,
FeatureMadMixInsts,		FeatureMadMixInsts,
FeatureLDSBankCount32,		FeatureLDSBankCount32,
FeatureXNACK,		FeatureXNACK,
FeatureCodeObjectV3]>;		FeatureCodeObjectV3,
		FeatureImageGather4D16Bug]>;

// TODO: Organize more features into groups.		// TODO: Organize more features into groups.
def FeatureGroup {		def FeatureGroup {
// Bugs present on gfx10.1.		// Bugs present on gfx10.1.
list<SubtargetFeature> GFX10_1_Bugs = [		list<SubtargetFeature> GFX10_1_Bugs = [
FeatureVcmpxPermlaneHazard,		FeatureVcmpxPermlaneHazard,
FeatureVMEMtoScalarWriteHazard,		FeatureVMEMtoScalarWriteHazard,
FeatureSMEMtoVectorWriteHazard,		FeatureSMEMtoVectorWriteHazard,
▲ Show 20 Lines • Show All 402 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Show First 20 Lines • Show All 1,533 Lines • ▼ Show 20 Lines	if (BaseOpcode->Atomic) {
} else {		} else {
DMask = Is64Bit ? 0x3 : 0x1;		DMask = Is64Bit ? 0x3 : 0x1;
NumVDataDwords = Is64Bit ? 2 : 1;		NumVDataDwords = Is64Bit ? 2 : 1;
}		}
} else {		} else {
DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();		DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);		DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);

if (BaseOpcode->Store) {
VDataIn = MI.getOperand(1).getReg();
VDataTy = MRI->getType(VDataIn);
NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
} else {
VDataOut = MI.getOperand(0).getReg();
VDataTy = MRI->getType(VDataOut);
NumVDataDwords = DMaskLanes;

// One memoperand is mandatory, except for getresinfo.		// One memoperand is mandatory, except for getresinfo.
// FIXME: Check this in verifier.		// FIXME: Check this in verifier.
if (!MI.memoperands_empty()) {		if (!MI.memoperands_empty()) {
const MachineMemOperand MMO = MI.memoperands_begin();		const MachineMemOperand MMO = MI.memoperands_begin();

// Infer d16 from the memory size, as the register type will be mangled by		// Infer d16 from the memory size, as the register type will be mangled by
// unpacked subtargets, or by TFE.		// unpacked subtargets, or by TFE.
IsD16 = ((8 * MMO->getSize()) / DMaskLanes) < 32;		IsD16 = ((8 * MMO->getSize()) / DMaskLanes) < 32;
		}

		if (BaseOpcode->Store) {
		VDataIn = MI.getOperand(1).getReg();
		VDataTy = MRI->getType(VDataIn);
		NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
		} else {
		VDataOut = MI.getOperand(0).getReg();
		VDataTy = MRI->getType(VDataOut);
		NumVDataDwords = DMaskLanes;

if (IsD16 && !STI.hasUnpackedD16VMem())		if (IsD16 && !STI.hasUnpackedD16VMem())
NumVDataDwords = (DMaskLanes + 1) / 2;		NumVDataDwords = (DMaskLanes + 1) / 2;
}		}
}		}
}

// Optimize _L to _LZ when _L is zero		// Optimize _L to _LZ when _L is zero
if (LZMappingInfo) {		if (LZMappingInfo) {
// The legalizer replaced the register with an immediate 0 if we need to		// The legalizer replaced the register with an immediate 0 if we need to
// change the opcode.		// change the opcode.
const MachineOperand &Lod = MI.getOperand(ArgOffset + Intr->LodIndex);		const MachineOperand &Lod = MI.getOperand(ArgOffset + Intr->LodIndex);
if (Lod.isImm()) {		if (Lod.isImm()) {
assert(Lod.getImm() == 0);		assert(Lod.getImm() == 0);
▲ Show 20 Lines • Show All 2,648 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h

Show First 20 Lines • Show All 140 Lines • ▼ Show 20 Lines	bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;		MachineIRBuilder &B) const;
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI,		bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B, unsigned AddrSpace) const;		MachineIRBuilder &B, unsigned AddrSpace) const;

std::tuple<Register, unsigned, unsigned>		std::tuple<Register, unsigned, unsigned>
splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const;		splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const;

Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI,		Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI,
Register Reg) const;		Register Reg, bool ImageStore = false) const;
bool legalizeRawBufferStore(MachineInstr &MI, MachineRegisterInfo &MRI,		bool legalizeRawBufferStore(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B, bool IsFormat) const;		MachineIRBuilder &B, bool IsFormat) const;
bool legalizeRawBufferLoad(MachineInstr &MI, MachineRegisterInfo &MRI,		bool legalizeRawBufferLoad(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B, bool IsFormat) const;		MachineIRBuilder &B, bool IsFormat) const;
Register fixStoreSourceType(MachineIRBuilder &B, Register VData,		Register fixStoreSourceType(MachineIRBuilder &B, Register VData,
bool IsFormat) const;		bool IsFormat) const;

bool legalizeBufferStore(MachineInstr &MI, MachineRegisterInfo &MRI,		bool legalizeBufferStore(MachineInstr &MI, MachineRegisterInfo &MRI,
Show All 30 Lines

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Show First 20 Lines • Show All 3,522 Lines • ▼ Show 20 Lines	if (!BaseReg)
BaseReg = B.buildConstant(S32, 0).getReg(0);		BaseReg = B.buildConstant(S32, 0).getReg(0);

return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);		return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
}		}

/// Handle register layout difference for f16 images for some subtargets.		/// Handle register layout difference for f16 images for some subtargets.
Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,		Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
MachineRegisterInfo &MRI,		MachineRegisterInfo &MRI,
Register Reg) const {		Register Reg,
if (!ST.hasUnpackedD16VMem())		bool ImageStore) const {
return Reg;

const LLT S16 = LLT::scalar(16);		const LLT S16 = LLT::scalar(16);
const LLT S32 = LLT::scalar(32);		const LLT S32 = LLT::scalar(32);
LLT StoreVT = MRI.getType(Reg);		LLT StoreVT = MRI.getType(Reg);
assert(StoreVT.isVector() && StoreVT.getElementType() == S16);		assert(StoreVT.isVector() && StoreVT.getElementType() == S16);

		if (ST.hasUnpackedD16VMem()) {
auto Unmerge = B.buildUnmerge(S16, Reg);		auto Unmerge = B.buildUnmerge(S16, Reg);

SmallVector<Register, 4> WideRegs;		SmallVector<Register, 4> WideRegs;
for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)		for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));		WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));

int NumElts = StoreVT.getNumElements();		int NumElts = StoreVT.getNumElements();

return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);		return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
}		}

		if (ImageStore && ST.hasImageStoreD16Bug()) {
		if (StoreVT.getNumElements() == 2) {
		arsenmUnsubmitted Done Reply Inline Actions Brace formatting arsenm: Brace formatting
		SmallVector<Register, 4> PackedRegs;
		Reg = B.buildBitcast(S32, Reg).getReg(0);
		PackedRegs.push_back(Reg);
		PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
		return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0);
		}
		arsenmUnsubmitted Not Done Reply Inline Actions There's no obstacle to handling v3 here, it should work in the other cases arsenm: There's no obstacle to handling v3 here, it should work in the other cases
		rdominguAuthorUnsubmitted Done Reply Inline Actions Note that v3f16 is not handled in SIISelLowering.cpp either (line 7255). Are you suggesting we implement v3f16 there too? rdomingu: Note that v3f16 is not handled in SIISelLowering.cpp either (line 7255). Are you suggesting we…
		arsenmUnsubmitted Not Done Reply Inline Actions There is a v3f16 definition now, so you could. I think it's more important for globalisel to be more complete arsenm: There is a v3f16 definition now, so you could. I think it's more important for globalisel to be…

		if (StoreVT.getNumElements() == 3) {
		SmallVector<Register, 4> PackedRegs;
		arsenmUnsubmitted Not Done Reply Inline Actions If you just resize to 8 below, why not use an array? arsenm: If you just resize to 8 below, why not use an array?
		auto Unmerge = B.buildUnmerge(S16, Reg);
		for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
		PackedRegs.push_back(Unmerge.getReg(I));
		PackedRegs.resize(8, B.buildUndef(S16).getReg(0));
		arsenmUnsubmitted Not Done Reply Inline Actions Resize here is weird. You can push back, or constructed PackedRegs to the desired size? arsenm: Resize here is weird. You can push back, or constructed PackedRegs to the desired size?
		arsenmUnsubmitted Not Done Reply Inline Actions You can do this in the initial construction, also the small size should just be 8? Or use std::array? arsenm: You can do this in the initial construction, also the small size should just be 8? Or use std…
		Reg = B.buildBuildVector(LLT::vector(8, S16), PackedRegs).getReg(0);
		arsenmUnsubmitted Not Done Reply Inline Actions It would be preferable to emit a concat_vectors of <2 x s16> pieces here arsenm: It would be preferable to emit a concat_vectors of <2 x s16> pieces here
		rdominguAuthorUnsubmitted Done Reply Inline Actions Sorry, I'm new to this. Why would concat_vectors be preferable than build_vector? Could you please elaborate? rdomingu: Sorry, I'm new to this. Why would concat_vectors be preferable than build_vector? Could you…
		arsenmUnsubmitted Not Done Reply Inline Actions Because a G_BUILD_VECTOR with 16-bit sources isn't naturally legal. This works, it just adds more work for the legalizer to reprocess these when you could produce something that's legal to begin with to save compile time arsenm: Because a G_BUILD_VECTOR with 16-bit sources isn't naturally legal. This works, it just adds…
		rdominguAuthorUnsubmitted Done Reply Inline Actions I see. But how would you go from v3f16 to concat_vectors of <2 x 16> to v4f32 (which is what we want at the end)? rdomingu: I see. But how would you go from v3f16 to concat_vectors of <2 x 16> to v4f32 (which is what we…
		arsenmUnsubmitted Not Done Reply Inline Actions I think I'm missing something. Why is this going from <3 x s16> to <4 x s32>? Isn't this the unpacked layout case? Why isn't this just an G_ANYEXT from <3 x s16> to <3 x s32>? arsenm: I think I'm missing something. Why is this going from <3 x s16> to <4 x s32>? Isn't this the…
		rdominguAuthorUnsubmitted Done Reply Inline Actions This is the image workaround for the packed layout case. We don't want to change the data layout which is why we shouldn't use G_ANYEXT. We just want to make the compiler think the data is twice as big. rdomingu: This is the image workaround for the packed layout case. We don't want to change the data…
		return B.buildBitcast(LLT::vector(4, S32), Reg).getReg(0);
		}

		if (StoreVT.getNumElements() == 4) {
		SmallVector<Register, 4> PackedRegs;
		arsenmUnsubmitted Not Done Reply Inline Actions Same as above arsenm: Same as above
		Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0);
		auto Unmerge = B.buildUnmerge(S32, Reg);
		for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
		PackedRegs.push_back(Unmerge.getReg(I));
		PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
		return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0);
		}

		llvm_unreachable("invalid data type");
		}

		return Reg;
		}

Register AMDGPULegalizerInfo::fixStoreSourceType(		Register AMDGPULegalizerInfo::fixStoreSourceType(
MachineIRBuilder &B, Register VData, bool IsFormat) const {		MachineIRBuilder &B, Register VData, bool IsFormat) const {
MachineRegisterInfo *MRI = B.getMRI();		MachineRegisterInfo *MRI = B.getMRI();
LLT Ty = MRI->getType(VData);		LLT Ty = MRI->getType(VData);

		arsenmUnsubmitted Done Reply Inline Actions No else after return arsenm: No else after return
const LLT S16 = LLT::scalar(16);		const LLT S16 = LLT::scalar(16);

// Fixup illegal register types for i8 stores.		// Fixup illegal register types for i8 stores.
if (Ty == LLT::scalar(8) \|\| Ty == S16) {		if (Ty == LLT::scalar(8) \|\| Ty == S16) {
Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);		Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
return AnyExt;		return AnyExt;
}		}

if (Ty.isVector()) {		if (Ty.isVector()) {
if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {		if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
if (IsFormat)		if (IsFormat)
return handleD16VData(B, *MRI, VData);		return handleD16VData(B, *MRI, VData);
}		}
}		}

		arsenmUnsubmitted Done Reply Inline Actions No else after return arsenm: No else after return
return VData;		return VData;
}		}
		arsenmUnsubmitted Done Reply Inline Actions llvm_unreachable instead of assert(alse) arsenm: llvm_unreachable instead of assert(alse)

bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,		bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
MachineRegisterInfo &MRI,		MachineRegisterInfo &MRI,
MachineIRBuilder &B,		MachineIRBuilder &B,
bool IsTyped,		bool IsTyped,
bool IsFormat) const {		bool IsFormat) const {
Register VData = MI.getOperand(1).getReg();		Register VData = MI.getOperand(1).getReg();
LLT Ty = MRI.getType(VData);		LLT Ty = MRI.getType(VData);
▲ Show 20 Lines • Show All 629 Lines • ▼ Show 20 Lines	bool AMDGPULegalizerInfo::legalizeImageIntrinsic(

if (BaseOpcode->Store) { // No TFE for stores?		if (BaseOpcode->Store) { // No TFE for stores?
// TODO: Handle dmask trim		// TODO: Handle dmask trim
Register VData = MI.getOperand(1).getReg();		Register VData = MI.getOperand(1).getReg();
LLT Ty = MRI->getType(VData);		LLT Ty = MRI->getType(VData);
if (!Ty.isVector() \|\| Ty.getElementType() != S16)		if (!Ty.isVector() \|\| Ty.getElementType() != S16)
return true;		return true;

Register RepackedReg = handleD16VData(B, *MRI, VData);		Register RepackedReg = handleD16VData(B, *MRI, VData, true);
if (RepackedReg != VData) {		if (RepackedReg != VData) {
MI.getOperand(1).setReg(RepackedReg);		MI.getOperand(1).setReg(RepackedReg);
}		}

return true;		return true;
}		}

Register DstReg = MI.getOperand(0).getReg();		Register DstReg = MI.getOperand(0).getReg();
▲ Show 20 Lines • Show All 552 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h

Show First 20 Lines • Show All 405 Lines • ▼ Show 20 Lines	protected:
bool HasVMEMtoScalarWriteHazard;		bool HasVMEMtoScalarWriteHazard;
bool HasSMEMtoVectorWriteHazard;		bool HasSMEMtoVectorWriteHazard;
bool HasInstFwdPrefetchBug;		bool HasInstFwdPrefetchBug;
bool HasVcmpxExecWARHazard;		bool HasVcmpxExecWARHazard;
bool HasLdsBranchVmemWARHazard;		bool HasLdsBranchVmemWARHazard;
bool HasNSAtoVMEMBug;		bool HasNSAtoVMEMBug;
bool HasOffset3fBug;		bool HasOffset3fBug;
bool HasFlatSegmentOffsetBug;		bool HasFlatSegmentOffsetBug;
		bool HasImageStoreD16Bug;
		bool HasImageGather4D16Bug;

// Dummy feature to use for assembler in tablegen.		// Dummy feature to use for assembler in tablegen.
bool FeatureDisable;		bool FeatureDisable;

SelectionDAGTargetInfo TSInfo;		SelectionDAGTargetInfo TSInfo;
private:		private:
SIInstrInfo InstrInfo;		SIInstrInfo InstrInfo;
SITargetLowering TLInfo;		SITargetLowering TLInfo;
▲ Show 20 Lines • Show All 598 Lines • ▼ Show 20 Lines	public:
bool hasA16() const { return hasR128A16() \|\| hasGFX10A16(); }		bool hasA16() const { return hasR128A16() \|\| hasGFX10A16(); }

bool hasG16() const { return HasG16; }		bool hasG16() const { return HasG16; }

bool hasOffset3fBug() const {		bool hasOffset3fBug() const {
return HasOffset3fBug;		return HasOffset3fBug;
}		}

bool hasNSAEncoding() const {		bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; }
return HasNSAEncoding;
}		bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; }

		bool hasNSAEncoding() const { return HasNSAEncoding; }

bool hasGFX10_BEncoding() const {		bool hasGFX10_BEncoding() const {
return GFX10_BEncoding;		return GFX10_BEncoding;
}		}

bool hasGFX10_3Insts() const {		bool hasGFX10_3Insts() const {
return GFX10_3Insts;		return GFX10_3Insts;
}		}
▲ Show 20 Lines • Show All 373 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

Show First 20 Lines • Show All 265 Lines • ▼ Show 20 Lines	GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
HasVMEMtoScalarWriteHazard(false),		HasVMEMtoScalarWriteHazard(false),
HasSMEMtoVectorWriteHazard(false),		HasSMEMtoVectorWriteHazard(false),
HasInstFwdPrefetchBug(false),		HasInstFwdPrefetchBug(false),
HasVcmpxExecWARHazard(false),		HasVcmpxExecWARHazard(false),
HasLdsBranchVmemWARHazard(false),		HasLdsBranchVmemWARHazard(false),
HasNSAtoVMEMBug(false),		HasNSAtoVMEMBug(false),
HasOffset3fBug(false),		HasOffset3fBug(false),
HasFlatSegmentOffsetBug(false),		HasFlatSegmentOffsetBug(false),
		HasImageStoreD16Bug(false),
		HasImageGather4D16Bug(false),

FeatureDisable(false),		FeatureDisable(false),
InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),		InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
TLInfo(TM, *this),		TLInfo(TM, *this),
FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {		FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);		MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));		CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));		InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
Show All 13 Lines	unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
case AMDGPU::V_LSHL_B64:		case AMDGPU::V_LSHL_B64:
case AMDGPU::V_LSHRREV_B64:		case AMDGPU::V_LSHRREV_B64:
case AMDGPU::V_LSHRREV_B64_gfx10:		case AMDGPU::V_LSHRREV_B64_gfx10:
case AMDGPU::V_LSHR_B64:		case AMDGPU::V_LSHR_B64:
case AMDGPU::V_ASHRREV_I64:		case AMDGPU::V_ASHRREV_I64:
case AMDGPU::V_ASHRREV_I64_gfx10:		case AMDGPU::V_ASHRREV_I64_gfx10:
case AMDGPU::V_ASHR_I64:		case AMDGPU::V_ASHR_I64:
return 1;		return 1;
}		}
		arsenmUnsubmitted Done Reply Inline Actions I know clang-format really wants to pack these onto a single line, but it's a terrible idea and you shouldn't listen to it arsenm: I know clang-format really wants to pack these onto a single line, but it's a terrible idea and…

return 2;		return 2;
}		}

unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,		unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
const Function &F) const {		const Function &F) const {
if (NWaves == 1)		if (NWaves == 1)
return getLocalMemorySize();		return getLocalMemorySize();
▲ Show 20 Lines • Show All 614 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/SIISelLowering.h

Show First 20 Lines • Show All 102 Lines • ▼ Show 20 Lines	SDValue lowerIntrinsicLoad(MemSDNode *M, bool IsFormat, SelectionDAG &DAG,
ArrayRef<SDValue> Ops) const;		ArrayRef<SDValue> Ops) const;

// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to		// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
// dwordx4 if on SI.		// dwordx4 if on SI.
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,		SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
ArrayRef<SDValue> Ops, EVT MemVT,		ArrayRef<SDValue> Ops, EVT MemVT,
MachineMemOperand *MMO, SelectionDAG &DAG) const;		MachineMemOperand *MMO, SelectionDAG &DAG) const;

SDValue handleD16VData(SDValue VData, SelectionDAG &DAG) const;		SDValue handleD16VData(SDValue VData, SelectionDAG &DAG,
		bool ImageStore = false) const;

/// Converts \p Op, which must be of floating point type, to the		/// Converts \p Op, which must be of floating point type, to the
/// floating point type \p VT, by either extending or truncating it.		/// floating point type \p VT, by either extending or truncating it.
SDValue getFPExtOrFPRound(SelectionDAG &DAG,		SDValue getFPExtOrFPRound(SelectionDAG &DAG,
SDValue Op,		SDValue Op,
const SDLoc &DL,		const SDLoc &DL,
EVT VT) const;		EVT VT) const;

▲ Show 20 Lines • Show All 367 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 5,845 Lines • ▼ Show 20 Lines	MVT DataDwordVT = NumDataDwords == 1 ?
MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);		MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);

MVT MaskPopVT = MaskPopDwords == 1 ?		MVT MaskPopVT = MaskPopDwords == 1 ?
MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);		MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);

SDValue Data(Result, 0);		SDValue Data(Result, 0);
SDValue TexFail;		SDValue TexFail;

if (IsTexFail) {		if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);		SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
if (MaskPopVT.isVector()) {		if (MaskPopVT.isVector()) {
Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,		Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
SDValue(Result, 0), ZeroIdx);		SDValue(Result, 0), ZeroIdx);
} else {		} else {
Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,		Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
SDValue(Result, 0), ZeroIdx);		SDValue(Result, 0), ZeroIdx);
}		}

TexFail = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
SDValue(Result, 0),
DAG.getConstant(MaskPopDwords, DL, MVT::i32));
}		}

if (DataDwordVT.isVector())		if (DataDwordVT.isVector())
Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,		Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
NumDataDwords - MaskPopDwords);		NumDataDwords - MaskPopDwords);

if (IsD16)		if (IsD16)
Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);		Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);

EVT LegalReqRetVT = ReqRetVT;		EVT LegalReqRetVT = ReqRetVT;
if (!ReqRetVT.isVector()) {		if (!ReqRetVT.isVector()) {
Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);		Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
} else {		} else {
// We need to widen the return vector to a legal type		// We need to widen the return vector to a legal type
if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&		if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
ReqRetVT.getVectorElementType().getSizeInBits() == 16) {		ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
LegalReqRetVT =		LegalReqRetVT =
EVT::getVectorVT(*DAG.getContext(), ReqRetVT.getVectorElementType(),		EVT::getVectorVT(*DAG.getContext(), ReqRetVT.getVectorElementType(),
ReqRetVT.getVectorNumElements() + 1);		ReqRetVT.getVectorNumElements() + 1);
}		}
}		}
Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);		Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);

if (TexFail)		if (IsTexFail) {
		TexFail =
		DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
		DAG.getConstant(MaskPopDwords, DL, MVT::i32));

return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);		return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
		}

if (Result->getNumValues() == 1)		if (Result->getNumValues() == 1)
return Data;		return Data;

return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);		return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
}		}

static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,		static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
▲ Show 20 Lines • Show All 102 Lines • ▼ Show 20 Lines	if (BaseOpcode->Store) {
VData = Op.getOperand(2);		VData = Op.getOperand(2);

MVT StoreVT = VData.getSimpleValueType();		MVT StoreVT = VData.getSimpleValueType();
if (StoreVT.getScalarType() == MVT::f16) {		if (StoreVT.getScalarType() == MVT::f16) {
if (!Subtarget->hasD16Images() \|\| !BaseOpcode->HasD16)		if (!Subtarget->hasD16Images() \|\| !BaseOpcode->HasD16)
return Op; // D16 is unsupported for this instruction		return Op; // D16 is unsupported for this instruction

IsD16 = true;		IsD16 = true;
VData = handleD16VData(VData, DAG);		VData = handleD16VData(VData, DAG, true);
}		}

NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;		NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
} else {		} else {
// Work out the num dwords based on the dmask popcount and underlying type		// Work out the num dwords based on the dmask popcount and underlying type
// and whether packing is supported.		// and whether packing is supported.
MVT LoadVT = ResultTypes[0].getSimpleVT();		MVT LoadVT = ResultTypes[0].getSimpleVT();
if (LoadVT.getScalarType() == MVT::f16) {		if (LoadVT.getScalarType() == MVT::f16) {
if (!Subtarget->hasD16Images() \|\| !BaseOpcode->HasD16)		if (!Subtarget->hasD16Images() \|\| !BaseOpcode->HasD16)
return Op; // D16 is unsupported for this instruction		return Op; // D16 is unsupported for this instruction

IsD16 = true;		IsD16 = true;
}		}

// Confirm that the return type is large enough for the dmask specified		// Confirm that the return type is large enough for the dmask specified
if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) \|\|		if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) \|\|
(!LoadVT.isVector() && DMaskLanes > 1))		(!LoadVT.isVector() && DMaskLanes > 1))
return Op;		return Op;

if (IsD16 && !Subtarget->hasUnpackedD16VMem())		// The sq block of gfx8 and gfx9 do not estimate register use correctly
		// for d16 image_gather4, image_gather4_l, and image_gather4_lz
		// instructions.
		if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
		!(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
		arsenmUnsubmitted Not Done Reply Inline Actions IsD16 && (!hasUnpackedD16() \|\| (Gather4 && hasGatherbug))? arsenm: IsD16 && (!hasUnpackedD16() \|\| (Gather4 && hasGatherbug))?
		rdominguAuthorUnsubmitted Done Reply Inline Actions We want NumVDataDwords = DMaskLanes for Gather4 && IsD16 && hasGatherBug. That wouldn't be the case with your suggestion. rdomingu: We want NumVDataDwords = DMaskLanes for Gather4 && IsD16 && hasGatherBug. That wouldn't be the…
NumVDataDwords = (DMaskLanes + 1) / 2;		NumVDataDwords = (DMaskLanes + 1) / 2;
else		else
NumVDataDwords = DMaskLanes;		NumVDataDwords = DMaskLanes;

AdjustRetType = true;		AdjustRetType = true;
}		}
}		}

▲ Show 20 Lines • Show All 1,357 Lines • ▼ Show 20 Lines	SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
if (WidenedVT != VT) {		if (WidenedVT != VT) {
auto Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, NewOp,		auto Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, NewOp,
DAG.getVectorIdxConstant(0, DL));		DAG.getVectorIdxConstant(0, DL));
NewOp = DAG.getMergeValues({ Extract, SDValue(NewOp.getNode(), 1) }, DL);		NewOp = DAG.getMergeValues({ Extract, SDValue(NewOp.getNode(), 1) }, DL);
}		}
return NewOp;		return NewOp;
}		}

SDValue SITargetLowering::handleD16VData(SDValue VData,		SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
SelectionDAG &DAG) const {		bool ImageStore) const {
EVT StoreVT = VData.getValueType();		EVT StoreVT = VData.getValueType();

// No change for f16 and legal vector D16 types.		// No change for f16 and legal vector D16 types.
if (!StoreVT.isVector())		if (!StoreVT.isVector())
return VData;		return VData;

SDLoc DL(VData);		SDLoc DL(VData);
unsigned NumElements = StoreVT.getVectorNumElements();		unsigned NumElements = StoreVT.getVectorNumElements();
Show All 15 Lines	if (Subtarget->hasUnpackedD16VMem()) {
EVT WidenedStoreVT = EVT::getVectorVT(		EVT WidenedStoreVT = EVT::getVectorVT(
*DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);		*DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),		EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
WidenedStoreVT.getStoreSizeInBits());		WidenedStoreVT.getStoreSizeInBits());
SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);		SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);		return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
}		}

		// The sq block of gfx8.1 does not estimate register use correctly for d16
		// image store instructions. The data operand is computed as if it were not a
		// d16 image instruction.
		if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
		// Bitcast to i16
		EVT IntStoreVT = StoreVT.changeTypeToInteger();
		SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);

		// Decompose into scalars
		SmallVector<SDValue, 4> Elts;
		DAG.ExtractVectorElements(IntVData, Elts);

		// Group pairs of i16 into v2i16 and bitcast to i32
		SmallVector<SDValue, 4> PackedElts;
		arsenmUnsubmitted Not Done Reply Inline Actions You can initialize this to the target size and then avoid push_back arsenm: You can initialize this to the target size and then avoid push_back
		for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
		arsenmUnsubmitted Done Reply Inline Actions This is the same as just MVT::v2i16 arsenm: This is the same as just MVT::v2i16
		arsenmUnsubmitted Not Done Reply Inline Actions ++I arsenm: ++I
		SDValue Pair =
		DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
		arsenmUnsubmitted Done Reply Inline Actions Formatting arsenm: Formatting
		SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
		PackedElts.push_back(IntPair);
		}

		// Pad using UNDEF
		PackedElts.resize(PackedElts.size() * 2, DAG.getUNDEF(MVT::i32));

		// Build final vector
		EVT VecVT =
		EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
		return DAG.getBuildVector(VecVT, DL, PackedElts);
		}

assert(isTypeLegal(StoreVT));		assert(isTypeLegal(StoreVT));
return VData;		return VData;
}		}

SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,		SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
SelectionDAG &DAG) const {		SelectionDAG &DAG) const {
SDLoc DL(Op);		SDLoc DL(Op);
SDValue Chain = Op.getOperand(0);		SDValue Chain = Op.getOperand(0);
▲ Show 20 Lines • Show All 4,451 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll

; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py		; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -stop-after=legalizer -global-isel-abort=0 -o - %s \| FileCheck -check-prefix=UNPACKED %s		; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -stop-after=legalizer -global-isel-abort=0 -o - %s \| FileCheck -check-prefix=UNPACKED %s
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=legalizer -global-isel-abort=0 -o - %s \| FileCheck -check-prefix=PACKED %s		; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=legalizer -global-isel-abort=0 -o - %s \| FileCheck -check-prefix=GFX81 %s

define amdgpu_ps void @image_store_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, half %data) {		define amdgpu_ps void @image_store_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, half %data) {
		; PACKED-LABEL: name: image_store_f16
		; PACKED: bb.1 (%ir-block.0):
		; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2
		; PACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
		; PACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
		; PACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
		; PACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
		; PACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
		; PACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
		; PACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
		; PACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
		; PACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
		; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
		; PACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
		; PACKED: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32)
		; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
		; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
		; PACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[TRUNC]](s16), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 2 into custom "TargetCustom8")
		; PACKED: S_ENDPGM 0
; UNPACKED-LABEL: name: image_store_f16		; UNPACKED-LABEL: name: image_store_f16
; UNPACKED: bb.1 (%ir-block.0):		; UNPACKED: bb.1 (%ir-block.0):
; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2		; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2
; UNPACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2		; UNPACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
; UNPACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3		; UNPACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
; UNPACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4		; UNPACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
; UNPACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5		; UNPACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
; UNPACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6		; UNPACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
; UNPACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7		; UNPACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
; UNPACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8		; UNPACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
; UNPACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9		; UNPACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
; UNPACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0		; UNPACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
; UNPACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1		; UNPACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
; UNPACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2		; UNPACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
; UNPACKED: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32)		; UNPACKED: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32)
; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)		; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)		; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; UNPACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[TRUNC]](s16), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 2 into custom "TargetCustom8")		; UNPACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[TRUNC]](s16), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 2 into custom "TargetCustom8")
; UNPACKED: S_ENDPGM 0		; UNPACKED: S_ENDPGM 0
; PACKED-LABEL: name: image_store_f16		; GFX81-LABEL: name: image_store_f16
		; GFX81: bb.1 (%ir-block.0):
		; GFX81: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2
		; GFX81: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
		; GFX81: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
		; GFX81: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
		; GFX81: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
		; GFX81: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
		; GFX81: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
		; GFX81: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
		; GFX81: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
		; GFX81: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
		; GFX81: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
		; GFX81: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
		; GFX81: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32)
		; GFX81: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
		; GFX81: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
		; GFX81: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[TRUNC]](s16), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 2 into custom "TargetCustom8")
		; GFX81: S_ENDPGM 0
		call void @llvm.amdgcn.image.store.2d.f16.i32(half %data, i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
		ret void
		}

		define amdgpu_ps void @image_store_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <2 x half> %in) {
		; PACKED-LABEL: name: image_store_v2f16
; PACKED: bb.1 (%ir-block.0):		; PACKED: bb.1 (%ir-block.0):
; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2		; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2
; PACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2		; PACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
; PACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3		; PACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
; PACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4		; PACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
; PACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5		; PACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
; PACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6		; PACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
; PACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7		; PACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
; PACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8		; PACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
; PACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9		; PACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
; PACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0		; PACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1		; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
; PACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2		; PACKED: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
; PACKED: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32)
; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)		; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)		; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; PACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[TRUNC]](s16), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 2 into custom "TargetCustom8")		; PACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[COPY10]](<2 x s16>), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 4 into custom "TargetCustom8")
; PACKED: S_ENDPGM 0		; PACKED: S_ENDPGM 0
call void @llvm.amdgcn.image.store.2d.f16.i32(half %data, i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret void
}

define amdgpu_ps void @image_store_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <2 x half> %in) {
; UNPACKED-LABEL: name: image_store_v2f16		; UNPACKED-LABEL: name: image_store_v2f16
; UNPACKED: bb.1 (%ir-block.0):		; UNPACKED: bb.1 (%ir-block.0):
; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2		; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2
; UNPACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2		; UNPACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
; UNPACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3		; UNPACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
; UNPACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4		; UNPACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
; UNPACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5		; UNPACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
; UNPACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6		; UNPACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
; UNPACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7		; UNPACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
; UNPACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8		; UNPACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
; UNPACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9		; UNPACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
; UNPACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0		; UNPACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
; UNPACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1		; UNPACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
; UNPACKED: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2		; UNPACKED: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)		; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)		; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; UNPACKED: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY10]](<2 x s16>)		; UNPACKED: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY10]](<2 x s16>)
; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16		; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
; UNPACKED: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)		; UNPACKED: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
; UNPACKED: [[COPY11:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)		; UNPACKED: [[COPY11:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
; UNPACKED: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)		; UNPACKED: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
; UNPACKED: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY11]](s32), [[COPY12]](s32)		; UNPACKED: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY11]](s32), [[COPY12]](s32)
; UNPACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR2]](<2 x s32>), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 4 into custom "TargetCustom8")		; UNPACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR2]](<2 x s32>), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 4 into custom "TargetCustom8")
; UNPACKED: S_ENDPGM 0		; UNPACKED: S_ENDPGM 0
; PACKED-LABEL: name: image_store_v2f16		; GFX81-LABEL: name: image_store_v2f16
		; GFX81: bb.1 (%ir-block.0):
		; GFX81: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2
		; GFX81: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
		; GFX81: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
		; GFX81: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
		; GFX81: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
		; GFX81: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
		; GFX81: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
		; GFX81: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
		; GFX81: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
		; GFX81: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
		; GFX81: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
		; GFX81: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
		; GFX81: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
		; GFX81: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
		; GFX81: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY10]](<2 x s16>)
		; GFX81: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
		; GFX81: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[BITCAST]](s32), [[DEF]](s32)
		; GFX81: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR2]](<2 x s32>), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 4 into custom "TargetCustom8")
		; GFX81: S_ENDPGM 0
		call void @llvm.amdgcn.image.store.2d.v2f16.i32(<2 x half> %in, i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
		ret void
		}

		define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <3 x half> %in) {
		; PACKED-LABEL: name: image_store_v3f16
; PACKED: bb.1 (%ir-block.0):		; PACKED: bb.1 (%ir-block.0):
; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2		; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3
; PACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2		; PACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
; PACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3		; PACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
; PACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4		; PACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
; PACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5		; PACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
; PACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6		; PACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
; PACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7		; PACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
; PACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8		; PACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
; PACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9		; PACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
; PACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0		; PACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1		; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
; PACKED: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2		; PACKED: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
		; PACKED: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)		; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
		; PACKED: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
		; PACKED: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>), [[DEF]](<2 x s16>)
		; PACKED: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[CONCAT_VECTORS]](<6 x s16>)
		; PACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](s96)
		; PACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
		; PACKED: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32)
		; PACKED: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32)
		; PACKED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
		; PACKED: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
		; PACKED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C1]]
		; PACKED: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
		; PACKED: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C1]]
		; PACKED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
		; PACKED: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
		; PACKED: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
		; PACKED: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
		; PACKED: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C1]]
		; PACKED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
		; PACKED: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32)
		; PACKED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
		; PACKED: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
		; PACKED: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[DEF]](<2 x s16>)
		; PACKED: [[UV3:%[0-9]+]]:_(<3 x s16>), [[UV4:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)		; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; PACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[COPY10]](<2 x s16>), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 4 into custom "TargetCustom8")		; PACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[UV3]](<3 x s16>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 6 into custom "TargetCustom8", align 8)
; PACKED: S_ENDPGM 0		; PACKED: S_ENDPGM 0
call void @llvm.amdgcn.image.store.2d.v2f16.i32(<2 x half> %in, i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret void
}

define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <3 x half> %in) {
; UNPACKED-LABEL: name: image_store_v3f16		; UNPACKED-LABEL: name: image_store_v3f16
; UNPACKED: bb.1 (%ir-block.0):		; UNPACKED: bb.1 (%ir-block.0):
; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3		; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3
; UNPACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2		; UNPACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
; UNPACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3		; UNPACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
; UNPACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4		; UNPACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
; UNPACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5		; UNPACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
; UNPACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6		; UNPACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
Show All 14 Lines	define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <3 x half> %in) {
; UNPACKED: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32)		; UNPACKED: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32)
; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)		; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; UNPACKED: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV]](s32)		; UNPACKED: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
; UNPACKED: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)		; UNPACKED: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
; UNPACKED: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)		; UNPACKED: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
; UNPACKED: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32)		; UNPACKED: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32)
; UNPACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR2]](<3 x s32>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 6 into custom "TargetCustom8", align 8)		; UNPACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR2]](<3 x s32>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 6 into custom "TargetCustom8", align 8)
; UNPACKED: S_ENDPGM 0		; UNPACKED: S_ENDPGM 0
; PACKED-LABEL: name: image_store_v3f16		; GFX81-LABEL: name: image_store_v3f16
		; GFX81: bb.1 (%ir-block.0):
		; GFX81: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3
		; GFX81: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
		; GFX81: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
		; GFX81: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
		; GFX81: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
		; GFX81: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
		; GFX81: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
		; GFX81: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
		; GFX81: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
		; GFX81: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
		; GFX81: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
		; GFX81: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
		; GFX81: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
		; GFX81: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
		; GFX81: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
		; GFX81: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>), [[DEF]](<2 x s16>)
		; GFX81: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[CONCAT_VECTORS]](<6 x s16>)
		; GFX81: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](s96)
		; GFX81: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
		; GFX81: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32)
		; GFX81: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32)
		; GFX81: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
		; GFX81: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
		; GFX81: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
		; GFX81: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C1]]
		; GFX81: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
		; GFX81: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C1]]
		; GFX81: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
		; GFX81: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
		; GFX81: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
		; GFX81: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
		; GFX81: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C1]]
		; GFX81: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
		; GFX81: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32)
		; GFX81: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
		; GFX81: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
		; GFX81: [[OR2:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL1]]
		; GFX81: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
		; GFX81: [[COPY15:%[0-9]+]]:_(s32) = COPY [[OR2]](s32)
		; GFX81: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32)
		; GFX81: [[CONCAT_VECTORS1:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
		; GFX81: [[BITCAST5:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[CONCAT_VECTORS1]](<8 x s16>)
		; GFX81: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BITCAST5]](<4 x s32>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 6 into custom "TargetCustom8", align 8)
		; GFX81: S_ENDPGM 0
		call void @llvm.amdgcn.image.store.2d.v3f16.i32(<3 x half> %in, i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
		ret void
		}

		define amdgpu_ps void @image_store_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <4 x half> %in) {
		; PACKED-LABEL: name: image_store_v4f16
; PACKED: bb.1 (%ir-block.0):		; PACKED: bb.1 (%ir-block.0):
; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3		; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3
; PACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2		; PACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
; PACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3		; PACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
; PACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4		; PACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
; PACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5		; PACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
; PACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6		; PACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
; PACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7		; PACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
; PACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8		; PACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
; PACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9		; PACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
; PACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0		; PACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1		; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
; PACKED: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2		; PACKED: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
; PACKED: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3		; PACKED: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)		; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
; PACKED: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF		; PACKED: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>)
; PACKED: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>), [[DEF]](<2 x s16>)
; PACKED: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[CONCAT_VECTORS]](<6 x s16>)
; PACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](s96)
; PACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
; PACKED: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32)
; PACKED: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32)
; PACKED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
; PACKED: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
; PACKED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C1]]
; PACKED: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
; PACKED: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C1]]
; PACKED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
; PACKED: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
; PACKED: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
; PACKED: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
; PACKED: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C1]]
; PACKED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; PACKED: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32)
; PACKED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
; PACKED: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
; PACKED: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[DEF]](<2 x s16>)
; PACKED: [[UV3:%[0-9]+]]:_(<3 x s16>), [[UV4:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)		; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; PACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[UV3]](<3 x s16>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 6 into custom "TargetCustom8", align 8)		; PACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[CONCAT_VECTORS]](<4 x s16>), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 8 into custom "TargetCustom8")
; PACKED: S_ENDPGM 0		; PACKED: S_ENDPGM 0
call void @llvm.amdgcn.image.store.2d.v3f16.i32(<3 x half> %in, i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret void
}

define amdgpu_ps void @image_store_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <4 x half> %in) {
; UNPACKED-LABEL: name: image_store_v4f16		; UNPACKED-LABEL: name: image_store_v4f16
; UNPACKED: bb.1 (%ir-block.0):		; UNPACKED: bb.1 (%ir-block.0):
; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3		; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3
; UNPACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2		; UNPACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
; UNPACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3		; UNPACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
; UNPACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4		; UNPACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
; UNPACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5		; UNPACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
; UNPACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6		; UNPACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
Show All 13 Lines	define amdgpu_ps void @image_store_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <4 x half> %in) {
; UNPACKED: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)		; UNPACKED: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
; UNPACKED: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)		; UNPACKED: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
; UNPACKED: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)		; UNPACKED: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
; UNPACKED: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32)		; UNPACKED: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32)
; UNPACKED: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)		; UNPACKED: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
; UNPACKED: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32)		; UNPACKED: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32)
; UNPACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR2]](<4 x s32>), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 8 into custom "TargetCustom8")		; UNPACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR2]](<4 x s32>), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 8 into custom "TargetCustom8")
; UNPACKED: S_ENDPGM 0		; UNPACKED: S_ENDPGM 0
; PACKED-LABEL: name: image_store_v4f16		; GFX81-LABEL: name: image_store_v4f16
; PACKED: bb.1 (%ir-block.0):		; GFX81: bb.1 (%ir-block.0):
; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3		; GFX81: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3
; PACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2		; GFX81: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
; PACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3		; GFX81: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
; PACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4		; GFX81: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
; PACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5		; GFX81: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
; PACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6		; GFX81: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
; PACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7		; GFX81: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
; PACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8		; GFX81: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
; PACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9		; GFX81: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
; PACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0		; GFX81: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1		; GFX81: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
; PACKED: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2		; GFX81: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
; PACKED: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3		; GFX81: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)		; GFX81: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
; PACKED: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>)		; GFX81: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>)
; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)		; GFX81: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; PACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[CONCAT_VECTORS]](<4 x s16>), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 8 into custom "TargetCustom8")		; GFX81: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s16>)
; PACKED: S_ENDPGM 0		; GFX81: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<2 x s32>)
		; GFX81: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
		; GFX81: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[DEF]](s32), [[DEF]](s32)
		; GFX81: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR2]](<4 x s32>), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 8 into custom "TargetCustom8")
		; GFX81: S_ENDPGM 0
call void @llvm.amdgcn.image.store.2d.v4f16.i32(<4 x half> %in, i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)		call void @llvm.amdgcn.image.store.2d.v4f16.i32(<4 x half> %in, i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret void		ret void
}		}

declare void @llvm.amdgcn.image.store.2d.f16.i32(half, i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0		declare void @llvm.amdgcn.image.store.2d.f16.i32(half, i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0
declare void @llvm.amdgcn.image.store.2d.v2f16.i32(<2 x half>, i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0		declare void @llvm.amdgcn.image.store.2d.v2f16.i32(<2 x half>, i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0
declare void @llvm.amdgcn.image.store.2d.v3f16.i32(<3 x half>, i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0		declare void @llvm.amdgcn.image.store.2d.v3f16.i32(<3 x half>, i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0
declare void @llvm.amdgcn.image.store.2d.v4f16.i32(<4 x half>, i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0		declare void @llvm.amdgcn.image.store.2d.v4f16.i32(<4 x half>, i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0

attributes #0 = { nounwind writeonly }		attributes #0 = { nounwind writeonly }

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.d16.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -o - %s \| FileCheck -check-prefix=UNPACKED %s			; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -o - %s \| FileCheck -check-prefix=UNPACKED %s
	; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -o - %s \| FileCheck -check-prefix=PACKED %s			; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -o - %s \| FileCheck -check-prefix=GFX81 %s

	define amdgpu_ps void @image_store_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, half %data) {			define amdgpu_ps void @image_store_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, half %data) {
	; UNPACKED-LABEL: image_store_f16:			; UNPACKED-LABEL: image_store_f16:
	; UNPACKED: ; %bb.0:			; UNPACKED: ; %bb.0:
	; UNPACKED-NEXT: s_mov_b32 s0, s2			; UNPACKED-NEXT: s_mov_b32 s0, s2
	; UNPACKED-NEXT: s_mov_b32 s1, s3			; UNPACKED-NEXT: s_mov_b32 s1, s3
	; UNPACKED-NEXT: s_mov_b32 s2, s4			; UNPACKED-NEXT: s_mov_b32 s2, s4
	; UNPACKED-NEXT: s_mov_b32 s3, s5			; UNPACKED-NEXT: s_mov_b32 s3, s5
	; UNPACKED-NEXT: s_mov_b32 s4, s6			; UNPACKED-NEXT: s_mov_b32 s4, s6
	; UNPACKED-NEXT: s_mov_b32 s5, s7			; UNPACKED-NEXT: s_mov_b32 s5, s7
	; UNPACKED-NEXT: s_mov_b32 s6, s8			; UNPACKED-NEXT: s_mov_b32 s6, s8
	; UNPACKED-NEXT: s_mov_b32 s7, s9			; UNPACKED-NEXT: s_mov_b32 s7, s9
	; UNPACKED-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x1 unorm			; UNPACKED-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x1 unorm d16
	; UNPACKED-NEXT: s_endpgm			; UNPACKED-NEXT: s_endpgm
	;			;
	; PACKED-LABEL: image_store_f16:			; PACKED-LABEL: image_store_f16:
	; PACKED: ; %bb.0:			; PACKED: ; %bb.0:
	; PACKED-NEXT: s_mov_b32 s0, s2			; PACKED-NEXT: s_mov_b32 s0, s2
	; PACKED-NEXT: s_mov_b32 s1, s3			; PACKED-NEXT: s_mov_b32 s1, s3
	; PACKED-NEXT: s_mov_b32 s2, s4			; PACKED-NEXT: s_mov_b32 s2, s4
	; PACKED-NEXT: s_mov_b32 s3, s5			; PACKED-NEXT: s_mov_b32 s3, s5
	; PACKED-NEXT: s_mov_b32 s4, s6			; PACKED-NEXT: s_mov_b32 s4, s6
	; PACKED-NEXT: s_mov_b32 s5, s7			; PACKED-NEXT: s_mov_b32 s5, s7
	; PACKED-NEXT: s_mov_b32 s6, s8			; PACKED-NEXT: s_mov_b32 s6, s8
	; PACKED-NEXT: s_mov_b32 s7, s9			; PACKED-NEXT: s_mov_b32 s7, s9
	; PACKED-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x1 unorm			; PACKED-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x1 unorm d16
	; PACKED-NEXT: s_endpgm			; PACKED-NEXT: s_endpgm
				;
				; GFX81-LABEL: image_store_f16:
				; GFX81: ; %bb.0:
				; GFX81-NEXT: s_mov_b32 s0, s2
				; GFX81-NEXT: s_mov_b32 s1, s3
				; GFX81-NEXT: s_mov_b32 s2, s4
				; GFX81-NEXT: s_mov_b32 s3, s5
				; GFX81-NEXT: s_mov_b32 s4, s6
				; GFX81-NEXT: s_mov_b32 s5, s7
				; GFX81-NEXT: s_mov_b32 s6, s8
				; GFX81-NEXT: s_mov_b32 s7, s9
				; GFX81-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x1 unorm d16
				; GFX81-NEXT: s_endpgm
	call void @llvm.amdgcn.image.store.2d.f16.i32(half %data, i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)			call void @llvm.amdgcn.image.store.2d.f16.i32(half %data, i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
	ret void			ret void
	}			}

	define amdgpu_ps void @image_store_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <2 x half> %in) {			define amdgpu_ps void @image_store_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <2 x half> %in) {
	; UNPACKED-LABEL: image_store_v2f16:			; UNPACKED-LABEL: image_store_v2f16:
	; UNPACKED: ; %bb.0:			; UNPACKED: ; %bb.0:
	; UNPACKED-NEXT: s_mov_b32 s0, s2			; UNPACKED-NEXT: s_mov_b32 s0, s2
	; UNPACKED-NEXT: s_mov_b32 s1, s3			; UNPACKED-NEXT: s_mov_b32 s1, s3
	; UNPACKED-NEXT: s_mov_b32 s2, s4			; UNPACKED-NEXT: s_mov_b32 s2, s4
	; UNPACKED-NEXT: s_mov_b32 s3, s5			; UNPACKED-NEXT: s_mov_b32 s3, s5
	; UNPACKED-NEXT: s_mov_b32 s4, s6			; UNPACKED-NEXT: s_mov_b32 s4, s6
	; UNPACKED-NEXT: s_mov_b32 s5, s7			; UNPACKED-NEXT: s_mov_b32 s5, s7
	; UNPACKED-NEXT: s_mov_b32 s6, s8			; UNPACKED-NEXT: s_mov_b32 s6, s8
	; UNPACKED-NEXT: s_mov_b32 s7, s9			; UNPACKED-NEXT: s_mov_b32 s7, s9
	; UNPACKED-NEXT: v_lshrrev_b32_e32 v3, 16, v2			; UNPACKED-NEXT: v_lshrrev_b32_e32 v3, 16, v2
	; UNPACKED-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0x3 unorm			; UNPACKED-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0x3 unorm d16
	; UNPACKED-NEXT: s_endpgm			; UNPACKED-NEXT: s_endpgm
	;			;
	; PACKED-LABEL: image_store_v2f16:			; PACKED-LABEL: image_store_v2f16:
	; PACKED: ; %bb.0:			; PACKED: ; %bb.0:
	; PACKED-NEXT: s_mov_b32 s0, s2			; PACKED-NEXT: s_mov_b32 s0, s2
	; PACKED-NEXT: s_mov_b32 s1, s3			; PACKED-NEXT: s_mov_b32 s1, s3
	; PACKED-NEXT: s_mov_b32 s2, s4			; PACKED-NEXT: s_mov_b32 s2, s4
	; PACKED-NEXT: s_mov_b32 s3, s5			; PACKED-NEXT: s_mov_b32 s3, s5
	; PACKED-NEXT: s_mov_b32 s4, s6			; PACKED-NEXT: s_mov_b32 s4, s6
	; PACKED-NEXT: s_mov_b32 s5, s7			; PACKED-NEXT: s_mov_b32 s5, s7
	; PACKED-NEXT: s_mov_b32 s6, s8			; PACKED-NEXT: s_mov_b32 s6, s8
	; PACKED-NEXT: s_mov_b32 s7, s9			; PACKED-NEXT: s_mov_b32 s7, s9
	; PACKED-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x3 unorm			; PACKED-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x3 unorm d16
	; PACKED-NEXT: s_endpgm			; PACKED-NEXT: s_endpgm
				;
				; GFX81-LABEL: image_store_v2f16:
				; GFX81: ; %bb.0:
				; GFX81-NEXT: s_mov_b32 s0, s2
				; GFX81-NEXT: s_mov_b32 s1, s3
				; GFX81-NEXT: s_mov_b32 s2, s4
				; GFX81-NEXT: s_mov_b32 s3, s5
				; GFX81-NEXT: s_mov_b32 s4, s6
				; GFX81-NEXT: s_mov_b32 s5, s7
				; GFX81-NEXT: s_mov_b32 s6, s8
				; GFX81-NEXT: s_mov_b32 s7, s9
				; GFX81-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0x3 unorm d16
				; GFX81-NEXT: s_endpgm
	call void @llvm.amdgcn.image.store.2d.v2f16.i32(<2 x half> %in, i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)			call void @llvm.amdgcn.image.store.2d.v2f16.i32(<2 x half> %in, i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
	ret void			ret void
	}			}

	; FIXME: Broken			; FIXME: Broken
	; define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <3 x half> %in) {			; define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <3 x half> %in) {
	; call void @llvm.amdgcn.image.store.2d.v3f16.i32(<3 x half> %in, i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)			; call void @llvm.amdgcn.image.store.2d.v3f16.i32(<3 x half> %in, i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
	; ret void			; ret void
	Show All 10 Lines
	; UNPACKED-NEXT: s_mov_b32 s3, s5			; UNPACKED-NEXT: s_mov_b32 s3, s5
	; UNPACKED-NEXT: s_mov_b32 s4, s6			; UNPACKED-NEXT: s_mov_b32 s4, s6
	; UNPACKED-NEXT: s_mov_b32 s5, s7			; UNPACKED-NEXT: s_mov_b32 s5, s7
	; UNPACKED-NEXT: s_mov_b32 s6, s8			; UNPACKED-NEXT: s_mov_b32 s6, s8
	; UNPACKED-NEXT: s_mov_b32 s7, s9			; UNPACKED-NEXT: s_mov_b32 s7, s9
	; UNPACKED-NEXT: v_mov_b32_e32 v5, v0			; UNPACKED-NEXT: v_mov_b32_e32 v5, v0
	; UNPACKED-NEXT: v_lshrrev_b32_e32 v2, 16, v1			; UNPACKED-NEXT: v_lshrrev_b32_e32 v2, 16, v1
	; UNPACKED-NEXT: v_lshrrev_b32_e32 v4, 16, v3			; UNPACKED-NEXT: v_lshrrev_b32_e32 v4, 16, v3
	; UNPACKED-NEXT: image_store v[1:4], v[5:6], s[0:7] dmask:0xf unorm			; UNPACKED-NEXT: image_store v[1:4], v[5:6], s[0:7] dmask:0xf unorm d16
	; UNPACKED-NEXT: s_endpgm			; UNPACKED-NEXT: s_endpgm
	;			;
	; PACKED-LABEL: image_store_v4f16:			; PACKED-LABEL: image_store_v4f16:
	; PACKED: ; %bb.0:			; PACKED: ; %bb.0:
	; PACKED-NEXT: s_mov_b32 s0, s2			; PACKED-NEXT: s_mov_b32 s0, s2
	; PACKED-NEXT: s_mov_b32 s1, s3			; PACKED-NEXT: s_mov_b32 s1, s3
	; PACKED-NEXT: s_mov_b32 s2, s4			; PACKED-NEXT: s_mov_b32 s2, s4
	; PACKED-NEXT: s_mov_b32 s3, s5			; PACKED-NEXT: s_mov_b32 s3, s5
	; PACKED-NEXT: s_mov_b32 s4, s6			; PACKED-NEXT: s_mov_b32 s4, s6
	; PACKED-NEXT: s_mov_b32 s5, s7			; PACKED-NEXT: s_mov_b32 s5, s7
	; PACKED-NEXT: s_mov_b32 s6, s8			; PACKED-NEXT: s_mov_b32 s6, s8
	; PACKED-NEXT: s_mov_b32 s7, s9			; PACKED-NEXT: s_mov_b32 s7, s9
	; PACKED-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0xf unorm			; PACKED-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0xf unorm d16
	; PACKED-NEXT: s_endpgm			; PACKED-NEXT: s_endpgm
				;
				; GFX81-LABEL: image_store_v4f16:
				; GFX81: ; %bb.0:
				; GFX81-NEXT: s_mov_b32 s0, s2
				; GFX81-NEXT: s_mov_b32 s1, s3
				; GFX81-NEXT: s_mov_b32 s2, s4
				; GFX81-NEXT: s_mov_b32 s3, s5
				; GFX81-NEXT: s_mov_b32 s4, s6
				; GFX81-NEXT: s_mov_b32 s5, s7
				; GFX81-NEXT: s_mov_b32 s6, s8
				; GFX81-NEXT: s_mov_b32 s7, s9
				; GFX81-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0xf unorm d16
				; GFX81-NEXT: s_endpgm
	call void @llvm.amdgcn.image.store.2d.v4f16.i32(<4 x half> %in, i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)			call void @llvm.amdgcn.image.store.2d.v4f16.i32(<4 x half> %in, i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
	ret void			ret void
	}			}

	declare void @llvm.amdgcn.image.store.2d.f16.i32(half, i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0			declare void @llvm.amdgcn.image.store.2d.f16.i32(half, i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0
	declare void @llvm.amdgcn.image.store.2d.v2f16.i32(<2 x half>, i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0			declare void @llvm.amdgcn.image.store.2d.v2f16.i32(<2 x half>, i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0
	declare void @llvm.amdgcn.image.store.2d.v3f16.i32(<3 x half>, i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0			declare void @llvm.amdgcn.image.store.2d.v3f16.i32(<3 x half>, i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0
	declare void @llvm.amdgcn.image.store.2d.v4f16.i32(<4 x half>, i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0			declare void @llvm.amdgcn.image.store.2d.v4f16.i32(<4 x half>, i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0

	attributes #0 = { nounwind writeonly }			attributes #0 = { nounwind writeonly }

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.dim.ll

; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs \| FileCheck -check-prefixes=GCN,UNPACKED,GFX89 %s		; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs \| FileCheck -check-prefixes=GCN,UNPACKED,GFX89 %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs \| FileCheck -check-prefixes=GCN,PACKED,GFX81,GFX89 %s		; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs \| FileCheck -check-prefixes=GCN,GFX81,GFX89 %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs \| FileCheck -check-prefixes=GCN,PACKED,GFX9,GFX89 %s		; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs \| FileCheck -check-prefixes=GCN,PACKED,GFX9,GFX89 %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs \| FileCheck -check-prefixes=GCN,GFX10 %s		; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs \| FileCheck -check-prefixes=GCN,GFX10 %s

; GCN-LABEL: {{^}}image_load_f16:		; GCN-LABEL: {{^}}image_load_f16:
; GFX89: image_load v0, v[0:1], s[0:7] dmask:0x1 unorm d16{{$}}		; GFX89: image_load v0, v[0:1], s[0:7] dmask:0x1 unorm d16{{$}}
; GFX10: image_load v0, v[0:1], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm d16{{$}}		; GFX10: image_load v0, v[0:1], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm d16{{$}}
define amdgpu_ps half @image_load_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {		define amdgpu_ps half @image_load_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
main_body:		main_body:
%tex = call half @llvm.amdgcn.image.load.2d.f16.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)		%tex = call half @llvm.amdgcn.image.load.2d.f16.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret half %tex		ret half %tex
}		}

; GCN-LABEL: {{^}}image_load_v2f16:		; GCN-LABEL: {{^}}image_load_v2f16:
; UNPACKED: image_load v[0:1], v[0:1], s[0:7] dmask:0x3 unorm d16{{$}}		; UNPACKED: image_load v[0:1], v[0:1], s[0:7] dmask:0x3 unorm d16{{$}}
; PACKED: image_load v0, v[0:1], s[0:7] dmask:0x3 unorm d16{{$}}		; PACKED: image_load v0, v[0:1], s[0:7] dmask:0x3 unorm d16{{$}}
		; GFX81: image_load v0, v[0:1], s[0:7] dmask:0x3 unorm d16{{$}}
; GFX10: image_load v0, v[0:1], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D unorm d16{{$}}		; GFX10: image_load v0, v[0:1], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D unorm d16{{$}}
define amdgpu_ps float @image_load_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {		define amdgpu_ps float @image_load_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
main_body:		main_body:
%tex = call <2 x half> @llvm.amdgcn.image.load.2d.v2f16.i32(i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)		%tex = call <2 x half> @llvm.amdgcn.image.load.2d.v2f16.i32(i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
%r = bitcast <2 x half> %tex to float		%r = bitcast <2 x half> %tex to float
ret float %r		ret float %r
}		}

; GCN-LABEL: {{^}}image_load_v3f16:		; GCN-LABEL: {{^}}image_load_v3f16:
; UNPACKED: image_load v[0:2], v[0:1], s[0:7] dmask:0x7 unorm d16{{$}}		; UNPACKED: image_load v[0:2], v[0:1], s[0:7] dmask:0x7 unorm d16{{$}}
; PACKED: image_load v[0:1], v[0:1], s[0:7] dmask:0x7 unorm d16{{$}}		; PACKED: image_load v[0:1], v[0:1], s[0:7] dmask:0x7 unorm d16{{$}}
; GFX10: image_load v[0:1], v[0:1], s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_2D unorm d16{{$}}		; GFX10: image_load v[0:1], v[0:1], s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_2D unorm d16{{$}}
define amdgpu_ps <2 x float> @image_load_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {		define amdgpu_ps <2 x float> @image_load_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
main_body:		main_body:
%tex = call <3 x half> @llvm.amdgcn.image.load.2d.v3f16.i32(i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)		%tex = call <3 x half> @llvm.amdgcn.image.load.2d.v3f16.i32(i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
%ext = shufflevector <3 x half> %tex, <3 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>		%ext = shufflevector <3 x half> %tex, <3 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%r = bitcast <4 x half> %ext to <2 x float>		%r = bitcast <4 x half> %ext to <2 x float>
ret <2 x float> %r		ret <2 x float> %r
}		}

; GCN-LABEL: {{^}}image_load_v4f16:		; GCN-LABEL: {{^}}image_load_v4f16:
; UNPACKED: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm d16{{$}}		; UNPACKED: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm d16{{$}}
; PACKED: image_load v[0:1], v[0:1], s[0:7] dmask:0xf unorm d16{{$}}		; PACKED: image_load v[0:1], v[0:1], s[0:7] dmask:0xf unorm d16{{$}}
		; GFX81: image_load v[0:1], v[0:1], s[0:7] dmask:0xf unorm d16{{$}}
; GFX10: image_load v[0:1], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm d16{{$}}		; GFX10: image_load v[0:1], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm d16{{$}}
define amdgpu_ps <2 x float> @image_load_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {		define amdgpu_ps <2 x float> @image_load_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
main_body:		main_body:
%tex = call <4 x half> @llvm.amdgcn.image.load.2d.v4f16.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)		%tex = call <4 x half> @llvm.amdgcn.image.load.2d.v4f16.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
%r = bitcast <4 x half> %tex to <2 x float>		%r = bitcast <4 x half> %tex to <2 x float>
ret <2 x float> %r		ret <2 x float> %r
}		}

; GCN-LABEL: {{^}}image_load_mip_v4f16:		; GCN-LABEL: {{^}}image_load_mip_v4f16:
; UNPACKED: image_load_mip v[0:3], v[0:2], s[0:7] dmask:0xf unorm d16{{$}}		; UNPACKED: image_load_mip v[0:3], v[0:2], s[0:7] dmask:0xf unorm d16{{$}}
; PACKED: image_load_mip v[0:1], v[0:2], s[0:7] dmask:0xf unorm d16{{$}}		; PACKED: image_load_mip v[0:1], v[0:2], s[0:7] dmask:0xf unorm d16{{$}}
		; GFX81: image_load_mip v[0:1], v[0:2], s[0:7] dmask:0xf unorm d16{{$}}
; GFX10: image_load_mip v[0:1], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm d16{{$}}		; GFX10: image_load_mip v[0:1], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm d16{{$}}
define amdgpu_ps <2 x float> @image_load_mip_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) {		define amdgpu_ps <2 x float> @image_load_mip_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) {
main_body:		main_body:
%tex = call <4 x half> @llvm.amdgcn.image.load.mip.2d.v4f16.i32(i32 15, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)		%tex = call <4 x half> @llvm.amdgcn.image.load.mip.2d.v4f16.i32(i32 15, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
%r = bitcast <4 x half> %tex to <2 x float>		%r = bitcast <4 x half> %tex to <2 x float>
ret <2 x float> %r		ret <2 x float> %r
}		}

; GCN-LABEL: {{^}}image_load_3d_v2f16:		; GCN-LABEL: {{^}}image_load_3d_v2f16:
; UNPACKED: image_load v[0:1], v[0:2], s[0:7] dmask:0x3 unorm d16{{$}}		; UNPACKED: image_load v[0:1], v[0:2], s[0:7] dmask:0x3 unorm d16{{$}}
; PACKED: image_load v0, v[0:2], s[0:7] dmask:0x3 unorm d16{{$}}		; PACKED: image_load v0, v[0:2], s[0:7] dmask:0x3 unorm d16{{$}}
		; GFX81: image_load v0, v[0:2], s[0:7] dmask:0x3 unorm d16{{$}}
; GFX10: image_load v0, v[0:2], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_3D unorm d16{{$}}		; GFX10: image_load v0, v[0:2], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_3D unorm d16{{$}}
define amdgpu_ps float @image_load_3d_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r) {		define amdgpu_ps float @image_load_3d_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r) {
main_body:		main_body:
%tex = call <2 x half> @llvm.amdgcn.image.load.3d.v2f16.i32(i32 3, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0)		%tex = call <2 x half> @llvm.amdgcn.image.load.3d.v2f16.i32(i32 3, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0)
%x = bitcast <2 x half> %tex to float		%x = bitcast <2 x half> %tex to float
ret float %x		ret float %x
}		}

Show All 14 Lines	main_body:
ret void		ret void
}		}

; GCN-LABEL: {{^}}image_store_v2f16		; GCN-LABEL: {{^}}image_store_v2f16
; UNPACKED: v_lshrrev_b32_e32		; UNPACKED: v_lshrrev_b32_e32
; UNPACKED: v_and_b32_e32		; UNPACKED: v_and_b32_e32
; UNPACKED: image_store v[{{[0-9:]+}}], v[0:1], s[0:7] dmask:0x3 unorm d16{{$}}		; UNPACKED: image_store v[{{[0-9:]+}}], v[0:1], s[0:7] dmask:0x3 unorm d16{{$}}
; PACKED: image_store v2, v[0:1], s[0:7] dmask:0x3 unorm d16{{$}}		; PACKED: image_store v2, v[0:1], s[0:7] dmask:0x3 unorm d16{{$}}
		; GFX81: image_store v[2:3], v[0:1], s[0:7] dmask:0x3 unorm d16{{$}}
; GFX10: image_store v2, v[0:1], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D unorm d16{{$}}		; GFX10: image_store v2, v[0:1], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D unorm d16{{$}}
define amdgpu_ps void @image_store_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, float %in) {		define amdgpu_ps void @image_store_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, float %in) {
main_body:		main_body:
%data = bitcast float %in to <2 x half>		%data = bitcast float %in to <2 x half>
call void @llvm.amdgcn.image.store.2d.v2f16.i32(<2 x half> %data, i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)		call void @llvm.amdgcn.image.store.2d.v2f16.i32(<2 x half> %data, i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret void		ret void
}		}

define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <2 x float> %in) {		define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <2 x float> %in) {
main_body:		main_body:
%r = bitcast <2 x float> %in to <4 x half>		%r = bitcast <2 x float> %in to <4 x half>
%data = shufflevector <4 x half> %r, <4 x half> undef, <3 x i32> <i32 0, i32 1, i32 2>		%data = shufflevector <4 x half> %r, <4 x half> undef, <3 x i32> <i32 0, i32 1, i32 2>
call void @llvm.amdgcn.image.store.2d.v3f16.i32(<3 x half> %data, i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)		call void @llvm.amdgcn.image.store.2d.v3f16.i32(<3 x half> %data, i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret void		ret void
}		}

; GCN-LABEL: {{^}}image_store_v4f16		; GCN-LABEL: {{^}}image_store_v4f16
; UNPACKED: v_lshrrev_b32_e32		; UNPACKED: v_lshrrev_b32_e32
; UNPACKED: v_and_b32_e32		; UNPACKED: v_and_b32_e32
; UNPACKED: v_lshrrev_b32_e32		; UNPACKED: v_lshrrev_b32_e32
; UNPACKED: v_and_b32_e32		; UNPACKED: v_and_b32_e32
; UNPACKED: image_store v[{{[0-9:]+}}], v[0:1], s[0:7] dmask:0xf unorm d16{{$}}		; UNPACKED: image_store v[{{[0-9:]+}}], v[0:1], s[0:7] dmask:0xf unorm d16{{$}}
; PACKED: image_store v[2:3], v[0:1], s[0:7] dmask:0xf unorm d16{{$}}		; PACKED: image_store v[2:3], v[0:1], s[0:7] dmask:0xf unorm d16{{$}}
		; GFX81: image_store v[2:5], v[0:1], s[0:7] dmask:0xf unorm d16{{$}}
; GFX10: image_store v[2:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm d16{{$}}		; GFX10: image_store v[2:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm d16{{$}}
define amdgpu_ps void @image_store_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <2 x float> %in) {		define amdgpu_ps void @image_store_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <2 x float> %in) {
main_body:		main_body:
%data = bitcast <2 x float> %in to <4 x half>		%data = bitcast <2 x float> %in to <4 x half>
call void @llvm.amdgcn.image.store.2d.v4f16.i32(<4 x half> %data, i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)		call void @llvm.amdgcn.image.store.2d.v4f16.i32(<4 x half> %data, i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret void		ret void
}		}

; GCN-LABEL: {{^}}image_store_mip_1d_v4f16		; GCN-LABEL: {{^}}image_store_mip_1d_v4f16
; UNPACKED: v_lshrrev_b32_e32		; UNPACKED: v_lshrrev_b32_e32
; UNPACKED: v_and_b32_e32		; UNPACKED: v_and_b32_e32
; UNPACKED: v_lshrrev_b32_e32		; UNPACKED: v_lshrrev_b32_e32
; UNPACKED: v_and_b32_e32		; UNPACKED: v_and_b32_e32
; UNPACKED: image_store_mip v[{{[0-9:]+}}], v[0:1], s[0:7] dmask:0xf unorm d16{{$}}		; UNPACKED: image_store_mip v[{{[0-9:]+}}], v[0:1], s[0:7] dmask:0xf unorm d16{{$}}
; PACKED: image_store_mip v[2:3], v[0:1], s[0:7] dmask:0xf unorm d16{{$}}		; PACKED: image_store_mip v[2:3], v[0:1], s[0:7] dmask:0xf unorm d16{{$}}
		; GFX81: image_store_mip v[2:5], v[0:1], s[0:7] dmask:0xf unorm d16{{$}}
; GFX10: image_store_mip v[2:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm d16{{$}}		; GFX10: image_store_mip v[2:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm d16{{$}}
define amdgpu_ps void @image_store_mip_1d_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %mip, <2 x float> %in) {		define amdgpu_ps void @image_store_mip_1d_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %mip, <2 x float> %in) {
main_body:		main_body:
%data = bitcast <2 x float> %in to <4 x half>		%data = bitcast <2 x float> %in to <4 x half>
call void @llvm.amdgcn.image.store.mip.1d.v4f16.i32(<4 x half> %data, i32 15, i32 %s, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)		call void @llvm.amdgcn.image.store.mip.1d.v4f16.i32(<4 x half> %data, i32 15, i32 %s, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
ret void		ret void
}		}

Show All 19 Lines

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.dim.ll

	; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs \| FileCheck -check-prefixes=GCN,UNPACKED,GFX89 %s			; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs \| FileCheck -check-prefixes=GCN,UNPACKED,GFX89 %s
	; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs \| FileCheck -check-prefixes=GCN,PACKED,GFX81,GFX89 %s			; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs \| FileCheck -check-prefixes=GCN,GFX81,GFX89 %s
	; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs \| FileCheck -check-prefixes=GCN,PACKED,GFX9,GFX89 %s			; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs \| FileCheck -check-prefixes=GCN,GFX9,GFX89 %s
	; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs \| FileCheck -check-prefixes=GCN,GFX10 %s			; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs \| FileCheck -check-prefixes=GCN,GFX10 %s

	; GCN-LABEL: {{^}}image_gather4_b_2d_v4f16:			; GCN-LABEL: {{^}}image_gather4_b_2d_v4f16:
	; UNPACKED: image_gather4_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x4 d16{{$}}			; UNPACKED: image_gather4_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x4 d16{{$}}
	; PACKED: image_gather4_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0x4 d16{{$}}			; PACKED: image_gather4_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0x4 d16{{$}}
				; GFX810: image_gather4_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x4 d16{{$}}
				; GFX9: image_gather4_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x4 d16{{$}}
	; GFX10: image_gather4_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D d16{{$}}			; GFX10: image_gather4_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D d16{{$}}
	define amdgpu_ps <2 x float> @image_gather4_b_2d_v4f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) {			define amdgpu_ps <2 x float> @image_gather4_b_2d_v4f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) {
	main_body:			main_body:
	%tex = call <4 x half> @llvm.amdgcn.image.gather4.b.2d.v4f16.f32.f32(i32 4, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)			%tex = call <4 x half> @llvm.amdgcn.image.gather4.b.2d.v4f16.f32.f32(i32 4, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
	%r = bitcast <4 x half> %tex to <2 x float>			%r = bitcast <4 x half> %tex to <2 x float>
	ret <2 x float> %r			ret <2 x float> %r
	}			}

	declare <4 x half> @llvm.amdgcn.image.gather4.b.2d.v4f16.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1			declare <4 x half> @llvm.amdgcn.image.gather4.b.2d.v4f16.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1

	attributes #0 = { nounwind }			attributes #0 = { nounwind }
	attributes #1 = { nounwind readonly }			attributes #1 = { nounwind readonly }
	attributes #2 = { nounwind readnone }			attributes #2 = { nounwind readnone }

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Implement hardware bug workaround for image instructions
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 296645

llvm/lib/Target/AMDGPU/AMDGPU.td

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

llvm/lib/Target/AMDGPU/SIISelLowering.h

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.d16.ll

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.dim.ll

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.dim.ll

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Implement hardware bug workaround for image instructionsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 296645

llvm/lib/Target/AMDGPU/AMDGPU.td

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

llvm/lib/Target/AMDGPU/SIISelLowering.h

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.d16.ll

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.dim.ll

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.dim.ll

[AMDGPU] Implement hardware bug workaround for image instructions
ClosedPublic