This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Fix interaction of tfe and d16
ClosedPublic

Authored by arsenm on Jan 17 2020, 3:53 PM.

Download Raw Diff

Details

Reviewers

rtaylor
nhaehnle
tpr
dstuttard

Summary

This using the wrong result register, and dropping the result entirely
for v2f16. This would fail to select on the scalar case. I believe it
was also mishandling packed/unpacked subtargets.

Diff Detail

Event Timeline

arsenm created this revision.Jan 17 2020, 3:53 PM

Herald added a project: Restricted Project. · View Herald TranscriptJan 17 2020, 3:53 PM

Herald added subscribers: kerbowa, hiraditya, t-tye and 4 others. · View Herald Transcript

One question, but apart from that LGTM.

The register initialization code is suboptimal, bit I'm going to write up a patch for that.

llvm/lib/Target/AMDGPU/SIISelLowering.cpp
5311	When does this case actually happen?

This revision is now accepted and ready to land.Jan 22 2020, 4:20 AM

In D72964#1833404, @nhaehnle wrote:

The register initialization code is suboptimal, bit I'm going to write up a patch for that.

Never mind, I believe at least some API use cases actually need it like that... so it would require the frontend to give us more information, and it's not high impact anyway.

arsenm marked an inline comment as done.Jan 22 2020, 5:20 AM

arsenm added inline comments.

llvm/lib/Target/AMDGPU/SIISelLowering.cpp
5311	This is the normal case with a load. The chainless case is the weird one above for the non-loading intrinsics

9c928649a085646c4c779bac095643b50b464d83

Revision Contents

Path

Size

llvm/

lib/

Target/

AMDGPU/

SIISelLowering.cpp

120 lines

test/

CodeGen/

AMDGPU/

image-load-d16-tfe.ll

410 lines

Diff 238916

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 5,224 Lines • ▼ Show 20 Lines	static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG,
if (DLC) {		if (DLC) {
*DLC = DAG.getTargetConstant((Value & 0x4) ? 1 : 0, DL, MVT::i32);		*DLC = DAG.getTargetConstant((Value & 0x4) ? 1 : 0, DL, MVT::i32);
Value &= ~(uint64_t)0x4;		Value &= ~(uint64_t)0x4;
}		}

return Value == 0;		return Value == 0;
}		}

		static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
		SDValue Src, int ExtraElts) {
		EVT SrcVT = Src.getValueType();

		SmallVector<SDValue, 8> Elts;

		if (SrcVT.isVector())
		DAG.ExtractVectorElements(Src, Elts);
		else
		Elts.push_back(Src);

		SDValue Undef = DAG.getUNDEF(SrcVT.getScalarType());
		while (ExtraElts--)
		Elts.push_back(Undef);

		return DAG.getBuildVector(CastVT, DL, Elts);
		}

// Re-construct the required return value for a image load intrinsic.		// Re-construct the required return value for a image load intrinsic.
// This is more complicated due to the optional use TexFailCtrl which means the required		// This is more complicated due to the optional use TexFailCtrl which means the required
// return type is an aggregate		// return type is an aggregate
static SDValue constructRetValue(SelectionDAG &DAG,		static SDValue constructRetValue(SelectionDAG &DAG,
MachineSDNode *Result,		MachineSDNode *Result,
ArrayRef<EVT> ResultTypes,		ArrayRef<EVT> ResultTypes,
bool IsTexFail, bool Unpacked, bool IsD16,		bool IsTexFail, bool Unpacked, bool IsD16,
int DMaskPop, int NumVDataDwords,		int DMaskPop, int NumVDataDwords,
const SDLoc &DL, LLVMContext &Context) {		const SDLoc &DL, LLVMContext &Context) {
// Determine the required return type. This is the same regardless of IsTexFail flag		// Determine the required return type. This is the same regardless of IsTexFail flag
EVT ReqRetVT = ResultTypes[0];		EVT ReqRetVT = ResultTypes[0];
EVT ReqRetEltVT = ReqRetVT.isVector() ? ReqRetVT.getVectorElementType() : ReqRetVT;
int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;		int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
EVT AdjEltVT = Unpacked && IsD16 ? MVT::i32 : ReqRetEltVT;		int NumDataDwords = (!IsD16 \|\| (IsD16 && Unpacked)) ?
EVT AdjVT = Unpacked ? ReqRetNumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, ReqRetNumElts)		ReqRetNumElts : (ReqRetNumElts + 1) / 2;
: AdjEltVT
: ReqRetVT;

// Extract data part of the result
// Bitcast the result to the same type as the required return type
int NumElts;
if (IsD16 && !Unpacked)
NumElts = NumVDataDwords << 1;
else
NumElts = NumVDataDwords;

EVT CastVT = NumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, NumElts)		int MaskPopDwords = (!IsD16 \|\| (IsD16 && Unpacked)) ?
: AdjEltVT;		DMaskPop : (DMaskPop + 1) / 2;

// Special case for v6f16. Rather than add support for this, use v3i32 to		MVT DataDwordVT = NumDataDwords == 1 ?
// extract the data elements		MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
bool V6F16Special = false;
if (NumElts == 6) {
CastVT = EVT::getVectorVT(Context, MVT::i32, NumElts / 2);
DMaskPop >>= 1;
ReqRetNumElts >>= 1;
V6F16Special = true;
AdjVT = MVT::v2i32;
}

SDValue N = SDValue(Result, 0);		MVT MaskPopVT = MaskPopDwords == 1 ?
SDValue CastRes = DAG.getNode(ISD::BITCAST, DL, CastVT, N);		MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);

// Iterate over the result		SDValue Data(Result, 0);
SmallVector<SDValue, 4> BVElts;		SDValue TexFail;

if (CastVT.isVector()) {		if (IsTexFail) {
DAG.ExtractVectorElements(CastRes, BVElts, 0, DMaskPop);		SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
		if (MaskPopVT.isVector()) {
		Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
		SDValue(Result, 0), ZeroIdx);
} else {		} else {
BVElts.push_back(CastRes);		Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
		SDValue(Result, 0), ZeroIdx);
}		}
int ExtraElts = ReqRetNumElts - DMaskPop;
while(ExtraElts--)
BVElts.push_back(DAG.getUNDEF(AdjEltVT));

SDValue PreTFCRes;		TexFail = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
if (ReqRetNumElts > 1) {		SDValue(Result, 0),
SDValue NewVec = DAG.getBuildVector(AdjVT, DL, BVElts);		DAG.getConstant(MaskPopDwords, DL, MVT::i32));
if (IsD16 && Unpacked)
PreTFCRes = adjustLoadValueTypeImpl(NewVec, ReqRetVT, DL, DAG, Unpacked);
else
PreTFCRes = NewVec;
} else {
PreTFCRes = BVElts[0];
}		}

if (V6F16Special)		if (DataDwordVT.isVector())
PreTFCRes = DAG.getNode(ISD::BITCAST, DL, MVT::v4f16, PreTFCRes);		Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
		NumDataDwords - MaskPopDwords);

if (!IsTexFail) {		if (IsD16)
if (Result->getNumValues() > 1)		Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
return DAG.getMergeValues({PreTFCRes, SDValue(Result, 1)}, DL);
else		if (!ReqRetVT.isVector())
return PreTFCRes;		Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
}
		Data = DAG.getNode(ISD::BITCAST, DL, ReqRetVT, Data);

		if (TexFail)
		return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);

		if (Result->getNumValues() == 1)
		return Data;

// Extract the TexFail result and insert into aggregate return		return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
		nhaehnleUnsubmitted Not Done Reply Inline Actions When does this case actually happen? nhaehnle: When does this case actually happen?
		arsenmAuthorUnsubmitted Done Reply Inline Actions This is the normal case with a load. The chainless case is the weird one above for the non-loading intrinsics arsenm: This is the normal case with a load. The chainless case is the weird one above for the non…
SmallVector<SDValue, 1> TFCElt;
DAG.ExtractVectorElements(N, TFCElt, DMaskPop, 1);
SDValue TFCRes = DAG.getNode(ISD::BITCAST, DL, ResultTypes[1], TFCElt[0]);
return DAG.getMergeValues({PreTFCRes, TFCRes, SDValue(Result, 1)}, DL);
}		}

static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,		static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
SDValue *LWE, bool &IsTexFail) {		SDValue *LWE, bool &IsTexFail) {
auto TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());		auto TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());

uint64_t Value = TexFailCtrlConst->getZExtValue();		uint64_t Value = TexFailCtrlConst->getZExtValue();
if (Value) {		if (Value) {
▲ Show 20 Lines • Show All 233 Lines • ▼ Show 20 Lines	if (DMaskLanes == 0 && !BaseOpcode->Store) {
// This is a no-op load. This can be eliminated		// This is a no-op load. This can be eliminated
SDValue Undef = DAG.getUNDEF(Op.getValueType());		SDValue Undef = DAG.getUNDEF(Op.getValueType());
if (isa<MemSDNode>(Op))		if (isa<MemSDNode>(Op))
return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);		return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
return Undef;		return Undef;
}		}

EVT NewVT = NumVDataDwords > 1 ?		EVT NewVT = NumVDataDwords > 1 ?
EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumVDataDwords)		EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumVDataDwords)
: MVT::f32;		: MVT::i32;

ResultTypes[0] = NewVT;		ResultTypes[0] = NewVT;
if (ResultTypes.size() == 3) {		if (ResultTypes.size() == 3) {
// Original result was aggregate type used for TexFailCtrl results		// Original result was aggregate type used for TexFailCtrl results
// The actual instruction returns as a vector type which has now been		// The actual instruction returns as a vector type which has now been
// created. Remove the aggregate result.		// created. Remove the aggregate result.
ResultTypes.erase(&ResultTypes[1]);		ResultTypes.erase(&ResultTypes[1]);
}		}
▲ Show 20 Lines • Show All 5,479 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -check-prefix=GFX9 %s
				; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s \| FileCheck -check-prefix=GFX10 %s
				; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -check-prefix=GFX8-UNPACKED %s

				define amdgpu_ps void @load_1d_f16_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) {
				; GFX9-LABEL: load_1d_f16_tfe_dmask0:
				; GFX9: ; %bb.0:
				; GFX9-NEXT: v_mov_b32_e32 v1, 0
				; GFX9-NEXT: s_mov_b32 s11, s9
				; GFX9-NEXT: s_mov_b32 s10, s8
				; GFX9-NEXT: s_mov_b32 s9, s7
				; GFX9-NEXT: s_mov_b32 s8, s6
				; GFX9-NEXT: s_mov_b32 s7, s5
				; GFX9-NEXT: s_mov_b32 s6, s4
				; GFX9-NEXT: s_mov_b32 s5, s3
				; GFX9-NEXT: s_mov_b32 s4, s2
				; GFX9-NEXT: v_mov_b32_e32 v2, v1
				; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
				; GFX9-NEXT: s_waitcnt vmcnt(0)
				; GFX9-NEXT: global_store_short v[0:1], v1, off
				; GFX9-NEXT: global_store_dword v[0:1], v2, off
				; GFX9-NEXT: s_endpgm
				;
				; GFX10-LABEL: load_1d_f16_tfe_dmask0:
				; GFX10: ; %bb.0:
				; GFX10-NEXT: v_mov_b32_e32 v1, 0
				; GFX10-NEXT: s_mov_b32 s11, s9
				; GFX10-NEXT: s_mov_b32 s10, s8
				; GFX10-NEXT: s_mov_b32 s9, s7
				; GFX10-NEXT: s_mov_b32 s8, s6
				; GFX10-NEXT: s_mov_b32 s7, s5
				; GFX10-NEXT: s_mov_b32 s6, s4
				; GFX10-NEXT: s_mov_b32 s5, s3
				; GFX10-NEXT: s_mov_b32 s4, s2
				; GFX10-NEXT: v_mov_b32_e32 v2, v1
				; GFX10-NEXT: ; implicit-def: $vcc_hi
				; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16
				; GFX10-NEXT: s_waitcnt vmcnt(0)
				; GFX10-NEXT: global_store_short v[0:1], v1, off
				; GFX10-NEXT: global_store_dword v[0:1], v2, off
				; GFX10-NEXT: s_endpgm
				;
				; GFX8-UNPACKED-LABEL: load_1d_f16_tfe_dmask0:
				; GFX8-UNPACKED: ; %bb.0:
				; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v1, 0
				; GFX8-UNPACKED-NEXT: s_mov_b32 s11, s9
				; GFX8-UNPACKED-NEXT: s_mov_b32 s10, s8
				; GFX8-UNPACKED-NEXT: s_mov_b32 s9, s7
				; GFX8-UNPACKED-NEXT: s_mov_b32 s8, s6
				; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s5
				; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s4
				; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s3
				; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s2
				; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v2, v1
				; GFX8-UNPACKED-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
				; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
				; GFX8-UNPACKED-NEXT: flat_store_short v[0:1], v1
				; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v2
				; GFX8-UNPACKED-NEXT: s_endpgm
				%v = call { half, i32 } @llvm.amdgcn.image.load.1d.sl_f16i32s.i32(i32 0, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
				%v.data = extractvalue { half, i32 } %v, 0
				%v.err = extractvalue { half, i32 } %v, 1
				store volatile half %v.data, half addrspace(1)* undef
				store volatile i32 %v.err, i32 addrspace(1)* undef
				ret void
				}

				define amdgpu_ps void @load_1d_f16_tfe_dmask1(<8 x i32> inreg %rsrc, i32 %s) {
				; GFX9-LABEL: load_1d_f16_tfe_dmask1:
				; GFX9: ; %bb.0:
				; GFX9-NEXT: v_mov_b32_e32 v1, 0
				; GFX9-NEXT: s_mov_b32 s11, s9
				; GFX9-NEXT: s_mov_b32 s10, s8
				; GFX9-NEXT: s_mov_b32 s9, s7
				; GFX9-NEXT: s_mov_b32 s8, s6
				; GFX9-NEXT: s_mov_b32 s7, s5
				; GFX9-NEXT: s_mov_b32 s6, s4
				; GFX9-NEXT: s_mov_b32 s5, s3
				; GFX9-NEXT: s_mov_b32 s4, s2
				; GFX9-NEXT: v_mov_b32_e32 v2, v1
				; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
				; GFX9-NEXT: s_waitcnt vmcnt(0)
				; GFX9-NEXT: global_store_short v[0:1], v1, off
				; GFX9-NEXT: global_store_dword v[0:1], v2, off
				; GFX9-NEXT: s_endpgm
				;
				; GFX10-LABEL: load_1d_f16_tfe_dmask1:
				; GFX10: ; %bb.0:
				; GFX10-NEXT: v_mov_b32_e32 v1, 0
				; GFX10-NEXT: s_mov_b32 s11, s9
				; GFX10-NEXT: s_mov_b32 s10, s8
				; GFX10-NEXT: s_mov_b32 s9, s7
				; GFX10-NEXT: s_mov_b32 s8, s6
				; GFX10-NEXT: s_mov_b32 s7, s5
				; GFX10-NEXT: s_mov_b32 s6, s4
				; GFX10-NEXT: s_mov_b32 s5, s3
				; GFX10-NEXT: s_mov_b32 s4, s2
				; GFX10-NEXT: v_mov_b32_e32 v2, v1
				; GFX10-NEXT: ; implicit-def: $vcc_hi
				; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16
				; GFX10-NEXT: s_waitcnt vmcnt(0)
				; GFX10-NEXT: global_store_short v[0:1], v1, off
				; GFX10-NEXT: global_store_dword v[0:1], v2, off
				; GFX10-NEXT: s_endpgm
				;
				; GFX8-UNPACKED-LABEL: load_1d_f16_tfe_dmask1:
				; GFX8-UNPACKED: ; %bb.0:
				; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v1, 0
				; GFX8-UNPACKED-NEXT: s_mov_b32 s11, s9
				; GFX8-UNPACKED-NEXT: s_mov_b32 s10, s8
				; GFX8-UNPACKED-NEXT: s_mov_b32 s9, s7
				; GFX8-UNPACKED-NEXT: s_mov_b32 s8, s6
				; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s5
				; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s4
				; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s3
				; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s2
				; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v2, v1
				; GFX8-UNPACKED-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
				; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
				; GFX8-UNPACKED-NEXT: flat_store_short v[0:1], v1
				; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v2
				; GFX8-UNPACKED-NEXT: s_endpgm
				%v = call { half, i32 } @llvm.amdgcn.image.load.1d.sl_f16i32s.i32(i32 1, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
				%v.data = extractvalue { half, i32 } %v, 0
				%v.err = extractvalue { half, i32 } %v, 1
				store volatile half %v.data, half addrspace(1)* undef
				store volatile i32 %v.err, i32 addrspace(1)* undef
				ret void
				}

				define amdgpu_ps void @load_1d_v2f16_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) {
				; GFX9-LABEL: load_1d_v2f16_tfe_dmask0:
				; GFX9: ; %bb.0:
				; GFX9-NEXT: v_mov_b32_e32 v1, 0
				; GFX9-NEXT: s_mov_b32 s11, s9
				; GFX9-NEXT: s_mov_b32 s10, s8
				; GFX9-NEXT: s_mov_b32 s9, s7
				; GFX9-NEXT: s_mov_b32 s8, s6
				; GFX9-NEXT: s_mov_b32 s7, s5
				; GFX9-NEXT: s_mov_b32 s6, s4
				; GFX9-NEXT: s_mov_b32 s5, s3
				; GFX9-NEXT: s_mov_b32 s4, s2
				; GFX9-NEXT: v_mov_b32_e32 v2, v1
				; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
				; GFX9-NEXT: s_waitcnt vmcnt(0)
				; GFX9-NEXT: global_store_dword v[0:1], v1, off
				; GFX9-NEXT: global_store_dword v[0:1], v2, off
				; GFX9-NEXT: s_endpgm
				;
				; GFX10-LABEL: load_1d_v2f16_tfe_dmask0:
				; GFX10: ; %bb.0:
				; GFX10-NEXT: v_mov_b32_e32 v1, 0
				; GFX10-NEXT: s_mov_b32 s11, s9
				; GFX10-NEXT: s_mov_b32 s10, s8
				; GFX10-NEXT: s_mov_b32 s9, s7
				; GFX10-NEXT: s_mov_b32 s8, s6
				; GFX10-NEXT: s_mov_b32 s7, s5
				; GFX10-NEXT: s_mov_b32 s6, s4
				; GFX10-NEXT: s_mov_b32 s5, s3
				; GFX10-NEXT: s_mov_b32 s4, s2
				; GFX10-NEXT: v_mov_b32_e32 v2, v1
				; GFX10-NEXT: ; implicit-def: $vcc_hi
				; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16
				; GFX10-NEXT: s_waitcnt vmcnt(0)
				; GFX10-NEXT: global_store_dword v[0:1], v1, off
				; GFX10-NEXT: global_store_dword v[0:1], v2, off
				; GFX10-NEXT: s_endpgm
				;
				; GFX8-UNPACKED-LABEL: load_1d_v2f16_tfe_dmask0:
				; GFX8-UNPACKED: ; %bb.0:
				; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v1, 0
				; GFX8-UNPACKED-NEXT: s_mov_b32 s11, s9
				; GFX8-UNPACKED-NEXT: s_mov_b32 s10, s8
				; GFX8-UNPACKED-NEXT: s_mov_b32 s9, s7
				; GFX8-UNPACKED-NEXT: s_mov_b32 s8, s6
				; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s5
				; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s4
				; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s3
				; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s2
				; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v2, v1
				; GFX8-UNPACKED-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
				; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
				; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v1
				; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v2
				; GFX8-UNPACKED-NEXT: s_endpgm
				%v = call { <2 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v2f16i32s.i32(i32 0, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
				%v.data = extractvalue { <2 x half>, i32 } %v, 0
				%v.err = extractvalue { <2 x half>, i32 } %v, 1
				store volatile <2 x half> %v.data, <2 x half> addrspace(1)* undef
				store volatile i32 %v.err, i32 addrspace(1)* undef
				ret void
				}

				define amdgpu_ps void @load_1d_v2f16_tfe_dmask1(<8 x i32> inreg %rsrc, i32 %s) {
				; GFX9-LABEL: load_1d_v2f16_tfe_dmask1:
				; GFX9: ; %bb.0:
				; GFX9-NEXT: v_mov_b32_e32 v1, 0
				; GFX9-NEXT: s_mov_b32 s11, s9
				; GFX9-NEXT: s_mov_b32 s10, s8
				; GFX9-NEXT: s_mov_b32 s9, s7
				; GFX9-NEXT: s_mov_b32 s8, s6
				; GFX9-NEXT: s_mov_b32 s7, s5
				; GFX9-NEXT: s_mov_b32 s6, s4
				; GFX9-NEXT: s_mov_b32 s5, s3
				; GFX9-NEXT: s_mov_b32 s4, s2
				; GFX9-NEXT: v_mov_b32_e32 v2, v1
				; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
				; GFX9-NEXT: s_waitcnt vmcnt(0)
				; GFX9-NEXT: global_store_dword v[0:1], v1, off
				; GFX9-NEXT: global_store_dword v[0:1], v2, off
				; GFX9-NEXT: s_endpgm
				;
				; GFX10-LABEL: load_1d_v2f16_tfe_dmask1:
				; GFX10: ; %bb.0:
				; GFX10-NEXT: v_mov_b32_e32 v1, 0
				; GFX10-NEXT: s_mov_b32 s11, s9
				; GFX10-NEXT: s_mov_b32 s10, s8
				; GFX10-NEXT: s_mov_b32 s9, s7
				; GFX10-NEXT: s_mov_b32 s8, s6
				; GFX10-NEXT: s_mov_b32 s7, s5
				; GFX10-NEXT: s_mov_b32 s6, s4
				; GFX10-NEXT: s_mov_b32 s5, s3
				; GFX10-NEXT: s_mov_b32 s4, s2
				; GFX10-NEXT: v_mov_b32_e32 v2, v1
				; GFX10-NEXT: ; implicit-def: $vcc_hi
				; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16
				; GFX10-NEXT: s_waitcnt vmcnt(0)
				; GFX10-NEXT: global_store_dword v[0:1], v1, off
				; GFX10-NEXT: global_store_dword v[0:1], v2, off
				; GFX10-NEXT: s_endpgm
				;
				; GFX8-UNPACKED-LABEL: load_1d_v2f16_tfe_dmask1:
				; GFX8-UNPACKED: ; %bb.0:
				; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v1, 0
				; GFX8-UNPACKED-NEXT: s_mov_b32 s11, s9
				; GFX8-UNPACKED-NEXT: s_mov_b32 s10, s8
				; GFX8-UNPACKED-NEXT: s_mov_b32 s9, s7
				; GFX8-UNPACKED-NEXT: s_mov_b32 s8, s6
				; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s5
				; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s4
				; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s3
				; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s2
				; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v2, v1
				; GFX8-UNPACKED-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
				; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
				; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v1
				; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v2
				; GFX8-UNPACKED-NEXT: s_endpgm
				%v = call { <2 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v2f16i32s.i32(i32 1, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
				%v.data = extractvalue { <2 x half>, i32 } %v, 0
				%v.err = extractvalue { <2 x half>, i32 } %v, 1
				store volatile <2 x half> %v.data, <2 x half> addrspace(1)* undef
				store volatile i32 %v.err, i32 addrspace(1)* undef
				ret void
				}

				define amdgpu_ps void @load_1d_v2f16_tfe_dmask3(<8 x i32> inreg %rsrc, i32 %s) {
				; GFX9-LABEL: load_1d_v2f16_tfe_dmask3:
				; GFX9: ; %bb.0:
				; GFX9-NEXT: v_mov_b32_e32 v1, 0
				; GFX9-NEXT: s_mov_b32 s11, s9
				; GFX9-NEXT: s_mov_b32 s10, s8
				; GFX9-NEXT: s_mov_b32 s9, s7
				; GFX9-NEXT: s_mov_b32 s8, s6
				; GFX9-NEXT: s_mov_b32 s7, s5
				; GFX9-NEXT: s_mov_b32 s6, s4
				; GFX9-NEXT: s_mov_b32 s5, s3
				; GFX9-NEXT: s_mov_b32 s4, s2
				; GFX9-NEXT: v_mov_b32_e32 v2, v1
				; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x3 unorm tfe d16
				; GFX9-NEXT: s_waitcnt vmcnt(0)
				; GFX9-NEXT: global_store_dword v[0:1], v1, off
				; GFX9-NEXT: global_store_dword v[0:1], v2, off
				; GFX9-NEXT: s_endpgm
				;
				; GFX10-LABEL: load_1d_v2f16_tfe_dmask3:
				; GFX10: ; %bb.0:
				; GFX10-NEXT: v_mov_b32_e32 v1, 0
				; GFX10-NEXT: s_mov_b32 s11, s9
				; GFX10-NEXT: s_mov_b32 s10, s8
				; GFX10-NEXT: s_mov_b32 s9, s7
				; GFX10-NEXT: s_mov_b32 s8, s6
				; GFX10-NEXT: s_mov_b32 s7, s5
				; GFX10-NEXT: s_mov_b32 s6, s4
				; GFX10-NEXT: s_mov_b32 s5, s3
				; GFX10-NEXT: s_mov_b32 s4, s2
				; GFX10-NEXT: v_mov_b32_e32 v2, v1
				; GFX10-NEXT: ; implicit-def: $vcc_hi
				; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm tfe d16
				; GFX10-NEXT: s_waitcnt vmcnt(0)
				; GFX10-NEXT: global_store_dword v[0:1], v1, off
				; GFX10-NEXT: global_store_dword v[0:1], v2, off
				; GFX10-NEXT: s_endpgm
				;
				; GFX8-UNPACKED-LABEL: load_1d_v2f16_tfe_dmask3:
				; GFX8-UNPACKED: ; %bb.0:
				; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v1, 0
				; GFX8-UNPACKED-NEXT: s_mov_b32 s11, s9
				; GFX8-UNPACKED-NEXT: s_mov_b32 s10, s8
				; GFX8-UNPACKED-NEXT: s_mov_b32 s9, s7
				; GFX8-UNPACKED-NEXT: s_mov_b32 s8, s6
				; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s5
				; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s4
				; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s3
				; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s2
				; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v2, v1
				; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v3, v1
				; GFX8-UNPACKED-NEXT: image_load v[1:3], v0, s[4:11] dmask:0x3 unorm tfe d16
				; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
				; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v0, 16, v2
				; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
				; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v0
				; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v3
				; GFX8-UNPACKED-NEXT: s_endpgm
				%v = call { <2 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v2f16i32s.i32(i32 3, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
				%v.data = extractvalue { <2 x half>, i32 } %v, 0
				%v.err = extractvalue { <2 x half>, i32 } %v, 1
				store volatile <2 x half> %v.data, <2 x half> addrspace(1)* undef
				store volatile i32 %v.err, i32 addrspace(1)* undef
				ret void
				}

				; define amdgpu_ps void @load_1d_v3f16_tfe_dmask7(<8 x i32> inreg %rsrc, i32 %s) {
				; %v = call { <3 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v3f16i32s.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
				; %v.data = extractvalue { <3 x half>, i32 } %v, 0
				; %v.err = extractvalue { <3 x half>, i32 } %v, 1
				; store volatile <3 x half> %v.data, <3 x half> addrspace(1)* undef
				; store volatile i32 %v.err, i32 addrspace(1)* undef
				; ret void
				; }

				define amdgpu_ps void @load_1d_v4f16_tfe_dmask15(<8 x i32> inreg %rsrc, i32 %s) {
				; GFX9-LABEL: load_1d_v4f16_tfe_dmask15:
				; GFX9: ; %bb.0:
				; GFX9-NEXT: v_mov_b32_e32 v1, 0
				; GFX9-NEXT: s_mov_b32 s11, s9
				; GFX9-NEXT: s_mov_b32 s10, s8
				; GFX9-NEXT: s_mov_b32 s9, s7
				; GFX9-NEXT: s_mov_b32 s8, s6
				; GFX9-NEXT: s_mov_b32 s7, s5
				; GFX9-NEXT: s_mov_b32 s6, s4
				; GFX9-NEXT: s_mov_b32 s5, s3
				; GFX9-NEXT: s_mov_b32 s4, s2
				; GFX9-NEXT: v_mov_b32_e32 v2, v1
				; GFX9-NEXT: v_mov_b32_e32 v3, v1
				; GFX9-NEXT: image_load v[1:3], v0, s[4:11] dmask:0xf unorm tfe d16
				; GFX9-NEXT: s_waitcnt vmcnt(0)
				; GFX9-NEXT: global_store_dwordx2 v[0:1], v[1:2], off
				; GFX9-NEXT: global_store_dword v[0:1], v3, off
				; GFX9-NEXT: s_endpgm
				;
				; GFX10-LABEL: load_1d_v4f16_tfe_dmask15:
				; GFX10: ; %bb.0:
				; GFX10-NEXT: v_mov_b32_e32 v1, 0
				; GFX10-NEXT: s_mov_b32 s11, s9
				; GFX10-NEXT: s_mov_b32 s10, s8
				; GFX10-NEXT: s_mov_b32 s9, s7
				; GFX10-NEXT: s_mov_b32 s8, s6
				; GFX10-NEXT: s_mov_b32 s7, s5
				; GFX10-NEXT: s_mov_b32 s6, s4
				; GFX10-NEXT: s_mov_b32 s5, s3
				; GFX10-NEXT: s_mov_b32 s4, s2
				; GFX10-NEXT: v_mov_b32_e32 v2, v1
				; GFX10-NEXT: v_mov_b32_e32 v3, v1
				; GFX10-NEXT: ; implicit-def: $vcc_hi
				; GFX10-NEXT: image_load v[1:3], v0, s[4:11] dmask:0xf dim:SQ_RSRC_IMG_1D unorm tfe d16
				; GFX10-NEXT: s_waitcnt vmcnt(0)
				; GFX10-NEXT: global_store_dwordx2 v[0:1], v[1:2], off
				; GFX10-NEXT: global_store_dword v[0:1], v3, off
				; GFX10-NEXT: s_endpgm
				;
				; GFX8-UNPACKED-LABEL: load_1d_v4f16_tfe_dmask15:
				; GFX8-UNPACKED: ; %bb.0:
				; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v1, 0
				; GFX8-UNPACKED-NEXT: s_mov_b32 s11, s9
				; GFX8-UNPACKED-NEXT: s_mov_b32 s10, s8
				; GFX8-UNPACKED-NEXT: s_mov_b32 s9, s7
				; GFX8-UNPACKED-NEXT: s_mov_b32 s8, s6
				; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s5
				; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s4
				; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s3
				; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s2
				; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v2, v1
				; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v3, v1
				; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v4, v1
				; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v5, v1
				; GFX8-UNPACKED-NEXT: image_load v[1:5], v0, s[4:11] dmask:0xf unorm tfe d16
				; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
				; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v0, 16, v4
				; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v4, 16, v2
				; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v2, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
				; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
				; GFX8-UNPACKED-NEXT: flat_store_dwordx2 v[0:1], v[1:2]
				; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v5
				; GFX8-UNPACKED-NEXT: s_endpgm
				%v = call { <4 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f16i32s.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
				%v.data = extractvalue { <4 x half>, i32 } %v, 0
				%v.err = extractvalue { <4 x half>, i32 } %v, 1
				store volatile <4 x half> %v.data, <4 x half> addrspace(1)* undef
				store volatile i32 %v.err, i32 addrspace(1)* undef
				ret void
				}

				declare { half, i32 } @llvm.amdgcn.image.load.1d.sl_f16i32s.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #0
				declare { <2 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v2f16i32s.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #0
				declare { <3 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v3f16i32s.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #0
				declare { <4 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f16i32s.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #0

				attributes #0 = { nounwind readonly }