This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Add A16/G16 to InstCombine
ClosedPublic

Authored by Flakebi on Aug 13 2020, 3:03 AM.

Download Raw Diff

Details

Reviewers

arsenm
nhaehnle

Commits

rGb8d199477820: [AMDGPU] Add A16/G16 to InstCombine

Summary

When sampling from images with coordinates that only have 16 bit
accuracy, convert the image intrinsic call to use a16 or g16.
This does only happen if the target hardware supports it.

An alternative would be to always apply this combination, independent of
the target hardware and extend 16 bit arguments to 32 bit arguments
during legalization. To me, this sounds like an unnecessary roundtrip
that could prevent some further InstCombine optimizations.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

Flakebi created this revision.Aug 13 2020, 3:03 AM

Herald added a project: Restricted Project. · View Herald TranscriptAug 13 2020, 3:03 AM

Herald added subscribers: llvm-commits, kerbowa, hiraditya and 6 others. · View Herald Transcript

Flakebi requested review of this revision.Aug 13 2020, 3:03 AM

Herald added a subscriber: wdng. · View Herald TranscriptAug 13 2020, 3:03 AM

Harbormaster completed remote builds in B68234: Diff 285299.Aug 13 2020, 3:34 AM

arsenm added inline comments.Aug 13 2020, 6:02 AM

llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
56	Typo loosing
63	No else after return
70	No else after return. Also use match()?
85	No else after return
91	Dead code
96	Don't need llvm::
113	No else after return
828–830	The subtarget is already available in GCNTTI

Fix review comments

Harbormaster completed remote builds in B68278: Diff 285388.Aug 13 2020, 9:26 AM

arsenm added inline comments.Aug 13 2020, 1:02 PM

llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
3815	Can you add a test making sure this preserves fast math flags? We should be able to mark these as nnan / ninf etc. and have it be preserved

Preserve fast-math flags and add test that ensures a16 combining is not done on gfx8.

Harbormaster completed remote builds in B68379: Diff 285568.Aug 14 2020, 1:19 AM

arsenm accepted this revision.Aug 19 2020, 11:22 AM

This revision is now accepted and ready to land.Aug 19 2020, 11:22 AM

Closed by commit rGb8d199477820: [AMDGPU] Add A16/G16 to InstCombine (authored by sebastian-ne). · Explain WhyAug 20 2020, 1:52 AM

This revision was automatically updated to reflect the committed changes.

sebastian-ne added a commit: rGb8d199477820: [AMDGPU] Add A16/G16 to InstCombine.

foad added a subscriber: foad.Aug 21 2020, 12:14 AM

sebastian-ne mentioned this in D111754: AMDGPU: Fixes for 'LOD bias' operand in ISelDAG path and GobalISel path when A16-bit is 'ON'.Oct 14 2021, 6:47 AM

Revision Contents

Path

Size

llvm/

include/

llvm/

IR/

IntrinsicsAMDGPU.td

6 lines

lib/

Target/

AMDGPU/

AMDGPUInstCombineIntrinsic.cpp

117 lines

AMDGPUInstrInfo.h

5 lines

MIMGInstructions.td

10 lines

test/

Transforms/

InstCombine/

AMDGPU/

amdgcn-intrinsics-gfx8.ll

108 lines

amdgcn-intrinsics.ll

1140 lines

Diff 286743

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Show First 20 Lines • Show All 676 Lines • ▼ Show 20 Lines	class AMDGPUDimGetResInfoProfile<AMDGPUDimProps dim> : AMDGPUDimProfile<"GET_RESINFO", dim> {
let AddrArgs = [AMDGPUArg<llvm_anyint_ty, "mip">];		let AddrArgs = [AMDGPUArg<llvm_anyint_ty, "mip">];
let LodClampMip = "mip";		let LodClampMip = "mip";
}		}

// Helper class for figuring out image intrinsic argument indexes.		// Helper class for figuring out image intrinsic argument indexes.
class AMDGPUImageDimIntrinsicEval<AMDGPUDimProfile P_> {		class AMDGPUImageDimIntrinsicEval<AMDGPUDimProfile P_> {
int NumDataArgs = !size(P_.DataArgs);		int NumDataArgs = !size(P_.DataArgs);
int NumDmaskArgs = !if(P_.IsAtomic, 0, 1);		int NumDmaskArgs = !if(P_.IsAtomic, 0, 1);
		int NumExtraAddrArgs = !size(P_.ExtraAddrArgs);
int NumVAddrArgs = !size(P_.AddrArgs);		int NumVAddrArgs = !size(P_.AddrArgs);
		int NumGradientArgs = !if(P_.Gradients, !size(P_.Dim.GradientArgs), 0);
		int NumCoordArgs = !if(P_.IsSample, !size(P_.Dim.CoordSliceArgs), !size(P_.Dim.CoordSliceIntArgs));
int NumRSrcArgs = 1;		int NumRSrcArgs = 1;
int NumSampArgs = !if(P_.IsSample, 2, 0);		int NumSampArgs = !if(P_.IsSample, 2, 0);
int DmaskArgIndex = NumDataArgs;		int DmaskArgIndex = NumDataArgs;
		int VAddrArgIndex = !add(NumDataArgs, NumDmaskArgs);
		int GradientArgIndex = !add(NumDataArgs, NumDmaskArgs, NumExtraAddrArgs);
		int CoordArgIndex = !add(NumDataArgs, NumDmaskArgs, NumExtraAddrArgs, NumGradientArgs);
int UnormArgIndex = !add(NumDataArgs, NumDmaskArgs, NumVAddrArgs, NumRSrcArgs, 1);		int UnormArgIndex = !add(NumDataArgs, NumDmaskArgs, NumVAddrArgs, NumRSrcArgs, 1);
int TexFailCtrlArgIndex = !add(NumDataArgs, NumDmaskArgs, NumVAddrArgs, NumRSrcArgs, NumSampArgs);		int TexFailCtrlArgIndex = !add(NumDataArgs, NumDmaskArgs, NumVAddrArgs, NumRSrcArgs, NumSampArgs);
int CachePolicyArgIndex = !add(TexFailCtrlArgIndex, 1);		int CachePolicyArgIndex = !add(TexFailCtrlArgIndex, 1);
}		}

// All dimension-aware intrinsics are derived from this class.		// All dimension-aware intrinsics are derived from this class.
class AMDGPUImageDimIntrinsic<AMDGPUDimProfile P_,		class AMDGPUImageDimIntrinsic<AMDGPUDimProfile P_,
list<IntrinsicProperty> props,		list<IntrinsicProperty> props,
▲ Show 20 Lines • Show All 1,303 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp

Show First 20 Lines • Show All 47 Lines • ▼ Show 20 Lines	static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
APFloat::cmpResult Cmp1 = Max3.compare(Src1);		APFloat::cmpResult Cmp1 = Max3.compare(Src1);
assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");		assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
if (Cmp1 == APFloat::cmpEqual)		if (Cmp1 == APFloat::cmpEqual)
return maxnum(Src0, Src2);		return maxnum(Src0, Src2);

return maxnum(Src0, Src1);		return maxnum(Src0, Src1);
}		}

		// Check if a value can be converted to a 16-bit value without losing
		arsenmUnsubmitted Not Done Reply Inline Actions Typo loosing arsenm: Typo loosing
		// precision.
		static bool canSafelyConvertTo16Bit(Value &V) {
		Type *VTy = V.getType();
		if (VTy->isHalfTy() \|\| VTy->isIntegerTy(16)) {
		// The value is already 16-bit, so we don't want to convert to 16-bit again!
		return false;
		}
		arsenmUnsubmitted Not Done Reply Inline Actions No else after return arsenm: No else after return
		if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
		// We need to check that if we cast the index down to a half, we do not lose
		// precision.
		APFloat FloatValue(ConstFloat->getValueAPF());
		bool LosesInfo = true;
		FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, &LosesInfo);
		return !LosesInfo;
		arsenmUnsubmitted Not Done Reply Inline Actions No else after return. Also use match()? arsenm: No else after return. Also use match()?
		}
		Value *CastSrc;
		if (match(&V, m_FPExt(PatternMatch::m_Value(CastSrc))) \|\|
		match(&V, m_SExt(PatternMatch::m_Value(CastSrc))) \|\|
		match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)))) {
		Type *CastSrcTy = CastSrc->getType();
		if (CastSrcTy->isHalfTy() \|\| CastSrcTy->isIntegerTy(16))
		return true;
		}

		return false;
		}

		// Convert a value to 16-bit.
		Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) {
		arsenmUnsubmitted Not Done Reply Inline Actions No else after return arsenm: No else after return
		Type *VTy = V.getType();
		if (isa<FPExtInst>(&V) \|\| isa<SExtInst>(&V) \|\| isa<ZExtInst>(&V))
		return cast<Instruction>(&V)->getOperand(0);
		if (VTy->isIntegerTy())
		return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false);
		if (VTy->isFloatingPointTy())
		arsenmUnsubmitted Not Done Reply Inline Actions Dead code arsenm: Dead code
		return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext()));

		llvm_unreachable("Should never be called!");
		}

		arsenmUnsubmitted Not Done Reply Inline Actions Don't need llvm:: arsenm: Don't need llvm::
		static Optional<Instruction *>
		simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
		const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
		IntrinsicInst &II, InstCombiner &IC) {
		if (!ST->hasA16() && !ST->hasG16())
		return None;

		bool FloatCoord = false;
		// true means derivatives can be converted to 16 bit, coordinates not
		bool OnlyDerivatives = false;

		for (unsigned OperandIndex = ImageDimIntr->GradientStart;
		OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
		Value *Coord = II.getOperand(OperandIndex);
		// If the values are not derived from 16-bit values, we cannot optimize.
		if (!canSafelyConvertTo16Bit(*Coord)) {
		if (OperandIndex < ImageDimIntr->CoordStart \|\|
		arsenmUnsubmitted Not Done Reply Inline Actions No else after return arsenm: No else after return
		ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
		return None;
		}
		// All gradients can be converted, so convert only them
		OnlyDerivatives = true;
		break;
		}

		assert(OperandIndex == ImageDimIntr->GradientStart \|\|
		FloatCoord == Coord->getType()->isFloatingPointTy());
		FloatCoord = Coord->getType()->isFloatingPointTy();
		}

		if (OnlyDerivatives) {
		if (!ST->hasG16())
		return None;
		} else {
		if (!ST->hasA16())
		OnlyDerivatives = true; // Only supports G16
		}

		Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())
		: Type::getInt16Ty(II.getContext());

		SmallVector<Type *, 4> ArgTys;
		if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), ArgTys))
		return None;

		ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
		if (!OnlyDerivatives)
		ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
		Function *I =
		Intrinsic::getDeclaration(II.getModule(), II.getIntrinsicID(), ArgTys);

		SmallVector<Value *, 8> Args(II.arg_operands());

		unsigned EndIndex =
		OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
		for (unsigned OperandIndex = ImageDimIntr->GradientStart;
		OperandIndex < EndIndex; OperandIndex++) {
		Args[OperandIndex] =
		convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder);
		}

		CallInst *NewCall = IC.Builder.CreateCall(I, Args);
		NewCall->takeName(&II);
		NewCall->copyMetadata(II);
		NewCall->copyFastMathFlags(&II);
		return IC.replaceInstUsesWith(II, NewCall);
		}

Optional<Instruction *>		Optional<Instruction *>
GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {		GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
Intrinsic::ID IID = II.getIntrinsicID();		Intrinsic::ID IID = II.getIntrinsicID();
switch (IID) {		switch (IID) {
default:
break;
case Intrinsic::amdgcn_rcp: {		case Intrinsic::amdgcn_rcp: {
Value *Src = II.getArgOperand(0);		Value *Src = II.getArgOperand(0);

// TODO: Move to ConstantFolding/InstSimplify?		// TODO: Move to ConstantFolding/InstSimplify?
if (isa<UndefValue>(Src)) {		if (isa<UndefValue>(Src)) {
Type *Ty = II.getType();		Type *Ty = II.getType();
auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));		auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
return IC.replaceInstUsesWith(II, QNaN);		return IC.replaceInstUsesWith(II, QNaN);
▲ Show 20 Lines • Show All 640 Lines • ▼ Show 20 Lines	case Intrinsic::amdgcn_ldexp: {
// ldexp(x, 0) -> x		// ldexp(x, 0) -> x
// ldexp(x, undef) -> x		// ldexp(x, undef) -> x
if (isa<UndefValue>(Op1) \|\| match(Op1, PatternMatch::m_ZeroInt())) {		if (isa<UndefValue>(Op1) \|\| match(Op1, PatternMatch::m_ZeroInt())) {
return IC.replaceInstUsesWith(II, Op0);		return IC.replaceInstUsesWith(II, Op0);
}		}

break;		break;
}		}
		default: {
		if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
		AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
		return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
		}
		}
		arsenmUnsubmitted Not Done Reply Inline Actions The subtarget is already available in GCNTTI arsenm: The subtarget is already available in GCNTTI
}		}
return None;		return None;
}		}

/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.		/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
///		///
/// Note: This only supports non-TFE/LWE image intrinsic calls; those have		/// Note: This only supports non-TFE/LWE image intrinsic calls; those have
/// struct returns.		/// struct returns.
▲ Show 20 Lines • Show All 170 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h

Show First 20 Lines • Show All 46 Lines • ▼ Show 20 Lines	struct D16ImageDimIntrinsic {
unsigned D16HelperIntr;		unsigned D16HelperIntr;
};		};
const D16ImageDimIntrinsic *lookupD16ImageDimIntrinsic(unsigned Intr);		const D16ImageDimIntrinsic *lookupD16ImageDimIntrinsic(unsigned Intr);

struct ImageDimIntrinsicInfo {		struct ImageDimIntrinsicInfo {
unsigned Intr;		unsigned Intr;
unsigned BaseOpcode;		unsigned BaseOpcode;
MIMGDim Dim;		MIMGDim Dim;
		unsigned GradientStart;
		unsigned CoordStart;
		unsigned VAddrEnd;
		unsigned GradientTyArg;
		unsigned CoordTyArg;
};		};
const ImageDimIntrinsicInfo *getImageDimIntrinsicInfo(unsigned Intr);		const ImageDimIntrinsicInfo *getImageDimIntrinsicInfo(unsigned Intr);

const ImageDimIntrinsicInfo *getImageDimInstrinsicByBaseOpcode(unsigned BaseOpcode,		const ImageDimIntrinsicInfo *getImageDimInstrinsicByBaseOpcode(unsigned BaseOpcode,
unsigned Dim);		unsigned Dim);

} // end AMDGPU namespace		} // end AMDGPU namespace
} // End llvm namespace		} // End llvm namespace

#endif		#endif

llvm/lib/Target/AMDGPU/MIMGInstructions.td

	Show First 20 Lines • Show All 834 Lines • ▼ Show 20 Lines
	/******** ========================================= ********/			/******** ========================================= ********/
	/******** Table of dimension-aware image intrinsics ********/			/******** Table of dimension-aware image intrinsics ********/
	/******** ========================================= ********/			/******** ========================================= ********/

	class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> {			class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> {
	Intrinsic Intr = I;			Intrinsic Intr = I;
	MIMGBaseOpcode BaseOpcode = !cast<MIMGBaseOpcode>(!strconcat("IMAGE_", I.P.OpMod));			MIMGBaseOpcode BaseOpcode = !cast<MIMGBaseOpcode>(!strconcat("IMAGE_", I.P.OpMod));
	AMDGPUDimProps Dim = I.P.Dim;			AMDGPUDimProps Dim = I.P.Dim;
				AMDGPUImageDimIntrinsicEval DimEval = AMDGPUImageDimIntrinsicEval<I.P>;

				bits<8> GradientStart = DimEval.GradientArgIndex;
				bits<8> CoordStart = DimEval.CoordArgIndex;
				bits<8> VAddrEnd = !add(DimEval.VAddrArgIndex, DimEval.NumVAddrArgs);
				bits<8> GradientTyArg = !add(I.P.NumRetAndDataAnyTypes,
				!foldl(0, I.P.ExtraAddrArgs, cnt, arg, !add(cnt, arg.Type.isAny)));
				bits<8> CoordTyArg = !add(GradientTyArg, !if(I.P.Gradients, 1, 0));
	}			}

	def ImageDimIntrinsicTable : GenericTable {			def ImageDimIntrinsicTable : GenericTable {
	let FilterClass = "ImageDimIntrinsicInfo";			let FilterClass = "ImageDimIntrinsicInfo";
	let Fields = ["Intr", "BaseOpcode", "Dim"];			let Fields = ["Intr", "BaseOpcode", "Dim", "GradientStart", "CoordStart", "VAddrEnd", "GradientTyArg", "CoordTyArg"];
	GenericEnum TypeOf_BaseOpcode = MIMGBaseOpcode;			GenericEnum TypeOf_BaseOpcode = MIMGBaseOpcode;
	GenericEnum TypeOf_Dim = MIMGDim;			GenericEnum TypeOf_Dim = MIMGDim;

	let PrimaryKey = ["Intr"];			let PrimaryKey = ["Intr"];
	let PrimaryKeyName = "getImageDimIntrinsicInfo";			let PrimaryKeyName = "getImageDimIntrinsicInfo";
	let PrimaryKeyEarlyOut = 1;			let PrimaryKeyEarlyOut = 1;
	}			}

	▲ Show 20 Lines • Show All 41 Lines • Show Last 20 Lines

llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics-gfx8.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
				; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -instcombine -S < %s \| FileCheck %s

				; --------------------------------------------------------------------
				; llvm.amdgcn.image.sample a16 is disabled on pre-gfx9
				; --------------------------------------------------------------------

				declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
				declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
				declare <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
				declare <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
				declare <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
				declare <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1

				define amdgpu_kernel void @image_sample_a16_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) {
				; CHECK-LABEL: @image_sample_a16_1d(
				; CHECK-NEXT: [[S32:%.]] = fpext half [[S:%.]] to float
				; CHECK-NEXT: [[RES:%.]] = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float [[S32]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%s32 = fpext half %s to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) {
				; CHECK-LABEL: @image_sample_a16_2d(
				; CHECK-NEXT: [[S32:%.]] = fpext half [[S:%.]] to float
				; CHECK-NEXT: [[T32:%.]] = fpext half [[T:%.]] to float
				; CHECK-NEXT: [[RES:%.]] = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float [[S32]], float [[T32]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%s32 = fpext half %s to float
				%t32 = fpext half %t to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_3d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %r) {
				; CHECK-LABEL: @image_sample_a16_3d(
				; CHECK-NEXT: [[S32:%.]] = fpext half [[S:%.]] to float
				; CHECK-NEXT: [[T32:%.]] = fpext half [[T:%.]] to float
				; CHECK-NEXT: [[R32:%.]] = fpext half [[R:%.]] to float
				; CHECK-NEXT: [[RES:%.]] = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32 15, float [[S32]], float [[T32]], float [[R32]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%s32 = fpext half %s to float
				%t32 = fpext half %t to float
				%r32 = fpext half %r to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32 15, float %s32, float %t32, float %r32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_cube(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %face) {
				;
				; CHECK-LABEL: @image_sample_a16_cube(
				; CHECK-NEXT: [[S32:%.]] = fpext half [[S:%.]] to float
				; CHECK-NEXT: [[T32:%.]] = fpext half [[T:%.]] to float
				; CHECK-NEXT: [[FACE32:%.]] = fpext half [[FACE:%.]] to float
				; CHECK-NEXT: [[RES:%.]] = call <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32 15, float [[S32]], float [[T32]], float [[FACE32]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%s32 = fpext half %s to float
				%t32 = fpext half %t to float
				%face32 = fpext half %face to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32 15, float %s32, float %t32, float %face32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_1darray(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %slice) {
				; CHECK-LABEL: @image_sample_a16_1darray(
				; CHECK-NEXT: [[S32:%.]] = fpext half [[S:%.]] to float
				; CHECK-NEXT: [[SLICE32:%.]] = fpext half [[SLICE:%.]] to float
				; CHECK-NEXT: [[RES:%.]] = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32 15, float [[S32]], float [[SLICE32]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%s32 = fpext half %s to float
				%slice32 = fpext half %slice to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32 15, float %s32, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_2darray(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %slice) {
				; CHECK-LABEL: @image_sample_a16_2darray(
				; CHECK-NEXT: [[S32:%.]] = fpext half [[S:%.]] to float
				; CHECK-NEXT: [[T32:%.]] = fpext half [[T:%.]] to float
				; CHECK-NEXT: [[SLICE32:%.]] = fpext half [[SLICE:%.]] to float
				; CHECK-NEXT: [[RES:%.]] = call <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f32(i32 15, float [[S32]], float [[T32]], float [[SLICE32]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%s32 = fpext half %s to float
				%t32 = fpext half %t to float
				%slice32 = fpext half %slice to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f32(i32 15, float %s32, float %t32, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll

	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	; RUN: opt -mtriple=amdgcn-amd-amdhsa -instcombine -S < %s \| FileCheck %s			; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -instcombine -S < %s \| FileCheck %s

	; --------------------------------------------------------------------			; --------------------------------------------------------------------
	; llvm.amdgcn.rcp			; llvm.amdgcn.rcp
	; --------------------------------------------------------------------			; --------------------------------------------------------------------

	declare float @llvm.amdgcn.rcp.f32(float) nounwind readnone			declare float @llvm.amdgcn.rcp.f32(float) nounwind readnone
	declare double @llvm.amdgcn.rcp.f64(double) nounwind readnone			declare double @llvm.amdgcn.rcp.f64(double) nounwind readnone

	▲ Show 20 Lines • Show All 50 Lines • ▼ Show 20 Lines
	; CHECK-NEXT: ret double 0x3F97D05F417D05F4			; CHECK-NEXT: ret double 0x3F97D05F417D05F4
	;			;
	%val = call double @llvm.amdgcn.rcp.f64(double 4.300000e+01) nounwind readnone			%val = call double @llvm.amdgcn.rcp.f64(double 4.300000e+01) nounwind readnone
	ret double %val			ret double %val
	}			}

	define float @test_constant_fold_rcp_f32_43_strictfp() nounwind strictfp {			define float @test_constant_fold_rcp_f32_43_strictfp() nounwind strictfp {
	; CHECK-LABEL: @test_constant_fold_rcp_f32_43_strictfp(			; CHECK-LABEL: @test_constant_fold_rcp_f32_43_strictfp(
	; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.rcp.f32(float 4.300000e+01) [[STRICTFP:#[0-9]+]]			; CHECK-NEXT: [[VAL:%.]] = call float @llvm.amdgcn.rcp.f32(float 4.300000e+01) [[ATTR11:#.]]
	; CHECK-NEXT: ret float [[VAL]]			; CHECK-NEXT: ret float [[VAL]]
	;			;
	%val = call float @llvm.amdgcn.rcp.f32(float 4.300000e+01) strictfp nounwind readnone			%val = call float @llvm.amdgcn.rcp.f32(float 4.300000e+01) strictfp nounwind readnone
	ret float %val			ret float %val
	}			}

	; --------------------------------------------------------------------			; --------------------------------------------------------------------
	; llvm.amdgcn.rsq			; llvm.amdgcn.rsq
	▲ Show 20 Lines • Show All 1,579 Lines • ▼ Show 20 Lines
	; CHECK-NEXT: ret i64 0			; CHECK-NEXT: ret i64 0
	;			;
	%result = call i64 @llvm.amdgcn.icmp.i64.i32(i32 9, i32 8, i32 32)			%result = call i64 @llvm.amdgcn.icmp.i64.i32(i32 9, i32 8, i32 32)
	ret i64 %result			ret i64 %result
	}			}

	define i64 @icmp_constant_inputs_true() {			define i64 @icmp_constant_inputs_true() {
	; CHECK-LABEL: @icmp_constant_inputs_true(			; CHECK-LABEL: @icmp_constant_inputs_true(
	; CHECK-NEXT: [[RESULT:%.]] = call i64 @llvm.read_register.i64(metadata !0) [[CONVERGENT:#[0-9]]]			; CHECK-NEXT: [[RESULT:%.]] = call i64 @llvm.read_register.i64(metadata !0) [[ATTR12:#.]]
	; CHECK-NEXT: ret i64 [[RESULT]]			; CHECK-NEXT: ret i64 [[RESULT]]
	;			;
	%result = call i64 @llvm.amdgcn.icmp.i64.i32(i32 9, i32 8, i32 34)			%result = call i64 @llvm.amdgcn.icmp.i64.i32(i32 9, i32 8, i32 34)
	ret i64 %result			ret i64 %result
	}			}

	define i64 @icmp_constant_to_rhs_slt(i32 %x) {			define i64 @icmp_constant_to_rhs_slt(i32 %x) {
	; CHECK-LABEL: @icmp_constant_to_rhs_slt(			; CHECK-LABEL: @icmp_constant_to_rhs_slt(
	▲ Show 20 Lines • Show All 690 Lines • ▼ Show 20 Lines
	; CHECK-NEXT: ret i64 0			; CHECK-NEXT: ret i64 0
	;			;
	%result = call i64 @llvm.amdgcn.fcmp.i64.f32(float 2.0, float 4.0, i32 1)			%result = call i64 @llvm.amdgcn.fcmp.i64.f32(float 2.0, float 4.0, i32 1)
	ret i64 %result			ret i64 %result
	}			}

	define i64 @fcmp_constant_inputs_true() {			define i64 @fcmp_constant_inputs_true() {
	; CHECK-LABEL: @fcmp_constant_inputs_true(			; CHECK-LABEL: @fcmp_constant_inputs_true(
	; CHECK-NEXT: [[RESULT:%.*]] = call i64 @llvm.read_register.i64(metadata !0) [[CONVERGENT]]			; CHECK-NEXT: [[RESULT:%.*]] = call i64 @llvm.read_register.i64(metadata !0) [[ATTR12]]
	; CHECK-NEXT: ret i64 [[RESULT]]			; CHECK-NEXT: ret i64 [[RESULT]]
	;			;
	%result = call i64 @llvm.amdgcn.fcmp.i64.f32(float 2.0, float 4.0, i32 4)			%result = call i64 @llvm.amdgcn.fcmp.i64.f32(float 2.0, float 4.0, i32 4)
	ret i64 %result			ret i64 %result
	}			}

	define i64 @fcmp_constant_to_rhs_olt(float %x) {			define i64 @fcmp_constant_to_rhs_olt(float %x) {
	; CHECK-LABEL: @fcmp_constant_to_rhs_olt(			; CHECK-LABEL: @fcmp_constant_to_rhs_olt(
	Show All 25 Lines
	; CHECK-NEXT: ret i64 0			; CHECK-NEXT: ret i64 0
	;			;
	%b = call i64 @llvm.amdgcn.ballot.i64(i1 0)			%b = call i64 @llvm.amdgcn.ballot.i64(i1 0)
	ret i64 %b			ret i64 %b
	}			}

	define i64 @ballot_one_64() {			define i64 @ballot_one_64() {
	; CHECK-LABEL: @ballot_one_64(			; CHECK-LABEL: @ballot_one_64(
	; CHECK-NEXT: %b = call i64 @llvm.read_register.i64(metadata !0) [[CONVERGENT]]			; CHECK-NEXT: [[B:%.*]] = call i64 @llvm.read_register.i64(metadata !0) [[ATTR12]]
	; CHECK-NEXT: ret i64 %b			; CHECK-NEXT: ret i64 [[B]]
	;			;
	%b = call i64 @llvm.amdgcn.ballot.i64(i1 1)			%b = call i64 @llvm.amdgcn.ballot.i64(i1 1)
	ret i64 %b			ret i64 %b
	}			}

	define i32 @ballot_nocombine_32(i1 %i) {			define i32 @ballot_nocombine_32(i1 %i) {
	; CHECK-LABEL: @ballot_nocombine_32(			; CHECK-LABEL: @ballot_nocombine_32(
	; CHECK-NEXT: [[B:%.]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[I:%.]])			; CHECK-NEXT: [[B:%.]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[I:%.]])
	; CHECK-NEXT: ret i32 [[B]]			; CHECK-NEXT: ret i32 [[B]]
	;			;
	%b = call i32 @llvm.amdgcn.ballot.i32(i1 %i)			%b = call i32 @llvm.amdgcn.ballot.i32(i1 %i)
	ret i32 %b			ret i32 %b
	}			}

	define i32 @ballot_zero_32() {			define i32 @ballot_zero_32() {
	; CHECK-LABEL: @ballot_zero_32(			; CHECK-LABEL: @ballot_zero_32(
	; CHECK-NEXT: ret i32 0			; CHECK-NEXT: ret i32 0
	;			;
	%b = call i32 @llvm.amdgcn.ballot.i32(i1 0)			%b = call i32 @llvm.amdgcn.ballot.i32(i1 0)
	ret i32 %b			ret i32 %b
	}			}

	define i32 @ballot_one_32() {			define i32 @ballot_one_32() {
	; CHECK-LABEL: @ballot_one_32(			; CHECK-LABEL: @ballot_one_32(
	; CHECK-NEXT: %b = call i32 @llvm.read_register.i32(metadata !1) [[CONVERGENT]]			; CHECK-NEXT: [[B:%.*]] = call i32 @llvm.read_register.i32(metadata !1) [[ATTR12]]
	; CHECK-NEXT: ret i32 %b			; CHECK-NEXT: ret i32 [[B]]
	;			;
	%b = call i32 @llvm.amdgcn.ballot.i32(i1 1)			%b = call i32 @llvm.amdgcn.ballot.i32(i1 1)
	ret i32 %b			ret i32 %b
	}			}

	; --------------------------------------------------------------------			; --------------------------------------------------------------------
	; llvm.amdgcn.wqm.vote			; llvm.amdgcn.wqm.vote
	; --------------------------------------------------------------------			; --------------------------------------------------------------------
	▲ Show 20 Lines • Show All 345 Lines • ▼ Show 20 Lines
	; CHECK-NEXT: store i32 [[RES]], i32 addrspace(1)* [[OUT:%.*]], align 4			; CHECK-NEXT: store i32 [[RES]], i32 addrspace(1)* [[OUT:%.*]], align 4
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
	;			;
	%res = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true)			%res = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true)
	store i32 %res, i32 addrspace(1)* %out			store i32 %res, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	; CHECK: attributes [[STRICTFP]] = { nounwind readnone strictfp }			; --------------------------------------------------------------------
	; CHECK: attributes [[CONVERGENT]] = { convergent }			; llvm.amdgcn.image.sample a16
				; --------------------------------------------------------------------

				declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
				declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
				declare <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
				declare <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
				declare <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
				declare <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1

				declare <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
				declare <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
				declare <4 x float> @llvm.amdgcn.image.sample.cl.1d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
				declare <4 x float> @llvm.amdgcn.image.sample.cl.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
				declare <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
				declare <4 x float> @llvm.amdgcn.image.sample.c.cl.2d.v4f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1

				declare <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
				declare <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
				declare <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
				declare <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
				declare <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
				declare <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
				declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
				declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f32.f32(i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1

				declare <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
				declare <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
				declare <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f32.f32(i32, float, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
				declare <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
				declare <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
				declare <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
				declare <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
				declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f32(i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
				declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1

				declare <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
				declare <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
				declare <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
				declare <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
				declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
				declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
				declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f32(i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
				declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1

				declare <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
				declare <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
				declare <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
				declare <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1

				declare <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
				declare <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
				declare <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
				declare <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1

				declare float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f32.f32(i32, i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
				declare <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f32(i32, i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1

				define amdgpu_kernel void @image_sample_a16_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) {
				; CHECK-LABEL: @image_sample_a16_1d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f16(i32 15, half [[S:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%s32 = fpext half %s to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) {
				; CHECK-LABEL: @image_sample_a16_2d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f16(i32 15, half [[S:%.]], half [[T:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%s32 = fpext half %s to float
				%t32 = fpext half %t to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_3d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %r) {
				; CHECK-LABEL: @image_sample_a16_3d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f16(i32 15, half [[S:%.]], half [[T:%.]], half [[R:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%s32 = fpext half %s to float
				%t32 = fpext half %t to float
				%r32 = fpext half %r to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32 15, float %s32, float %t32, float %r32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_cube(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %face) {
				;
				; CHECK-LABEL: @image_sample_a16_cube(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f16(i32 15, half [[S:%.]], half [[T:%.]], half [[FACE:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%s32 = fpext half %s to float
				%t32 = fpext half %t to float
				%face32 = fpext half %face to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32 15, float %s32, float %t32, float %face32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_1darray(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %slice) {
				; CHECK-LABEL: @image_sample_a16_1darray(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f16(i32 15, half [[S:%.]], half [[SLICE:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%s32 = fpext half %s to float
				%slice32 = fpext half %slice to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32 15, float %s32, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_2darray(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %slice) {
				; CHECK-LABEL: @image_sample_a16_2darray(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f16(i32 15, half [[S:%.]], half [[T:%.]], half [[SLICE:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%s32 = fpext half %s to float
				%t32 = fpext half %t to float
				%slice32 = fpext half %slice to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f32(i32 15, float %s32, float %t32, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_c_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s) {
				; CHECK-LABEL: @image_sample_a16_c_1d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f16(i32 15, float [[ZCOMPARE:%.]], half [[S:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%s32 = fpext half %s to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32 15, float %zcompare, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_c_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t) {
				; CHECK-LABEL: @image_sample_a16_c_2d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f16(i32 15, float [[ZCOMPARE:%.]], half [[S:%.]], half [[T:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%s32 = fpext half %s to float
				%t32 = fpext half %t to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f32(i32 15, float %zcompare, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_cl_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %clamp) {
				; CHECK-LABEL: @image_sample_a16_cl_1d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.cl.1d.v4f32.f16(i32 15, half [[S:%.]], half [[CLAMP:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%s32 = fpext half %s to float
				%clamp32 = fpext half %clamp to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.cl.1d.v4f32.f32(i32 15, float %s32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_cl_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %clamp) {
				; CHECK-LABEL: @image_sample_a16_cl_2d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.cl.2d.v4f32.f16(i32 15, half [[S:%.]], half [[T:%.]], half [[CLAMP:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%s32 = fpext half %s to float
				%t32 = fpext half %t to float
				%clamp32 = fpext half %clamp to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.cl.2d.v4f32.f32(i32 15, float %s32, float %t32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_c_cl_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %clamp) {
				; CHECK-LABEL: @image_sample_a16_c_cl_1d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f16(i32 15, float [[ZCOMPARE:%.]], half [[S:%.]], half [[CLAMP:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%s32 = fpext half %s to float
				%clamp32 = fpext half %clamp to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f32(i32 15, float %zcompare, float %s32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_c_cl_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t, half %clamp) {
				; CHECK-LABEL: @image_sample_a16_c_cl_2d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.c.cl.2d.v4f32.f16(i32 15, float [[ZCOMPARE:%.]], half [[S:%.]], half [[T:%.]], half [[CLAMP:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%s32 = fpext half %s to float
				%t32 = fpext half %t to float
				%clamp32 = fpext half %clamp to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.c.cl.2d.v4f32.f32(i32 15, float %zcompare, float %s32, float %t32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_b_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s) {
				; CHECK-LABEL: @image_sample_a16_b_1d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f16(i32 15, float [[BIAS:%.]], half [[S:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%s32 = fpext half %s to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32 15, float %bias, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_b_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %t) {
				; CHECK-LABEL: @image_sample_a16_b_2d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32.f16(i32 15, float [[BIAS:%.]], half [[S:%.]], half [[T:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%s32 = fpext half %s to float
				%t32 = fpext half %t to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32.f32(i32 15, float %bias, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_c_b_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s) {
				; CHECK-LABEL: @image_sample_a16_c_b_1d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f16(i32 15, float [[BIAS:%.]], float [[ZCOMPARE:%.]], half [[S:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%s32 = fpext half %s to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32(i32 15, float %bias, float %zcompare, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_c_b_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %t) {
				; CHECK-LABEL: @image_sample_a16_c_b_2d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.f16(i32 15, float [[BIAS:%.]], float [[ZCOMPARE:%.]], half [[S:%.]], half [[T:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%s32 = fpext half %s to float
				%t32 = fpext half %t to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.f32(i32 15, float %bias, float %zcompare, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_b_cl_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %clamp) {
				; CHECK-LABEL: @image_sample_a16_b_cl_1d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f16(i32 15, float [[BIAS:%.]], half [[S:%.]], half [[CLAMP:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%s32 = fpext half %s to float
				%clamp32 = fpext half %clamp to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32(i32 15, float %bias, float %s32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_b_cl_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %t, half %clamp) {
				; CHECK-LABEL: @image_sample_a16_b_cl_2d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f16(i32 15, float [[BIAS:%.]], half [[S:%.]], half [[T:%.]], half [[CLAMP:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%s32 = fpext half %s to float
				%t32 = fpext half %t to float
				%clamp32 = fpext half %clamp to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f32(i32 15, float %bias, float %s32, float %t32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_c_b_cl_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %clamp) {
				; CHECK-LABEL: @image_sample_a16_c_b_cl_1d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f16(i32 15, float [[BIAS:%.]], float [[ZCOMPARE:%.]], half [[S:%.]], half [[CLAMP:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%s32 = fpext half %s to float
				%clamp32 = fpext half %clamp to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32(i32 15, float %bias, float %zcompare, float %s32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_c_b_cl_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %t, half %clamp) {
				; CHECK-LABEL: @image_sample_a16_c_b_cl_2d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f32.f16(i32 15, float [[BIAS:%.]], float [[ZCOMPARE:%.]], half [[S:%.]], half [[T:%.]], half [[CLAMP:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%s32 = fpext half %s to float
				%t32 = fpext half %t to float
				%clamp32 = fpext half %clamp to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f32.f32(i32 15, float %bias, float %zcompare, float %s32, float %t32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_d_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, half %s) {
				; CHECK-LABEL: @image_sample_a16_d_1d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f16(i32 15, half [[DSDH:%.]], half [[DSDV:%.]], half [[S:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%dsdh32 = fpext half %dsdh to float
				%dsdv32 = fpext half %dsdv to float
				%s32 = fpext half %s to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f32.f32(i32 15, float %dsdh32, float %dsdv32, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_d_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) {
				; CHECK-LABEL: @image_sample_a16_d_2d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f16(i32 15, half [[DSDH:%.]], half [[DTDH:%.]], half [[DSDV:%.]], half [[DTDV:%.]], half [[S:%.]], half [[T:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%dsdh32 = fpext half %dsdh to float
				%dtdh32 = fpext half %dtdh to float
				%dsdv32 = fpext half %dsdv to float
				%dtdv32 = fpext half %dtdv to float
				%s32 = fpext half %s to float
				%t32 = fpext half %t to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32(i32 15, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_d_3d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, half %s, half %t, half %r) {
				; CHECK-LABEL: @image_sample_a16_d_3d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f16(i32 15, half [[DSDH:%.]], half [[DTDH:%.]], half [[DRDH:%.]], half [[DSDV:%.]], half [[DTDV:%.]], half [[DRDV:%.]], half [[S:%.]], half [[T:%.]], half [[R:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%dsdh32 = fpext half %dsdh to float
				%dtdh32 = fpext half %dtdh to float
				%drdh32 = fpext half %drdh to float
				%dsdv32 = fpext half %dsdv to float
				%dtdv32 = fpext half %dtdv to float
				%drdv32 = fpext half %drdv to float
				%s32 = fpext half %s to float
				%t32 = fpext half %t to float
				%r32 = fpext half %r to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f32.f32(i32 15, float %dsdh32, float %dtdh32, float %drdh32, float %dsdv32, float %dtdv32, float %drdv32, float %s32, float %t32, float %r32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_c_d_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, half %s) {
				; CHECK-LABEL: @image_sample_a16_c_d_1d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f16(i32 15, float [[ZCOMPARE:%.]], half [[DSDH:%.]], half [[DSDV:%.]], half [[S:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%dsdh32 = fpext half %dsdh to float
				%dsdv32 = fpext half %dsdv to float
				%s32 = fpext half %s to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh32, float %dsdv32, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_c_d_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) {
				; CHECK-LABEL: @image_sample_a16_c_d_2d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f16(i32 15, float [[ZCOMPARE:%.]], half [[DSDH:%.]], half [[DTDH:%.]], half [[DSDV:%.]], half [[DTDV:%.]], half [[S:%.]], half [[T:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%dsdh32 = fpext half %dsdh to float
				%dtdh32 = fpext half %dtdh to float
				%dsdv32 = fpext half %dsdv to float
				%dtdv32 = fpext half %dtdv to float
				%s32 = fpext half %s to float
				%t32 = fpext half %t to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_d_cl_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, half %s, half %clamp) {
				; CHECK-LABEL: @image_sample_a16_d_cl_1d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f16(i32 15, half [[DSDH:%.]], half [[DSDV:%.]], half [[S:%.]], half [[CLAMP:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%dsdh32 = fpext half %dsdh to float
				%dsdv32 = fpext half %dsdv to float
				%s32 = fpext half %s to float
				%clamp32 = fpext half %clamp to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32(i32 15, float %dsdh32, float %dsdv32, float %s32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_d_cl_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) {
				; CHECK-LABEL: @image_sample_a16_d_cl_2d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f16(i32 15, half [[DSDH:%.]], half [[DTDH:%.]], half [[DSDV:%.]], half [[DTDV:%.]], half [[S:%.]], half [[T:%.]], half [[CLAMP:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%dsdh32 = fpext half %dsdh to float
				%dtdh32 = fpext half %dtdh to float
				%dsdv32 = fpext half %dsdv to float
				%dtdv32 = fpext half %dtdv to float
				%s32 = fpext half %s to float
				%t32 = fpext half %t to float
				%clamp32 = fpext half %clamp to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f32.f32(i32 15, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float %t32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_c_d_cl_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, half %s, half %clamp) {
				; CHECK-LABEL: @image_sample_a16_c_d_cl_1d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f16(i32 15, float [[ZCOMPARE:%.]], half [[DSDH:%.]], half [[DSDV:%.]], half [[S:%.]], half [[CLAMP:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%dsdh32 = fpext half %dsdh to float
				%dsdv32 = fpext half %dsdv to float
				%s32 = fpext half %s to float
				%clamp32 = fpext half %clamp to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh32, float %dsdv32, float %s32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_c_d_cl_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) {
				; CHECK-LABEL: @image_sample_a16_c_d_cl_2d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f16(i32 15, float [[ZCOMPARE:%.]], half [[DSDH:%.]], half [[DTDH:%.]], half [[DSDV:%.]], half [[DTDV:%.]], half [[S:%.]], half [[T:%.]], half [[CLAMP:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%dsdh32 = fpext half %dsdh to float
				%dtdh32 = fpext half %dtdh to float
				%dsdv32 = fpext half %dsdv to float
				%dtdv32 = fpext half %dtdv to float
				%s32 = fpext half %s to float
				%t32 = fpext half %t to float
				%clamp32 = fpext half %clamp to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float %t32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_cd_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, half %s) {
				; CHECK-LABEL: @image_sample_a16_cd_1d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f16(i32 15, half [[DSDH:%.]], half [[DSDV:%.]], half [[S:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%dsdh32 = fpext half %dsdh to float
				%dsdv32 = fpext half %dsdv to float
				%s32 = fpext half %s to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32(i32 15, float %dsdh32, float %dsdv32, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_cd_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) {
				; CHECK-LABEL: @image_sample_a16_cd_2d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f16(i32 15, half [[DSDH:%.]], half [[DTDH:%.]], half [[DSDV:%.]], half [[DTDV:%.]], half [[S:%.]], half [[T:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%dsdh32 = fpext half %dsdh to float
				%dtdh32 = fpext half %dtdh to float
				%dsdv32 = fpext half %dsdv to float
				%dtdv32 = fpext half %dtdv to float
				%s32 = fpext half %s to float
				%t32 = fpext half %t to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f32.f32(i32 15, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_c_cd_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, half %s) {
				; CHECK-LABEL: @image_sample_a16_c_cd_1d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f16(i32 15, float [[ZCOMPARE:%.]], half [[DSDH:%.]], half [[DSDV:%.]], half [[S:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%dsdh32 = fpext half %dsdh to float
				%dsdv32 = fpext half %dsdv to float
				%s32 = fpext half %s to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh32, float %dsdv32, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_c_cd_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) {
				; CHECK-LABEL: @image_sample_a16_c_cd_2d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f16(i32 15, float [[ZCOMPARE:%.]], half [[DSDH:%.]], half [[DTDH:%.]], half [[DSDV:%.]], half [[DTDV:%.]], half [[S:%.]], half [[T:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%dsdh32 = fpext half %dsdh to float
				%dtdh32 = fpext half %dtdh to float
				%dsdv32 = fpext half %dsdv to float
				%dtdv32 = fpext half %dtdv to float
				%s32 = fpext half %s to float
				%t32 = fpext half %t to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_cd_cl_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, half %s, half %clamp) {
				; CHECK-LABEL: @image_sample_a16_cd_cl_1d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f16(i32 15, half [[DSDH:%.]], half [[DSDV:%.]], half [[S:%.]], half [[CLAMP:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%dsdh32 = fpext half %dsdh to float
				%dsdv32 = fpext half %dsdv to float
				%s32 = fpext half %s to float
				%clamp32 = fpext half %clamp to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f32.f32(i32 15, float %dsdh32, float %dsdv32, float %s32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_cd_cl_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) {
				; CHECK-LABEL: @image_sample_a16_cd_cl_2d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f16(i32 15, half [[DSDH:%.]], half [[DTDH:%.]], half [[DSDV:%.]], half [[DTDV:%.]], half [[S:%.]], half [[T:%.]], half [[CLAMP:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%dsdh32 = fpext half %dsdh to float
				%dtdh32 = fpext half %dtdh to float
				%dsdv32 = fpext half %dsdv to float
				%dtdv32 = fpext half %dtdv to float
				%s32 = fpext half %s to float
				%t32 = fpext half %t to float
				%clamp32 = fpext half %clamp to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f32.f32(i32 15, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float %t32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_c_cd_cl_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, half %s, half %clamp) {
				; CHECK-LABEL: @image_sample_a16_c_cd_cl_1d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f16(i32 15, float [[ZCOMPARE:%.]], half [[DSDH:%.]], half [[DSDV:%.]], half [[S:%.]], half [[CLAMP:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%dsdh32 = fpext half %dsdh to float
				%dsdv32 = fpext half %dsdv to float
				%s32 = fpext half %s to float
				%clamp32 = fpext half %clamp to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh32, float %dsdv32, float %s32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_c_cd_cl_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) {
				; CHECK-LABEL: @image_sample_a16_c_cd_cl_2d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f16(i32 15, float [[ZCOMPARE:%.]], half [[DSDH:%.]], half [[DTDH:%.]], half [[DSDV:%.]], half [[DTDV:%.]], half [[S:%.]], half [[T:%.]], half [[CLAMP:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%dsdh32 = fpext half %dsdh to float
				%dtdh32 = fpext half %dtdh to float
				%dsdv32 = fpext half %dsdv to float
				%dtdv32 = fpext half %dtdv to float
				%s32 = fpext half %s to float
				%t32 = fpext half %t to float
				%clamp32 = fpext half %clamp to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float %t32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_l_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %lod) {
				; CHECK-LABEL: @image_sample_a16_l_1d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f16(i32 15, half [[S:%.]], half [[LOD:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%s32 = fpext half %s to float
				%lod32 = fpext half %lod to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f32(i32 15, float %s32, float %lod32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_l_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %lod) {
				; CHECK-LABEL: @image_sample_a16_l_2d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f16(i32 15, half [[S:%.]], half [[T:%.]], half [[LOD:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%s32 = fpext half %s to float
				%t32 = fpext half %t to float
				%lod32 = fpext half %lod to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32(i32 15, float %s32, float %t32, float %lod32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_c_l_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %lod) {
				; CHECK-LABEL: @image_sample_a16_c_l_1d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f16(i32 15, float [[ZCOMPARE:%.]], half [[S:%.]], half [[LOD:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%s32 = fpext half %s to float
				%lod32 = fpext half %lod to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32(i32 15, float %zcompare, float %s32, float %lod32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_c_l_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t, half %lod) {
				; CHECK-LABEL: @image_sample_a16_c_l_2d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f16(i32 15, float [[ZCOMPARE:%.]], half [[S:%.]], half [[T:%.]], half [[LOD:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%s32 = fpext half %s to float
				%t32 = fpext half %t to float
				%lod32 = fpext half %lod to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f32(i32 15, float %zcompare, float %s32, float %t32, float %lod32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_lz_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) {
				; CHECK-LABEL: @image_sample_a16_lz_1d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f16(i32 15, half [[S:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%s32 = fpext half %s to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32(i32 15, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_lz_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) {
				; CHECK-LABEL: @image_sample_a16_lz_2d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f16(i32 15, half [[S:%.]], half [[T:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%s32 = fpext half %s to float
				%t32 = fpext half %t to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 15, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_c_lz_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s) {
				; CHECK-LABEL: @image_sample_a16_c_lz_1d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f16(i32 15, float [[ZCOMPARE:%.]], half [[S:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%s32 = fpext half %s to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32(i32 15, float %zcompare, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_c_lz_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t) {
				; CHECK-LABEL: @image_sample_a16_c_lz_2d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f16(i32 15, float [[ZCOMPARE:%.]], half [[S:%.]], half [[T:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%s32 = fpext half %s to float
				%t32 = fpext half %t to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f32(i32 15, float %zcompare, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_c_d_o_2darray_V1(float addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %slice) {
				; CHECK-LABEL: @image_sample_a16_c_d_o_2darray_V1(
				; CHECK-NEXT: [[TMP1:%.]] = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f16.f16(i32 4, i32 [[OFFSET:%.]], float [[ZCOMPARE:%.]], half [[DSDH:%.]], half [[DTDH:%.]], half [[DSDV:%.]], half [[DTDV:%.]], half [[S:%.]], half [[T:%.]], half [[SLICE:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store float [[TMP1]], float addrspace(1)* [[OUT:%.*]], align 4
				; CHECK-NEXT: ret void
				;
				%dsdh32 = fpext half %dsdh to float
				%dtdh32 = fpext half %dtdh to float
				%dsdv32 = fpext half %dsdv to float
				%dtdv32 = fpext half %dtdv to float
				%s32 = fpext half %s to float
				%t32 = fpext half %t to float
				%slice32 = fpext half %slice to float
				%res = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f32.f32(i32 4, i32 %offset, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float %t32, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store float %res, float addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_c_d_o_2darray_V2(<2 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %slice) {
				; CHECK-LABEL: @image_sample_a16_c_d_o_2darray_V2(
				; CHECK-NEXT: [[TMP1:%.]] = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f16(i32 6, i32 [[OFFSET:%.]], float [[ZCOMPARE:%.]], half [[DSDH:%.]], half [[DTDH:%.]], half [[DSDV:%.]], half [[DTDV:%.]], half [[S:%.]], half [[T:%.]], half [[SLICE:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <2 x float> [[TMP1]], <2 x float> addrspace(1)* [[OUT:%.*]], align 8
				; CHECK-NEXT: ret void
				;
				%dsdh32 = fpext half %dsdh to float
				%dtdh32 = fpext half %dtdh to float
				%dsdv32 = fpext half %dsdv to float
				%dtdv32 = fpext half %dtdv to float
				%s32 = fpext half %s to float
				%t32 = fpext half %t to float
				%slice32 = fpext half %slice to float
				%res = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f32(i32 6, i32 %offset, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float %t32, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <2 x float> %res, <2 x float> addrspace(1)* %out
				ret void
				}

				; --------------------------------------------------------------------
				; llvm.amdgcn.image.sample g16
				; --------------------------------------------------------------------

				define amdgpu_kernel void @image_sample_g16_d_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) {
				; CHECK-LABEL: @image_sample_g16_d_1d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32 15, half [[DSDH:%.]], half [[DSDV:%.]], float [[S:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%dsdh32 = fpext half %dsdh to float
				%dsdv32 = fpext half %dsdv to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f32.f32(i32 15, float %dsdh32, float %dsdv32, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_g16_d_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
				; CHECK-LABEL: @image_sample_g16_d_2d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f32(i32 15, half [[DSDH:%.]], half [[DTDH:%.]], half [[DSDV:%.]], half [[DTDV:%.]], float [[S:%.]], float [[T:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%dsdh32 = fpext half %dsdh to float
				%dtdh32 = fpext half %dtdh to float
				%dsdv32 = fpext half %dsdv to float
				%dtdv32 = fpext half %dtdv to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32(i32 15, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_g16_d_3d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r) {
				; CHECK-LABEL: @image_sample_g16_d_3d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f32(i32 15, half [[DSDH:%.]], half [[DTDH:%.]], half [[DRDH:%.]], half [[DSDV:%.]], half [[DTDV:%.]], half [[DRDV:%.]], float [[S:%.]], float [[T:%.]], float [[R:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%dsdh32 = fpext half %dsdh to float
				%dtdh32 = fpext half %dtdh to float
				%drdh32 = fpext half %drdh to float
				%dsdv32 = fpext half %dsdv to float
				%dtdv32 = fpext half %dtdv to float
				%drdv32 = fpext half %drdv to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f32.f32(i32 15, float %dsdh32, float %dtdh32, float %drdh32, float %dsdv32, float %dtdv32, float %drdv32, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_g16_c_d_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) {
				; CHECK-LABEL: @image_sample_g16_c_d_1d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f32(i32 15, float [[ZCOMPARE:%.]], half [[DSDH:%.]], half [[DSDV:%.]], float [[S:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%dsdh32 = fpext half %dsdh to float
				%dsdv32 = fpext half %dsdv to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh32, float %dsdv32, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_g16_c_d_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
				; CHECK-LABEL: @image_sample_g16_c_d_2d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f32(i32 15, float [[ZCOMPARE:%.]], half [[DSDH:%.]], half [[DTDH:%.]], half [[DSDV:%.]], half [[DTDV:%.]], float [[S:%.]], float [[T:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%dsdh32 = fpext half %dsdh to float
				%dtdh32 = fpext half %dtdh to float
				%dsdv32 = fpext half %dsdv to float
				%dtdv32 = fpext half %dtdv to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_g16_d_cl_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) {
				; CHECK-LABEL: @image_sample_g16_d_cl_1d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f32(i32 15, half [[DSDH:%.]], half [[DSDV:%.]], float [[S:%.]], float [[CLAMP:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%dsdh32 = fpext half %dsdh to float
				%dsdv32 = fpext half %dsdv to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32(i32 15, float %dsdh32, float %dsdv32, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_g16_d_cl_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
				; CHECK-LABEL: @image_sample_g16_d_cl_2d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f32(i32 15, half [[DSDH:%.]], half [[DTDH:%.]], half [[DSDV:%.]], half [[DTDV:%.]], float [[S:%.]], float [[T:%.]], float [[CLAMP:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%dsdh32 = fpext half %dsdh to float
				%dtdh32 = fpext half %dtdh to float
				%dsdv32 = fpext half %dsdv to float
				%dtdv32 = fpext half %dtdv to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f32.f32(i32 15, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_g16_c_d_cl_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) {
				; CHECK-LABEL: @image_sample_g16_c_d_cl_1d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f32(i32 15, float [[ZCOMPARE:%.]], half [[DSDH:%.]], half [[DSDV:%.]], float [[S:%.]], float [[CLAMP:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%dsdh32 = fpext half %dsdh to float
				%dsdv32 = fpext half %dsdv to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh32, float %dsdv32, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_g16_c_d_cl_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
				; CHECK-LABEL: @image_sample_g16_c_d_cl_2d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f32(i32 15, float [[ZCOMPARE:%.]], half [[DSDH:%.]], half [[DTDH:%.]], half [[DSDV:%.]], half [[DTDV:%.]], float [[S:%.]], float [[T:%.]], float [[CLAMP:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%dsdh32 = fpext half %dsdh to float
				%dtdh32 = fpext half %dtdh to float
				%dsdv32 = fpext half %dsdv to float
				%dtdv32 = fpext half %dtdv to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_g16_cd_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) {
				; CHECK-LABEL: @image_sample_g16_cd_1d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32 15, half [[DSDH:%.]], half [[DSDV:%.]], float [[S:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%dsdh32 = fpext half %dsdh to float
				%dsdv32 = fpext half %dsdv to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32(i32 15, float %dsdh32, float %dsdv32, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_g16_cd_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
				; CHECK-LABEL: @image_sample_g16_cd_2d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32 15, half [[DSDH:%.]], half [[DTDH:%.]], half [[DSDV:%.]], half [[DTDV:%.]], float [[S:%.]], float [[T:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%dsdh32 = fpext half %dsdh to float
				%dtdh32 = fpext half %dtdh to float
				%dsdv32 = fpext half %dsdv to float
				%dtdv32 = fpext half %dtdv to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f32.f32(i32 15, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_g16_c_cd_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) {
				; CHECK-LABEL: @image_sample_g16_c_cd_1d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32 15, float [[ZCOMPARE:%.]], half [[DSDH:%.]], half [[DSDV:%.]], float [[S:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%dsdh32 = fpext half %dsdh to float
				%dsdv32 = fpext half %dsdv to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh32, float %dsdv32, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_g16_c_cd_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
				; CHECK-LABEL: @image_sample_g16_c_cd_2d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32 15, float [[ZCOMPARE:%.]], half [[DSDH:%.]], half [[DTDH:%.]], half [[DSDV:%.]], half [[DTDV:%.]], float [[S:%.]], float [[T:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%dsdh32 = fpext half %dsdh to float
				%dtdh32 = fpext half %dtdh to float
				%dsdv32 = fpext half %dsdv to float
				%dtdv32 = fpext half %dtdv to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_g16_cd_cl_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) {
				; CHECK-LABEL: @image_sample_g16_cd_cl_1d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32 15, half [[DSDH:%.]], half [[DSDV:%.]], float [[S:%.]], float [[CLAMP:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%dsdh32 = fpext half %dsdh to float
				%dsdv32 = fpext half %dsdv to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f32.f32(i32 15, float %dsdh32, float %dsdv32, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_g16_cd_cl_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
				; CHECK-LABEL: @image_sample_g16_cd_cl_2d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32 15, half [[DSDH:%.]], half [[DTDH:%.]], half [[DSDV:%.]], half [[DTDV:%.]], float [[S:%.]], float [[T:%.]], float [[CLAMP:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%dsdh32 = fpext half %dsdh to float
				%dtdh32 = fpext half %dtdh to float
				%dsdv32 = fpext half %dsdv to float
				%dtdv32 = fpext half %dtdv to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f32.f32(i32 15, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_g16_c_cd_cl_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) {
				; CHECK-LABEL: @image_sample_g16_c_cd_cl_1d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32 15, float [[ZCOMPARE:%.]], half [[DSDH:%.]], half [[DSDV:%.]], float [[S:%.]], float [[CLAMP:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%dsdh32 = fpext half %dsdh to float
				%dsdv32 = fpext half %dsdv to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh32, float %dsdv32, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_g16_c_cd_cl_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
				; CHECK-LABEL: @image_sample_g16_c_cd_cl_2d(
				; CHECK-NEXT: [[TMP1:%.]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32 15, float [[ZCOMPARE:%.]], half [[DSDH:%.]], half [[DTDH:%.]], half [[DSDV:%.]], half [[DTDV:%.]], float [[S:%.]], float [[T:%.]], float [[CLAMP:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%dsdh32 = fpext half %dsdh to float
				%dtdh32 = fpext half %dtdh to float
				%dsdv32 = fpext half %dsdv to float
				%dtdv32 = fpext half %dtdv to float
				%res = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_g16_c_d_o_2darray_V1(float addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) {
				; CHECK-LABEL: @image_sample_g16_c_d_o_2darray_V1(
				; CHECK-NEXT: [[TMP1:%.]] = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f16.f32(i32 4, i32 [[OFFSET:%.]], float [[ZCOMPARE:%.]], half [[DSDH:%.]], half [[DTDH:%.]], half [[DSDV:%.]], half [[DTDV:%.]], float [[S:%.]], float [[T:%.]], float [[SLICE:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store float [[TMP1]], float addrspace(1)* [[OUT:%.*]], align 4
				; CHECK-NEXT: ret void
				;
				%dsdh32 = fpext half %dsdh to float
				%dtdh32 = fpext half %dtdh to float
				%dsdv32 = fpext half %dsdv to float
				%dtdv32 = fpext half %dtdv to float
				%res = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f32.f32(i32 4, i32 %offset, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store float %res, float addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_g16_c_d_o_2darray_V2(<2 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) {
				; CHECK-LABEL: @image_sample_g16_c_d_o_2darray_V2(
				; CHECK-NEXT: [[TMP1:%.]] = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32 6, i32 [[OFFSET:%.]], float [[ZCOMPARE:%.]], half [[DSDH:%.]], half [[DTDH:%.]], half [[DSDV:%.]], half [[DTDV:%.]], float [[S:%.]], float [[T:%.]], float [[SLICE:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <2 x float> [[TMP1]], <2 x float> addrspace(1)* [[OUT:%.*]], align 8
				; CHECK-NEXT: ret void
				;
				%dsdh32 = fpext half %dsdh to float
				%dtdh32 = fpext half %dtdh to float
				%dsdv32 = fpext half %dsdv to float
				%dtdv32 = fpext half %dtdv to float
				%res = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f32(i32 6, i32 %offset, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <2 x float> %res, <2 x float> addrspace(1)* %out
				ret void
				}
				arsenmUnsubmitted Not Done Reply Inline Actions Can you add a test making sure this preserves fast math flags? We should be able to mark these as nnan / ninf etc. and have it be preserved arsenm: Can you add a test making sure this preserves fast math flags? We should be able to mark these…

				; --------------------------------------------------------------------
				; llvm.amdgcn.image.sample a16 preserve fast-math flags
				; --------------------------------------------------------------------

				define amdgpu_kernel void @image_sample_a16_1d_nnan(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) {
				; CHECK-LABEL: @image_sample_a16_1d_nnan(
				; CHECK-NEXT: [[TMP1:%.]] = call nnan <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f16(i32 15, half [[S:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%s32 = fpext half %s to float
				%res = call nnan <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_1d_nnan_ninf_nsz(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) {
				; CHECK-LABEL: @image_sample_a16_1d_nnan_ninf_nsz(
				; CHECK-NEXT: [[TMP1:%.]] = call nnan ninf nsz <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f16(i32 15, half [[S:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%s32 = fpext half %s to float
				%res = call nnan ninf nsz <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_1d_fast(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) {
				; CHECK-LABEL: @image_sample_a16_1d_fast(
				; CHECK-NEXT: [[TMP1:%.]] = call fast <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f16(i32 15, half [[S:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%s32 = fpext half %s to float
				%res = call fast <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_2d_nnan(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) {
				; CHECK-LABEL: @image_sample_a16_2d_nnan(
				; CHECK-NEXT: [[TMP1:%.]] = call nnan <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f16(i32 15, half [[S:%.]], half [[T:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%s32 = fpext half %s to float
				%t32 = fpext half %t to float
				%res = call nnan <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_3d_nnan(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %r) {
				; CHECK-LABEL: @image_sample_a16_3d_nnan(
				; CHECK-NEXT: [[TMP1:%.]] = call nnan <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f16(i32 15, half [[S:%.]], half [[T:%.]], half [[R:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%s32 = fpext half %s to float
				%t32 = fpext half %t to float
				%r32 = fpext half %r to float
				%res = call nnan <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32 15, float %s32, float %t32, float %r32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_cube_nnan(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %face) {
				;
				; CHECK-LABEL: @image_sample_a16_cube_nnan(
				; CHECK-NEXT: [[TMP1:%.]] = call nnan <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f16(i32 15, half [[S:%.]], half [[T:%.]], half [[FACE:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%s32 = fpext half %s to float
				%t32 = fpext half %t to float
				%face32 = fpext half %face to float
				%res = call nnan <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32 15, float %s32, float %t32, float %face32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_1darray_nnan(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %slice) {
				; CHECK-LABEL: @image_sample_a16_1darray_nnan(
				; CHECK-NEXT: [[TMP1:%.]] = call nnan <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f16(i32 15, half [[S:%.]], half [[SLICE:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%s32 = fpext half %s to float
				%slice32 = fpext half %slice to float
				%res = call nnan <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32 15, float %s32, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_sample_a16_2darray_nnan(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %slice) {
				; CHECK-LABEL: @image_sample_a16_2darray_nnan(
				; CHECK-NEXT: [[TMP1:%.]] = call nnan <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f16(i32 15, half [[S:%.]], half [[T:%.]], half [[SLICE:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
				; CHECK-NEXT: ret void
				;
				%s32 = fpext half %s to float
				%t32 = fpext half %t to float
				%slice32 = fpext half %slice to float
				%res = call nnan <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f32(i32 15, float %s32, float %t32, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				store <4 x float> %res, <4 x float> addrspace(1)* %out
				ret void
				}