diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -682,10 +682,16 @@ class AMDGPUImageDimIntrinsicEval { int NumDataArgs = !size(P_.DataArgs); int NumDmaskArgs = !if(P_.IsAtomic, 0, 1); + int NumExtraAddrArgs = !size(P_.ExtraAddrArgs); int NumVAddrArgs = !size(P_.AddrArgs); + int NumGradientArgs = !if(P_.Gradients, !size(P_.Dim.GradientArgs), 0); + int NumCoordArgs = !if(P_.IsSample, !size(P_.Dim.CoordSliceArgs), !size(P_.Dim.CoordSliceIntArgs)); int NumRSrcArgs = 1; int NumSampArgs = !if(P_.IsSample, 2, 0); int DmaskArgIndex = NumDataArgs; + int VAddrArgIndex = !add(NumDataArgs, NumDmaskArgs); + int GradientArgIndex = !add(NumDataArgs, NumDmaskArgs, NumExtraAddrArgs); + int CoordArgIndex = !add(NumDataArgs, NumDmaskArgs, NumExtraAddrArgs, NumGradientArgs); int UnormArgIndex = !add(NumDataArgs, NumDmaskArgs, NumVAddrArgs, NumRSrcArgs, 1); int TexFailCtrlArgIndex = !add(NumDataArgs, NumDmaskArgs, NumVAddrArgs, NumRSrcArgs, NumSampArgs); int CachePolicyArgIndex = !add(TexFailCtrlArgIndex, 1); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -53,12 +53,119 @@ return maxnum(Src0, Src1); } +// Check if a value can be converted to a 16-bit value without losing +// precision. +static bool canSafelyConvertTo16Bit(Value &V) { + Type *VTy = V.getType(); + if (VTy->isHalfTy() || VTy->isIntegerTy(16)) { + // The value is already 16-bit, so we don't want to convert to 16-bit again! + return false; + } + if (ConstantFP *ConstFloat = dyn_cast(&V)) { + // We need to check that if we cast the index down to a half, we do not lose + // precision. + APFloat FloatValue(ConstFloat->getValueAPF()); + bool LosesInfo = true; + FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, &LosesInfo); + return !LosesInfo; + } + Value *CastSrc; + if (match(&V, m_FPExt(PatternMatch::m_Value(CastSrc))) || + match(&V, m_SExt(PatternMatch::m_Value(CastSrc))) || + match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)))) { + Type *CastSrcTy = CastSrc->getType(); + if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16)) + return true; + } + + return false; +} + +// Convert a value to 16-bit. +Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) { + Type *VTy = V.getType(); + if (isa(&V) || isa(&V) || isa(&V)) + return cast(&V)->getOperand(0); + if (VTy->isIntegerTy()) + return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false); + if (VTy->isFloatingPointTy()) + return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext())); + + llvm_unreachable("Should never be called!"); +} + +static Optional +simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST, + const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr, + IntrinsicInst &II, InstCombiner &IC) { + if (!ST->hasA16() && !ST->hasG16()) + return None; + + bool FloatCoord = false; + // true means derivatives can be converted to 16 bit, coordinates not + bool OnlyDerivatives = false; + + for (unsigned OperandIndex = ImageDimIntr->GradientStart; + OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) { + Value *Coord = II.getOperand(OperandIndex); + // If the values are not derived from 16-bit values, we cannot optimize. + if (!canSafelyConvertTo16Bit(*Coord)) { + if (OperandIndex < ImageDimIntr->CoordStart || + ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) { + return None; + } + // All gradients can be converted, so convert only them + OnlyDerivatives = true; + break; + } + + assert(OperandIndex == ImageDimIntr->GradientStart || + FloatCoord == Coord->getType()->isFloatingPointTy()); + FloatCoord = Coord->getType()->isFloatingPointTy(); + } + + if (OnlyDerivatives) { + if (!ST->hasG16()) + return None; + } else { + if (!ST->hasA16()) + OnlyDerivatives = true; // Only supports G16 + } + + Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext()) + : Type::getInt16Ty(II.getContext()); + + SmallVector ArgTys; + if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), ArgTys)) + return None; + + ArgTys[ImageDimIntr->GradientTyArg] = CoordType; + if (!OnlyDerivatives) + ArgTys[ImageDimIntr->CoordTyArg] = CoordType; + Function *I = + Intrinsic::getDeclaration(II.getModule(), II.getIntrinsicID(), ArgTys); + + SmallVector Args(II.arg_operands()); + + unsigned EndIndex = + OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd; + for (unsigned OperandIndex = ImageDimIntr->GradientStart; + OperandIndex < EndIndex; OperandIndex++) { + Args[OperandIndex] = + convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder); + } + + CallInst *NewCall = IC.Builder.CreateCall(I, Args); + NewCall->takeName(&II); + NewCall->copyMetadata(II); + NewCall->copyFastMathFlags(&II); + return IC.replaceInstUsesWith(II, NewCall); +} + Optional GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { Intrinsic::ID IID = II.getIntrinsicID(); switch (IID) { - default: - break; case Intrinsic::amdgcn_rcp: { Value *Src = II.getArgOperand(0); @@ -715,6 +822,12 @@ break; } + default: { + if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = + AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) { + return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC); + } + } } return None; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h @@ -52,6 +52,11 @@ unsigned Intr; unsigned BaseOpcode; MIMGDim Dim; + unsigned GradientStart; + unsigned CoordStart; + unsigned VAddrEnd; + unsigned GradientTyArg; + unsigned CoordTyArg; }; const ImageDimIntrinsicInfo *getImageDimIntrinsicInfo(unsigned Intr); diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -840,11 +840,19 @@ Intrinsic Intr = I; MIMGBaseOpcode BaseOpcode = !cast(!strconcat("IMAGE_", I.P.OpMod)); AMDGPUDimProps Dim = I.P.Dim; + AMDGPUImageDimIntrinsicEval DimEval = AMDGPUImageDimIntrinsicEval; + + bits<8> GradientStart = DimEval.GradientArgIndex; + bits<8> CoordStart = DimEval.CoordArgIndex; + bits<8> VAddrEnd = !add(DimEval.VAddrArgIndex, DimEval.NumVAddrArgs); + bits<8> GradientTyArg = !add(I.P.NumRetAndDataAnyTypes, + !foldl(0, I.P.ExtraAddrArgs, cnt, arg, !add(cnt, arg.Type.isAny))); + bits<8> CoordTyArg = !add(GradientTyArg, !if(I.P.Gradients, 1, 0)); } def ImageDimIntrinsicTable : GenericTable { let FilterClass = "ImageDimIntrinsicInfo"; - let Fields = ["Intr", "BaseOpcode", "Dim"]; + let Fields = ["Intr", "BaseOpcode", "Dim", "GradientStart", "CoordStart", "VAddrEnd", "GradientTyArg", "CoordTyArg"]; GenericEnum TypeOf_BaseOpcode = MIMGBaseOpcode; GenericEnum TypeOf_Dim = MIMGDim; diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics-gfx8.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics-gfx8.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics-gfx8.ll @@ -0,0 +1,108 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -instcombine -S < %s | FileCheck %s + +; -------------------------------------------------------------------- +; llvm.amdgcn.image.sample a16 is disabled on pre-gfx9 +; -------------------------------------------------------------------- + +declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +define amdgpu_kernel void @image_sample_a16_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) { +; CHECK-LABEL: @image_sample_a16_1d( +; CHECK-NEXT: [[S32:%.*]] = fpext half [[S:%.*]] to float +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float [[S32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %s32 = fpext half %s to float + %res = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) { +; CHECK-LABEL: @image_sample_a16_2d( +; CHECK-NEXT: [[S32:%.*]] = fpext half [[S:%.*]] to float +; CHECK-NEXT: [[T32:%.*]] = fpext half [[T:%.*]] to float +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float [[S32]], float [[T32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %s32 = fpext half %s to float + %t32 = fpext half %t to float + %res = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_3d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %r) { +; CHECK-LABEL: @image_sample_a16_3d( +; CHECK-NEXT: [[S32:%.*]] = fpext half [[S:%.*]] to float +; CHECK-NEXT: [[T32:%.*]] = fpext half [[T:%.*]] to float +; CHECK-NEXT: [[R32:%.*]] = fpext half [[R:%.*]] to float +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32 15, float [[S32]], float [[T32]], float [[R32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %s32 = fpext half %s to float + %t32 = fpext half %t to float + %r32 = fpext half %r to float + %res = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32 15, float %s32, float %t32, float %r32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_cube(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %face) { +; +; CHECK-LABEL: @image_sample_a16_cube( +; CHECK-NEXT: [[S32:%.*]] = fpext half [[S:%.*]] to float +; CHECK-NEXT: [[T32:%.*]] = fpext half [[T:%.*]] to float +; CHECK-NEXT: [[FACE32:%.*]] = fpext half [[FACE:%.*]] to float +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32 15, float [[S32]], float [[T32]], float [[FACE32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %s32 = fpext half %s to float + %t32 = fpext half %t to float + %face32 = fpext half %face to float + %res = call <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32 15, float %s32, float %t32, float %face32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_1darray(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %slice) { +; CHECK-LABEL: @image_sample_a16_1darray( +; CHECK-NEXT: [[S32:%.*]] = fpext half [[S:%.*]] to float +; CHECK-NEXT: [[SLICE32:%.*]] = fpext half [[SLICE:%.*]] to float +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32 15, float [[S32]], float [[SLICE32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %s32 = fpext half %s to float + %slice32 = fpext half %slice to float + %res = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32 15, float %s32, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_2darray(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %slice) { +; CHECK-LABEL: @image_sample_a16_2darray( +; CHECK-NEXT: [[S32:%.*]] = fpext half [[S:%.*]] to float +; CHECK-NEXT: [[T32:%.*]] = fpext half [[T:%.*]] to float +; CHECK-NEXT: [[SLICE32:%.*]] = fpext half [[SLICE:%.*]] to float +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f32(i32 15, float [[S32]], float [[T32]], float [[SLICE32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %s32 = fpext half %s to float + %t32 = fpext half %t to float + %slice32 = fpext half %slice to float + %res = call <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f32(i32 15, float %s32, float %t32, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll --- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -mtriple=amdgcn-amd-amdhsa -instcombine -S < %s | FileCheck %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -instcombine -S < %s | FileCheck %s ; -------------------------------------------------------------------- ; llvm.amdgcn.rcp @@ -66,7 +66,7 @@ define float @test_constant_fold_rcp_f32_43_strictfp() nounwind strictfp { ; CHECK-LABEL: @test_constant_fold_rcp_f32_43_strictfp( -; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.rcp.f32(float 4.300000e+01) [[STRICTFP:#[0-9]+]] +; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.rcp.f32(float 4.300000e+01) [[ATTR11:#.*]] ; CHECK-NEXT: ret float [[VAL]] ; %val = call float @llvm.amdgcn.rcp.f32(float 4.300000e+01) strictfp nounwind readnone @@ -1662,7 +1662,7 @@ define i64 @icmp_constant_inputs_true() { ; CHECK-LABEL: @icmp_constant_inputs_true( -; CHECK-NEXT: [[RESULT:%.*]] = call i64 @llvm.read_register.i64(metadata !0) [[CONVERGENT:#[0-9]*]] +; CHECK-NEXT: [[RESULT:%.*]] = call i64 @llvm.read_register.i64(metadata !0) [[ATTR12:#.*]] ; CHECK-NEXT: ret i64 [[RESULT]] ; %result = call i64 @llvm.amdgcn.icmp.i64.i32(i32 9, i32 8, i32 34) @@ -2369,7 +2369,7 @@ define i64 @fcmp_constant_inputs_true() { ; CHECK-LABEL: @fcmp_constant_inputs_true( -; CHECK-NEXT: [[RESULT:%.*]] = call i64 @llvm.read_register.i64(metadata !0) [[CONVERGENT]] +; CHECK-NEXT: [[RESULT:%.*]] = call i64 @llvm.read_register.i64(metadata !0) [[ATTR12]] ; CHECK-NEXT: ret i64 [[RESULT]] ; %result = call i64 @llvm.amdgcn.fcmp.i64.f32(float 2.0, float 4.0, i32 4) @@ -2411,8 +2411,8 @@ define i64 @ballot_one_64() { ; CHECK-LABEL: @ballot_one_64( -; CHECK-NEXT: %b = call i64 @llvm.read_register.i64(metadata !0) [[CONVERGENT]] -; CHECK-NEXT: ret i64 %b +; CHECK-NEXT: [[B:%.*]] = call i64 @llvm.read_register.i64(metadata !0) [[ATTR12]] +; CHECK-NEXT: ret i64 [[B]] ; %b = call i64 @llvm.amdgcn.ballot.i64(i1 1) ret i64 %b @@ -2437,8 +2437,8 @@ define i32 @ballot_one_32() { ; CHECK-LABEL: @ballot_one_32( -; CHECK-NEXT: %b = call i32 @llvm.read_register.i32(metadata !1) [[CONVERGENT]] -; CHECK-NEXT: ret i32 %b +; CHECK-NEXT: [[B:%.*]] = call i32 @llvm.read_register.i32(metadata !1) [[ATTR12]] +; CHECK-NEXT: ret i32 [[B]] ; %b = call i32 @llvm.amdgcn.ballot.i32(i1 1) ret i32 %b @@ -2800,5 +2800,1125 @@ ret void } -; CHECK: attributes [[STRICTFP]] = { nounwind readnone strictfp } -; CHECK: attributes [[CONVERGENT]] = { convergent } +; -------------------------------------------------------------------- +; llvm.amdgcn.image.sample a16 +; -------------------------------------------------------------------- + +declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cl.1d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cl.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cl.2d.v4f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f32.f32(i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f32.f32(i32, float, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f32(i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f32(i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f32.f32(i32, i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f32(i32, i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +define amdgpu_kernel void @image_sample_a16_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) { +; CHECK-LABEL: @image_sample_a16_1d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f16(i32 15, half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %s32 = fpext half %s to float + %res = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) { +; CHECK-LABEL: @image_sample_a16_2d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f16(i32 15, half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %s32 = fpext half %s to float + %t32 = fpext half %t to float + %res = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_3d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %r) { +; CHECK-LABEL: @image_sample_a16_3d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f16(i32 15, half [[S:%.*]], half [[T:%.*]], half [[R:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %s32 = fpext half %s to float + %t32 = fpext half %t to float + %r32 = fpext half %r to float + %res = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32 15, float %s32, float %t32, float %r32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_cube(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %face) { +; +; CHECK-LABEL: @image_sample_a16_cube( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f16(i32 15, half [[S:%.*]], half [[T:%.*]], half [[FACE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %s32 = fpext half %s to float + %t32 = fpext half %t to float + %face32 = fpext half %face to float + %res = call <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32 15, float %s32, float %t32, float %face32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_1darray(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %slice) { +; CHECK-LABEL: @image_sample_a16_1darray( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f16(i32 15, half [[S:%.*]], half [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %s32 = fpext half %s to float + %slice32 = fpext half %slice to float + %res = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32 15, float %s32, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_2darray(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %slice) { +; CHECK-LABEL: @image_sample_a16_2darray( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f16(i32 15, half [[S:%.*]], half [[T:%.*]], half [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %s32 = fpext half %s to float + %t32 = fpext half %t to float + %slice32 = fpext half %slice to float + %res = call <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f32(i32 15, float %s32, float %t32, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_c_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s) { +; CHECK-LABEL: @image_sample_a16_c_1d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f16(i32 15, float [[ZCOMPARE:%.*]], half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %s32 = fpext half %s to float + %res = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32 15, float %zcompare, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_c_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t) { +; CHECK-LABEL: @image_sample_a16_c_2d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f16(i32 15, float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %s32 = fpext half %s to float + %t32 = fpext half %t to float + %res = call <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f32(i32 15, float %zcompare, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_cl_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %clamp) { +; CHECK-LABEL: @image_sample_a16_cl_1d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cl.1d.v4f32.f16(i32 15, half [[S:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %s32 = fpext half %s to float + %clamp32 = fpext half %clamp to float + %res = call <4 x float> @llvm.amdgcn.image.sample.cl.1d.v4f32.f32(i32 15, float %s32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_cl_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %clamp) { +; CHECK-LABEL: @image_sample_a16_cl_2d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cl.2d.v4f32.f16(i32 15, half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %s32 = fpext half %s to float + %t32 = fpext half %t to float + %clamp32 = fpext half %clamp to float + %res = call <4 x float> @llvm.amdgcn.image.sample.cl.2d.v4f32.f32(i32 15, float %s32, float %t32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_c_cl_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %clamp) { +; CHECK-LABEL: @image_sample_a16_c_cl_1d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f16(i32 15, float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %s32 = fpext half %s to float + %clamp32 = fpext half %clamp to float + %res = call <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f32(i32 15, float %zcompare, float %s32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_c_cl_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t, half %clamp) { +; CHECK-LABEL: @image_sample_a16_c_cl_2d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cl.2d.v4f32.f16(i32 15, float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %s32 = fpext half %s to float + %t32 = fpext half %t to float + %clamp32 = fpext half %clamp to float + %res = call <4 x float> @llvm.amdgcn.image.sample.c.cl.2d.v4f32.f32(i32 15, float %zcompare, float %s32, float %t32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_b_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s) { +; CHECK-LABEL: @image_sample_a16_b_1d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f16(i32 15, float [[BIAS:%.*]], half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %s32 = fpext half %s to float + %res = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32 15, float %bias, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_b_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %t) { +; CHECK-LABEL: @image_sample_a16_b_2d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32.f16(i32 15, float [[BIAS:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %s32 = fpext half %s to float + %t32 = fpext half %t to float + %res = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32.f32(i32 15, float %bias, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_c_b_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s) { +; CHECK-LABEL: @image_sample_a16_c_b_1d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f16(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %s32 = fpext half %s to float + %res = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32(i32 15, float %bias, float %zcompare, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_c_b_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %t) { +; CHECK-LABEL: @image_sample_a16_c_b_2d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.f16(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %s32 = fpext half %s to float + %t32 = fpext half %t to float + %res = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.f32(i32 15, float %bias, float %zcompare, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_b_cl_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %clamp) { +; CHECK-LABEL: @image_sample_a16_b_cl_1d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f16(i32 15, float [[BIAS:%.*]], half [[S:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %s32 = fpext half %s to float + %clamp32 = fpext half %clamp to float + %res = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32(i32 15, float %bias, float %s32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_b_cl_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %t, half %clamp) { +; CHECK-LABEL: @image_sample_a16_b_cl_2d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f16(i32 15, float [[BIAS:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %s32 = fpext half %s to float + %t32 = fpext half %t to float + %clamp32 = fpext half %clamp to float + %res = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f32(i32 15, float %bias, float %s32, float %t32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_c_b_cl_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %clamp) { +; CHECK-LABEL: @image_sample_a16_c_b_cl_1d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f16(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %s32 = fpext half %s to float + %clamp32 = fpext half %clamp to float + %res = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32(i32 15, float %bias, float %zcompare, float %s32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_c_b_cl_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %t, half %clamp) { +; CHECK-LABEL: @image_sample_a16_c_b_cl_2d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f32.f16(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %s32 = fpext half %s to float + %t32 = fpext half %t to float + %clamp32 = fpext half %clamp to float + %res = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f32.f32(i32 15, float %bias, float %zcompare, float %s32, float %t32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_d_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, half %s) { +; CHECK-LABEL: @image_sample_a16_d_1d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f16(i32 15, half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %dsdh32 = fpext half %dsdh to float + %dsdv32 = fpext half %dsdv to float + %s32 = fpext half %s to float + %res = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f32.f32(i32 15, float %dsdh32, float %dsdv32, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_d_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) { +; CHECK-LABEL: @image_sample_a16_d_2d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f16(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %dsdh32 = fpext half %dsdh to float + %dtdh32 = fpext half %dtdh to float + %dsdv32 = fpext half %dsdv to float + %dtdv32 = fpext half %dtdv to float + %s32 = fpext half %s to float + %t32 = fpext half %t to float + %res = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32(i32 15, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_d_3d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, half %s, half %t, half %r) { +; CHECK-LABEL: @image_sample_a16_d_3d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f16(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DRDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[DRDV:%.*]], half [[S:%.*]], half [[T:%.*]], half [[R:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %dsdh32 = fpext half %dsdh to float + %dtdh32 = fpext half %dtdh to float + %drdh32 = fpext half %drdh to float + %dsdv32 = fpext half %dsdv to float + %dtdv32 = fpext half %dtdv to float + %drdv32 = fpext half %drdv to float + %s32 = fpext half %s to float + %t32 = fpext half %t to float + %r32 = fpext half %r to float + %res = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f32.f32(i32 15, float %dsdh32, float %dtdh32, float %drdh32, float %dsdv32, float %dtdv32, float %drdv32, float %s32, float %t32, float %r32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_c_d_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, half %s) { +; CHECK-LABEL: @image_sample_a16_c_d_1d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f16(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %dsdh32 = fpext half %dsdh to float + %dsdv32 = fpext half %dsdv to float + %s32 = fpext half %s to float + %res = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh32, float %dsdv32, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_c_d_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) { +; CHECK-LABEL: @image_sample_a16_c_d_2d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f16(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %dsdh32 = fpext half %dsdh to float + %dtdh32 = fpext half %dtdh to float + %dsdv32 = fpext half %dsdv to float + %dtdv32 = fpext half %dtdv to float + %s32 = fpext half %s to float + %t32 = fpext half %t to float + %res = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_d_cl_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, half %s, half %clamp) { +; CHECK-LABEL: @image_sample_a16_d_cl_1d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f16(i32 15, half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %dsdh32 = fpext half %dsdh to float + %dsdv32 = fpext half %dsdv to float + %s32 = fpext half %s to float + %clamp32 = fpext half %clamp to float + %res = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32(i32 15, float %dsdh32, float %dsdv32, float %s32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_d_cl_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) { +; CHECK-LABEL: @image_sample_a16_d_cl_2d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f16(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %dsdh32 = fpext half %dsdh to float + %dtdh32 = fpext half %dtdh to float + %dsdv32 = fpext half %dsdv to float + %dtdv32 = fpext half %dtdv to float + %s32 = fpext half %s to float + %t32 = fpext half %t to float + %clamp32 = fpext half %clamp to float + %res = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f32.f32(i32 15, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float %t32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_c_d_cl_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, half %s, half %clamp) { +; CHECK-LABEL: @image_sample_a16_c_d_cl_1d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f16(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %dsdh32 = fpext half %dsdh to float + %dsdv32 = fpext half %dsdv to float + %s32 = fpext half %s to float + %clamp32 = fpext half %clamp to float + %res = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh32, float %dsdv32, float %s32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_c_d_cl_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) { +; CHECK-LABEL: @image_sample_a16_c_d_cl_2d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f16(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %dsdh32 = fpext half %dsdh to float + %dtdh32 = fpext half %dtdh to float + %dsdv32 = fpext half %dsdv to float + %dtdv32 = fpext half %dtdv to float + %s32 = fpext half %s to float + %t32 = fpext half %t to float + %clamp32 = fpext half %clamp to float + %res = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float %t32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_cd_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, half %s) { +; CHECK-LABEL: @image_sample_a16_cd_1d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f16(i32 15, half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %dsdh32 = fpext half %dsdh to float + %dsdv32 = fpext half %dsdv to float + %s32 = fpext half %s to float + %res = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32(i32 15, float %dsdh32, float %dsdv32, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_cd_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) { +; CHECK-LABEL: @image_sample_a16_cd_2d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f16(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %dsdh32 = fpext half %dsdh to float + %dtdh32 = fpext half %dtdh to float + %dsdv32 = fpext half %dsdv to float + %dtdv32 = fpext half %dtdv to float + %s32 = fpext half %s to float + %t32 = fpext half %t to float + %res = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f32.f32(i32 15, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_c_cd_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, half %s) { +; CHECK-LABEL: @image_sample_a16_c_cd_1d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f16(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %dsdh32 = fpext half %dsdh to float + %dsdv32 = fpext half %dsdv to float + %s32 = fpext half %s to float + %res = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh32, float %dsdv32, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_c_cd_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) { +; CHECK-LABEL: @image_sample_a16_c_cd_2d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f16(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %dsdh32 = fpext half %dsdh to float + %dtdh32 = fpext half %dtdh to float + %dsdv32 = fpext half %dsdv to float + %dtdv32 = fpext half %dtdv to float + %s32 = fpext half %s to float + %t32 = fpext half %t to float + %res = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_cd_cl_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, half %s, half %clamp) { +; CHECK-LABEL: @image_sample_a16_cd_cl_1d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f16(i32 15, half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %dsdh32 = fpext half %dsdh to float + %dsdv32 = fpext half %dsdv to float + %s32 = fpext half %s to float + %clamp32 = fpext half %clamp to float + %res = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f32.f32(i32 15, float %dsdh32, float %dsdv32, float %s32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_cd_cl_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) { +; CHECK-LABEL: @image_sample_a16_cd_cl_2d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f16(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %dsdh32 = fpext half %dsdh to float + %dtdh32 = fpext half %dtdh to float + %dsdv32 = fpext half %dsdv to float + %dtdv32 = fpext half %dtdv to float + %s32 = fpext half %s to float + %t32 = fpext half %t to float + %clamp32 = fpext half %clamp to float + %res = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f32.f32(i32 15, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float %t32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_c_cd_cl_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, half %s, half %clamp) { +; CHECK-LABEL: @image_sample_a16_c_cd_cl_1d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f16(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], half [[S:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %dsdh32 = fpext half %dsdh to float + %dsdv32 = fpext half %dsdv to float + %s32 = fpext half %s to float + %clamp32 = fpext half %clamp to float + %res = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh32, float %dsdv32, float %s32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_c_cd_cl_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) { +; CHECK-LABEL: @image_sample_a16_c_cd_cl_2d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f16(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %dsdh32 = fpext half %dsdh to float + %dtdh32 = fpext half %dtdh to float + %dsdv32 = fpext half %dsdv to float + %dtdv32 = fpext half %dtdv to float + %s32 = fpext half %s to float + %t32 = fpext half %t to float + %clamp32 = fpext half %clamp to float + %res = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float %t32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_l_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %lod) { +; CHECK-LABEL: @image_sample_a16_l_1d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f16(i32 15, half [[S:%.*]], half [[LOD:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %s32 = fpext half %s to float + %lod32 = fpext half %lod to float + %res = call <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f32(i32 15, float %s32, float %lod32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_l_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %lod) { +; CHECK-LABEL: @image_sample_a16_l_2d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f16(i32 15, half [[S:%.*]], half [[T:%.*]], half [[LOD:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %s32 = fpext half %s to float + %t32 = fpext half %t to float + %lod32 = fpext half %lod to float + %res = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32(i32 15, float %s32, float %t32, float %lod32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_c_l_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %lod) { +; CHECK-LABEL: @image_sample_a16_c_l_1d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f16(i32 15, float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[LOD:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %s32 = fpext half %s to float + %lod32 = fpext half %lod to float + %res = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32(i32 15, float %zcompare, float %s32, float %lod32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_c_l_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t, half %lod) { +; CHECK-LABEL: @image_sample_a16_c_l_2d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f16(i32 15, float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], half [[LOD:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %s32 = fpext half %s to float + %t32 = fpext half %t to float + %lod32 = fpext half %lod to float + %res = call <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f32(i32 15, float %zcompare, float %s32, float %t32, float %lod32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_lz_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) { +; CHECK-LABEL: @image_sample_a16_lz_1d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f16(i32 15, half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %s32 = fpext half %s to float + %res = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32(i32 15, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_lz_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) { +; CHECK-LABEL: @image_sample_a16_lz_2d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f16(i32 15, half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %s32 = fpext half %s to float + %t32 = fpext half %t to float + %res = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 15, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_c_lz_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s) { +; CHECK-LABEL: @image_sample_a16_c_lz_1d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f16(i32 15, float [[ZCOMPARE:%.*]], half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %s32 = fpext half %s to float + %res = call <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32(i32 15, float %zcompare, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_c_lz_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t) { +; CHECK-LABEL: @image_sample_a16_c_lz_2d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f16(i32 15, float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %s32 = fpext half %s to float + %t32 = fpext half %t to float + %res = call <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f32(i32 15, float %zcompare, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_c_d_o_2darray_V1(float addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %slice) { +; CHECK-LABEL: @image_sample_a16_c_d_o_2darray_V1( +; CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f16.f16(i32 4, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], half [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store float [[TMP1]], float addrspace(1)* [[OUT:%.*]], align 4 +; CHECK-NEXT: ret void +; + %dsdh32 = fpext half %dsdh to float + %dtdh32 = fpext half %dtdh to float + %dsdv32 = fpext half %dsdv to float + %dtdv32 = fpext half %dtdv to float + %s32 = fpext half %s to float + %t32 = fpext half %t to float + %slice32 = fpext half %slice to float + %res = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f32.f32(i32 4, i32 %offset, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float %t32, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store float %res, float addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_c_d_o_2darray_V2(<2 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %slice) { +; CHECK-LABEL: @image_sample_a16_c_d_o_2darray_V2( +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f16(i32 6, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half [[T:%.*]], half [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <2 x float> [[TMP1]], <2 x float> addrspace(1)* [[OUT:%.*]], align 8 +; CHECK-NEXT: ret void +; + %dsdh32 = fpext half %dsdh to float + %dtdh32 = fpext half %dtdh to float + %dsdv32 = fpext half %dsdv to float + %dtdv32 = fpext half %dtdv to float + %s32 = fpext half %s to float + %t32 = fpext half %t to float + %slice32 = fpext half %slice to float + %res = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f32(i32 6, i32 %offset, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float %t32, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <2 x float> %res, <2 x float> addrspace(1)* %out + ret void +} + +; -------------------------------------------------------------------- +; llvm.amdgcn.image.sample g16 +; -------------------------------------------------------------------- + +define amdgpu_kernel void @image_sample_g16_d_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) { +; CHECK-LABEL: @image_sample_g16_d_1d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32 15, half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %dsdh32 = fpext half %dsdh to float + %dsdv32 = fpext half %dsdv to float + %res = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f32.f32(i32 15, float %dsdh32, float %dsdv32, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_g16_d_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { +; CHECK-LABEL: @image_sample_g16_d_2d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f32(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %dsdh32 = fpext half %dsdh to float + %dtdh32 = fpext half %dtdh to float + %dsdv32 = fpext half %dsdv to float + %dtdv32 = fpext half %dtdv to float + %res = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32(i32 15, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_g16_d_3d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r) { +; CHECK-LABEL: @image_sample_g16_d_3d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f32(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DRDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[DRDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[R:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %dsdh32 = fpext half %dsdh to float + %dtdh32 = fpext half %dtdh to float + %drdh32 = fpext half %drdh to float + %dsdv32 = fpext half %dsdv to float + %dtdv32 = fpext half %dtdv to float + %drdv32 = fpext half %drdv to float + %res = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f32.f32(i32 15, float %dsdh32, float %dtdh32, float %drdh32, float %dsdv32, float %dtdv32, float %drdv32, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_g16_c_d_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) { +; CHECK-LABEL: @image_sample_g16_c_d_1d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f32(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %dsdh32 = fpext half %dsdh to float + %dsdv32 = fpext half %dsdv to float + %res = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh32, float %dsdv32, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_g16_c_d_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { +; CHECK-LABEL: @image_sample_g16_c_d_2d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f32(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %dsdh32 = fpext half %dsdh to float + %dtdh32 = fpext half %dtdh to float + %dsdv32 = fpext half %dsdv to float + %dtdv32 = fpext half %dtdv to float + %res = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_g16_d_cl_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) { +; CHECK-LABEL: @image_sample_g16_d_cl_1d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f32(i32 15, half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %dsdh32 = fpext half %dsdh to float + %dsdv32 = fpext half %dsdv to float + %res = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32(i32 15, float %dsdh32, float %dsdv32, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_g16_d_cl_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { +; CHECK-LABEL: @image_sample_g16_d_cl_2d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f32(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %dsdh32 = fpext half %dsdh to float + %dtdh32 = fpext half %dtdh to float + %dsdv32 = fpext half %dsdv to float + %dtdv32 = fpext half %dtdv to float + %res = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f32.f32(i32 15, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_g16_c_d_cl_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) { +; CHECK-LABEL: @image_sample_g16_c_d_cl_1d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f32(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %dsdh32 = fpext half %dsdh to float + %dsdv32 = fpext half %dsdv to float + %res = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh32, float %dsdv32, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_g16_c_d_cl_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { +; CHECK-LABEL: @image_sample_g16_c_d_cl_2d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f32(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %dsdh32 = fpext half %dsdh to float + %dtdh32 = fpext half %dtdh to float + %dsdv32 = fpext half %dsdv to float + %dtdv32 = fpext half %dtdv to float + %res = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_g16_cd_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) { +; CHECK-LABEL: @image_sample_g16_cd_1d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32 15, half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %dsdh32 = fpext half %dsdh to float + %dsdv32 = fpext half %dsdv to float + %res = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32(i32 15, float %dsdh32, float %dsdv32, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_g16_cd_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { +; CHECK-LABEL: @image_sample_g16_cd_2d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %dsdh32 = fpext half %dsdh to float + %dtdh32 = fpext half %dtdh to float + %dsdv32 = fpext half %dsdv to float + %dtdv32 = fpext half %dtdv to float + %res = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f32.f32(i32 15, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_g16_c_cd_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) { +; CHECK-LABEL: @image_sample_g16_c_cd_1d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %dsdh32 = fpext half %dsdh to float + %dsdv32 = fpext half %dsdv to float + %res = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh32, float %dsdv32, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_g16_c_cd_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { +; CHECK-LABEL: @image_sample_g16_c_cd_2d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %dsdh32 = fpext half %dsdh to float + %dtdh32 = fpext half %dtdh to float + %dsdv32 = fpext half %dsdv to float + %dtdv32 = fpext half %dtdv to float + %res = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_g16_cd_cl_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) { +; CHECK-LABEL: @image_sample_g16_cd_cl_1d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32 15, half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %dsdh32 = fpext half %dsdh to float + %dsdv32 = fpext half %dsdv to float + %res = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f32.f32(i32 15, float %dsdh32, float %dsdv32, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_g16_cd_cl_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { +; CHECK-LABEL: @image_sample_g16_cd_cl_2d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32 15, half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %dsdh32 = fpext half %dsdh to float + %dtdh32 = fpext half %dtdh to float + %dsdv32 = fpext half %dsdv to float + %dtdv32 = fpext half %dtdv to float + %res = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f32.f32(i32 15, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_g16_c_cd_cl_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) { +; CHECK-LABEL: @image_sample_g16_c_cd_cl_1d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %dsdh32 = fpext half %dsdh to float + %dsdv32 = fpext half %dsdv to float + %res = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh32, float %dsdv32, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_g16_c_cd_cl_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { +; CHECK-LABEL: @image_sample_g16_c_cd_cl_2d( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32 15, float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %dsdh32 = fpext half %dsdh to float + %dtdh32 = fpext half %dtdh to float + %dsdv32 = fpext half %dsdv to float + %dtdv32 = fpext half %dtdv to float + %res = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_g16_c_d_o_2darray_V1(float addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) { +; CHECK-LABEL: @image_sample_g16_c_d_o_2darray_V1( +; CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f16.f32(i32 4, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store float [[TMP1]], float addrspace(1)* [[OUT:%.*]], align 4 +; CHECK-NEXT: ret void +; + %dsdh32 = fpext half %dsdh to float + %dtdh32 = fpext half %dtdh to float + %dsdv32 = fpext half %dsdv to float + %dtdv32 = fpext half %dtdv to float + %res = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f32.f32(i32 4, i32 %offset, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store float %res, float addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_g16_c_d_o_2darray_V2(<2 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) { +; CHECK-LABEL: @image_sample_g16_c_d_o_2darray_V2( +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32 6, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <2 x float> [[TMP1]], <2 x float> addrspace(1)* [[OUT:%.*]], align 8 +; CHECK-NEXT: ret void +; + %dsdh32 = fpext half %dsdh to float + %dtdh32 = fpext half %dtdh to float + %dsdv32 = fpext half %dsdv to float + %dtdv32 = fpext half %dtdv to float + %res = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f32(i32 6, i32 %offset, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <2 x float> %res, <2 x float> addrspace(1)* %out + ret void +} + +; -------------------------------------------------------------------- +; llvm.amdgcn.image.sample a16 preserve fast-math flags +; -------------------------------------------------------------------- + +define amdgpu_kernel void @image_sample_a16_1d_nnan(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) { +; CHECK-LABEL: @image_sample_a16_1d_nnan( +; CHECK-NEXT: [[TMP1:%.*]] = call nnan <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f16(i32 15, half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %s32 = fpext half %s to float + %res = call nnan <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_1d_nnan_ninf_nsz(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) { +; CHECK-LABEL: @image_sample_a16_1d_nnan_ninf_nsz( +; CHECK-NEXT: [[TMP1:%.*]] = call nnan ninf nsz <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f16(i32 15, half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %s32 = fpext half %s to float + %res = call nnan ninf nsz <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_1d_fast(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) { +; CHECK-LABEL: @image_sample_a16_1d_fast( +; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f16(i32 15, half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %s32 = fpext half %s to float + %res = call fast <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_2d_nnan(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) { +; CHECK-LABEL: @image_sample_a16_2d_nnan( +; CHECK-NEXT: [[TMP1:%.*]] = call nnan <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f16(i32 15, half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %s32 = fpext half %s to float + %t32 = fpext half %t to float + %res = call nnan <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_3d_nnan(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %r) { +; CHECK-LABEL: @image_sample_a16_3d_nnan( +; CHECK-NEXT: [[TMP1:%.*]] = call nnan <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f16(i32 15, half [[S:%.*]], half [[T:%.*]], half [[R:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %s32 = fpext half %s to float + %t32 = fpext half %t to float + %r32 = fpext half %r to float + %res = call nnan <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32 15, float %s32, float %t32, float %r32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_cube_nnan(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %face) { +; +; CHECK-LABEL: @image_sample_a16_cube_nnan( +; CHECK-NEXT: [[TMP1:%.*]] = call nnan <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f16(i32 15, half [[S:%.*]], half [[T:%.*]], half [[FACE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %s32 = fpext half %s to float + %t32 = fpext half %t to float + %face32 = fpext half %face to float + %res = call nnan <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32 15, float %s32, float %t32, float %face32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_1darray_nnan(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %slice) { +; CHECK-LABEL: @image_sample_a16_1darray_nnan( +; CHECK-NEXT: [[TMP1:%.*]] = call nnan <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f16(i32 15, half [[S:%.*]], half [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %s32 = fpext half %s to float + %slice32 = fpext half %slice to float + %res = call nnan <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32 15, float %s32, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_2darray_nnan(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %slice) { +; CHECK-LABEL: @image_sample_a16_2darray_nnan( +; CHECK-NEXT: [[TMP1:%.*]] = call nnan <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f16(i32 15, half [[S:%.*]], half [[T:%.*]], half [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %s32 = fpext half %s to float + %t32 = fpext half %t to float + %slice32 = fpext half %slice to float + %res = call nnan <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f32(i32 15, float %s32, float %t32, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +}