diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -558,6 +558,9 @@ // {offset} {bias} {z-compare} list ExtraAddrArgs = extra_addr; + bit Offset = false; + bit Bias = false; + bit ZCompare = false; bit Gradients = false; // Name of the {lod} or {clamp} argument that is appended to the coordinates, @@ -571,6 +574,7 @@ multiclass AMDGPUSampleHelper_Offset extra_addr> { def NAME#lcmod : AMDGPUSampleVariant; + let Offset = true in def NAME#lcmod#_o : AMDGPUSampleVariant< ucmod#"_O", lcmod#"_o", !listconcat([AMDGPUArg], extra_addr)>; } @@ -578,6 +582,7 @@ multiclass AMDGPUSampleHelper_Compare extra_addr> { defm NAME : AMDGPUSampleHelper_Offset; + let ZCompare = true in defm NAME : AMDGPUSampleHelper_Offset< "_C"#ucmod, "_c"#lcmod, !listconcat(extra_addr, [AMDGPUArg])>; } @@ -591,6 +596,7 @@ defset list AMDGPUSampleVariantsNoGradients = { defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"", "", []>; + let Bias = true in defm AMDGPUSample : AMDGPUSampleHelper_Clamp< "_B", "_b", [AMDGPUArg]>; let LodOrClamp = "lod" in @@ -618,6 +624,9 @@ list RetTypes = []; list DataArgs = []; list ExtraAddrArgs = []; + bit Offset = false; + bit Bias = false; + bit ZCompare = false; bit Gradients = false; string LodClampMip = ""; @@ -652,6 +661,9 @@ let RetTypes = base.RetTypes; let DataArgs = base.DataArgs; let ExtraAddrArgs = base.ExtraAddrArgs; + let Offset = base.Offset; + let Bias = base.Bias; + let ZCompare = base.ZCompare; let Gradients = base.Gradients; let LodClampMip = base.LodClampMip; } @@ -662,6 +674,9 @@ let IsSample = true; let RetTypes = [llvm_any_ty]; let ExtraAddrArgs = sample.ExtraAddrArgs; + let Offset = sample.Offset; + let Bias = sample.Bias; + let ZCompare = sample.ZCompare; let Gradients = sample.Gradients; let LodClampMip = sample.LodOrClamp; } @@ -702,7 +717,10 @@ class AMDGPUImageDimIntrinsicEval { int NumDataArgs = !size(P_.DataArgs); int NumDmaskArgs = !not(P_.IsAtomic); - int NumExtraAddrArgs = !size(P_.ExtraAddrArgs); + int NumOffsetArgs = !if(P_.Offset, 1, 0); + int NumBiasArgs = !if(P_.Bias, 1, 0); + int NumZCompareArgs = !if(P_.ZCompare, 1, 0); + int NumExtraAddrArgs = !add(NumOffsetArgs, NumBiasArgs, NumZCompareArgs); int NumVAddrArgs = !size(P_.AddrArgs); int NumGradientArgs = !if(P_.Gradients, !size(P_.Dim.GradientArgs), 0); int NumCoordArgs = !if(P_.IsSample, !size(P_.Dim.CoordSliceArgs), !size(P_.Dim.CoordSliceIntArgs)); @@ -710,6 +728,9 @@ int NumSampArgs = !if(P_.IsSample, 2, 0); int DmaskArgIndex = NumDataArgs; int VAddrArgIndex = !add(DmaskArgIndex, NumDmaskArgs); + int OffsetArgIndex = VAddrArgIndex; + int BiasArgIndex = !add(VAddrArgIndex, NumOffsetArgs); + int ZCompareArgIndex = !add(BiasArgIndex, NumBiasArgs); int GradientArgIndex = !add(VAddrArgIndex, NumExtraAddrArgs); int CoordArgIndex = !add(GradientArgIndex, NumGradientArgs); int LodArgIndex = !add(VAddrArgIndex, NumVAddrArgs, -1); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -127,14 +127,20 @@ FloatCoord = Coord->getType()->isFloatingPointTy(); } - if (OnlyDerivatives) { - if (!ST->hasG16()) - return None; - } else { - if (!ST->hasA16()) - OnlyDerivatives = true; // Only supports G16 + if (!OnlyDerivatives && !ST->hasA16()) + OnlyDerivatives = true; // Only supports G16 + + // Check if there is a bias parameter and if it can be converted to f16 + if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) { + Value *Bias = II.getOperand(ImageDimIntr->BiasIndex); + if (!canSafelyConvertTo16Bit(*Bias)) + OnlyDerivatives = true; } + if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart == + ImageDimIntr->CoordStart)) + return None; + Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext()) : Type::getInt16Ty(II.getContext()); @@ -143,8 +149,13 @@ return None; ArgTys[ImageDimIntr->GradientTyArg] = CoordType; - if (!OnlyDerivatives) + if (!OnlyDerivatives) { ArgTys[ImageDimIntr->CoordTyArg] = CoordType; + + // Change the bias type + if (ImageDimIntr->NumBiasArgs != 0) + ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(II.getContext()); + } Function *I = Intrinsic::getDeclaration(II.getModule(), II.getIntrinsicID(), ArgTys); @@ -158,6 +169,12 @@ convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder); } + // Convert the bias + if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) { + Value *Bias = II.getOperand(ImageDimIntr->BiasIndex); + Args[ImageDimIntr->BiasIndex] = convertTo16Bit(*Bias, IC.Builder); + } + CallInst *NewCall = IC.Builder.CreateCall(I, Args); NewCall->takeName(&II); NewCall->copyMetadata(II); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h @@ -49,6 +49,9 @@ unsigned BaseOpcode; MIMGDim Dim; + uint8_t NumOffsetArgs; + uint8_t NumBiasArgs; + uint8_t NumZCompareArgs; uint8_t NumGradients; uint8_t NumDmask; uint8_t NumData; @@ -57,6 +60,9 @@ uint8_t DMaskIndex; uint8_t VAddrStart; + uint8_t OffsetIndex; + uint8_t BiasIndex; + uint8_t ZCompareIndex; uint8_t GradientStart; uint8_t CoordStart; uint8_t LodIndex; @@ -68,6 +74,7 @@ uint8_t TexFailCtrlIndex; uint8_t CachePolicyIndex; + uint8_t BiasTyArg; uint8_t GradientTyArg; uint8_t CoordTyArg; }; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -4273,15 +4273,18 @@ if ((I < Intr->GradientStart) || (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || (I >= Intr->CoordStart && !IsA16)) { - // Handle any gradient or coordinate operands that should not be packed if ((I < Intr->GradientStart) && IsA16 && (B.getMRI()->getType(AddrReg) == S16)) { + assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument"); // Special handling of bias when A16 is on. Bias is of type half but // occupies full 32-bit. PackedAddrs.push_back( B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) .getReg(0)); } else { + assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && + "Bias needs to be converted to 16 bit in A16 mode"); + // Handle any gradient or coordinate operands that should not be packed AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); PackedAddrs.push_back(AddrReg); } diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -1070,6 +1070,9 @@ AMDGPUDimProps Dim = I.P.Dim; AMDGPUImageDimIntrinsicEval DimEval = AMDGPUImageDimIntrinsicEval; + bits<8> NumOffsetArgs = DimEval.NumOffsetArgs; + bits<8> NumBiasArgs = DimEval.NumBiasArgs; + bits<8> NumZCompareArgs = DimEval.NumZCompareArgs; bits<8> NumGradients = DimEval.NumGradientArgs; bits<8> NumDmask = DimEval.NumDmaskArgs; bits<8> NumData = DimEval.NumDataArgs; @@ -1078,6 +1081,9 @@ bits<8> DMaskIndex = DimEval.DmaskArgIndex; bits<8> VAddrStart = DimEval.VAddrArgIndex; + bits<8> OffsetIndex = DimEval.OffsetArgIndex; + bits<8> BiasIndex = DimEval.BiasArgIndex; + bits<8> ZCompareIndex = DimEval.ZCompareArgIndex; bits<8> GradientStart = DimEval.GradientArgIndex; bits<8> CoordStart = DimEval.CoordArgIndex; bits<8> LodIndex = DimEval.LodArgIndex; @@ -1089,6 +1095,8 @@ bits<8> TexFailCtrlIndex = DimEval.TexFailCtrlArgIndex; bits<8> CachePolicyIndex = DimEval.CachePolicyArgIndex; + bits<8> BiasTyArg = !add(I.P.NumRetAndDataAnyTypes, + !if(!eq(NumOffsetArgs, 0), 0, I.P.ExtraAddrArgs[0].Type.isAny)); bits<8> GradientTyArg = !add(I.P.NumRetAndDataAnyTypes, !foldl(0, I.P.ExtraAddrArgs, cnt, arg, !add(cnt, arg.Type.isAny))); bits<8> CoordTyArg = !add(GradientTyArg, !if(I.P.Gradients, 1, 0)); @@ -1096,10 +1104,10 @@ def ImageDimIntrinsicTable : GenericTable { let FilterClass = "ImageDimIntrinsicInfo"; - let Fields = ["Intr", "BaseOpcode", "Dim", "NumGradients", "NumDmask", "NumData", "NumVAddrs", "NumArgs", - "DMaskIndex", "VAddrStart", "GradientStart", "CoordStart", "LodIndex", "MipIndex", "VAddrEnd", + let Fields = ["Intr", "BaseOpcode", "Dim", "NumOffsetArgs", "NumBiasArgs", "NumZCompareArgs", "NumGradients", "NumDmask", "NumData", "NumVAddrs", "NumArgs", + "DMaskIndex", "VAddrStart", "OffsetIndex", "BiasIndex", "ZCompareIndex", "GradientStart", "CoordStart", "LodIndex", "MipIndex", "VAddrEnd", "RsrcIndex", "SampIndex", "UnormIndex", "TexFailCtrlIndex", "CachePolicyIndex", - "GradientTyArg", "CoordTyArg"]; + "BiasTyArg", "GradientTyArg", "CoordTyArg"]; string TypeOf_BaseOpcode = "MIMGBaseOpcode"; string TypeOf_Dim = "MIMGDim"; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6316,12 +6316,18 @@ // Push back extra arguments. for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) { if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) { + assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument"); // Special handling of bias when A16 is on. Bias is of type half but // occupies full 32-bit. - SDValue bias = DAG.getBuildVector( MVT::v2f16, DL, {Op.getOperand(ArgOffset + I), DAG.getUNDEF(MVT::f16)}); - VAddrs.push_back(bias); - } else + SDValue Bias = DAG.getBuildVector( + MVT::v2f16, DL, + {Op.getOperand(ArgOffset + I), DAG.getUNDEF(MVT::f16)}); + VAddrs.push_back(Bias); + } else { + assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && + "Bias needs to be converted to 16 bit in A16 mode"); VAddrs.push_back(Op.getOperand(ArgOffset + I)); + } } if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) { diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll --- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll @@ -3019,9 +3019,23 @@ ret void } -define amdgpu_kernel void @image_sample_a16_b_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s) { -; CHECK-LABEL: @image_sample_a16_b_1d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f16(i32 15, float [[BIAS:%.*]], half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +define amdgpu_kernel void @image_sample_a16_b16_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, half %s) { +; CHECK-LABEL: @image_sample_a16_b16_1d( +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f16.f16(i32 15, half [[BIAS:%.*]], half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %bias32 = fpext half %bias to float + %s32 = fpext half %s to float + %res = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32 15, float %bias32, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_b32_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s) { +; CHECK-LABEL: @image_sample_a16_b32_1d( +; CHECK-NEXT: [[S32:%.*]] = fpext half [[S:%.*]] to float +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[S32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) ; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 ; CHECK-NEXT: ret void ; @@ -3031,9 +3045,25 @@ ret void } -define amdgpu_kernel void @image_sample_a16_b_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %t) { -; CHECK-LABEL: @image_sample_a16_b_2d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32.f16(i32 15, float [[BIAS:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +define amdgpu_kernel void @image_sample_a16_b16_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, half %s, half %t) { +; CHECK-LABEL: @image_sample_a16_b16_2d( +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f16.f16(i32 15, half [[BIAS:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %bias32 = fpext half %bias to float + %s32 = fpext half %s to float + %t32 = fpext half %t to float + %res = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32.f32(i32 15, float %bias32, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_b32_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %t) { +; CHECK-LABEL: @image_sample_a16_b32_2d( +; CHECK-NEXT: [[S32:%.*]] = fpext half [[S:%.*]] to float +; CHECK-NEXT: [[T32:%.*]] = fpext half [[T:%.*]] to float +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[S32]], float [[T32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) ; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 ; CHECK-NEXT: ret void ; @@ -3044,9 +3074,23 @@ ret void } -define amdgpu_kernel void @image_sample_a16_c_b_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s) { -; CHECK-LABEL: @image_sample_a16_c_b_1d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f16(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +define amdgpu_kernel void @image_sample_a16_c_b16_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, float %zcompare, half %s) { +; CHECK-LABEL: @image_sample_a16_c_b16_1d( +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f16.f16(i32 15, half [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %bias32 = fpext half %bias to float + %s32 = fpext half %s to float + %res = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32(i32 15, float %bias32, float %zcompare, float %s32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_c_b32_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s) { +; CHECK-LABEL: @image_sample_a16_c_b32_1d( +; CHECK-NEXT: [[S32:%.*]] = fpext half [[S:%.*]] to float +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) ; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 ; CHECK-NEXT: ret void ; @@ -3056,9 +3100,25 @@ ret void } -define amdgpu_kernel void @image_sample_a16_c_b_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %t) { -; CHECK-LABEL: @image_sample_a16_c_b_2d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.f16(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +define amdgpu_kernel void @image_sample_a16_c_b16_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, float %zcompare, half %s, half %t) { +; CHECK-LABEL: @image_sample_a16_c_b16_2d( +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f16.f16(i32 15, half [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %bias32 = fpext half %bias to float + %s32 = fpext half %s to float + %t32 = fpext half %t to float + %res = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.f32(i32 15, float %bias32, float %zcompare, float %s32, float %t32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_c_b32_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %t) { +; CHECK-LABEL: @image_sample_a16_c_b32_2d( +; CHECK-NEXT: [[S32:%.*]] = fpext half [[S:%.*]] to float +; CHECK-NEXT: [[T32:%.*]] = fpext half [[T:%.*]] to float +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S32]], float [[T32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) ; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 ; CHECK-NEXT: ret void ; @@ -3069,9 +3129,25 @@ ret void } -define amdgpu_kernel void @image_sample_a16_b_cl_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %clamp) { -; CHECK-LABEL: @image_sample_a16_b_cl_1d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f16(i32 15, float [[BIAS:%.*]], half [[S:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +define amdgpu_kernel void @image_sample_a16_b16_cl_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, half %s, half %clamp) { +; CHECK-LABEL: @image_sample_a16_b16_cl_1d( +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f16.f16(i32 15, half [[BIAS:%.*]], half [[S:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %bias32 = fpext half %bias to float + %s32 = fpext half %s to float + %clamp32 = fpext half %clamp to float + %res = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32(i32 15, float %bias32, float %s32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_b32_cl_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %clamp) { +; CHECK-LABEL: @image_sample_a16_b32_cl_1d( +; CHECK-NEXT: [[S32:%.*]] = fpext half [[S:%.*]] to float +; CHECK-NEXT: [[CLAMP32:%.*]] = fpext half [[CLAMP:%.*]] to float +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[S32]], float [[CLAMP32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) ; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 ; CHECK-NEXT: ret void ; @@ -3082,9 +3158,27 @@ ret void } -define amdgpu_kernel void @image_sample_a16_b_cl_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %t, half %clamp) { -; CHECK-LABEL: @image_sample_a16_b_cl_2d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f16(i32 15, float [[BIAS:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +define amdgpu_kernel void @image_sample_a16_b16_cl_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, half %s, half %t, half %clamp) { +; CHECK-LABEL: @image_sample_a16_b16_cl_2d( +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f16.f16(i32 15, half [[BIAS:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %bias32 = fpext half %bias to float + %s32 = fpext half %s to float + %t32 = fpext half %t to float + %clamp32 = fpext half %clamp to float + %res = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f32(i32 15, float %bias32, float %s32, float %t32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_b32_cl_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %t, half %clamp) { +; CHECK-LABEL: @image_sample_a16_b32_cl_2d( +; CHECK-NEXT: [[S32:%.*]] = fpext half [[S:%.*]] to float +; CHECK-NEXT: [[T32:%.*]] = fpext half [[T:%.*]] to float +; CHECK-NEXT: [[CLAMP32:%.*]] = fpext half [[CLAMP:%.*]] to float +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[S32]], float [[T32]], float [[CLAMP32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) ; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 ; CHECK-NEXT: ret void ; @@ -3096,9 +3190,25 @@ ret void } -define amdgpu_kernel void @image_sample_a16_c_b_cl_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %clamp) { -; CHECK-LABEL: @image_sample_a16_c_b_cl_1d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f16(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +define amdgpu_kernel void @image_sample_a16_c_b16_cl_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, float %zcompare, half %s, half %clamp) { +; CHECK-LABEL: @image_sample_a16_c_b16_cl_1d( +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f16.f16(i32 15, half [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %bias32 = fpext half %bias to float + %s32 = fpext half %s to float + %clamp32 = fpext half %clamp to float + %res = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32(i32 15, float %bias32, float %zcompare, float %s32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_c_b32_cl_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %clamp) { +; CHECK-LABEL: @image_sample_a16_c_b32_cl_1d( +; CHECK-NEXT: [[S32:%.*]] = fpext half [[S:%.*]] to float +; CHECK-NEXT: [[CLAMP32:%.*]] = fpext half [[CLAMP:%.*]] to float +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S32]], float [[CLAMP32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) ; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 ; CHECK-NEXT: ret void ; @@ -3109,9 +3219,27 @@ ret void } -define amdgpu_kernel void @image_sample_a16_c_b_cl_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %t, half %clamp) { -; CHECK-LABEL: @image_sample_a16_c_b_cl_2d( -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f32.f16(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +define amdgpu_kernel void @image_sample_a16_c_b16_cl_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %bias, float %zcompare, half %s, half %t, half %clamp) { +; CHECK-LABEL: @image_sample_a16_c_b16_cl_2d( +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f16.f16(i32 15, half [[BIAS:%.*]], float [[ZCOMPARE:%.*]], half [[S:%.*]], half [[T:%.*]], half [[CLAMP:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %bias32 = fpext half %bias to float + %s32 = fpext half %s to float + %t32 = fpext half %t to float + %clamp32 = fpext half %clamp to float + %res = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f32.f32(i32 15, float %bias32, float %zcompare, float %s32, float %t32, float %clamp32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_c_b32_cl_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %t, half %clamp) { +; CHECK-LABEL: @image_sample_a16_c_b32_cl_2d( +; CHECK-NEXT: [[S32:%.*]] = fpext half [[S:%.*]] to float +; CHECK-NEXT: [[T32:%.*]] = fpext half [[T:%.*]] to float +; CHECK-NEXT: [[CLAMP32:%.*]] = fpext half [[CLAMP:%.*]] to float +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f32.f32(i32 15, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S32]], float [[T32]], float [[CLAMP32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) ; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 ; CHECK-NEXT: ret void ;