diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -58,24 +58,37 @@ // Check if a value can be converted to a 16-bit value without losing // precision. -static bool canSafelyConvertTo16Bit(Value &V) { +// The value is expected to be either a float (IsFloat = true) or an unsigned +// integer (IsFloat = false). +static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) { Type *VTy = V.getType(); if (VTy->isHalfTy() || VTy->isIntegerTy(16)) { // The value is already 16-bit, so we don't want to convert to 16-bit again! return false; } - if (ConstantFP *ConstFloat = dyn_cast(&V)) { - // We need to check that if we cast the index down to a half, we do not lose - // precision. - APFloat FloatValue(ConstFloat->getValueAPF()); - bool LosesInfo = true; - FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, &LosesInfo); - return !LosesInfo; + if (IsFloat) { + if (ConstantFP *ConstFloat = dyn_cast(&V)) { + // We need to check that if we cast the index down to a half, we do not + // lose precision. + APFloat FloatValue(ConstFloat->getValueAPF()); + bool LosesInfo = true; + FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, + &LosesInfo); + return !LosesInfo; + } + } else { + if (ConstantInt *ConstInt = dyn_cast(&V)) { + // We need to check that if we cast the index down to an i16, we do not + // lose precision. + APInt IntValue(ConstInt->getValue()); + return IntValue.getActiveBits() <= 16; + } } + Value *CastSrc; - if (match(&V, m_FPExt(PatternMatch::m_Value(CastSrc))) || - match(&V, m_SExt(PatternMatch::m_Value(CastSrc))) || - match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)))) { + bool IsExt = IsFloat ? match(&V, m_FPExt(PatternMatch::m_Value(CastSrc))) + : match(&V, m_ZExt(PatternMatch::m_Value(CastSrc))); + if (IsExt) { Type *CastSrcTy = CastSrc->getType(); if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16)) return true; @@ -203,6 +216,10 @@ if (!ST->hasA16() && !ST->hasG16()) return None; + // Address is interpreted as float if the instruction has a sampler or as + // unsigned int if there is no sampler. + bool HasSampler = + AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode)->Sampler; bool FloatCoord = false; // true means derivatives can be converted to 16 bit, coordinates not bool OnlyDerivatives = false; @@ -211,7 +228,7 @@ OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) { Value *Coord = II.getOperand(OperandIndex); // If the values are not derived from 16-bit values, we cannot optimize. - if (!canSafelyConvertTo16Bit(*Coord)) { + if (!canSafelyConvertTo16Bit(*Coord, HasSampler)) { if (OperandIndex < ImageDimIntr->CoordStart || ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) { return None; @@ -232,7 +249,9 @@ // Check if there is a bias parameter and if it can be converted to f16 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) { Value *Bias = II.getOperand(ImageDimIntr->BiasIndex); - if (!canSafelyConvertTo16Bit(*Bias)) + assert(HasSampler && + "Only image instructions with a sampler can have a bias"); + if (!canSafelyConvertTo16Bit(*Bias, HasSampler)) OnlyDerivatives = true; } diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll --- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll @@ -3667,6 +3667,105 @@ ret void } +define amdgpu_kernel void @image_sample_a16_c_d_o_2darray_const(<2 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %slice) { +; CHECK-LABEL: @image_sample_a16_c_d_o_2darray_const( +; CHECK-NEXT: [[RES:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f16(i32 6, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half 0xH3400, half [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <2 x float> [[RES]], <2 x float> addrspace(1)* [[OUT:%.*]], align 8 +; CHECK-NEXT: ret void +; + %dsdh32 = fpext half %dsdh to float + %dtdh32 = fpext half %dtdh to float + %dsdv32 = fpext half %dsdv to float + %dtdv32 = fpext half %dtdv to float + %s32 = fpext half %s to float + %slice32 = fpext half %slice to float + %res = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f32(i32 6, i32 %offset, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float 0.25, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <2 x float> %res, <2 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_sample_a16_c_d_o_2darray_const_noopt(<2 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %slice) { +; CHECK-LABEL: @image_sample_a16_c_d_o_2darray_const_noopt( +; CHECK-NEXT: [[S32:%.*]] = fpext half [[S:%.*]] to float +; CHECK-NEXT: [[SLICE32:%.*]] = fpext half [[SLICE:%.*]] to float +; CHECK-NEXT: [[RES:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32 6, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S32]], float 1.000000e+10, float [[SLICE32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <2 x float> [[RES]], <2 x float> addrspace(1)* [[OUT:%.*]], align 8 +; CHECK-NEXT: ret void +; + %dsdh32 = fpext half %dsdh to float + %dtdh32 = fpext half %dtdh to float + %dsdv32 = fpext half %dsdv to float + %dtdv32 = fpext half %dtdv to float + %s32 = fpext half %s to float + %slice32 = fpext half %slice to float + %res = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f32(i32 6, i32 %offset, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float 1.0e+10, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + store <2 x float> %res, <2 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_load_a16_mip_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, i16 %s) { +; CHECK-LABEL: @image_load_a16_mip_1d( +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 15, i16 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %s32 = zext i16 %s to i32 + %res = call <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32 15, i32 %s32, i32 0, <8 x i32> %rsrc, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_load_a16_mip_1d_noopt(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, i16 %s) { +; CHECK-LABEL: @image_load_a16_mip_1d_noopt( +; CHECK-NEXT: [[S32:%.*]] = sext i16 [[S:%.*]] to i32 +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 [[S32]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %s32 = sext i16 %s to i32 + %res = call <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32 15, i32 %s32, i32 0, <8 x i32> %rsrc, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_load_a16_mip_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, i16 %s, i16 %t) { +; CHECK-LABEL: @image_load_a16_mip_2d( +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32 15, i16 [[S:%.*]], i16 [[T:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %s32 = zext i16 %s to i32 + %t32 = zext i16 %t to i32 + %res = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32 15, i32 %s32, i32 %t32, i32 0, <8 x i32> %rsrc, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_load_a16_mip_2d_const(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, i16 %s) { +; CHECK-LABEL: @image_load_a16_mip_2d_const( +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32 15, i16 [[S:%.*]], i16 -1, <8 x i32> [[RSRC:%.*]], i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %s32 = zext i16 %s to i32 + %res = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32 15, i32 %s32, i32 65535, i32 0, <8 x i32> %rsrc, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_load_a16_mip_2d_const_noopt(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, i16 %s) { +; CHECK-LABEL: @image_load_a16_mip_2d_const_noopt( +; CHECK-NEXT: [[S32:%.*]] = zext i16 [[S:%.*]] to i32 +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 [[S32]], i32 65536, <8 x i32> [[RSRC:%.*]], i32 0, i32 0) +; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16 +; CHECK-NEXT: ret void +; + %s32 = zext i16 %s to i32 + %res = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32 15, i32 %s32, i32 65536, i32 0, <8 x i32> %rsrc, i32 0, i32 0) + store <4 x float> %res, <4 x float> addrspace(1)* %out + ret void +} + ; -------------------------------------------------------------------- ; llvm.amdgcn.image.sample g16 ; --------------------------------------------------------------------