diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -58,12 +58,14 @@ // Check if a value can be converted to a 16-bit value without losing // precision. -static bool canSafelyConvertTo16Bit(Value &V) { +static bool areAllDefs16Bit(Value &V) { Type *VTy = V.getType(); - if (VTy->isHalfTy() || VTy->isIntegerTy(16)) { + Type *STy = VTy->getScalarType(); + if (STy->isHalfTy() || STy->isIntegerTy(16)) { // The value is already 16-bit, so we don't want to convert to 16-bit again! return false; } + if (ConstantFP *ConstFloat = dyn_cast(&V)) { // We need to check that if we cast the index down to a half, we do not lose // precision. @@ -72,11 +74,22 @@ FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, &LosesInfo); return !LosesInfo; } + + if (VTy->isVectorTy()) { + if (auto *ConstVec = dyn_cast(&V)) { + for (auto &Part : ConstVec->operands()) { + if (!areAllDefs16Bit(*Part)) + return false; + } + return true; + } + } + Value *CastSrc; if (match(&V, m_FPExt(PatternMatch::m_Value(CastSrc))) || match(&V, m_SExt(PatternMatch::m_Value(CastSrc))) || match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)))) { - Type *CastSrcTy = CastSrc->getType(); + Type *CastSrcTy = CastSrc->getType()->getScalarType(); if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16)) return true; } @@ -89,14 +102,37 @@ Type *VTy = V.getType(); if (isa(&V) || isa(&V) || isa(&V)) return cast(&V)->getOperand(0); - if (VTy->isIntegerTy()) - return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false); - if (VTy->isFloatingPointTy()) - return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext())); + auto *NewScalarTy = + VTy->isFPOrFPVectorTy() ? Builder.getHalfTy() : Builder.getInt16Ty(); + Type *NewTy; + if (auto *VectorTy = dyn_cast(VTy)) + NewTy = VectorType::get(NewScalarTy, VectorTy->getElementCount()); + else + NewTy = NewScalarTy; + + if (VTy->isIntOrIntVectorTy()) + return Builder.CreateTrunc(&V, NewTy); + if (VTy->isFPOrFPVectorTy()) + return Builder.CreateFPTrunc(&V, NewTy); llvm_unreachable("Should never be called!"); } +/// Check if all uses of a value only need 16-bit precision. +static bool areAllUses16Bit(Value &V) { + for (auto *Use : V.users()) { + Value *CastSrc; + if (match(Use, m_FPTrunc(PatternMatch::m_Value(CastSrc))) || + match(Use, m_Trunc(PatternMatch::m_Value(CastSrc)))) { + Type *CastDestTy = Use->getType()->getScalarType(); + if (CastDestTy->isHalfTy() || CastDestTy->isIntegerTy(16)) + continue; + } + return false; + } + return true; +} + /// Applies Function(II.Args, II.ArgTys) and replaces the intrinsic call with /// the modified arguments. static Optional modifyIntrinsicCall( @@ -182,6 +218,60 @@ } } + // Try to use D16 + const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo = + AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); + if (ST->hasD16LoadStore() && BaseInfo->HasD16) { + Type *DataTy; + if (BaseInfo->Store) + DataTy = II.getFunctionType()->getParamType(0); + else + DataTy = II.getType(); + auto *ScalarTy = DataTy->getScalarType(); + + if ((ScalarTy->isFloatTy() || ScalarTy->isIntegerTy(32))) { + auto *NewScalarTy = ScalarTy->isFloatTy() ? IC.Builder.getHalfTy() + : IC.Builder.getInt16Ty(); + Type *NewDataTy; + if (auto *VTy = dyn_cast(DataTy)) + NewDataTy = VectorType::get(NewScalarTy, VTy->getElementCount()); + else + NewDataTy = NewScalarTy; + + if (BaseInfo->Store) { + if (areAllDefs16Bit(*II.getArgOperand(0))) { + return modifyIntrinsicCall( + II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) { + ArgTys[0] = NewDataTy; + Args[0] = convertTo16Bit(*Args[0], IC.Builder); + }); + } + } else { + if (areAllUses16Bit(II)) { + SmallVector ArgTys; + if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), ArgTys)) + return None; + + SmallVector Args(II.args()); + + ArgTys[0] = NewDataTy; + + Function *I = Intrinsic::getDeclaration(II.getModule(), + II.getIntrinsicID(), ArgTys); + + CallInst *NewCall = IC.Builder.CreateCall(I, Args); + NewCall->takeName(&II); + NewCall->copyMetadata(II); + if (isa(NewCall)) + NewCall->copyFastMathFlags(&II); + + auto *NewValue = IC.Builder.CreateFPExt(NewCall, DataTy); + return IC.replaceInstUsesWith(II, NewValue); + } + } + } + } + // Try to use A16 or G16 if (!ST->hasA16() && !ST->hasG16()) return None; @@ -194,7 +284,7 @@ OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) { Value *Coord = II.getOperand(OperandIndex); // If the values are not derived from 16-bit values, we cannot optimize. - if (!canSafelyConvertTo16Bit(*Coord)) { + if (!areAllDefs16Bit(*Coord)) { if (OperandIndex < ImageDimIntr->CoordStart || ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) { return None; @@ -215,7 +305,7 @@ // Check if there is a bias parameter and if it can be converted to f16 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) { Value *Bias = II.getOperand(ImageDimIntr->BiasIndex); - if (!canSafelyConvertTo16Bit(*Bias)) + if (!areAllDefs16Bit(*Bias)) OnlyDerivatives = true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h @@ -38,12 +38,6 @@ }; const RsrcIntrinsic *lookupRsrcIntrinsic(unsigned Intr); -struct D16ImageDimIntrinsic { - unsigned Intr; - unsigned D16HelperIntr; -}; -const D16ImageDimIntrinsic *lookupD16ImageDimIntrinsic(unsigned Intr); - struct ImageDimIntrinsicInfo { unsigned Intr; unsigned BaseOpcode; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -43,11 +43,10 @@ class AAResults; namespace AMDGPU { -#define GET_D16ImageDimIntrinsics_IMPL #define GET_ImageDimIntrinsicTable_IMPL #define GET_RsrcIntrinsics_IMPL #include "AMDGPUGenSearchableTables.inc" -} +} // namespace AMDGPU } diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll --- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll @@ -2800,6 +2800,80 @@ ret void } +; -------------------------------------------------------------------- +; llvm.amdgcn.image.sample d16 +; -------------------------------------------------------------------- + +define amdgpu_kernel void @image_sample_d16_1d(<4 x half> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) { +; CHECK-LABEL: @image_sample_d16_1d( +; CHECK-NEXT: [[RES:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.1d.v4f16.f32(i32 15, float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x half> [[RES]], <4 x half> addrspace(1)* [[OUT:%.*]], align 8 +; CHECK-NEXT: ret void +; + %res = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + %res16 = fptrunc <4 x float> %res to <4 x half> + store <4 x half> %res16, <4 x half> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_gather_d16_2d(<4 x half> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) { +; CHECK-LABEL: @image_gather_d16_2d( +; CHECK-NEXT: [[RES:%.*]] = call <4 x half> @llvm.amdgcn.image.gather4.2d.v4f16.f32(i32 15, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: store <4 x half> [[RES]], <4 x half> addrspace(1)* [[OUT:%.*]], align 8 +; CHECK-NEXT: ret void +; + %res = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 15, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + %res16 = fptrunc <4 x float> %res to <4 x half> + store <4 x half> %res16, <4 x half> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_load_d16_2d(<4 x half> addrspace(1)* %out, <8 x i32> inreg %rsrc, i32 %s, i32 %t) { +; CHECK-LABEL: @image_load_d16_2d( +; CHECK-NEXT: [[RES:%.*]] = call <4 x half> @llvm.amdgcn.image.load.2d.v4f16.i32(i32 15, i32 [[S:%.*]], i32 [[T:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) +; CHECK-NEXT: store <4 x half> [[RES]], <4 x half> addrspace(1)* [[OUT:%.*]], align 8 +; CHECK-NEXT: ret void +; + %res = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + %res16 = fptrunc <4 x float> %res to <4 x half> + store <4 x half> %res16, <4 x half> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @image_store_d16_2d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, half %data) { +; CHECK-LABEL: @image_store_d16_2d( +; CHECK-NEXT: call void @llvm.amdgcn.image.store.2d.f16.i32(half [[DATA:%.*]], i32 1, i32 [[S:%.*]], i32 [[T:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) +; CHECK-NEXT: ret void +; + %data32 = fpext half %data to float + call void @llvm.amdgcn.image.store.2d.f32.i32(float %data32, i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_kernel void @image_store_d16_2d_v4f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <4 x half> %data) { +; CHECK-LABEL: @image_store_d16_2d_v4f32( +; CHECK-NEXT: call void @llvm.amdgcn.image.store.2d.v4f16.i32(<4 x half> [[DATA:%.*]], i32 1, i32 [[S:%.*]], i32 [[T:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) +; CHECK-NEXT: ret void +; + %data32 = fpext <4 x half> %data to <4 x float> + call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %data32, i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_kernel void @image_store_d16_2d_const_v4f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { +; CHECK-LABEL: @image_store_d16_2d_const_v4f32( +; CHECK-NEXT: call void @llvm.amdgcn.image.store.2d.v4f16.i32(<4 x half> , i32 1, i32 [[S:%.*]], i32 [[T:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) +; CHECK-NEXT: ret void +; + call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> < float 0.0, float 1.0, float 2.0, float 3.0 >, i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +declare <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) +declare <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) +declare void @llvm.amdgcn.image.store.2d.f32.i32(float, i32, i32, i32, <8 x i32>, i32, i32) +declare void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float>, i32, i32, i32, <8 x i32>, i32, i32) + ; -------------------------------------------------------------------- ; llvm.amdgcn.image.sample a16 ; --------------------------------------------------------------------