This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/
-
lib/Target/AMDGPU/
-
Target/
-
AMDGPU/
2/2
AMDGPUInstCombineIntrinsic.cpp
-
test/Transforms/InstCombine/AMDGPU/
-
Transforms/
-
InstCombine/
-
AMDGPU/
-
amdgcn-intrinsics.ll

Differential D118043

[AMDGPU] Only match correct type for a16
ClosedPublic

Authored by sebastian-ne on Jan 24 2022, 7:13 AM.

Download Raw Diff

Details

Reviewers

arsenm
foad
rampitec

Commits

rG4ed7c6eec979: [AMDGPU] Only match correct type for a16

Summary

Addresses are floats when a sampler is present and unsigned integers
when no sampler is present.

Therefore, only zext instructions, not sext instructions should match.

Also match integer constants that can be truncated.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

sebastian-ne created this revision.Jan 24 2022, 7:13 AM

Herald added subscribers: kerbowa, hiraditya, t-tye and 6 others. · View Herald TranscriptJan 24 2022, 7:13 AM

sebastian-ne requested review of this revision.Jan 24 2022, 7:13 AM

Herald added a project: Restricted Project. · View Herald TranscriptJan 24 2022, 7:13 AM

Herald added subscribers: llvm-commits, wdng. · View Herald Transcript

LGTM, thanks! Just a couple of nits inline.

llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
81	s/half/i16/ ?
89–91	I have a slight preference for `IsFloat ? match(...) : match(...)`.

This revision is now accepted and ready to land.Jan 24 2022, 8:45 AM

Thanks for the review, I fixed the comments.

This revision was landed with ongoing or failed builds.Jan 25 2022, 6:07 AM

Closed by commit rG4ed7c6eec979: [AMDGPU] Only match correct type for a16 (authored by sebastian-ne). · Explain Why

This revision was automatically updated to reflect the committed changes.

sebastian-ne added a commit: rG4ed7c6eec979: [AMDGPU] Only match correct type for a16.

Harbormaster completed remote builds in B145264: Diff 402574.Jan 26 2022, 10:51 AM

Revision Contents

Path

Size

llvm/

lib/

Target/

AMDGPU/

AMDGPUInstCombineIntrinsic.cpp

45 lines

test/

Transforms/

InstCombine/

AMDGPU/

amdgcn-intrinsics.ll

99 lines

Diff 402882

llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp

Show First 20 Lines • Show All 52 Lines • ▼ Show 20 Lines	static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
if (Cmp1 == APFloat::cmpEqual)		if (Cmp1 == APFloat::cmpEqual)
return maxnum(Src0, Src2);		return maxnum(Src0, Src2);

return maxnum(Src0, Src1);		return maxnum(Src0, Src1);
}		}

// Check if a value can be converted to a 16-bit value without losing		// Check if a value can be converted to a 16-bit value without losing
// precision.		// precision.
static bool canSafelyConvertTo16Bit(Value &V) {		// The value is expected to be either a float (IsFloat = true) or an unsigned
		// integer (IsFloat = false).
		static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) {
Type *VTy = V.getType();		Type *VTy = V.getType();
if (VTy->isHalfTy() \|\| VTy->isIntegerTy(16)) {		if (VTy->isHalfTy() \|\| VTy->isIntegerTy(16)) {
// The value is already 16-bit, so we don't want to convert to 16-bit again!		// The value is already 16-bit, so we don't want to convert to 16-bit again!
return false;		return false;
}		}
		if (IsFloat) {
if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {		if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
// We need to check that if we cast the index down to a half, we do not lose		// We need to check that if we cast the index down to a half, we do not
// precision.		// lose precision.
APFloat FloatValue(ConstFloat->getValueAPF());		APFloat FloatValue(ConstFloat->getValueAPF());
bool LosesInfo = true;		bool LosesInfo = true;
FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, &LosesInfo);		FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero,
		&LosesInfo);
return !LosesInfo;		return !LosesInfo;
}		}
		} else {
		if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(&V)) {
		// We need to check that if we cast the index down to an i16, we do not
		foadUnsubmitted Done Reply Inline Actions s/half/i16/ ? foad: s/half/i16/ ?
		// lose precision.
		APInt IntValue(ConstInt->getValue());
		return IntValue.getActiveBits() <= 16;
		}
		}

Value *CastSrc;		Value *CastSrc;
if (match(&V, m_FPExt(PatternMatch::m_Value(CastSrc))) \|\|		bool IsExt = IsFloat ? match(&V, m_FPExt(PatternMatch::m_Value(CastSrc)))
match(&V, m_SExt(PatternMatch::m_Value(CastSrc))) \|\|		: match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)));
match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)))) {		if (IsExt) {
		foadUnsubmitted Done Reply Inline Actions I have a slight preference for `IsFloat ? match(...) : match(...)`. foad: I have a slight preference for `IsFloat ? match(...) : match(...)`.
Type *CastSrcTy = CastSrc->getType();		Type *CastSrcTy = CastSrc->getType();
if (CastSrcTy->isHalfTy() \|\| CastSrcTy->isIntegerTy(16))		if (CastSrcTy->isHalfTy() \|\| CastSrcTy->isIntegerTy(16))
return true;		return true;
}		}

return false;		return false;
}		}

▲ Show 20 Lines • Show All 111 Lines • ▼ Show 20 Lines	if (auto *ConstantOffset =
}		}
}		}
}		}

// Try to use A16 or G16		// Try to use A16 or G16
if (!ST->hasA16() && !ST->hasG16())		if (!ST->hasA16() && !ST->hasG16())
return None;		return None;

		// Address is interpreted as float if the instruction has a sampler or as
		// unsigned int if there is no sampler.
		bool HasSampler =
		AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode)->Sampler;
bool FloatCoord = false;		bool FloatCoord = false;
// true means derivatives can be converted to 16 bit, coordinates not		// true means derivatives can be converted to 16 bit, coordinates not
bool OnlyDerivatives = false;		bool OnlyDerivatives = false;

for (unsigned OperandIndex = ImageDimIntr->GradientStart;		for (unsigned OperandIndex = ImageDimIntr->GradientStart;
OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {		OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
Value *Coord = II.getOperand(OperandIndex);		Value *Coord = II.getOperand(OperandIndex);
// If the values are not derived from 16-bit values, we cannot optimize.		// If the values are not derived from 16-bit values, we cannot optimize.
if (!canSafelyConvertTo16Bit(*Coord)) {		if (!canSafelyConvertTo16Bit(*Coord, HasSampler)) {
if (OperandIndex < ImageDimIntr->CoordStart \|\|		if (OperandIndex < ImageDimIntr->CoordStart \|\|
ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {		ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
return None;		return None;
}		}
// All gradients can be converted, so convert only them		// All gradients can be converted, so convert only them
OnlyDerivatives = true;		OnlyDerivatives = true;
break;		break;
}		}

assert(OperandIndex == ImageDimIntr->GradientStart \|\|		assert(OperandIndex == ImageDimIntr->GradientStart \|\|
FloatCoord == Coord->getType()->isFloatingPointTy());		FloatCoord == Coord->getType()->isFloatingPointTy());
FloatCoord = Coord->getType()->isFloatingPointTy();		FloatCoord = Coord->getType()->isFloatingPointTy();
}		}

if (!OnlyDerivatives && !ST->hasA16())		if (!OnlyDerivatives && !ST->hasA16())
OnlyDerivatives = true; // Only supports G16		OnlyDerivatives = true; // Only supports G16

// Check if there is a bias parameter and if it can be converted to f16		// Check if there is a bias parameter and if it can be converted to f16
if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {		if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);		Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
if (!canSafelyConvertTo16Bit(*Bias))		assert(HasSampler &&
		"Only image instructions with a sampler can have a bias");
		if (!canSafelyConvertTo16Bit(*Bias, HasSampler))
OnlyDerivatives = true;		OnlyDerivatives = true;
}		}

if (OnlyDerivatives && (!ST->hasG16() \|\| ImageDimIntr->GradientStart ==		if (OnlyDerivatives && (!ST->hasG16() \|\| ImageDimIntr->GradientStart ==
ImageDimIntr->CoordStart))		ImageDimIntr->CoordStart))
return None;		return None;

Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())		Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())
▲ Show 20 Lines • Show All 942 Lines • Show Last 20 Lines

llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 3,661 Lines • ▼ Show 20 Lines	;
%s32 = fpext half %s to float		%s32 = fpext half %s to float
%t32 = fpext half %t to float		%t32 = fpext half %t to float
%slice32 = fpext half %slice to float		%slice32 = fpext half %slice to float
%res = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f32(i32 6, i32 %offset, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float %t32, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)		%res = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f32(i32 6, i32 %offset, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float %t32, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
store <2 x float> %res, <2 x float> addrspace(1)* %out		store <2 x float> %res, <2 x float> addrspace(1)* %out
ret void		ret void
}		}

		define amdgpu_kernel void @image_sample_a16_c_d_o_2darray_const(<2 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %slice) {
		; CHECK-LABEL: @image_sample_a16_c_d_o_2darray_const(
		; CHECK-NEXT: [[RES:%.]] = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f16(i32 6, i32 [[OFFSET:%.]], float [[ZCOMPARE:%.]], half [[DSDH:%.]], half [[DTDH:%.]], half [[DSDV:%.]], half [[DTDV:%.]], half [[S:%.]], half 0xH3400, half [[SLICE:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
		; CHECK-NEXT: store <2 x float> [[RES]], <2 x float> addrspace(1)* [[OUT:%.*]], align 8
		; CHECK-NEXT: ret void
		;
		%dsdh32 = fpext half %dsdh to float
		%dtdh32 = fpext half %dtdh to float
		%dsdv32 = fpext half %dsdv to float
		%dtdv32 = fpext half %dtdv to float
		%s32 = fpext half %s to float
		%slice32 = fpext half %slice to float
		%res = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f32(i32 6, i32 %offset, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float 0.25, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
		store <2 x float> %res, <2 x float> addrspace(1)* %out
		ret void
		}

		define amdgpu_kernel void @image_sample_a16_c_d_o_2darray_const_noopt(<2 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %slice) {
		; CHECK-LABEL: @image_sample_a16_c_d_o_2darray_const_noopt(
		; CHECK-NEXT: [[S32:%.]] = fpext half [[S:%.]] to float
		; CHECK-NEXT: [[SLICE32:%.]] = fpext half [[SLICE:%.]] to float
		; CHECK-NEXT: [[RES:%.]] = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32 6, i32 [[OFFSET:%.]], float [[ZCOMPARE:%.]], half [[DSDH:%.]], half [[DTDH:%.]], half [[DSDV:%.]], half [[DTDV:%.]], float [[S32]], float 1.000000e+10, float [[SLICE32]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
		; CHECK-NEXT: store <2 x float> [[RES]], <2 x float> addrspace(1)* [[OUT:%.*]], align 8
		; CHECK-NEXT: ret void
		;
		%dsdh32 = fpext half %dsdh to float
		%dtdh32 = fpext half %dtdh to float
		%dsdv32 = fpext half %dsdv to float
		%dtdv32 = fpext half %dtdv to float
		%s32 = fpext half %s to float
		%slice32 = fpext half %slice to float
		%res = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f32(i32 6, i32 %offset, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float 1.0e+10, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
		store <2 x float> %res, <2 x float> addrspace(1)* %out
		ret void
		}

		define amdgpu_kernel void @image_load_a16_mip_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, i16 %s) {
		; CHECK-LABEL: @image_load_a16_mip_1d(
		; CHECK-NEXT: [[RES:%.]] = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 15, i16 [[S:%.]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
		; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
		; CHECK-NEXT: ret void
		;
		%s32 = zext i16 %s to i32
		%res = call <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32 15, i32 %s32, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
		store <4 x float> %res, <4 x float> addrspace(1)* %out
		ret void
		}

		define amdgpu_kernel void @image_load_a16_mip_1d_noopt(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, i16 %s) {
		; CHECK-LABEL: @image_load_a16_mip_1d_noopt(
		; CHECK-NEXT: [[S32:%.]] = sext i16 [[S:%.]] to i32
		; CHECK-NEXT: [[RES:%.]] = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 [[S32]], <8 x i32> [[RSRC:%.]], i32 0, i32 0)
		; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
		; CHECK-NEXT: ret void
		;
		%s32 = sext i16 %s to i32
		%res = call <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32 15, i32 %s32, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
		store <4 x float> %res, <4 x float> addrspace(1)* %out
		ret void
		}

		define amdgpu_kernel void @image_load_a16_mip_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, i16 %s, i16 %t) {
		; CHECK-LABEL: @image_load_a16_mip_2d(
		; CHECK-NEXT: [[RES:%.]] = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32 15, i16 [[S:%.]], i16 [[T:%.]], <8 x i32> [[RSRC:%.]], i32 0, i32 0)
		; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
		; CHECK-NEXT: ret void
		;
		%s32 = zext i16 %s to i32
		%t32 = zext i16 %t to i32
		%res = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32 15, i32 %s32, i32 %t32, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
		store <4 x float> %res, <4 x float> addrspace(1)* %out
		ret void
		}

		define amdgpu_kernel void @image_load_a16_mip_2d_const(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, i16 %s) {
		; CHECK-LABEL: @image_load_a16_mip_2d_const(
		; CHECK-NEXT: [[RES:%.]] = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32 15, i16 [[S:%.]], i16 -1, <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
		; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
		; CHECK-NEXT: ret void
		;
		%s32 = zext i16 %s to i32
		%res = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32 15, i32 %s32, i32 65535, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
		store <4 x float> %res, <4 x float> addrspace(1)* %out
		ret void
		}

		define amdgpu_kernel void @image_load_a16_mip_2d_const_noopt(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, i16 %s) {
		; CHECK-LABEL: @image_load_a16_mip_2d_const_noopt(
		; CHECK-NEXT: [[S32:%.]] = zext i16 [[S:%.]] to i32
		; CHECK-NEXT: [[RES:%.]] = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 [[S32]], i32 65536, <8 x i32> [[RSRC:%.]], i32 0, i32 0)
		; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
		; CHECK-NEXT: ret void
		;
		%s32 = zext i16 %s to i32
		%res = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32 15, i32 %s32, i32 65536, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
		store <4 x float> %res, <4 x float> addrspace(1)* %out
		ret void
		}

; --------------------------------------------------------------------		; --------------------------------------------------------------------
; llvm.amdgcn.image.sample g16		; llvm.amdgcn.image.sample g16
; --------------------------------------------------------------------		; --------------------------------------------------------------------

define amdgpu_kernel void @image_sample_g16_d_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) {		define amdgpu_kernel void @image_sample_g16_d_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) {
; CHECK-LABEL: @image_sample_g16_d_1d(		; CHECK-LABEL: @image_sample_g16_d_1d(
; CHECK-NEXT: [[RES:%.]] = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32 15, half [[DSDH:%.]], half [[DSDV:%.]], float [[S:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.]], i1 false, i32 0, i32 0)		; CHECK-NEXT: [[RES:%.]] = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32 15, half [[DSDH:%.]], half [[DSDV:%.]], float [[S:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.]], i1 false, i32 0, i32 0)
; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16		; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
▲ Show 20 Lines • Show All 1,563 Lines • Show Last 20 Lines