This is an archive of the discontinued LLVM Phabricator instance.

llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
64–67	I don't understand why we need this code.
90–91	I know you haven't changed this but it seems dangerous: for a d16 input treated as unsigned you should not match sext here, and for a d16 input treated as signed you should not match zext. Or do all d16 integer inputs ignore the high bits?
107	`NewTy = VTy->getWithNewType(NewScalarTy)`?
llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
41	If this was already unused, you can remove it with a separate NFC patch (consider it pre-approved if you want).

Harbormaster completed remote builds in B144799: Diff 401945.Jan 21 2022, 5:34 AM

sebastian-ne abandoned this revision.Jan 24 2022, 5:45 AM

sebastian-ne added inline comments.

llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
90–91	You’re right, this seems to be incorrect. Good catch. As far as I understand, the format field in the image descriptor decides if float or int conversion is used, so the compiler can’t combine d16 at all. For A16, it’s a uint for instructions without sampler or a float for instructions with sampler. I’ll fix that.

Revision Contents

Path

Size

llvm/

lib/

Target/

AMDGPU/

AMDGPUInstCombineIntrinsic.cpp

108 lines

AMDGPUInstrInfo.h

6 lines

SIInstrInfo.cpp

3 lines

test/

Transforms/

InstCombine/

AMDGPU/

amdgcn-intrinsics.ll

74 lines

Diff 401945

llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp

Show First 20 Lines • Show All 52 Lines • ▼ Show 20 Lines	static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
if (Cmp1 == APFloat::cmpEqual)		if (Cmp1 == APFloat::cmpEqual)
return maxnum(Src0, Src2);		return maxnum(Src0, Src2);

return maxnum(Src0, Src1);		return maxnum(Src0, Src1);
}		}

// Check if a value can be converted to a 16-bit value without losing		// Check if a value can be converted to a 16-bit value without losing
// precision.		// precision.
static bool canSafelyConvertTo16Bit(Value &V) {		static bool areAllDefs16Bit(Value &V) {
Type *VTy = V.getType();		Type *VTy = V.getType();
if (VTy->isHalfTy() \|\| VTy->isIntegerTy(16)) {		Type *STy = VTy->getScalarType();
		if (STy->isHalfTy() \|\| STy->isIntegerTy(16)) {
// The value is already 16-bit, so we don't want to convert to 16-bit again!		// The value is already 16-bit, so we don't want to convert to 16-bit again!
return false;		return false;
}		}
		foadUnsubmitted Not Done Reply Inline Actions I don't understand why we need this code. foad: I don't understand why we need this code.

if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {		if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
// We need to check that if we cast the index down to a half, we do not lose		// We need to check that if we cast the index down to a half, we do not lose
// precision.		// precision.
APFloat FloatValue(ConstFloat->getValueAPF());		APFloat FloatValue(ConstFloat->getValueAPF());
bool LosesInfo = true;		bool LosesInfo = true;
FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, &LosesInfo);		FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, &LosesInfo);
return !LosesInfo;		return !LosesInfo;
}		}

		if (VTy->isVectorTy()) {
		if (auto *ConstVec = dyn_cast<Constant>(&V)) {
		for (auto &Part : ConstVec->operands()) {
		if (!areAllDefs16Bit(*Part))
		return false;
		}
		return true;
		}
		}

Value *CastSrc;		Value *CastSrc;
if (match(&V, m_FPExt(PatternMatch::m_Value(CastSrc))) \|\|		if (match(&V, m_FPExt(PatternMatch::m_Value(CastSrc))) \|\|
match(&V, m_SExt(PatternMatch::m_Value(CastSrc))) \|\|		match(&V, m_SExt(PatternMatch::m_Value(CastSrc))) \|\|
match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)))) {		match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)))) {
		foadUnsubmitted Not Done Reply Inline Actions I know you haven't changed this but it seems dangerous: for a d16 input treated as unsigned you should not match sext here, and for a d16 input treated as signed you should not match zext. Or do all d16 integer inputs ignore the high bits? foad: I know you haven't changed this but it seems dangerous: for a d16 input treated as unsigned you…
		sebastian-neAuthorUnsubmitted Not Done Reply Inline Actions You’re right, this seems to be incorrect. Good catch. As far as I understand, the format field in the image descriptor decides if float or int conversion is used, so the compiler can’t combine d16 at all. For A16, it’s a uint for instructions without sampler or a float for instructions with sampler. I’ll fix that. sebastian-ne: You’re right, this seems to be incorrect. Good catch. As far as I understand, the format field…
Type *CastSrcTy = CastSrc->getType();		Type *CastSrcTy = CastSrc->getType()->getScalarType();
if (CastSrcTy->isHalfTy() \|\| CastSrcTy->isIntegerTy(16))		if (CastSrcTy->isHalfTy() \|\| CastSrcTy->isIntegerTy(16))
return true;		return true;
}		}

return false;		return false;
}		}

// Convert a value to 16-bit.		// Convert a value to 16-bit.
static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) {		static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) {
Type *VTy = V.getType();		Type *VTy = V.getType();
if (isa<FPExtInst>(&V) \|\| isa<SExtInst>(&V) \|\| isa<ZExtInst>(&V))		if (isa<FPExtInst>(&V) \|\| isa<SExtInst>(&V) \|\| isa<ZExtInst>(&V))
return cast<Instruction>(&V)->getOperand(0);		return cast<Instruction>(&V)->getOperand(0);
if (VTy->isIntegerTy())		auto *NewScalarTy =
return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false);		VTy->isFPOrFPVectorTy() ? Builder.getHalfTy() : Builder.getInt16Ty();
if (VTy->isFloatingPointTy())		Type *NewTy;
		foadUnsubmitted Not Done Reply Inline Actions `NewTy = VTy->getWithNewType(NewScalarTy)`? foad: `NewTy = VTy->getWithNewType(NewScalarTy)`?
return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext()));		if (auto *VectorTy = dyn_cast<VectorType>(VTy))
		NewTy = VectorType::get(NewScalarTy, VectorTy->getElementCount());
		else
		NewTy = NewScalarTy;

		if (VTy->isIntOrIntVectorTy())
		return Builder.CreateTrunc(&V, NewTy);
		if (VTy->isFPOrFPVectorTy())
		return Builder.CreateFPTrunc(&V, NewTy);

llvm_unreachable("Should never be called!");		llvm_unreachable("Should never be called!");
}		}

		/// Check if all uses of a value only need 16-bit precision.
		static bool areAllUses16Bit(Value &V) {
		for (auto *Use : V.users()) {
		Value *CastSrc;
		if (match(Use, m_FPTrunc(PatternMatch::m_Value(CastSrc))) \|\|
		match(Use, m_Trunc(PatternMatch::m_Value(CastSrc)))) {
		Type *CastDestTy = Use->getType()->getScalarType();
		if (CastDestTy->isHalfTy() \|\| CastDestTy->isIntegerTy(16))
		continue;
		}
		return false;
		}
		return true;
		}

/// Applies Function(II.Args, II.ArgTys) and replaces the intrinsic call with		/// Applies Function(II.Args, II.ArgTys) and replaces the intrinsic call with
/// the modified arguments.		/// the modified arguments.
static Optional<Instruction *> modifyIntrinsicCall(		static Optional<Instruction *> modifyIntrinsicCall(
IntrinsicInst &II, unsigned NewIntr, InstCombiner &IC,		IntrinsicInst &II, unsigned NewIntr, InstCombiner &IC,
std::function<void(SmallVectorImpl<Value > &, SmallVectorImpl<Type > &)>		std::function<void(SmallVectorImpl<Value > &, SmallVectorImpl<Type > &)>
Func) {		Func) {
SmallVector<Type *, 4> ArgTys;		SmallVector<Type *, 4> ArgTys;
if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), ArgTys))		if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), ArgTys))
▲ Show 20 Lines • Show All 69 Lines • ▼ Show 20 Lines	if (auto *ConstantBias =
II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {		II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
Args.erase(Args.begin() + ImageDimIntr->BiasIndex);		Args.erase(Args.begin() + ImageDimIntr->BiasIndex);
ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg);		ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg);
});		});
}		}
}		}
}		}

		// Try to use D16
		const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
		AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
		if (ST->hasD16LoadStore() && BaseInfo->HasD16) {
		Type *DataTy;
		if (BaseInfo->Store)
		DataTy = II.getFunctionType()->getParamType(0);
		else
		DataTy = II.getType();
		auto *ScalarTy = DataTy->getScalarType();

		if ((ScalarTy->isFloatTy() \|\| ScalarTy->isIntegerTy(32))) {
		auto *NewScalarTy = ScalarTy->isFloatTy() ? IC.Builder.getHalfTy()
		: IC.Builder.getInt16Ty();
		Type *NewDataTy;
		if (auto *VTy = dyn_cast<VectorType>(DataTy))
		NewDataTy = VectorType::get(NewScalarTy, VTy->getElementCount());
		else
		NewDataTy = NewScalarTy;

		if (BaseInfo->Store) {
		if (areAllDefs16Bit(*II.getArgOperand(0))) {
		return modifyIntrinsicCall(
		II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) {
		ArgTys[0] = NewDataTy;
		Args[0] = convertTo16Bit(*Args[0], IC.Builder);
		});
		}
		} else {
		if (areAllUses16Bit(II)) {
		SmallVector<Type *, 4> ArgTys;
		if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), ArgTys))
		return None;

		SmallVector<Value *, 8> Args(II.args());

		ArgTys[0] = NewDataTy;

		Function *I = Intrinsic::getDeclaration(II.getModule(),
		II.getIntrinsicID(), ArgTys);

		CallInst *NewCall = IC.Builder.CreateCall(I, Args);
		NewCall->takeName(&II);
		NewCall->copyMetadata(II);
		if (isa<FPMathOperator>(NewCall))
		NewCall->copyFastMathFlags(&II);

		auto *NewValue = IC.Builder.CreateFPExt(NewCall, DataTy);
		return IC.replaceInstUsesWith(II, NewValue);
		}
		}
		}
		}

// Try to use A16 or G16		// Try to use A16 or G16
if (!ST->hasA16() && !ST->hasG16())		if (!ST->hasA16() && !ST->hasG16())
return None;		return None;

bool FloatCoord = false;		bool FloatCoord = false;
// true means derivatives can be converted to 16 bit, coordinates not		// true means derivatives can be converted to 16 bit, coordinates not
bool OnlyDerivatives = false;		bool OnlyDerivatives = false;

for (unsigned OperandIndex = ImageDimIntr->GradientStart;		for (unsigned OperandIndex = ImageDimIntr->GradientStart;
OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {		OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
Value *Coord = II.getOperand(OperandIndex);		Value *Coord = II.getOperand(OperandIndex);
// If the values are not derived from 16-bit values, we cannot optimize.		// If the values are not derived from 16-bit values, we cannot optimize.
if (!canSafelyConvertTo16Bit(*Coord)) {		if (!areAllDefs16Bit(*Coord)) {
if (OperandIndex < ImageDimIntr->CoordStart \|\|		if (OperandIndex < ImageDimIntr->CoordStart \|\|
ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {		ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
return None;		return None;
}		}
// All gradients can be converted, so convert only them		// All gradients can be converted, so convert only them
OnlyDerivatives = true;		OnlyDerivatives = true;
break;		break;
}		}

assert(OperandIndex == ImageDimIntr->GradientStart \|\|		assert(OperandIndex == ImageDimIntr->GradientStart \|\|
FloatCoord == Coord->getType()->isFloatingPointTy());		FloatCoord == Coord->getType()->isFloatingPointTy());
FloatCoord = Coord->getType()->isFloatingPointTy();		FloatCoord = Coord->getType()->isFloatingPointTy();
}		}

if (!OnlyDerivatives && !ST->hasA16())		if (!OnlyDerivatives && !ST->hasA16())
OnlyDerivatives = true; // Only supports G16		OnlyDerivatives = true; // Only supports G16

// Check if there is a bias parameter and if it can be converted to f16		// Check if there is a bias parameter and if it can be converted to f16
if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {		if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);		Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
if (!canSafelyConvertTo16Bit(*Bias))		if (!areAllDefs16Bit(*Bias))
OnlyDerivatives = true;		OnlyDerivatives = true;
}		}

if (OnlyDerivatives && (!ST->hasG16() \|\| ImageDimIntr->GradientStart ==		if (OnlyDerivatives && (!ST->hasG16() \|\| ImageDimIntr->GradientStart ==
ImageDimIntr->CoordStart))		ImageDimIntr->CoordStart))
return None;		return None;

Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())		Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())
▲ Show 20 Lines • Show All 942 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h

	Show All 32 Lines

	struct RsrcIntrinsic {			struct RsrcIntrinsic {
	unsigned Intr;			unsigned Intr;
	uint8_t RsrcArg;			uint8_t RsrcArg;
	bool IsImage;			bool IsImage;
	};			};
	const RsrcIntrinsic *lookupRsrcIntrinsic(unsigned Intr);			const RsrcIntrinsic *lookupRsrcIntrinsic(unsigned Intr);

	struct D16ImageDimIntrinsic {
	foadUnsubmitted Not Done Reply Inline Actions If this was already unused, you can remove it with a separate NFC patch (consider it pre-approved if you want). foad: If this was already unused, you can remove it with a separate NFC patch (consider it pre…
	unsigned Intr;
	unsigned D16HelperIntr;
	};
	const D16ImageDimIntrinsic *lookupD16ImageDimIntrinsic(unsigned Intr);

	struct ImageDimIntrinsicInfo {			struct ImageDimIntrinsicInfo {
	unsigned Intr;			unsigned Intr;
	unsigned BaseOpcode;			unsigned BaseOpcode;
	MIMGDim Dim;			MIMGDim Dim;

	uint8_t NumOffsetArgs;			uint8_t NumOffsetArgs;
	uint8_t NumBiasArgs;			uint8_t NumBiasArgs;
	uint8_t NumZCompareArgs;			uint8_t NumZCompareArgs;
	Show All 35 Lines

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

	Show All 37 Lines
	#define GET_INSTRINFO_CTOR_DTOR			#define GET_INSTRINFO_CTOR_DTOR
	#include "AMDGPUGenInstrInfo.inc"			#include "AMDGPUGenInstrInfo.inc"

	namespace llvm {			namespace llvm {

	class AAResults;			class AAResults;

	namespace AMDGPU {			namespace AMDGPU {
	#define GET_D16ImageDimIntrinsics_IMPL
	#define GET_ImageDimIntrinsicTable_IMPL			#define GET_ImageDimIntrinsicTable_IMPL
	#define GET_RsrcIntrinsics_IMPL			#define GET_RsrcIntrinsics_IMPL
	#include "AMDGPUGenSearchableTables.inc"			#include "AMDGPUGenSearchableTables.inc"
	}			} // namespace AMDGPU
	}			}


	// Must be at least 4 to be able to branch over minimum unconditional branch			// Must be at least 4 to be able to branch over minimum unconditional branch
	// code. This is only for making it possible to write reasonably small tests for			// code. This is only for making it possible to write reasonably small tests for
	// long branches.			// long branches.
	static cl::opt<unsigned>			static cl::opt<unsigned>
	BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),			BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
	▲ Show 20 Lines • Show All 8,212 Lines • Show Last 20 Lines

llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll

	Show First 20 Lines • Show All 2,795 Lines • ▼ Show 20 Lines
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
	;			;
	%res = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true)			%res = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true)
	store i32 %res, i32 addrspace(1)* %out			store i32 %res, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	; --------------------------------------------------------------------			; --------------------------------------------------------------------
				; llvm.amdgcn.image.sample d16
				; --------------------------------------------------------------------

				define amdgpu_kernel void @image_sample_d16_1d(<4 x half> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
				; CHECK-LABEL: @image_sample_d16_1d(
				; CHECK-NEXT: [[RES:%.]] = call <4 x half> @llvm.amdgcn.image.sample.1d.v4f16.f32(i32 15, float [[S:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x half> [[RES]], <4 x half> addrspace(1)* [[OUT:%.*]], align 8
				; CHECK-NEXT: ret void
				;
				%res = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				%res16 = fptrunc <4 x float> %res to <4 x half>
				store <4 x half> %res16, <4 x half> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_gather_d16_2d(<4 x half> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
				; CHECK-LABEL: @image_gather_d16_2d(
				; CHECK-NEXT: [[RES:%.]] = call <4 x half> @llvm.amdgcn.image.gather4.2d.v4f16.f32(i32 15, float [[S:%.]], float [[T:%.]], <8 x i32> [[RSRC:%.]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
				; CHECK-NEXT: store <4 x half> [[RES]], <4 x half> addrspace(1)* [[OUT:%.*]], align 8
				; CHECK-NEXT: ret void
				;
				%res = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 15, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
				%res16 = fptrunc <4 x float> %res to <4 x half>
				store <4 x half> %res16, <4 x half> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_load_d16_2d(<4 x half> addrspace(1)* %out, <8 x i32> inreg %rsrc, i32 %s, i32 %t) {
				; CHECK-LABEL: @image_load_d16_2d(
				; CHECK-NEXT: [[RES:%.]] = call <4 x half> @llvm.amdgcn.image.load.2d.v4f16.i32(i32 15, i32 [[S:%.]], i32 [[T:%.]], <8 x i32> [[RSRC:%.]], i32 0, i32 0)
				; CHECK-NEXT: store <4 x half> [[RES]], <4 x half> addrspace(1)* [[OUT:%.*]], align 8
				; CHECK-NEXT: ret void
				;
				%res = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
				%res16 = fptrunc <4 x float> %res to <4 x half>
				store <4 x half> %res16, <4 x half> addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @image_store_d16_2d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, half %data) {
				; CHECK-LABEL: @image_store_d16_2d(
				; CHECK-NEXT: call void @llvm.amdgcn.image.store.2d.f16.i32(half [[DATA:%.]], i32 1, i32 [[S:%.]], i32 [[T:%.]], <8 x i32> [[RSRC:%.]], i32 0, i32 0)
				; CHECK-NEXT: ret void
				;
				%data32 = fpext half %data to float
				call void @llvm.amdgcn.image.store.2d.f32.i32(float %data32, i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
				ret void
				}

				define amdgpu_kernel void @image_store_d16_2d_v4f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <4 x half> %data) {
				; CHECK-LABEL: @image_store_d16_2d_v4f32(
				; CHECK-NEXT: call void @llvm.amdgcn.image.store.2d.v4f16.i32(<4 x half> [[DATA:%.]], i32 1, i32 [[S:%.]], i32 [[T:%.]], <8 x i32> [[RSRC:%.]], i32 0, i32 0)
				; CHECK-NEXT: ret void
				;
				%data32 = fpext <4 x half> %data to <4 x float>
				call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %data32, i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
				ret void
				}

				define amdgpu_kernel void @image_store_d16_2d_const_v4f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
				; CHECK-LABEL: @image_store_d16_2d_const_v4f32(
				; CHECK-NEXT: call void @llvm.amdgcn.image.store.2d.v4f16.i32(<4 x half> <half 0xH0000, half 0xH3C00, half 0xH4000, half 0xH4200>, i32 1, i32 [[S:%.]], i32 [[T:%.]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
				; CHECK-NEXT: ret void
				;
				call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> < float 0.0, float 1.0, float 2.0, float 3.0 >, i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
				ret void
				}

				declare <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32)
				declare <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32)
				declare void @llvm.amdgcn.image.store.2d.f32.i32(float, i32, i32, i32, <8 x i32>, i32, i32)
				declare void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float>, i32, i32, i32, <8 x i32>, i32, i32)

				; --------------------------------------------------------------------
	; llvm.amdgcn.image.sample a16			; llvm.amdgcn.image.sample a16
	; --------------------------------------------------------------------			; --------------------------------------------------------------------

	declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1			declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
	declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1			declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
	declare <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1			declare <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
	declare <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1			declare <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
	declare <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1			declare <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
	▲ Show 20 Lines • Show All 1,864 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU][InstCombine] Use D16 if only f16 precision is neededAbandonedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 401945

llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp

llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll

[AMDGPU][InstCombine] Use D16 if only f16 precision is needed
AbandonedPublic