This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Add optimization patterns to combine fp32->fp16 conversions
Needs ReviewPublic

Authored by pendingchaos on Dec 1 2018, 10:57 AM.

Download Raw Diff

Details

Reviewers

arsenm
tstellar
alex-t

Summary

This has a build_vector (or equivalent) of the low or high words of a
cvt_pkrtz_f16_f32 be selected to a single v_cvt_pkrtz_f16_f32.

Diff Detail

Event Timeline

pendingchaos created this revision.Dec 1 2018, 10:57 AM

Herald added subscribers: t-tye, tpr, dstuttard and 5 others. · View Herald TranscriptDec 1 2018, 10:57 AM

I don't have commit access.

ping?

arsenm added inline comments.Dec 16 2018, 10:32 PM

lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
2102	Capitalize and punctuate
2137–2138	I think you should need only one or the other here, not both. Either way, you could move this out of the function (and I think there's a wrapper for this somewhere already)
2148	You can use the constructor directly without the extra = APFloat
2182	Capitalize hi
2211–2216	computeKnownBits?
lib/Target/AMDGPU/SIInstructions.td
1583	Should use isVI, or maybe these should be distinguished by GCN3Encoding? Needs a comment for why these are separated
test/CodeGen/AMDGPU/cvt_pkrtz_f16_f32_combine.ll
24	Probably should explicitly test the different encodings for the different sub targets

pendingchaos added inline comments.Dec 17 2018, 1:24 PM

lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
2211–2216	I don't see how that would help with obtaining the source of the low/high 16 bits (the cvt_pkrtz node) and end up make the code more generic/smaller? Since it would still have to match for "and(cvt_pkrtz(v, ), 0xffff)" and such. I just realized that this function could handle cvt_pkrtz(v, 0) and cvt_pkrtz(0, v) (without any ands or shifts). Should I make it so (and do something similar for SelectCvtRtzF16F32)?
lib/Target/AMDGPU/SIInstructions.td
1583	Since all GCN versions support the VOP3a form, shouldn't it use isGCN (to combine the modifiers into the instruction on SI/CI)? The VOP2 form is only supported on SI/CI, so isSICI is used. IIRC VOP2 ended up being used when no modifiers could be folded.

arsenm added inline comments.Jan 21 2019, 3:00 PM

lib/Target/AMDGPU/SIInstructions.td
1583	isGCN is obsolete and should be removed anywhere it's used. Since we try to shrink instructions later, we only want to select the _e64 version when possible

In this update:

The 64-bit encodings are always selected and SelectCvtRtzF16F32*Mods() have been removed.
SelectCvtRtzF16F32() is now implemented with SelectCvtRtzF16F32Lo().
cvt_pkrtz(v, 0) and cvt_pkrtz(0, v) are handled in SelectCvtRtzF16F32LoHiImpl().
The test explicitly tests the different encodings for the different sub targets.
The getConstantValue() helper is used.

Herald added a subscriber: jdoerfert. · View Herald TranscriptFeb 27 2019, 4:07 AM

arsenm added inline comments.Apr 7 2020, 2:11 PM

lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
2147	Constant folding during selection is pretty weird. Can we just do this in InstSimplify?

Herald added a subscriber: kerbowa. · View Herald TranscriptApr 7 2020, 2:11 PM

Revision Contents

Path

Size

lib/

Target/

AMDGPU/

AMDGPUISelDAGToDAG.cpp

110 lines

SIInstrInfo.td

4 lines

SIInstructions.td

16 lines

test/

CodeGen/

AMDGPU/

cvt_pkrtz_f16_f32_combine.ll

132 lines

Diff 188516

lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Show First 20 Lines • Show All 187 Lines • ▼ Show 20 Lines	bool SelectVOP3OpSel0(SDValue In, SDValue &Src, SDValue &SrcMods,
SDValue &Clamp) const;		SDValue &Clamp) const;

bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;		bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
bool SelectVOP3OpSelMods0(SDValue In, SDValue &Src, SDValue &SrcMods,		bool SelectVOP3OpSelMods0(SDValue In, SDValue &Src, SDValue &SrcMods,
SDValue &Clamp) const;		SDValue &Clamp) const;
bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, unsigned &Mods) const;		bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, unsigned &Mods) const;
bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;		bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;

		bool SelectLo16Elt(SDValue In, SDValue &Src) const;
bool SelectHi16Elt(SDValue In, SDValue &Src) const;		bool SelectHi16Elt(SDValue In, SDValue &Src) const;

		bool SelectCvtRtzF16F32(SDValue In, SDValue &Src, SDValue &SrcMods) const;

		bool SelectCvtRtzF16F32LoHiImpl(SDValue In, SDValue &Src, SDValue &SrcMods, bool hi) const;
		bool SelectCvtRtzF16F32Lo(SDValue In, SDValue &Src, SDValue &SrcMods) const;
		bool SelectCvtRtzF16F32Hi(SDValue In, SDValue &Src, SDValue &SrcMods) const;

void SelectADD_SUB_I64(SDNode *N);		void SelectADD_SUB_I64(SDNode *N);
void SelectUADDO_USUBO(SDNode *N);		void SelectUADDO_USUBO(SDNode *N);
void SelectDIV_SCALE(SDNode *N);		void SelectDIV_SCALE(SDNode *N);
void SelectMAD_64_32(SDNode *N);		void SelectMAD_64_32(SDNode *N);
void SelectFMA_W_CHAIN(SDNode *N);		void SelectFMA_W_CHAIN(SDNode *N);
void SelectFMUL_W_CHAIN(SDNode *N);		void SelectFMUL_W_CHAIN(SDNode *N);

SDNode *getS_BFE(unsigned Opcode, const SDLoc &DL, SDValue Val,		SDNode *getS_BFE(unsigned Opcode, const SDLoc &DL, SDValue Val,
▲ Show 20 Lines • Show All 1,682 Lines • ▼ Show 20 Lines	bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,

return true;		return true;
}		}

static SDValue stripBitcast(SDValue Val) {		static SDValue stripBitcast(SDValue Val) {
return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;		return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
}		}

		// Figure out if this is really an extract of the low 16-bits of a dword.
		static bool isExtractLoElt(SDValue In, SDValue &Out) {
		In = stripBitcast(In);
		if (In.getOpcode() != ISD::TRUNCATE)
		return false;

		Out = stripBitcast(In.getOperand(0));
		return true;
		}

// Figure out if this is really an extract of the high 16-bits of a dword.		// Figure out if this is really an extract of the high 16-bits of a dword.
static bool isExtractHiElt(SDValue In, SDValue &Out) {		static bool isExtractHiElt(SDValue In, SDValue &Out) {
In = stripBitcast(In);		In = stripBitcast(In);
if (In.getOpcode() != ISD::TRUNCATE)		if (In.getOpcode() != ISD::TRUNCATE)
return false;		return false;

SDValue Srl = In.getOperand(0);		SDValue Srl = In.getOperand(0);
if (Srl.getOpcode() == ISD::SRL) {		if (Srl.getOpcode() == ISD::SRL) {
▲ Show 20 Lines • Show All 167 Lines • ▼ Show 20 Lines
bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,		bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
SDValue &SrcMods) const {		SDValue &SrcMods) const {
unsigned Mods = 0;		unsigned Mods = 0;
SelectVOP3PMadMixModsImpl(In, Src, Mods);		SelectVOP3PMadMixModsImpl(In, Src, Mods);
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);		SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
return true;		return true;
}		}

		bool AMDGPUDAGToDAGISel::SelectLo16Elt(SDValue In, SDValue &Src) const {
		if (In.isUndef()) {
		Src = In;
		return true;
		}

		// No constant handling unlike SelectHi16Elt() due to lack of need.
		arsenmUnsubmitted Not Done Reply Inline Actions Capitalize and punctuate arsenm: Capitalize and punctuate

		return isExtractLoElt(In, Src);
		}

// TODO: Can we identify things like v_mad_mixhi_f16?		// TODO: Can we identify things like v_mad_mixhi_f16?
bool AMDGPUDAGToDAGISel::SelectHi16Elt(SDValue In, SDValue &Src) const {		bool AMDGPUDAGToDAGISel::SelectHi16Elt(SDValue In, SDValue &Src) const {
if (In.isUndef()) {		if (In.isUndef()) {
Src = In;		Src = In;
return true;		return true;
}		}

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {		if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {
Show All 13 Lines	MachineSDNode *MovK = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
SL, MVT::i32, K);		SL, MVT::i32, K);
Src = SDValue(MovK, 0);		Src = SDValue(MovK, 0);
return true;		return true;
}		}

return isExtractHiElt(In, Src);		return isExtractHiElt(In, Src);
}		}

		bool AMDGPUDAGToDAGISel::SelectCvtRtzF16F32(SDValue In, SDValue &Src, SDValue &SrcMods) const {
		In = stripBitcast(In);
		if (In.getOpcode() == ISD::TRUNCATE)
		arsenmUnsubmitted Not Done Reply Inline Actions I think you should need only one or the other here, not both. Either way, you could move this out of the function (and I think there's a wrapper for this somewhere already) arsenm: I think you should need only one or the other here, not both. Either way, you could move this…
		In = In.getOperand(0);
		return SelectCvtRtzF16F32Lo(In, Src, SrcMods);
		}

		bool AMDGPUDAGToDAGISel::SelectCvtRtzF16F32LoHiImpl(SDValue In, SDValue &Src, SDValue &SrcMods, bool Hi) const {
		In = stripBitcast(In);

		uint32_t Val;
		if (getConstantValue(In, Val)) {
		arsenmUnsubmitted Not Done Reply Inline Actions Constant folding during selection is pretty weird. Can we just do this in InstSimplify? arsenm: Constant folding during selection is pretty weird. Can we just do this in InstSimplify?
		if (Hi && Val & 0xffff)
		arsenmUnsubmitted Not Done Reply Inline Actions You can use the constructor directly without the extra = APFloat arsenm: You can use the constructor directly without the extra = APFloat
		return false;
		if (!Hi && Val >> 16)
		return false;

		Val = Hi ? Val >> 16 : Val;

		bool LostInfo;
		APFloat FVal(APFloatBase::IEEEhalf(), APInt(16, Val));
		FVal.convert(APFloatBase::IEEEsingle(), APFloatBase::rmNearestTiesToAway, &LostInfo);
		if (LostInfo)
		return false;

		SDLoc SL(In);
		SDValue K = CurDAG->getTargetConstant(FVal.bitcastToAPInt(), SL, MVT::i32);
		MachineSDNode *MovK = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::f32, K);
		Src = SDValue(MovK, 0);
		SrcMods = CurDAG->getTargetConstant(0, SL, MVT::i32);
		return true;
		}

		unsigned shiftOpcode = Hi ? ISD::SHL : ISD::SRL;
		int shiftOperand = Hi ? 0 : 1;
		uint32_t andMask = Hi ? 0xffff0000u : 0xffffu;
		int andOperand = Hi ? 1 : 0;

		if (In.getOpcode() == ISD::AND) {
		// low: and(cvt_pkrtz(v, ), 0xffff)
		// high: and(cvt_pkrtz(, v), 0xffff0000)
		for (int i = 0; i < 2; i++) {
		if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In.getOperand(i))) {
		uint32_t v = C->getZExtValue();
		SDValue CvtPkRtz = stripBitcast(In.getOperand(!i));
		if (v == andMask && CvtPkRtz.getOpcode() == AMDGPUISD::CVT_PKRTZ_F16_F32)
		return SelectVOP3Mods(CvtPkRtz.getOperand(andOperand), Src, SrcMods);
		arsenmUnsubmitted Not Done Reply Inline Actions Capitalize hi arsenm: Capitalize hi
		}
		}
		} else if (In.getOpcode() == shiftOpcode) {
		// low: srl(cvt_pkrtz(, v), 16)
		// high: shl(cvt_pkrtz(v, ), 16)
		if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
		uint32_t v = C->getZExtValue();
		SDValue CvtPkRtz = stripBitcast(In.getOperand(0));
		if (v == 16 && CvtPkRtz.getOpcode() == AMDGPUISD::CVT_PKRTZ_F16_F32)
		return SelectVOP3Mods(CvtPkRtz.getOperand(shiftOperand), Src, SrcMods);
		}
		} else if (In.getOpcode() == AMDGPUISD::CVT_PKRTZ_F16_F32) {
		// low: cvt_pkrtz(v, 0)
		// high: cvt_pkrtz(0, v)
		uint32_t Val;
		if (getConstantValue(In.getOperand(!Hi), Val)) {
		APFloat FVal(APFloatBase::IEEEsingle(), APInt(32, Val));
		bool LostInfo;
		FVal.convert(APFloatBase::IEEEhalf(), APFloatBase::rmTowardZero, &LostInfo);
		if (FVal.bitcastToAPInt().getZExtValue() == 0)
		return SelectVOP3Mods(In.getOperand(Hi), Src, SrcMods);
		}
		}

		return false;
		}

		bool AMDGPUDAGToDAGISel::SelectCvtRtzF16F32Lo(SDValue In, SDValue &Src, SDValue &SrcMods) const {
		return SelectCvtRtzF16F32LoHiImpl(In, Src, SrcMods, false);
		}

		bool AMDGPUDAGToDAGISel::SelectCvtRtzF16F32Hi(SDValue In, SDValue &Src, SDValue &SrcMods) const {
		return SelectCvtRtzF16F32LoHiImpl(In, Src, SrcMods, true);
		}
		arsenmUnsubmitted Not Done Reply Inline Actions computeKnownBits? arsenm: computeKnownBits?
		pendingchaosAuthorUnsubmitted Not Done Reply Inline Actions I don't see how that would help with obtaining the source of the low/high 16 bits (the cvt_pkrtz node) and end up make the code more generic/smaller? Since it would still have to match for "and(cvt_pkrtz(v, ), 0xffff)" and such. I just realized that this function could handle cvt_pkrtz(v, 0) and cvt_pkrtz(0, v) (without any ands or shifts). Should I make it so (and do something similar for SelectCvtRtzF16F32)? pendingchaos: I don't see how that would help with obtaining the source of the low/high 16 bits (the…

bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {		bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) {		if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) {
return false;		return false;
}		}
const SIRegisterInfo *SIRI =		const SIRegisterInfo *SIRI =
static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());		static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
const SIInstrInfo * SII =		const SIInstrInfo * SII =
static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());		static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
▲ Show 20 Lines • Show All 216 Lines • Show Last 20 Lines

lib/Target/AMDGPU/SIInstrInfo.td

	Show First 20 Lines • Show All 952 Lines • ▼ Show 20 Lines
	def VOP3OpSelMods : ComplexPattern<untyped, 2, "SelectVOP3OpSelMods">;			def VOP3OpSelMods : ComplexPattern<untyped, 2, "SelectVOP3OpSelMods">;
	def VOP3OpSelMods0 : ComplexPattern<untyped, 3, "SelectVOP3OpSelMods0">;			def VOP3OpSelMods0 : ComplexPattern<untyped, 3, "SelectVOP3OpSelMods0">;

	def VOP3PMadMixMods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixMods">;			def VOP3PMadMixMods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixMods">;


	def Hi16Elt : ComplexPattern<untyped, 1, "SelectHi16Elt">;			def Hi16Elt : ComplexPattern<untyped, 1, "SelectHi16Elt">;

				def CvtRtzF16F32 : ComplexPattern<untyped, 2, "SelectCvtRtzF16F32">;
				def CvtRtzF16F32Lo : ComplexPattern<untyped, 2, "SelectCvtRtzF16F32Lo">;
				def CvtRtzF16F32Hi : ComplexPattern<untyped, 2, "SelectCvtRtzF16F32Hi">;

	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	// SI assembler operands			// SI assembler operands
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	def SIOperand {			def SIOperand {
	int ZERO = 0x80;			int ZERO = 0x80;
	int VCC = 0x6A;			int VCC = 0x6A;
	int FLAT_SCR = 0x68;			int FLAT_SCR = 0x68;
	▲ Show 20 Lines • Show All 1,082 Lines • Show Last 20 Lines

lib/Target/AMDGPU/SIInstructions.td

	Show First 20 Lines • Show All 1,574 Lines • ▼ Show 20 Lines
	>;			>;

	} // End SubtargetPredicates = isSI			} // End SubtargetPredicates = isSI

	//============================================================================//			//============================================================================//
	// Miscellaneous Optimization Patterns			// Miscellaneous Optimization Patterns
	//============================================================================//			//============================================================================//

				def : GCNPat <
				arsenmUnsubmitted Not Done Reply Inline Actions Should use isVI, or maybe these should be distinguished by GCN3Encoding? Needs a comment for why these are separated arsenm: Should use isVI, or maybe these should be distinguished by GCN3Encoding? Needs a comment for…
				pendingchaosAuthorUnsubmitted Not Done Reply Inline Actions Since all GCN versions support the VOP3a form, shouldn't it use isGCN (to combine the modifiers into the instruction on SI/CI)? The VOP2 form is only supported on SI/CI, so isSICI is used. IIRC VOP2 ended up being used when no modifiers could be folded. pendingchaos: Since all GCN versions support the VOP3a form, shouldn't it use isGCN (to combine the modifiers…
				arsenmUnsubmitted Not Done Reply Inline Actions isGCN is obsolete and should be removed anywhere it's used. Since we try to shrink instructions later, we only want to select the _e64 version when possible arsenm: isGCN is obsolete and should be removed anywhere it's used. Since we try to shrink…
				(v2f16 (build_vector (f16 (CvtRtzF16F32 f32:$src0, i32:$src0_mods)),
				(f16 (CvtRtzF16F32 f32:$src1, i32:$src1_mods)))),
				(V_CVT_PKRTZ_F16_F32_e64 $src0_mods, $src0,
				$src1_mods, $src1,
				DSTCLAMP.NONE, DSTOMOD.NONE)
				>;

				def : GCNPat <
				(i32 (or (CvtRtzF16F32Lo f32:$src0, i32:$src0_mods),
				(CvtRtzF16F32Hi f32:$src1, i32:$src1_mods))),
				(V_CVT_PKRTZ_F16_F32_e64 $src0_mods, $src0,
				$src1_mods, $src1,
				DSTCLAMP.NONE, DSTOMOD.NONE)
				>;

	// Undo sub x, c -> add x, -c canonicalization since c is more likely			// Undo sub x, c -> add x, -c canonicalization since c is more likely
	// an inline immediate than -c.			// an inline immediate than -c.
	// TODO: Also do for 64-bit.			// TODO: Also do for 64-bit.
	def : GCNPat<			def : GCNPat<
	(add i32:$src0, (i32 NegSubInlineConst32:$src1)),			(add i32:$src0, (i32 NegSubInlineConst32:$src1)),
	(S_SUB_I32 $src0, NegSubInlineConst32:$src1)			(S_SUB_I32 $src0, NegSubInlineConst32:$src1)
	>;			>;

	▲ Show 20 Lines • Show All 75 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/cvt_pkrtz_f16_f32_combine.ll

This file was added.

				; RUN: llc -march=amdgcn -mcpu=gfx600 -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes ALL,GFX67 %s
				; RUN: llc -march=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes ALL,GFX67 %s
				; RUN: llc -march=amdgcn -mcpu=gfx802 -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes ALL,GFX89 %s
				; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes ALL,GFX89 %s

				; ALL-LABEL: {{^}}packed_convert_low:
				; GFX67: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]*}}, v0, v1
				; GFX89: v_cvt_pkrtz_f16_f32 v{{[0-9]*}}, v0, v1
				; ALL-NOT: v_cvt_pkrtz_f16_f32
				define i32 @packed_convert_low(float %a, float %b) #0 {
				%a_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float 0.0)
				%a_half = extractelement <2 x half> %a_half_vec, i32 0
				%b_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %b, float 0.0)
				%b_half = extractelement <2 x half> %b_half_vec, i32 0

				%vec_tmp = insertelement <2 x half> undef, half %a_half, i32 0
				%vec = insertelement <2 x half> %vec_tmp, half %b_half, i32 1

				%vec_i32 = bitcast <2 x half> %vec to i32
				ret i32 %vec_i32
				}

				; ALL-LABEL: {{^}}packed_convert_high:
				; GFX67: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]*}}, v0, v1
				arsenmUnsubmitted Not Done Reply Inline Actions Probably should explicitly test the different encodings for the different sub targets arsenm: Probably should explicitly test the different encodings for the different sub targets
				; GFX89: v_cvt_pkrtz_f16_f32 v{{[0-9]*}}, v0, v1
				; ALL-NOT: v_cvt_pkrtz_f16_f32
				define i32 @packed_convert_high(float %a, float %b) #0 {
				%a_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.0, float %a)
				%a_half = extractelement <2 x half> %a_half_vec, i32 1
				%b_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.0, float %b)
				%b_half = extractelement <2 x half> %b_half_vec, i32 1

				%vec_tmp = insertelement <2 x half> undef, half %a_half, i32 0
				%vec = insertelement <2 x half> %vec_tmp, half %b_half, i32 1

				%vec_i32 = bitcast <2 x half> %vec to i32
				ret i32 %vec_i32
				}

				; ALL-LABEL: {{^}}packed_convert_low_high:
				; GFX67: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]*}}, v0, v1
				; GFX89: v_cvt_pkrtz_f16_f32 v{{[0-9]*}}, v0, v1
				; ALL-NOT: v_cvt_pkrtz_f16_f32
				define i32 @packed_convert_low_high(float %a, float %b) #0 {
				%a_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float 0.0)
				%a_half = extractelement <2 x half> %a_half_vec, i32 0
				%b_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.0, float %b)
				%b_half = extractelement <2 x half> %b_half_vec, i32 1

				%vec_tmp = insertelement <2 x half> undef, half %a_half, i32 0
				%vec = insertelement <2 x half> %vec_tmp, half %b_half, i32 1

				%vec_i32 = bitcast <2 x half> %vec to i32
				ret i32 %vec_i32
				}

				; ALL-LABEL: {{^}}packed_convert_high_low:
				; GFX67: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]*}}, v0, v1
				; GFX89: v_cvt_pkrtz_f16_f32 v{{[0-9]*}}, v0, v1
				; ALL-NOT: v_cvt_pkrtz_f16_f32
				define i32 @packed_convert_high_low(float %a, float %b) #0 {
				%a_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.0, float %a)
				%a_half = extractelement <2 x half> %a_half_vec, i32 1
				%b_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %b, float 0.0)
				%b_half = extractelement <2 x half> %b_half_vec, i32 0

				%vec_tmp = insertelement <2 x half> undef, half %a_half, i32 0
				%vec = insertelement <2 x half> %vec_tmp, half %b_half, i32 1

				%vec_i32 = bitcast <2 x half> %vec to i32
				ret i32 %vec_i32
				}

				; ALL-LABEL: {{^}}packed_convert_imm:
				; GFX67: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]*}}, 0x40a00000, v1
				; GFX89: s_mov_b32 s{{[0-9]*}}, 0x40a00000
				; GFX89: v_cvt_pkrtz_f16_f32 v{{[0-9]}}, s{{[0-9]}}, v1
				; ALL-NOT: v_cvt_pkrtz_f16_f32
				define i32 @packed_convert_imm(float %a, float %b) #0 {
				%a_half = fptrunc float 5.0 to half
				%b_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %b, float 0.0)
				%b_half = extractelement <2 x half> %b_half_vec, i32 0

				%vec_tmp = insertelement <2 x half> undef, half %a_half, i32 0
				%vec = insertelement <2 x half> %vec_tmp, half %b_half, i32 1

				%vec_i32 = bitcast <2 x half> %vec to i32
				ret i32 %vec_i32
				}

				; ALL-LABEL: {{^}}packed_convert_low_neg:
				; GFX67: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]*}}, -v0, v1
				; GFX89: v_cvt_pkrtz_f16_f32 v{{[0-9]*}}, -v0, v1
				; ALL-NOT: v_cvt_pkrtz_f16_f32
				define i32 @packed_convert_low_neg(float %a, float %b) #0 {
				%a_neg = fneg float %a

				%a_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a_neg, float 0.0)
				%a_half = extractelement <2 x half> %a_half_vec, i32 0
				%b_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %b, float 0.0)
				%b_half = extractelement <2 x half> %b_half_vec, i32 0

				%vec_tmp = insertelement <2 x half> undef, half %a_half, i32 0
				%vec = insertelement <2 x half> %vec_tmp, half %b_half, i32 1

				%vec_i32 = bitcast <2 x half> %vec to i32
				ret i32 %vec_i32
				}

				; ALL-LABEL: {{^}}packed_convert_high_neg:
				; GFX67: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]*}}, -v0, v1
				; GFX89: v_cvt_pkrtz_f16_f32 v{{[0-9]*}}, -v0, v1
				; ALL-NOT: v_cvt_pkrtz_f16_f32
				define i32 @packed_convert_high_neg(float %a, float %b) #0 {
				%a_neg = fneg float %a

				%a_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.0, float %a_neg)
				%a_half = extractelement <2 x half> %a_half_vec, i32 1
				%b_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.0, float %b)
				%b_half = extractelement <2 x half> %b_half_vec, i32 1

				%vec_tmp = insertelement <2 x half> undef, half %a_half, i32 0
				%vec = insertelement <2 x half> %vec_tmp, half %b_half, i32 1

				%vec_i32 = bitcast <2 x half> %vec to i32
				ret i32 %vec_i32
				}

				declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1

				attributes #0 = { nounwind }
				attributes #1 = { nounwind readnone }