This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Add optimization patterns to combine fp32->fp16 conversions
Needs ReviewPublic

Authored by pendingchaos on Dec 1 2018, 10:57 AM.

Download Raw Diff

Details

Reviewers

arsenm
tstellar
alex-t

Summary

This has a build_vector (or equivalent) of the low or high words of a
cvt_pkrtz_f16_f32 be selected to a single v_cvt_pkrtz_f16_f32.

Diff Detail

Event Timeline

pendingchaos created this revision.Dec 1 2018, 10:57 AM

Herald added subscribers: t-tye, tpr, dstuttard and 5 others. · View Herald TranscriptDec 1 2018, 10:57 AM

I don't have commit access.

ping?

arsenm added inline comments.Dec 16 2018, 10:32 PM

lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
2045	Capitalize and punctuate
2080–2081	I think you should need only one or the other here, not both. Either way, you could move this out of the function (and I think there's a wrapper for this somewhere already)
2091	You can use the constructor directly without the extra = APFloat
2125	Capitalize hi
2154–2159	computeKnownBits?
lib/Target/AMDGPU/SIInstructions.td
1597	Should use isVI, or maybe these should be distinguished by GCN3Encoding? Needs a comment for why these are separated
test/CodeGen/AMDGPU/cvt_pkrtz_f16_f32_combine.ll
23	Probably should explicitly test the different encodings for the different sub targets

pendingchaos added inline comments.Dec 17 2018, 1:24 PM

lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
2154–2159	I don't see how that would help with obtaining the source of the low/high 16 bits (the cvt_pkrtz node) and end up make the code more generic/smaller? Since it would still have to match for "and(cvt_pkrtz(v, ), 0xffff)" and such. I just realized that this function could handle cvt_pkrtz(v, 0) and cvt_pkrtz(0, v) (without any ands or shifts). Should I make it so (and do something similar for SelectCvtRtzF16F32)?
lib/Target/AMDGPU/SIInstructions.td
1597	Since all GCN versions support the VOP3a form, shouldn't it use isGCN (to combine the modifiers into the instruction on SI/CI)? The VOP2 form is only supported on SI/CI, so isSICI is used. IIRC VOP2 ended up being used when no modifiers could be folded.

arsenm added inline comments.Jan 21 2019, 3:00 PM

lib/Target/AMDGPU/SIInstructions.td
1597	isGCN is obsolete and should be removed anywhere it's used. Since we try to shrink instructions later, we only want to select the _e64 version when possible

In this update:

The 64-bit encodings are always selected and SelectCvtRtzF16F32*Mods() have been removed.
SelectCvtRtzF16F32() is now implemented with SelectCvtRtzF16F32Lo().
cvt_pkrtz(v, 0) and cvt_pkrtz(0, v) are handled in SelectCvtRtzF16F32LoHiImpl().
The test explicitly tests the different encodings for the different sub targets.
The getConstantValue() helper is used.

Herald added a subscriber: jdoerfert. · View Herald TranscriptFeb 27 2019, 4:07 AM

arsenm added inline comments.Apr 7 2020, 2:11 PM

lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
2090	Constant folding during selection is pretty weird. Can we just do this in InstSimplify?

Herald added a subscriber: kerbowa. · View Herald TranscriptApr 7 2020, 2:11 PM

Revision Contents

Path

Size

lib/

Target/

AMDGPU/

AMDGPUISelDAGToDAG.cpp

162 lines

SIInstrInfo.td

9 lines

SIInstructions.td

37 lines

test/

CodeGen/

AMDGPU/

cvt_pkrtz_f16_f32_combine.ll

126 lines

Diff 176261

lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Show First 20 Lines • Show All 187 Lines • ▼ Show 20 Lines	bool SelectVOP3OpSel0(SDValue In, SDValue &Src, SDValue &SrcMods,
SDValue &Clamp) const;		SDValue &Clamp) const;

bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;		bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
bool SelectVOP3OpSelMods0(SDValue In, SDValue &Src, SDValue &SrcMods,		bool SelectVOP3OpSelMods0(SDValue In, SDValue &Src, SDValue &SrcMods,
SDValue &Clamp) const;		SDValue &Clamp) const;
bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, unsigned &Mods) const;		bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, unsigned &Mods) const;
bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;		bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;

		bool SelectLo16Elt(SDValue In, SDValue &Src) const;
bool SelectHi16Elt(SDValue In, SDValue &Src) const;		bool SelectHi16Elt(SDValue In, SDValue &Src) const;

		bool SelectCvtRtzF16F32(SDValue In, SDValue &Src) const;
		bool SelectCvtRtzF16F32Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const;

		bool SelectCvtRtzF16F32Impl(SDValue In, SDValue &Src, bool hi) const;
		bool SelectCvtRtzF16F32Lo(SDValue In, SDValue &Src) const;
		bool SelectCvtRtzF16F32Hi(SDValue In, SDValue &Src) const;
		bool SelectCvtRtzF16F32LoMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
		bool SelectCvtRtzF16F32HiMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;

void SelectADD_SUB_I64(SDNode *N);		void SelectADD_SUB_I64(SDNode *N);
void SelectUADDO_USUBO(SDNode *N);		void SelectUADDO_USUBO(SDNode *N);
void SelectDIV_SCALE(SDNode *N);		void SelectDIV_SCALE(SDNode *N);
void SelectMAD_64_32(SDNode *N);		void SelectMAD_64_32(SDNode *N);
void SelectFMA_W_CHAIN(SDNode *N);		void SelectFMA_W_CHAIN(SDNode *N);
void SelectFMUL_W_CHAIN(SDNode *N);		void SelectFMUL_W_CHAIN(SDNode *N);

SDNode *getS_BFE(unsigned Opcode, const SDLoc &DL, SDValue Val,		SDNode *getS_BFE(unsigned Opcode, const SDLoc &DL, SDValue Val,
▲ Show 20 Lines • Show All 1,622 Lines • ▼ Show 20 Lines	bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,

return true;		return true;
}		}

static SDValue stripBitcast(SDValue Val) {		static SDValue stripBitcast(SDValue Val) {
return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;		return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
}		}

		// Figure out if this is really an extract of the low 16-bits of a dword.
		static bool isExtractLoElt(SDValue In, SDValue &Out) {
		In = stripBitcast(In);
		if (In.getOpcode() != ISD::TRUNCATE)
		return false;

		Out = stripBitcast(In.getOperand(0));
		return true;
		}

// Figure out if this is really an extract of the high 16-bits of a dword.		// Figure out if this is really an extract of the high 16-bits of a dword.
static bool isExtractHiElt(SDValue In, SDValue &Out) {		static bool isExtractHiElt(SDValue In, SDValue &Out) {
In = stripBitcast(In);		In = stripBitcast(In);
if (In.getOpcode() != ISD::TRUNCATE)		if (In.getOpcode() != ISD::TRUNCATE)
return false;		return false;

SDValue Srl = In.getOperand(0);		SDValue Srl = In.getOperand(0);
if (Srl.getOpcode() == ISD::SRL) {		if (Srl.getOpcode() == ISD::SRL) {
▲ Show 20 Lines • Show All 167 Lines • ▼ Show 20 Lines
bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,		bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
SDValue &SrcMods) const {		SDValue &SrcMods) const {
unsigned Mods = 0;		unsigned Mods = 0;
SelectVOP3PMadMixModsImpl(In, Src, Mods);		SelectVOP3PMadMixModsImpl(In, Src, Mods);
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);		SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
return true;		return true;
}		}

		bool AMDGPUDAGToDAGISel::SelectLo16Elt(SDValue In, SDValue &Src) const {
		if (In.isUndef()) {
		Src = In;
		return true;
		}

		// no constant handling unlike SelectHi16Elt() due to lack of need
		arsenmUnsubmitted Not Done Reply Inline Actions Capitalize and punctuate arsenm: Capitalize and punctuate

		return isExtractLoElt(In, Src);
		}

// TODO: Can we identify things like v_mad_mixhi_f16?		// TODO: Can we identify things like v_mad_mixhi_f16?
bool AMDGPUDAGToDAGISel::SelectHi16Elt(SDValue In, SDValue &Src) const {		bool AMDGPUDAGToDAGISel::SelectHi16Elt(SDValue In, SDValue &Src) const {
if (In.isUndef()) {		if (In.isUndef()) {
Src = In;		Src = In;
return true;		return true;
}		}

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {		if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {
Show All 13 Lines	MachineSDNode *MovK = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
SL, MVT::i32, K);		SL, MVT::i32, K);
Src = SDValue(MovK, 0);		Src = SDValue(MovK, 0);
return true;		return true;
}		}

return isExtractHiElt(In, Src);		return isExtractHiElt(In, Src);
}		}

		bool AMDGPUDAGToDAGISel::SelectCvtRtzF16F32(SDValue In, SDValue &Src) const {
		ConstantSDNode *CI = dyn_cast<ConstantSDNode>(In);
		ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(In);
		arsenmUnsubmitted Not Done Reply Inline Actions I think you should need only one or the other here, not both. Either way, you could move this out of the function (and I think there's a wrapper for this somewhere already) arsenm: I think you should need only one or the other here, not both. Either way, you could move this…
		if (CI \|\| CF) {
		uint64_t Val = CI ? CI->getLimitedValue() : 0;
		if (CF)
		Val = CF->getValueAPF().bitcastToAPInt().getLimitedValue();

		if (Val >> 16)
		return false;

		bool LostInfo;
		arsenmUnsubmitted Not Done Reply Inline Actions Constant folding during selection is pretty weird. Can we just do this in InstSimplify? arsenm: Constant folding during selection is pretty weird. Can we just do this in InstSimplify?
		APFloat FVal = APFloat(APFloatBase::IEEEhalf(), APInt(16, Val));
		arsenmUnsubmitted Not Done Reply Inline Actions You can use the constructor directly without the extra = APFloat arsenm: You can use the constructor directly without the extra = APFloat
		FVal.convert(APFloatBase::IEEEsingle(), APFloatBase::rmNearestTiesToAway, &LostInfo);
		if (LostInfo)
		return false;

		SDLoc SL(In);
		SDValue K = CurDAG->getTargetConstant(FVal.bitcastToAPInt(), SL, MVT::i32);
		MachineSDNode *MovK = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::f32, K);
		Src = SDValue(MovK, 0);
		return true;
		}

		SDValue CvtPkRtz;

		if (SelectHi16Elt(In, CvtPkRtz) && CvtPkRtz.getOpcode() == AMDGPUISD::CVT_PKRTZ_F16_F32) {
		Src = CvtPkRtz.getOperand(1);
		return true;
		}

		if (SelectLo16Elt(In, CvtPkRtz) && CvtPkRtz.getOpcode() == AMDGPUISD::CVT_PKRTZ_F16_F32) {
		Src = CvtPkRtz.getOperand(0);
		return true;
		}

		return false;
		}

		bool AMDGPUDAGToDAGISel::SelectCvtRtzF16F32Mods(SDValue In, SDValue &Src, SDValue &Mods) const {
		SDValue Tmp;
		if (SelectCvtRtzF16F32(In, Tmp))
		return SelectVOP3Mods(Tmp, Src, Mods);
		return false;
		}

		bool AMDGPUDAGToDAGISel::SelectCvtRtzF16F32Impl(SDValue In, SDValue &Src, bool hi) const {
		arsenmUnsubmitted Not Done Reply Inline Actions Capitalize hi arsenm: Capitalize hi
		In = stripBitcast(In);

		ConstantSDNode *CI = dyn_cast<ConstantSDNode>(In);
		ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(In);
		if (CI \|\| CF) {
		uint32_t Val = CI ? CI->getLimitedValue() : 0;
		if (CF)
		Val = CF->getValueAPF().bitcastToAPInt().getLimitedValue();
		if (hi && Val & 0xffff)
		return false;
		if (!hi && Val >> 16)
		return false;

		Val = hi ? Val >> 16 : Val;

		bool LostInfo;
		APFloat FVal = APFloat(APFloatBase::IEEEhalf(), APInt(16, Val));
		FVal.convert(APFloatBase::IEEEsingle(), APFloatBase::rmNearestTiesToAway, &LostInfo);
		if (LostInfo)
		return false;

		SDLoc SL(In);
		SDValue K = CurDAG->getTargetConstant(FVal.bitcastToAPInt(), SL, MVT::i32);
		MachineSDNode *MovK = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::f32, K);
		Src = SDValue(MovK, 0);
		return true;
		}

		unsigned shiftOpcode = hi ? ISD::SHL : ISD::SRL;
		int shiftOperand = hi ? 0 : 1;
		uint32_t andMask = hi ? 0xffff0000u : 0xffffu;
		int andOperand = hi ? 1 : 0;

		if (In.getOpcode() == ISD::AND) {
		arsenmUnsubmitted Not Done Reply Inline Actions computeKnownBits? arsenm: computeKnownBits?
		pendingchaosAuthorUnsubmitted Not Done Reply Inline Actions I don't see how that would help with obtaining the source of the low/high 16 bits (the cvt_pkrtz node) and end up make the code more generic/smaller? Since it would still have to match for "and(cvt_pkrtz(v, ), 0xffff)" and such. I just realized that this function could handle cvt_pkrtz(v, 0) and cvt_pkrtz(0, v) (without any ands or shifts). Should I make it so (and do something similar for SelectCvtRtzF16F32)? pendingchaos: I don't see how that would help with obtaining the source of the low/high 16 bits (the…
		// low: and(cvt_pkrtz(v, 0), 0xffff)
		// high: and(cvt_pkrtz(0, v), 0xffff0000)
		for (int i = 0; i < 2; i++) {
		if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In.getOperand(i))) {
		uint32_t v = C->getZExtValue();
		SDValue CvtPkRtz = stripBitcast(In.getOperand(!i));
		if (v == andMask && CvtPkRtz.getOpcode() == AMDGPUISD::CVT_PKRTZ_F16_F32) {
		Src = CvtPkRtz.getOperand(andOperand);
		return true;
		}
		}
		}
		} else if (In.getOpcode() == shiftOpcode) {
		// low: srl(cvt_pkrtz(0, v), 16)
		// high: shl(cvt_pkrtz(v, 0), 16)
		if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
		uint32_t v = C->getZExtValue();
		SDValue CvtPkRtz = stripBitcast(In.getOperand(0));
		if (v == 16 && CvtPkRtz.getOpcode() == AMDGPUISD::CVT_PKRTZ_F16_F32) {
		Src = CvtPkRtz.getOperand(shiftOperand);
		return true;
		}
		}
		}

		return false;
		}

		bool AMDGPUDAGToDAGISel::SelectCvtRtzF16F32Lo(SDValue In, SDValue &Src) const {
		return SelectCvtRtzF16F32Impl(In, Src, false);
		}

		bool AMDGPUDAGToDAGISel::SelectCvtRtzF16F32Hi(SDValue In, SDValue &Src) const {
		return SelectCvtRtzF16F32Impl(In, Src, true);
		}

		bool AMDGPUDAGToDAGISel::SelectCvtRtzF16F32LoMods(SDValue In, SDValue &Src, SDValue &SrcMods) const {
		SDValue Tmp;
		if (SelectCvtRtzF16F32Lo(In, Tmp))
		return SelectVOP3Mods(Tmp, Src, SrcMods);
		return false;
		}

		bool AMDGPUDAGToDAGISel::SelectCvtRtzF16F32HiMods(SDValue In, SDValue &Src, SDValue &SrcMods) const {
		SDValue Tmp;
		if (SelectCvtRtzF16F32Hi(In, Tmp))
		return SelectVOP3Mods(Tmp, Src, SrcMods);
		return false;
		}

bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {		bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) {		if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) {
return false;		return false;
}		}
const SIRegisterInfo *SIRI =		const SIRegisterInfo *SIRI =
static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());		static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
const SIInstrInfo * SII =		const SIInstrInfo * SII =
static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());		static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
▲ Show 20 Lines • Show All 216 Lines • Show Last 20 Lines

lib/Target/AMDGPU/SIInstrInfo.td

	Show First 20 Lines • Show All 952 Lines • ▼ Show 20 Lines
	def VOP3OpSelMods : ComplexPattern<untyped, 2, "SelectVOP3OpSelMods">;			def VOP3OpSelMods : ComplexPattern<untyped, 2, "SelectVOP3OpSelMods">;
	def VOP3OpSelMods0 : ComplexPattern<untyped, 3, "SelectVOP3OpSelMods0">;			def VOP3OpSelMods0 : ComplexPattern<untyped, 3, "SelectVOP3OpSelMods0">;

	def VOP3PMadMixMods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixMods">;			def VOP3PMadMixMods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixMods">;


	def Hi16Elt : ComplexPattern<untyped, 1, "SelectHi16Elt">;			def Hi16Elt : ComplexPattern<untyped, 1, "SelectHi16Elt">;

				def CvtRtzF16F32 : ComplexPattern<untyped, 1, "SelectCvtRtzF16F32">;
				def CvtRtzF16F32Mods : ComplexPattern<untyped, 2, "SelectCvtRtzF16F32Mods">;

				def CvtRtzF16F32Lo : ComplexPattern<untyped, 1, "SelectCvtRtzF16F32Lo">;
				def CvtRtzF16F32LoMods : ComplexPattern<untyped, 2, "SelectCvtRtzF16F32LoMods">;

				def CvtRtzF16F32Hi : ComplexPattern<untyped, 1, "SelectCvtRtzF16F32Hi">;
				def CvtRtzF16F32HiMods : ComplexPattern<untyped, 2, "SelectCvtRtzF16F32HiMods">;

	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	// SI assembler operands			// SI assembler operands
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	def SIOperand {			def SIOperand {
	int ZERO = 0x80;			int ZERO = 0x80;
	int VCC = 0x6A;			int VCC = 0x6A;
	int FLAT_SCR = 0x68;			int FLAT_SCR = 0x68;
	▲ Show 20 Lines • Show All 1,064 Lines • Show Last 20 Lines

lib/Target/AMDGPU/SIInstructions.td

	Show First 20 Lines • Show All 1,588 Lines • ▼ Show 20 Lines
	>;			>;

	} // End SubtargetPredicates = isSI			} // End SubtargetPredicates = isSI

	//============================================================================//			//============================================================================//
	// Miscellaneous Optimization Patterns			// Miscellaneous Optimization Patterns
	//============================================================================//			//============================================================================//

				let SubtargetPredicate = isGCN in {
				arsenmUnsubmitted Not Done Reply Inline Actions Should use isVI, or maybe these should be distinguished by GCN3Encoding? Needs a comment for why these are separated arsenm: Should use isVI, or maybe these should be distinguished by GCN3Encoding? Needs a comment for…
				pendingchaosAuthorUnsubmitted Not Done Reply Inline Actions Since all GCN versions support the VOP3a form, shouldn't it use isGCN (to combine the modifiers into the instruction on SI/CI)? The VOP2 form is only supported on SI/CI, so isSICI is used. IIRC VOP2 ended up being used when no modifiers could be folded. pendingchaos: Since all GCN versions support the VOP3a form, shouldn't it use isGCN (to combine the modifiers…
				arsenmUnsubmitted Not Done Reply Inline Actions isGCN is obsolete and should be removed anywhere it's used. Since we try to shrink instructions later, we only want to select the _e64 version when possible arsenm: isGCN is obsolete and should be removed anywhere it's used. Since we try to shrink…

				def : GCNPat <
				(v2f16 (build_vector (f16 (CvtRtzF16F32Mods f32:$src0, i32:$src0_mods)),
				(f16 (CvtRtzF16F32Mods f32:$src1, i32:$src1_mods)))),
				(V_CVT_PKRTZ_F16_F32_e64 $src0_mods, $src0,
				$src1_mods, $src1,
				DSTCLAMP.NONE, DSTOMOD.NONE)
				>;

				def : GCNPat <
				(i32 (or (CvtRtzF16F32LoMods f32:$src0, i32:$src0_mods),
				(CvtRtzF16F32HiMods f32:$src1, i32:$src1_mods))),
				(V_CVT_PKRTZ_F16_F32_e64 $src0_mods, $src0,
				$src1_mods, $src1,
				DSTCLAMP.NONE, DSTOMOD.NONE)
				>;

				} // End SubtargetPredicates = isGCN

				let SubtargetPredicate = isSICI in {

				def : GCNPat <
				(v2f16 (build_vector (f16 (CvtRtzF16F32 f32:$src0)),
				(f16 (CvtRtzF16F32 f32:$src1)))),
				(V_CVT_PKRTZ_F16_F32_e32 $src0, $src1)
				>;

				def : GCNPat <
				(i32 (or (i32 (CvtRtzF16F32Lo f32:$src0)),
				(i32 (CvtRtzF16F32Hi f32:$src1)))),
				(V_CVT_PKRTZ_F16_F32_e32 $src0, $src1)
				>;

				} // End SubtargetPredicates = isSICI


	// Undo sub x, c -> add x, -c canonicalization since c is more likely			// Undo sub x, c -> add x, -c canonicalization since c is more likely
	// an inline immediate than -c.			// an inline immediate than -c.
	// TODO: Also do for 64-bit.			// TODO: Also do for 64-bit.
	def : GCNPat<			def : GCNPat<
	(add i32:$src0, (i32 NegSubInlineConst32:$src1)),			(add i32:$src0, (i32 NegSubInlineConst32:$src1)),
	(S_SUB_I32 $src0, NegSubInlineConst32:$src1)			(S_SUB_I32 $src0, NegSubInlineConst32:$src1)
	>;			>;

	▲ Show 20 Lines • Show All 79 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/cvt_pkrtz_f16_f32_combine.ll

				; RUN: llc -march=amdgcn -mcpu=gfx600 -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes ALL,GFX67 %s
				; RUN: llc -march=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes ALL,GFX67 %s
				; RUN: llc -march=amdgcn -mcpu=gfx802 -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes ALL,GFX89 %s
				; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes ALL,GFX89 %s

				; ALL-LABEL: {{^}}packed_convert_low:
				; ALL: v_cvt_pkrtz_f16_f32{{(_e32\|_e64)?}} v{{[0-9]*}}, v0, v1
				; ALL-NOT: v_cvt_pkrtz_f16_f32
				define i32 @packed_convert_low(float %a, float %b) #0 {
				%a_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float 0.0)
				%a_half = extractelement <2 x half> %a_half_vec, i32 0
				%b_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %b, float 0.0)
				%b_half = extractelement <2 x half> %b_half_vec, i32 0

				%vec_tmp = insertelement <2 x half> undef, half %a_half, i32 0
				%vec = insertelement <2 x half> %vec_tmp, half %b_half, i32 1

				%vec_i32 = bitcast <2 x half> %vec to i32
				ret i32 %vec_i32
				}

				; ALL-LABEL: {{^}}packed_convert_high:
				; ALL: v_cvt_pkrtz_f16_f32{{(_e32\|_e64)?}} v{{[0-9]*}}, v0, v1
				arsenmUnsubmitted Not Done Reply Inline Actions Probably should explicitly test the different encodings for the different sub targets arsenm: Probably should explicitly test the different encodings for the different sub targets
				; ALL-NOT: v_cvt_pkrtz_f16_f32
				define i32 @packed_convert_high(float %a, float %b) #0 {
				%a_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.0, float %a)
				%a_half = extractelement <2 x half> %a_half_vec, i32 1
				%b_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.0, float %b)
				%b_half = extractelement <2 x half> %b_half_vec, i32 1

				%vec_tmp = insertelement <2 x half> undef, half %a_half, i32 0
				%vec = insertelement <2 x half> %vec_tmp, half %b_half, i32 1

				%vec_i32 = bitcast <2 x half> %vec to i32
				ret i32 %vec_i32
				}

				; ALL-LABEL: {{^}}packed_convert_low_high:
				; ALL: v_cvt_pkrtz_f16_f32{{(_e32\|_e64)?}} v{{[0-9]*}}, v0, v1
				; ALL-NOT: v_cvt_pkrtz_f16_f32
				define i32 @packed_convert_low_high(float %a, float %b) #0 {
				%a_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float 0.0)
				%a_half = extractelement <2 x half> %a_half_vec, i32 0
				%b_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.0, float %b)
				%b_half = extractelement <2 x half> %b_half_vec, i32 1

				%vec_tmp = insertelement <2 x half> undef, half %a_half, i32 0
				%vec = insertelement <2 x half> %vec_tmp, half %b_half, i32 1

				%vec_i32 = bitcast <2 x half> %vec to i32
				ret i32 %vec_i32
				}

				; ALL-LABEL: {{^}}packed_convert_high_low:
				; ALL: v_cvt_pkrtz_f16_f32{{(_e32\|_e64)?}} v{{[0-9]*}}, v0, v1
				; ALL-NOT: v_cvt_pkrtz_f16_f32
				define i32 @packed_convert_high_low(float %a, float %b) #0 {
				%a_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.0, float %a)
				%a_half = extractelement <2 x half> %a_half_vec, i32 1
				%b_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %b, float 0.0)
				%b_half = extractelement <2 x half> %b_half_vec, i32 0

				%vec_tmp = insertelement <2 x half> undef, half %a_half, i32 0
				%vec = insertelement <2 x half> %vec_tmp, half %b_half, i32 1

				%vec_i32 = bitcast <2 x half> %vec to i32
				ret i32 %vec_i32
				}

				; ALL-LABEL: {{^}}packed_convert_imm:
				; GFX89: s_mov_b32 s{{[0-9]*}}, 0x40a00000
				; GFX89: v_cvt_pkrtz_f16_f32{{(_e32\|_e64)?}} v{{[0-9]}}, s{{[0-9]}}, v1
				; GFX67: v_cvt_pkrtz_f16_f32{{(_e32\|_e64)?}} v{{[0-9]*}}, 0x40a00000, v1
				; GFX67-NOT: v_cvt_pkrtz_f16_f32
				define i32 @packed_convert_imm(float %a, float %b) #0 {
				%a_half = fptrunc float 5.0 to half
				%b_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %b, float 0.0)
				%b_half = extractelement <2 x half> %b_half_vec, i32 0

				%vec_tmp = insertelement <2 x half> undef, half %a_half, i32 0
				%vec = insertelement <2 x half> %vec_tmp, half %b_half, i32 1

				%vec_i32 = bitcast <2 x half> %vec to i32
				ret i32 %vec_i32
				}

				; ALL-LABEL: {{^}}packed_convert_low_neg:
				; ALL: v_cvt_pkrtz_f16_f32{{(_e64)?}} v{{[0-9]*}}, -v0, v1
				; ALL-NOT: v_cvt_pkrtz_f16_f32
				define i32 @packed_convert_low_neg(float %a, float %b) #0 {
				%a_neg = fneg float %a

				%a_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a_neg, float 0.0)
				%a_half = extractelement <2 x half> %a_half_vec, i32 0
				%b_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %b, float 0.0)
				%b_half = extractelement <2 x half> %b_half_vec, i32 0

				%vec_tmp = insertelement <2 x half> undef, half %a_half, i32 0
				%vec = insertelement <2 x half> %vec_tmp, half %b_half, i32 1

				%vec_i32 = bitcast <2 x half> %vec to i32
				ret i32 %vec_i32
				}

				; ALL-LABEL: {{^}}packed_convert_high_neg:
				; ALL: v_cvt_pkrtz_f16_f32{{(_e64)?}} v{{[0-9]*}}, -v0, v1
				; ALL-NOT: v_cvt_pkrtz_f16_f32
				define i32 @packed_convert_high_neg(float %a, float %b) #0 {
				%a_neg = fneg float %a

				%a_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.0, float %a_neg)
				%a_half = extractelement <2 x half> %a_half_vec, i32 1
				%b_half_vec = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.0, float %b)
				%b_half = extractelement <2 x half> %b_half_vec, i32 1

				%vec_tmp = insertelement <2 x half> undef, half %a_half, i32 0
				%vec = insertelement <2 x half> %vec_tmp, half %b_half, i32 1

				%vec_i32 = bitcast <2 x half> %vec to i32
				ret i32 %vec_i32
				}

				declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1

				attributes #0 = { nounwind }
				attributes #1 = { nounwind readnone }