Diff 344799

llvm/lib/Target/AMDGPU/AMDGPUInstructions.td

Show First 20 Lines • Show All 179 Lines • ▼ Show 20 Lines	class HasOneUseTernaryOp<SDPatternOperator op> : PatFrag<
(ops node:$src0, node:$src1, node:$src2),		(ops node:$src0, node:$src1, node:$src2),
(op $src0, $src1, $src2),		(op $src0, $src1, $src2),
[{ return N->hasOneUse(); }]> {		[{ return N->hasOneUse(); }]> {
let GISelPredicateCode = [{		let GISelPredicateCode = [{
return MRI.hasOneNonDBGUse(MI.getOperand(0).getReg());		return MRI.hasOneNonDBGUse(MI.getOperand(0).getReg());
}];		}];
}		}

		class is_canonicalized<SDPatternOperator op> : PatFrag<
		foadUnsubmitted Not Done Reply Inline Actions This frag matches: a binary operator whose inputs are both canonicalized. I think would be cleaner to have a frag (maybe a PatLeaf?) that matches just: a node that is canonicalized. Then instead of a pattern like "is_canonicalized<build_vector> src0, src1" you would write "build_vector (is_canonicalized src0), (is_canonicalized src1)". Unfortunately my TableGen skills are not great, so I don't know exactly how to implement this. foad: This frag matches: a binary operator whose inputs are both canonicalized. I think would be…
		jpagesAuthorUnsubmitted Not Done Reply Inline Actions I tried to do it as you suggested. My tableGen skills are not great so maybe there is the way but I couldn't find it. From my understanding: A PatLeaf can not have an argument so I can't write `is_canonicalized $src` The only solution seems to use a PatFrag instead I have to give a "type" to such a PatFrag, as well as specifying the number of parameters. I tried to do something generic but the unary pattern looked like that `is_canonicalized_unary<BitConvert>` or some other SDNode with only one operand. jpages: I tried to do it as you suggested. My tableGen skills are not great so maybe there is the way…
		(ops node:$src0, node:$src1),
		(op $src0, $src1),
		[{
		const SITargetLowering &Lowering =
		static_cast<const SITargetLowering >(getTargetLowering());

		return Lowering.isCanonicalized(*CurDAG, N->getOperand(0)) &&
		Lowering.isCanonicalized(*CurDAG, N->getOperand(1));
		}]> {

		// TODO: Improve the Legalizer for g_build_vector in Global Isel to match this class
		let GISelPredicateCode = [{
		const SITargetLowering TLI = static_cast<const SITargetLowering >(
		MF.getSubtarget().getTargetLowering());

		return TLI->isCanonicalized(MI.getOperand(1).getReg(), const_cast<MachineFunction&>(MF)) &&
		TLI->isCanonicalized(MI.getOperand(2).getReg(), const_cast<MachineFunction&>(MF));
		foadUnsubmitted Done Reply Inline Actions In MIR, instructions have dst operands followed by src operands. You probably need to check operand 1 and operand 2 here. foad: In MIR, instructions have dst operands followed by src operands. You probably need to check…
		}];
		}


let Properties = [SDNPCommutative, SDNPAssociative] in {		let Properties = [SDNPCommutative, SDNPAssociative] in {
def smax_oneuse : HasOneUseBinOp<smax>;		def smax_oneuse : HasOneUseBinOp<smax>;
def smin_oneuse : HasOneUseBinOp<smin>;		def smin_oneuse : HasOneUseBinOp<smin>;
def umax_oneuse : HasOneUseBinOp<umax>;		def umax_oneuse : HasOneUseBinOp<umax>;
def umin_oneuse : HasOneUseBinOp<umin>;		def umin_oneuse : HasOneUseBinOp<umin>;

def fminnum_oneuse : HasOneUseBinOp<fminnum>;		def fminnum_oneuse : HasOneUseBinOp<fminnum>;
def fmaxnum_oneuse : HasOneUseBinOp<fmaxnum>;		def fmaxnum_oneuse : HasOneUseBinOp<fmaxnum>;
▲ Show 20 Lines • Show All 478 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 9,604 Lines • ▼ Show 20 Lines	bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
case ISD::INSERT_VECTOR_ELT: {		case ISD::INSERT_VECTOR_ELT: {
return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&		return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);		isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
}		}
case ISD::UNDEF:		case ISD::UNDEF:
// Could be anything.		// Could be anything.
return false;		return false;

case ISD::BITCAST: {		case ISD::BITCAST:
		return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
		case ISD::TRUNCATE: {
// Hack round the mess we make when legalizing extract_vector_elt		// Hack round the mess we make when legalizing extract_vector_elt
SDValue Src = Op.getOperand(0);		if (Op.getValueType() == MVT::i16) {
if (Src.getValueType() == MVT::i16 &&		SDValue TruncSrc = Op.getOperand(0);
Src.getOpcode() == ISD::TRUNCATE) {
SDValue TruncSrc = Src.getOperand(0);
if (TruncSrc.getValueType() == MVT::i32 &&		if (TruncSrc.getValueType() == MVT::i32 &&
TruncSrc.getOpcode() == ISD::BITCAST &&		TruncSrc.getOpcode() == ISD::BITCAST &&
TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {		TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);		return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
}		}
}		}
		foadUnsubmitted Not Done Reply Inline Actions If we're going to define isCanonicalized on integer types (which seems reasonable to me) then how about changing this to just: case ISD::BITCAST: return isCanonicalized(DAG, Src, MaxDepth - 1); ... and moving the "hack round the mess" code into `case ISD::TRUNCATE`? foad: If we're going to define isCanonicalized on integer types (which seems reasonable to me) then…

return false;		return false;
}		}
case ISD::INTRINSIC_WO_CHAIN: {		case ISD::INTRINSIC_WO_CHAIN: {
unsigned IntrinsicID		unsigned IntrinsicID
= cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();		= cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
// TODO: Handle more intrinsics		// TODO: Handle more intrinsics
switch (IntrinsicID) {		switch (IntrinsicID) {
case Intrinsic::amdgcn_cvt_pkrtz:		case Intrinsic::amdgcn_cvt_pkrtz:
▲ Show 20 Lines • Show All 2,647 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/SIInstructions.td

	Show First 20 Lines • Show All 2,274 Lines • ▼ Show 20 Lines
	>;			>;

	// TODO: Should source modifiers be matched to v_pack_b32_f16?			// TODO: Should source modifiers be matched to v_pack_b32_f16?
	def : GCNPat <			def : GCNPat <
	(v2f16 (build_vector (f16 SReg_32:$src0), (f16 SReg_32:$src1))),			(v2f16 (build_vector (f16 SReg_32:$src0), (f16 SReg_32:$src1))),
	(S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1)			(S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1)
	>;			>;

				def : GCNPat <
				(v2f16 (is_canonicalized<build_vector> (f16 (VOP3Mods (f16 VGPR_32:$src0), i32:$src0_mods)),
				(f16 (VOP3Mods (f16 VGPR_32:$src1), i32:$src1_mods)))),
				(V_PACK_B32_F16_e64 $src0_mods, VGPR_32:$src0, $src1_mods, VGPR_32:$src1)
				arsenmUnsubmitted Not Done Reply Inline Actions This isn't a simple bitpacking, this has FP output effects like flushing arsenm: This isn't a simple bitpacking, this has FP output effects like flushing
				arsenmUnsubmitted Done Reply Inline Actions I believe source modifiers should work as normal, so you can use the VOP3Mods complex patterns for the sources arsenm: I believe source modifiers should work as normal, so you can use the VOP3Mods complex patterns…
				>;
	} // End SubtargetPredicate = HasVOP3PInsts			} // End SubtargetPredicate = HasVOP3PInsts


	def : GCNPat <			def : GCNPat <
	(v2f16 (scalar_to_vector f16:$src0)),			(v2f16 (scalar_to_vector f16:$src0)),
	(COPY $src0)			(COPY $src0)
	>;			>;

	def : GCNPat <			def : GCNPat <
	(v2i16 (scalar_to_vector i16:$src0)),			(v2i16 (scalar_to_vector i16:$src0)),
	(COPY $src0)			(COPY $src0)
	▲ Show 20 Lines • Show All 466 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll

Show First 20 Lines • Show All 282 Lines • ▼ Show 20 Lines	define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f16_f32_flushf16(float addrspace(1)* %arg, half addrspace(1)* %out) #2 {
ret void		ret void
}		}

; GCN-LABEL: test_fold_canonicalize_fpround_value_v2f16_v2f32:		; GCN-LABEL: test_fold_canonicalize_fpround_value_v2f16_v2f32:
; GCN-DAG: v_cvt_f16_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}}		; GCN-DAG: v_cvt_f16_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}}
; VI-DAG: v_cvt_f16_f32_sdwa [[V1:v[0-9]+]], v{{[0-9]+}}		; VI-DAG: v_cvt_f16_f32_sdwa [[V1:v[0-9]+]], v{{[0-9]+}}
; VI: v_or_b32_e32 [[V:v[0-9]+]], [[V0]], [[V1]]		; VI: v_or_b32_e32 [[V:v[0-9]+]], [[V0]], [[V1]]
; GFX9: v_cvt_f16_f32_e32 [[V1:v[0-9]+]], v{{[0-9]+}}		; GFX9: v_cvt_f16_f32_e32 [[V1:v[0-9]+]], v{{[0-9]+}}
; GFX9: v_and_b32_e32 [[V0_16:v[0-9]+]], 0xffff, [[V0]]		; GFX9: v_pack_b32_f16 [[V:v[0-9]+]], [[V1]], [[V0]]
; GFX9: v_lshl_or_b32 [[V:v[0-9]+]], [[V1]], 16, [[V0_16]]
; GCN-NOT: v_mul		; GCN-NOT: v_mul
; GCN-NOT: v_max		; GCN-NOT: v_max
; GCN: {{flat\|global}}_store_dword v{{.+}}, [[V]]		; GCN: {{flat\|global}}_store_dword v{{.+}}, [[V]]
define amdgpu_kernel void @test_fold_canonicalize_fpround_value_v2f16_v2f32(<2 x float> addrspace(1)* %arg, <2 x half> addrspace(1)* %out) {		define amdgpu_kernel void @test_fold_canonicalize_fpround_value_v2f16_v2f32(<2 x float> addrspace(1)* %arg, <2 x half> addrspace(1)* %out) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()		%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %arg, i32 %id		%gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %arg, i32 %id
%load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8		%load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
%v = fptrunc <2 x float> %load to <2 x half>		%v = fptrunc <2 x float> %load to <2 x half>
▲ Show 20 Lines • Show All 608 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll

Show First 20 Lines • Show All 546 Lines • ▼ Show 20 Lines	define amdgpu_kernel void @s_test_canonicalize_undef_v2f16(<2 x half> addrspace(1)* %out) #1 {
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef)		%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef)
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out		store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
ret void		ret void
}		}

; GCN-LABEL: {{^}}v_test_canonicalize_reg_undef_v2f16:		; GCN-LABEL: {{^}}v_test_canonicalize_reg_undef_v2f16:
; GFX9: s_waitcnt		; GFX9: s_waitcnt
; GFX9-NEXT: v_max_f16_e32 v0, v0, v0		; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0		; GFX9-NEXT: v_pack_b32_f16 v0, v0, 0
; GFX9-NEXT: s_setpc_b64		; GFX9-NEXT: s_setpc_b64

; High bits known zero		; High bits known zero
; FIXME: Should also be true on gfx9 by default?		; FIXME: Should also be true on gfx9 by default?
; VI: s_waitcnt		; VI: s_waitcnt
; VI-NEXT: v_max_f16_e32 v0, v0, v0		; VI-NEXT: v_max_f16_e32 v0, v0, v0
; VI-NEXT: s_setpc_b64		; VI-NEXT: s_setpc_b64
define <2 x half> @v_test_canonicalize_reg_undef_v2f16(half %val) #1 {		define <2 x half> @v_test_canonicalize_reg_undef_v2f16(half %val) #1 {
▲ Show 20 Lines • Show All 66 Lines • ▼ Show 20 Lines	define <2 x half> @v_test_canonicalize_k_lo_undef_hi_v2f16() #1 {
%vec = insertelement <2 x half> undef, half 16.0, i32 0		%vec = insertelement <2 x half> undef, half 16.0, i32 0
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)		%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
ret <2 x half> %canonicalized		ret <2 x half> %canonicalized
}		}

; GCN-LABEL: {{^}}v_test_canonicalize_reg_k_v2f16:		; GCN-LABEL: {{^}}v_test_canonicalize_reg_k_v2f16:
; GFX9: s_waitcnt		; GFX9: s_waitcnt
; GFX9-DAG: v_max_f16_e32 v0, v0, v0		; GFX9-DAG: v_max_f16_e32 v0, v0, v0
; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x4000		; GFX9: v_pack_b32_f16 v0, v0, 2.0
; GFX9: v_and_b32_e32 v0, 0xffff, v0
; GFX9: v_lshl_or_b32 v0, [[K]], 16, v0
; GFX9: s_setpc_b64		; GFX9: s_setpc_b64

; VI: s_waitcnt		; VI: s_waitcnt
; VI-NEXT: v_max_f16_e32 v0, v0, v0		; VI-NEXT: v_max_f16_e32 v0, v0, v0
; VI-NEXT: v_or_b32_e32 v0, 2.0, v0		; VI-NEXT: v_or_b32_e32 v0, 2.0, v0
; VI-NEXT: s_setpc_b64		; VI-NEXT: s_setpc_b64
define <2 x half> @v_test_canonicalize_reg_k_v2f16(half %val) #1 {		define <2 x half> @v_test_canonicalize_reg_k_v2f16(half %val) #1 {
%vec0 = insertelement <2 x half> undef, half %val, i32 0		%vec0 = insertelement <2 x half> undef, half %val, i32 0
%vec1 = insertelement <2 x half> %vec0, half 2.0, i32 1		%vec1 = insertelement <2 x half> %vec0, half 2.0, i32 1
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec1)		%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec1)
ret <2 x half> %canonicalized		ret <2 x half> %canonicalized
}		}

; GCN-LABEL: {{^}}v_test_canonicalize_k_reg_v2f16:		; GCN-LABEL: {{^}}v_test_canonicalize_k_reg_v2f16:
; GFX9: v_max_f16_e32 v0, v0, v0		; GFX9: v_max_f16_e32 v0, v0, v0
; GFX9: v_mov_b32_e32 [[K:v[0-9]+]], 0x4000		; GFX9: v_pack_b32_f16 v0, 2.0, v0
; GFX9: v_lshl_or_b32 v0, v0, 16, [[K]]
; GFX9: s_setpc_b64		; GFX9: s_setpc_b64

; VI: s_waitcnt		; VI: s_waitcnt
; VI-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD		; VI-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, 0x4000, v0		; VI-NEXT: v_or_b32_e32 v0, 0x4000, v0
; VI-NEXT: s_setpc_b64		; VI-NEXT: s_setpc_b64
define <2 x half> @v_test_canonicalize_k_reg_v2f16(half %val) #1 {		define <2 x half> @v_test_canonicalize_k_reg_v2f16(half %val) #1 {
%vec0 = insertelement <2 x half> undef, half 2.0, i32 0		%vec0 = insertelement <2 x half> undef, half 2.0, i32 0
Show All 9 Lines	define amdgpu_kernel void @s_test_canonicalize_undef_v4f16(<4 x half> addrspace(1)* %out) #1 {
%canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef)		%canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef)
store <4 x half> %canonicalized, <4 x half> addrspace(1)* %out		store <4 x half> %canonicalized, <4 x half> addrspace(1)* %out
ret void		ret void
}		}

; GCN-LABEL: {{^}}v_test_canonicalize_reg_undef_undef_undef_v4f16:		; GCN-LABEL: {{^}}v_test_canonicalize_reg_undef_undef_undef_v4f16:
; GFX9: s_waitcnt		; GFX9: s_waitcnt
; GFX9-NEXT: v_max_f16_e32 v0, v0, v0		; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0		; GFX9-NEXT: v_pack_b32_f16 v0, v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0		; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00
; GFX9-NEXT: s_setpc_b64		; GFX9-NEXT: s_setpc_b64

; VI: s_waitcnt		; VI: s_waitcnt
; VI-NEXT: v_max_f16_e32 v0, v0, v0		; VI-NEXT: v_max_f16_e32 v0, v0, v0
; VI-NEXT: v_or_b32_e32 v0, 0x7e000000, v0		; VI-NEXT: v_or_b32_e32 v0, 0x7e000000, v0
; VI-NEXT: v_mov_b32_e32 v1, 0x7e007e00		; VI-NEXT: v_mov_b32_e32 v1, 0x7e007e00
; VI-NEXT: s_setpc_b64		; VI-NEXT: s_setpc_b64
define <4 x half> @v_test_canonicalize_reg_undef_undef_undef_v4f16(half %val) #1 {		define <4 x half> @v_test_canonicalize_reg_undef_undef_undef_v4f16(half %val) #1 {
Show All 23 Lines	define <4 x half> @v_test_canonicalize_reg_reg_undef_undef_v4f16(half %val0, half %val1) #1 {
ret <4 x half> %canonicalized		ret <4 x half> %canonicalized
}		}

; GCN-LABEL: {{^}}v_test_canonicalize_reg_undef_reg_reg_v4f16:		; GCN-LABEL: {{^}}v_test_canonicalize_reg_undef_reg_reg_v4f16:
; GFX9: s_waitcnt		; GFX9: s_waitcnt
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1		; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX9-NEXT: v_max_f16_e32 v0, v0, v0		; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1		; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0		; GFX9-NEXT: v_pack_b32_f16 v0, v0, 0
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1		; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
; GFX9-NEXT: s_setpc_b64		; GFX9-NEXT: s_setpc_b64

; VI: s_waitcnt		; VI: s_waitcnt
; VI-NEXT: v_max_f16_e32 v0, v0, v0		; VI-NEXT: v_max_f16_e32 v0, v0, v0
; VI-NEXT: v_max_f16_e32 v1, v1, v1		; VI-NEXT: v_max_f16_e32 v1, v1, v1
; VI-NEXT: v_max_f16_sdwa v2, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD		; VI-NEXT: v_max_f16_sdwa v2, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, 0x7e000000, v0		; VI-NEXT: v_or_b32_e32 v0, 0x7e000000, v0
Show All 14 Lines

llvm/test/CodeGen/AMDGPU/fexp.ll

	Show First 20 Lines • Show All 131 Lines • ▼ Show 20 Lines
	;			;
	; GFX9-LABEL: v_exp_v2f16:			; GFX9-LABEL: v_exp_v2f16:
	; GFX9: ; %bb.0:			; GFX9: ; %bb.0:
	; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)			; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GFX9-NEXT: s_movk_i32 [[SREG:s[0-9]+]], 0x3dc5			; GFX9-NEXT: s_movk_i32 [[SREG:s[0-9]+]], 0x3dc5
	; GFX9-NEXT: v_pk_mul_f16 v0, v0, [[SREG]] op_sel_hi:[1,0]			; GFX9-NEXT: v_pk_mul_f16 v0, v0, [[SREG]] op_sel_hi:[1,0]
	; GFX9-NEXT: v_exp_f16_e32 v1, v0			; GFX9-NEXT: v_exp_f16_e32 v1, v0
	; GFX9-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1			; GFX9-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
	; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1			; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
	; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1
	; GFX9-NEXT: s_setpc_b64 s[30:31]			; GFX9-NEXT: s_setpc_b64 s[30:31]
	%result = call <2 x half> @llvm.exp.v2f16(<2 x half> %arg0)			%result = call <2 x half> @llvm.exp.v2f16(<2 x half> %arg0)
	ret <2 x half> %result			ret <2 x half> %result
	}			}

	; define <3 x half> @v_exp_v3f16(<3 x half> %arg0) {			; define <3 x half> @v_exp_v3f16(<3 x half> %arg0) {
	; %result = call <3 x half> @llvm.exp.v3f16(<3 x half> %arg0)			; %result = call <3 x half> @llvm.exp.v3f16(<3 x half> %arg0)
	; ret <3 x half> %result			; ret <3 x half> %result
	▲ Show 20 Lines • Show All 43 Lines • ▼ Show 20 Lines
	; GFX9: ; %bb.0:			; GFX9: ; %bb.0:
	; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)			; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GFX9-NEXT: s_movk_i32 [[SREG:s[0-9]+]], 0x3dc5			; GFX9-NEXT: s_movk_i32 [[SREG:s[0-9]+]], 0x3dc5
	; GFX9-NEXT: v_mul_f16_e32 [[MUL1:v[0-9]+]], [[SREG]], v1			; GFX9-NEXT: v_mul_f16_e32 [[MUL1:v[0-9]+]], [[SREG]], v1
	; GFX9-NEXT: v_mul_f16_e32 [[MUL2:v[0-9]+]], [[SREG]], v0			; GFX9-NEXT: v_mul_f16_e32 [[MUL2:v[0-9]+]], [[SREG]], v0
	; GFX9-NEXT: v_mul_f16_sdwa [[MUL3:v[0-9]+]], v1, [[SREG]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD			; GFX9-NEXT: v_mul_f16_sdwa [[MUL3:v[0-9]+]], v1, [[SREG]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
	; GFX9-NEXT: v_mul_f16_sdwa [[MUL4:v[0-9]+]], v0, [[SREG]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD			; GFX9-NEXT: v_mul_f16_sdwa [[MUL4:v[0-9]+]], v0, [[SREG]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
	; GFX9-NEXT: v_exp_f16_e32 [[EXP1:v[0-9]+]], [[MUL1]]			; GFX9-NEXT: v_exp_f16_e32 [[EXP1:v[0-9]+]], [[MUL1]]
	; GFX9-NEXT: v_exp_f16_e32 [[EXP2:v[0-9]+]], [[MUL2]]			; GFX9-NEXT: v_exp_f16_e32 [[EXP2:v[0-9]+]], [[MUL3]]
	; GFX9-NEXT: v_exp_f16_e32 [[EXP3:v[0-9]+]], [[MUL4]]			; GFX9-NEXT: v_exp_f16_e32 [[EXP3:v[0-9]+]], [[MUL2]]
	; GFX9-NEXT: v_exp_f16_e32 [[EXP4:v[0-9]+]], [[MUL3]]			; GFX9-NEXT: v_exp_f16_e32 [[EXP4:v[0-9]+]], [[MUL4]]
	; GFX9-NEXT: v_mov_b32_e32 [[VCONST:v[0-9]+]], 0xffff			; GFX9-NEXT: v_pack_b32_f16 v1, [[EXP1]], [[EXP2]]
	; GFX9-NEXT: v_and_b32_e32 [[AND1:v[0-9]+]], [[VCONST]], [[EXP2]]			; GFX9-NEXT: v_pack_b32_f16 v0, [[EXP3]], [[EXP4]]
	; GFX9-NEXT: v_and_b32_e32 [[AND2:v[0-9]+]], [[VCONST]], [[EXP1]]
	; GFX9-NEXT: v_lshl_or_b32 v0, [[EXP3]], 16, [[AND1]]
	; GFX9-NEXT: v_lshl_or_b32 v1, [[EXP4]], 16, [[AND2]]
	; GFX9-NEXT: s_setpc_b64 s[30:31]			; GFX9-NEXT: s_setpc_b64 s[30:31]
	%result = call <4 x half> @llvm.exp.v4f16(<4 x half> %arg0)			%result = call <4 x half> @llvm.exp.v4f16(<4 x half> %arg0)
	ret <4 x half> %result			ret <4 x half> %result
	}			}

	declare float @llvm.exp.f32(float)			declare float @llvm.exp.f32(float)
	declare <2 x float> @llvm.exp.v2f32(<2 x float>)			declare <2 x float> @llvm.exp.v2f32(<2 x float>)
	declare <3 x float> @llvm.exp.v3f32(<3 x float>)			declare <3 x float> @llvm.exp.v3f32(<3 x float>)
	declare <4 x float> @llvm.exp.v4f32(<4 x float>)			declare <4 x float> @llvm.exp.v4f32(<4 x float>)

	declare half @llvm.exp.f16(half)			declare half @llvm.exp.f16(half)
	declare <2 x half> @llvm.exp.v2f16(<2 x half>)			declare <2 x half> @llvm.exp.v2f16(<2 x half>)
	declare <3 x half> @llvm.exp.v3f16(<3 x half>)			declare <3 x half> @llvm.exp.v3f16(<3 x half>)
	declare <4 x half> @llvm.exp.v4f16(<4 x half>)			declare <4 x half> @llvm.exp.v4f16(<4 x half>)

llvm/test/CodeGen/AMDGPU/fpow.ll

	Show First 20 Lines • Show All 184 Lines • ▼ Show 20 Lines
	; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0			; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0
	; GFX9-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1			; GFX9-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
	; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v1			; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v1
	; GFX9-NEXT: v_log_f32_e32 v2, v2			; GFX9-NEXT: v_log_f32_e32 v2, v2
	; GFX9-NEXT: v_log_f32_e32 v0, v0			; GFX9-NEXT: v_log_f32_e32 v0, v0
	; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v3, v2			; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v3, v2
	; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v1, v0			; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v1, v0
	; GFX9-NEXT: v_exp_f32_e32 v0, v0			; GFX9-NEXT: v_exp_f32_e32 v0, v0
	; GFX9-NEXT: v_exp_f32_e32 v1, v2			; GFX9-NEXT: v_exp_f32_e32 v2, v2
	; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0			; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
	; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1			; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v2
	; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0			; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
	; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
	; GFX9-NEXT: s_setpc_b64 s[30:31]			; GFX9-NEXT: s_setpc_b64 s[30:31]
	;			;
	; GFX10-LABEL: v_pow_v2f16:			; GFX10-LABEL: v_pow_v2f16:
	; GFX10: ; %bb.0:			; GFX10: ; %bb.0:
	; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)			; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GFX10-NEXT: s_waitcnt_vscnt null, 0x0			; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
	; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0			; GFX10-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
	; GFX10-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1			; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0
	; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1			; GFX10-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
	; GFX10-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1			; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v1
	; GFX10-NEXT: v_log_f32_e32 v2, v2			; GFX10-NEXT: v_log_f32_e32 v2, v2
	; GFX10-NEXT: v_log_f32_e32 v0, v0			; GFX10-NEXT: v_log_f32_e32 v0, v0
	; GFX10-NEXT: v_mul_legacy_f32_e32 v2, v3, v2			; GFX10-NEXT: v_mul_legacy_f32_e32 v2, v3, v2
	; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v1, v0			; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v1, v0
	; GFX10-NEXT: v_exp_f32_e32 v1, v2			; GFX10-NEXT: v_exp_f32_e32 v1, v2
	; GFX10-NEXT: v_exp_f32_e32 v0, v0			; GFX10-NEXT: v_exp_f32_e32 v0, v0
	; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1			; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
	; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0			; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
	; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1			; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
	; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1
	; GFX10-NEXT: s_setpc_b64 s[30:31]			; GFX10-NEXT: s_setpc_b64 s[30:31]
	%pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y)			%pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y)
	ret <2 x half> %pow			ret <2 x half> %pow
	}			}

	define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {			define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
	; GFX6-LABEL: v_pow_v2f16_fneg_lhs:			; GFX6-LABEL: v_pow_v2f16_fneg_lhs:
	; GFX6: ; %bb.0:			; GFX6: ; %bb.0:
	▲ Show 20 Lines • Show All 43 Lines • ▼ Show 20 Lines
	; GFX9-NEXT: v_cvt_f32_f16_e64 v0, -v0			; GFX9-NEXT: v_cvt_f32_f16_e64 v0, -v0
	; GFX9-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1			; GFX9-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
	; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v1			; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v1
	; GFX9-NEXT: v_log_f32_e32 v2, v2			; GFX9-NEXT: v_log_f32_e32 v2, v2
	; GFX9-NEXT: v_log_f32_e32 v0, v0			; GFX9-NEXT: v_log_f32_e32 v0, v0
	; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v3, v2			; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v3, v2
	; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v1, v0			; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v1, v0
	; GFX9-NEXT: v_exp_f32_e32 v0, v0			; GFX9-NEXT: v_exp_f32_e32 v0, v0
	; GFX9-NEXT: v_exp_f32_e32 v1, v2			; GFX9-NEXT: v_exp_f32_e32 v2, v2
	; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0			; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
	; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1			; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v2
	; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0			; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
	; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
	; GFX9-NEXT: s_setpc_b64 s[30:31]			; GFX9-NEXT: s_setpc_b64 s[30:31]
	;			;
	; GFX10-LABEL: v_pow_v2f16_fneg_lhs:			; GFX10-LABEL: v_pow_v2f16_fneg_lhs:
	; GFX10: ; %bb.0:			; GFX10: ; %bb.0:
	; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)			; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GFX10-NEXT: s_waitcnt_vscnt null, 0x0			; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
	; GFX10-NEXT: v_cvt_f32_f16_e64 v2, -v0			;GFX10-NEXT: v_cvt_f32_f16_sdwa v2, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
	; GFX10-NEXT: v_cvt_f32_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1			;GFX10-NEXT: v_cvt_f32_f16_e64 v0, -v0
	; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1			;GFX10-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
	; GFX10-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1			;GFX10-NEXT: v_cvt_f32_f16_e32 v1, v1
	; GFX10-NEXT: v_log_f32_e32 v2, v2			;GFX10-NEXT: v_log_f32_e32 v2, v2
	; GFX10-NEXT: v_log_f32_e32 v0, v0			;GFX10-NEXT: v_log_f32_e32 v0, v0
	; GFX10-NEXT: v_mul_legacy_f32_e32 v2, v3, v2			;GFX10-NEXT: v_mul_legacy_f32_e32 v2, v3, v2
	; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v1, v0			;GFX10-NEXT: v_mul_legacy_f32_e32 v0, v1, v0
	; GFX10-NEXT: v_exp_f32_e32 v1, v2			;GFX10-NEXT: v_exp_f32_e32 v1, v2
	; GFX10-NEXT: v_exp_f32_e32 v0, v0			;GFX10-NEXT: v_exp_f32_e32 v0, v0
	; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1			;GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
	; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0			;GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
	; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1			;GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
	; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1
	; GFX10-NEXT: s_setpc_b64 s[30:31]			; GFX10-NEXT: s_setpc_b64 s[30:31]
	%x.fneg = fneg <2 x half> %x			%x.fneg = fneg <2 x half> %x
	%pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x.fneg, <2 x half> %y)			%pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x.fneg, <2 x half> %y)
	ret <2 x half> %pow			ret <2 x half> %pow
	}			}

	define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {			define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
	; GFX6-LABEL: v_pow_v2f16_fneg_rhs:			; GFX6-LABEL: v_pow_v2f16_fneg_rhs:
	▲ Show 20 Lines • Show All 44 Lines • ▼ Show 20 Lines
	; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0			; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0
	; GFX9-NEXT: v_cvt_f32_f16_sdwa v3, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1			; GFX9-NEXT: v_cvt_f32_f16_sdwa v3, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
	; GFX9-NEXT: v_cvt_f32_f16_e64 v1, -v1			; GFX9-NEXT: v_cvt_f32_f16_e64 v1, -v1
	; GFX9-NEXT: v_log_f32_e32 v2, v2			; GFX9-NEXT: v_log_f32_e32 v2, v2
	; GFX9-NEXT: v_log_f32_e32 v0, v0			; GFX9-NEXT: v_log_f32_e32 v0, v0
	; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v3, v2			; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v3, v2
	; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v1, v0			; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v1, v0
	; GFX9-NEXT: v_exp_f32_e32 v0, v0			; GFX9-NEXT: v_exp_f32_e32 v0, v0
	; GFX9-NEXT: v_exp_f32_e32 v1, v2			; GFX9-NEXT: v_exp_f32_e32 v2, v2
	; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0			; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
	; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1			; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v2
	; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0			; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
	; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
	; GFX9-NEXT: s_setpc_b64 s[30:31]			; GFX9-NEXT: s_setpc_b64 s[30:31]
	;			;
	; GFX10-LABEL: v_pow_v2f16_fneg_rhs:			; GFX10-LABEL: v_pow_v2f16_fneg_rhs:
	; GFX10: ; %bb.0:			; GFX10: ; %bb.0:
	; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)			; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GFX10-NEXT: s_waitcnt_vscnt null, 0x0			; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
	; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0			;GFX10-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
	; GFX10-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1			;GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0
	; GFX10-NEXT: v_cvt_f32_f16_e64 v3, -v1			;GFX10-NEXT: v_cvt_f32_f16_sdwa v3, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
	; GFX10-NEXT: v_cvt_f32_f16_sdwa v1, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1			;GFX10-NEXT: v_cvt_f32_f16_e64 v1, -v1
	; GFX10-NEXT: v_log_f32_e32 v2, v2			;GFX10-NEXT: v_log_f32_e32 v2, v2
	; GFX10-NEXT: v_log_f32_e32 v0, v0			;GFX10-NEXT: v_log_f32_e32 v0, v0
	; GFX10-NEXT: v_mul_legacy_f32_e32 v2, v3, v2			;GFX10-NEXT: v_mul_legacy_f32_e32 v2, v3, v2
	; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v1, v0			;GFX10-NEXT: v_mul_legacy_f32_e32 v0, v1, v0
	; GFX10-NEXT: v_exp_f32_e32 v1, v2			;GFX10-NEXT: v_exp_f32_e32 v1, v2
	; GFX10-NEXT: v_exp_f32_e32 v0, v0			;GFX10-NEXT: v_exp_f32_e32 v0, v0
	; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1			;GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
	; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0			;GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
	; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1			;GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
	; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1
	; GFX10-NEXT: s_setpc_b64 s[30:31]			; GFX10-NEXT: s_setpc_b64 s[30:31]
	%y.fneg = fneg <2 x half> %y			%y.fneg = fneg <2 x half> %y
	%pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y.fneg)			%pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y.fneg)
	ret <2 x half> %pow			ret <2 x half> %pow
	}			}

	define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {			define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
	; GFX6-LABEL: v_pow_v2f16_fneg_lhs_rhs:			; GFX6-LABEL: v_pow_v2f16_fneg_lhs_rhs:
	▲ Show 20 Lines • Show All 49 Lines • ▼ Show 20 Lines
	; GFX9-NEXT: v_cvt_f32_f16_e64 v0, -v0			; GFX9-NEXT: v_cvt_f32_f16_e64 v0, -v0
	; GFX9-NEXT: v_cvt_f32_f16_sdwa v3, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1			; GFX9-NEXT: v_cvt_f32_f16_sdwa v3, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
	; GFX9-NEXT: v_cvt_f32_f16_e64 v1, -v1			; GFX9-NEXT: v_cvt_f32_f16_e64 v1, -v1
	; GFX9-NEXT: v_log_f32_e32 v2, v2			; GFX9-NEXT: v_log_f32_e32 v2, v2
	; GFX9-NEXT: v_log_f32_e32 v0, v0			; GFX9-NEXT: v_log_f32_e32 v0, v0
	; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v3, v2			; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v3, v2
	; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v1, v0			; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v1, v0
	; GFX9-NEXT: v_exp_f32_e32 v0, v0			; GFX9-NEXT: v_exp_f32_e32 v0, v0
	; GFX9-NEXT: v_exp_f32_e32 v1, v2			; GFX9-NEXT: v_exp_f32_e32 v2, v2
	; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0			; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
	; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1			; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v2
	; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0			; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
	; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
	; GFX9-NEXT: s_setpc_b64 s[30:31]			; GFX9-NEXT: s_setpc_b64 s[30:31]
	;			;
	; GFX10-LABEL: v_pow_v2f16_fneg_lhs_rhs:			; GFX10-LABEL: v_pow_v2f16_fneg_lhs_rhs:
	; GFX10: ; %bb.0:			; GFX10: ; %bb.0:
	; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)			; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GFX10-NEXT: s_waitcnt_vscnt null, 0x0			; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
	; GFX10-NEXT: v_cvt_f32_f16_e64 v2, -v0			; GFX10-NEXT: v_cvt_f32_f16_sdwa v2, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
	; GFX10-NEXT: v_cvt_f32_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1			; GFX10-NEXT: v_cvt_f32_f16_e64 v0, -v0
	; GFX10-NEXT: v_cvt_f32_f16_e64 v3, -v1			; GFX10-NEXT: v_cvt_f32_f16_sdwa v3, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
	; GFX10-NEXT: v_cvt_f32_f16_sdwa v1, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1			; GFX10-NEXT: v_cvt_f32_f16_e64 v1, -v1
	; GFX10-NEXT: v_log_f32_e32 v2, v2			; GFX10-NEXT: v_log_f32_e32 v2, v2
	; GFX10-NEXT: v_log_f32_e32 v0, v0			; GFX10-NEXT: v_log_f32_e32 v0, v0
	; GFX10-NEXT: v_mul_legacy_f32_e32 v2, v3, v2			; GFX10-NEXT: v_mul_legacy_f32_e32 v2, v3, v2
	; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v1, v0			; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v1, v0
	; GFX10-NEXT: v_exp_f32_e32 v1, v2			; GFX10-NEXT: v_exp_f32_e32 v1, v2
	; GFX10-NEXT: v_exp_f32_e32 v0, v0			; GFX10-NEXT: v_exp_f32_e32 v0, v0
	; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1			; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
	; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0			; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
	; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1			; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
	; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1
	; GFX10-NEXT: s_setpc_b64 s[30:31]			; GFX10-NEXT: s_setpc_b64 s[30:31]
	%x.fneg = fneg <2 x half> %x			%x.fneg = fneg <2 x half> %x
	%y.fneg = fneg <2 x half> %y			%y.fneg = fneg <2 x half> %y
	%pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x.fneg, <2 x half> %y.fneg)			%pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x.fneg, <2 x half> %y.fneg)
	ret <2 x half> %pow			ret <2 x half> %pow
	}			}

	; FIXME			; FIXME
	▲ Show 20 Lines • Show All 245 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll

	Show All 38 Lines
	; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]]			; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]]
	; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]			; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
	; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]			; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]

	; VI-DAG: v_cvt_f16_f32_sdwa v[[R_F16_1:[0-9]+]], v[[A_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD			; VI-DAG: v_cvt_f16_f32_sdwa v[[R_F16_1:[0-9]+]], v[[A_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
	; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]			; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]

	; GFX9-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]]			; GFX9-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]]
	; GFX9: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]			; GFX9: v_pack_b32_f16 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
	; GFX9: v_lshl_or_b32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], 16, v[[R_F16_LO]]

	; GCN: buffer_store_dword v[[R_V2_F16]]			; GCN: buffer_store_dword v[[R_V2_F16]]
	; GCN: s_endpgm			; GCN: s_endpgm

	define amdgpu_kernel void @fptrunc_v2f32_to_v2f16(			define amdgpu_kernel void @fptrunc_v2f32_to_v2f16(
	<2 x half> addrspace(1)* %r,			<2 x half> addrspace(1)* %r,
	<2 x float> addrspace(1)* %a) {			<2 x float> addrspace(1)* %a) {
	entry:			entry:
	▲ Show 20 Lines • Show All 140 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/frem.ll

	Show First 20 Lines • Show All 1,508 Lines • ▼ Show 20 Lines
	; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1			; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
	; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v1			; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v1
	; GFX9-NEXT: v_rcp_f32_e32 v5, v5			; GFX9-NEXT: v_rcp_f32_e32 v5, v5
	; GFX9-NEXT: v_mul_f32_e32 v4, v4, v5			; GFX9-NEXT: v_mul_f32_e32 v4, v4, v5
	; GFX9-NEXT: v_cvt_f16_f32_e32 v4, v4			; GFX9-NEXT: v_cvt_f16_f32_e32 v4, v4
	; GFX9-NEXT: v_div_fixup_f16 v4, v4, v2, v1			; GFX9-NEXT: v_div_fixup_f16 v4, v4, v2, v1
	; GFX9-NEXT: v_trunc_f16_e32 v4, v4			; GFX9-NEXT: v_trunc_f16_e32 v4, v4
	; GFX9-NEXT: v_fma_f16 v1, -v4, v2, v1			; GFX9-NEXT: v_fma_f16 v1, -v4, v2, v1
	; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3			; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1
	; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2
	; GFX9-NEXT: global_store_dword v0, v1, s[4:5]			; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
	; GFX9-NEXT: s_endpgm			; GFX9-NEXT: s_endpgm
	;			;
	; GFX10-LABEL: frem_v2f16:			; GFX10-LABEL: frem_v2f16:
	; GFX10: ; %bb.0:			; GFX10: ; %bb.0:
	; GFX10-NEXT: s_clause 0x1			; GFX10-NEXT: s_clause 0x1
	; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24			; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
	; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34			; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
	Show All 18 Lines
	; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1			; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
	; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v2			; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v2
	; GFX10-NEXT: v_rcp_f32_e32 v5, v5			; GFX10-NEXT: v_rcp_f32_e32 v5, v5
	; GFX10-NEXT: v_mul_f32_e32 v3, v3, v5			; GFX10-NEXT: v_mul_f32_e32 v3, v3, v5
	; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3			; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
	; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1			; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1
	; GFX10-NEXT: v_trunc_f16_e32 v3, v3			; GFX10-NEXT: v_trunc_f16_e32 v3, v3
	; GFX10-NEXT: v_fmac_f16_e64 v1, -v3, v2			; GFX10-NEXT: v_fmac_f16_e64 v1, -v3, v2
	; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v4			; GFX10-NEXT: v_pack_b32_f16 v1, v4, v1
	; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v2
	; GFX10-NEXT: global_store_dword v0, v1, s[4:5]			; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
	; GFX10-NEXT: s_endpgm			; GFX10-NEXT: s_endpgm
	<2 x half> addrspace(1)* %in2) #0 {			<2 x half> addrspace(1)* %in2) #0 {
	%gep2 = getelementptr <2 x half>, <2 x half> addrspace(1)* %in2, i32 4			%gep2 = getelementptr <2 x half>, <2 x half> addrspace(1)* %in2, i32 4
	%r0 = load <2 x half>, <2 x half> addrspace(1)* %in1, align 8			%r0 = load <2 x half>, <2 x half> addrspace(1)* %in1, align 8
	%r1 = load <2 x half>, <2 x half> addrspace(1)* %gep2, align 8			%r1 = load <2 x half>, <2 x half> addrspace(1)* %gep2, align 8
	%r2 = frem <2 x half> %r0, %r1			%r2 = frem <2 x half> %r0, %r1
	store <2 x half> %r2, <2 x half> addrspace(1)* %out, align 8			store <2 x half> %r2, <2 x half> addrspace(1)* %out, align 8
	▲ Show 20 Lines • Show All 294 Lines • ▼ Show 20 Lines
	; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1			; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
	; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v1			; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v1
	; GFX9-NEXT: v_rcp_f32_e32 v7, v7			; GFX9-NEXT: v_rcp_f32_e32 v7, v7
	; GFX9-NEXT: v_mul_f32_e32 v6, v6, v7			; GFX9-NEXT: v_mul_f32_e32 v6, v6, v7
	; GFX9-NEXT: v_cvt_f16_f32_e32 v6, v6			; GFX9-NEXT: v_cvt_f16_f32_e32 v6, v6
	; GFX9-NEXT: v_div_fixup_f16 v6, v6, v3, v1			; GFX9-NEXT: v_div_fixup_f16 v6, v6, v3, v1
	; GFX9-NEXT: v_trunc_f16_e32 v6, v6			; GFX9-NEXT: v_trunc_f16_e32 v6, v6
	; GFX9-NEXT: v_fma_f16 v1, -v6, v3, v1			; GFX9-NEXT: v_fma_f16 v1, -v6, v3, v1
				; GFX9-NEXT: v_pack_b32_f16 v1, v5, v1
				; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v2
				; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0
				; GFX9-NEXT: v_rcp_f32_e32 v5, v5
				; GFX9-NEXT: v_mul_f32_e32 v3, v3, v5
				; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
				; GFX9-NEXT: v_div_fixup_f16 v3, v3, v2, v0
				; GFX9-NEXT: v_trunc_f16_e32 v3, v3
				; GFX9-NEXT: v_fma_f16 v3, -v3, v2, v0
				; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
	; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v2			; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v2
	; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff			; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
	; GFX9-NEXT: v_and_b32_e32 v5, v3, v5
	; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v5
	; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v0			; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v0
	; GFX9-NEXT: v_rcp_f32_e32 v6, v6			; GFX9-NEXT: v_rcp_f32_e32 v6, v6
	; GFX9-NEXT: v_mul_f32_e32 v5, v5, v6			; GFX9-NEXT: v_mul_f32_e32 v5, v5, v6
	; GFX9-NEXT: v_cvt_f16_f32_e32 v5, v5			; GFX9-NEXT: v_cvt_f16_f32_e32 v5, v5
	; GFX9-NEXT: v_div_fixup_f16 v5, v5, v2, v0			; GFX9-NEXT: v_div_fixup_f16 v5, v5, v2, v0
	; GFX9-NEXT: v_trunc_f16_e32 v5, v5			; GFX9-NEXT: v_trunc_f16_e32 v5, v5
	; GFX9-NEXT: v_fma_f16 v5, -v5, v2, v0			; GFX9-NEXT: v_fma_f16 v0, -v5, v2, v0
	; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2			; GFX9-NEXT: v_pack_b32_f16 v0, v3, v0
	; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v2
	; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
	; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v0
	; GFX9-NEXT: v_rcp_f32_e32 v7, v7
	; GFX9-NEXT: v_mul_f32_e32 v6, v6, v7
	; GFX9-NEXT: v_cvt_f16_f32_e32 v6, v6
	; GFX9-NEXT: v_div_fixup_f16 v6, v6, v2, v0
	; GFX9-NEXT: v_trunc_f16_e32 v6, v6
	; GFX9-NEXT: v_fma_f16 v0, -v6, v2, v0
	; GFX9-NEXT: v_and_b32_e32 v2, v3, v5
	; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2
	; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]			; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
	; GFX9-NEXT: s_endpgm			; GFX9-NEXT: s_endpgm
	;			;
	; GFX10-LABEL: frem_v4f16:			; GFX10-LABEL: frem_v4f16:
	; GFX10: ; %bb.0:			; GFX10: ; %bb.0:
	; GFX10-NEXT: s_clause 0x1			; GFX10-NEXT: s_clause 0x1
	; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24			; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
	; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34			; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
	Show All 18 Lines
	; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v1			; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v1
	; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v3			; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v3
	; GFX10-NEXT: v_rcp_f32_e32 v7, v7			; GFX10-NEXT: v_rcp_f32_e32 v7, v7
	; GFX10-NEXT: v_mul_f32_e32 v5, v5, v7			; GFX10-NEXT: v_mul_f32_e32 v5, v5, v7
	; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5			; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5
	; GFX10-NEXT: v_div_fixup_f16 v5, v5, v3, v1			; GFX10-NEXT: v_div_fixup_f16 v5, v5, v3, v1
	; GFX10-NEXT: v_trunc_f16_e32 v5, v5			; GFX10-NEXT: v_trunc_f16_e32 v5, v5
	; GFX10-NEXT: v_fmac_f16_e64 v1, -v5, v3			; GFX10-NEXT: v_fmac_f16_e64 v1, -v5, v3
	; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff			; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v2
	; GFX10-NEXT: v_and_b32_e32 v5, v3, v6			; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0
	; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v2			; GFX10-NEXT: v_pack_b32_f16 v1, v6, v1
	; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v5			; GFX10-NEXT: v_rcp_f32_e32 v5, v5
	; GFX10-NEXT: v_rcp_f32_e32 v6, v6			; GFX10-NEXT: v_mul_f32_e32 v3, v3, v5
	; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v0			; GFX10-NEXT: v_mov_b32_e32 v5, v0
	; GFX10-NEXT: v_mul_f32_e32 v5, v5, v6			; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
	; GFX10-NEXT: v_mov_b32_e32 v6, v0			; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v0
	; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5
	; GFX10-NEXT: v_div_fixup_f16 v5, v5, v2, v0
	; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0			; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
	; GFX10-NEXT: v_trunc_f16_e32 v5, v5			; GFX10-NEXT: v_trunc_f16_e32 v3, v3
	; GFX10-NEXT: v_fmac_f16_e64 v6, -v5, v2			; GFX10-NEXT: v_fmac_f16_e64 v5, -v3, v2
	; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2			; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
	; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v0			; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0
	; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v2			; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v2
	; GFX10-NEXT: v_rcp_f32_e32 v7, v7			; GFX10-NEXT: v_rcp_f32_e32 v6, v6
	; GFX10-NEXT: v_mul_f32_e32 v5, v5, v7			; GFX10-NEXT: v_mul_f32_e32 v3, v3, v6
	; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5			; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
	; GFX10-NEXT: v_div_fixup_f16 v5, v5, v2, v0			; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v0
	; GFX10-NEXT: v_trunc_f16_e32 v5, v5			; GFX10-NEXT: v_trunc_f16_e32 v3, v3
	; GFX10-NEXT: v_fmac_f16_e64 v0, -v5, v2			; GFX10-NEXT: v_fmac_f16_e64 v0, -v3, v2
	; GFX10-NEXT: v_and_b32_e32 v2, v3, v6			; GFX10-NEXT: v_pack_b32_f16 v0, v5, v0
	; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2
	; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]			; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
	; GFX10-NEXT: s_endpgm			; GFX10-NEXT: s_endpgm
	<4 x half> addrspace(1)* %in2) #0 {			<4 x half> addrspace(1)* %in2) #0 {
	%gep2 = getelementptr <4 x half>, <4 x half> addrspace(1)* %in2, i32 4			%gep2 = getelementptr <4 x half>, <4 x half> addrspace(1)* %in2, i32 4
	%r0 = load <4 x half>, <4 x half> addrspace(1)* %in1, align 16			%r0 = load <4 x half>, <4 x half> addrspace(1)* %in1, align 16
	%r1 = load <4 x half>, <4 x half> addrspace(1)* %gep2, align 16			%r1 = load <4 x half>, <4 x half> addrspace(1)* %gep2, align 16
	%r2 = frem <4 x half> %r0, %r1			%r2 = frem <4 x half> %r0, %r1
	store <4 x half> %r2, <4 x half> addrspace(1)* %out, align 16			store <4 x half> %r2, <4 x half> addrspace(1)* %out, align 16
	▲ Show 20 Lines • Show All 932 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll

	Show First 20 Lines • Show All 129 Lines • ▼ Show 20 Lines
	; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24			; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
	; GFX9-NEXT: v_mov_b32_e32 v0, 0			; GFX9-NEXT: v_mov_b32_e32 v0, 0
	; GFX9-NEXT: v_mov_b32_e32 v2, 0x3118			; GFX9-NEXT: v_mov_b32_e32 v2, 0x3118
	; GFX9-NEXT: s_waitcnt lgkmcnt(0)			; GFX9-NEXT: s_waitcnt lgkmcnt(0)
	; GFX9-NEXT: global_load_dword v1, v0, s[2:3]			; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
	; GFX9-NEXT: s_waitcnt vmcnt(0)			; GFX9-NEXT: s_waitcnt vmcnt(0)
	; GFX9-NEXT: v_mul_f16_e32 v3, 0.15915494, v1			; GFX9-NEXT: v_mul_f16_e32 v3, 0.15915494, v1
	; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD			; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
	; GFX9-NEXT: v_cos_f16_e32 v3, v3			; GFX9-NEXT: v_cos_f16_e32 v2, v3
	; GFX9-NEXT: v_cos_f16_e32 v1, v1			; GFX9-NEXT: v_cos_f16_e32 v1, v1
	; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3			; GFX9-NEXT: v_pack_b32_f16 v1, v2, v1
	; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2
	; GFX9-NEXT: global_store_dword v0, v1, s[0:1]			; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
	; GFX9-NEXT: s_endpgm			; GFX9-NEXT: s_endpgm
	;			;
	; GFX10-LABEL: cos_v2f16:			; GFX10-LABEL: cos_v2f16:
	; GFX10: ; %bb.0:			; GFX10: ; %bb.0:
	; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24			; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
	; GFX10-NEXT: v_mov_b32_e32 v0, 0			; GFX10-NEXT: v_mov_b32_e32 v0, 0
	; GFX10-NEXT: v_mov_b32_e32 v3, 0x3118			; GFX10-NEXT: v_mov_b32_e32 v2, 0x3118
	; GFX10-NEXT: s_waitcnt lgkmcnt(0)			; GFX10-NEXT: s_waitcnt lgkmcnt(0)
	; GFX10-NEXT: global_load_dword v1, v0, s[2:3]			; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
	; GFX10-NEXT: s_waitcnt vmcnt(0)			; GFX10-NEXT: s_waitcnt vmcnt(0)
	; GFX10-NEXT: v_mul_f16_e32 v2, 0.15915494, v1			; GFX10-NEXT: v_mul_f16_e32 v3, 0.15915494, v1
	; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD			; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
	; GFX10-NEXT: v_cos_f16_e32 v2, v2			; GFX10-NEXT: v_cos_f16_e32 v2, v3
	; GFX10-NEXT: v_cos_f16_e32 v1, v1			; GFX10-NEXT: v_cos_f16_e32 v1, v1
	; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2			; GFX10-NEXT: v_pack_b32_f16 v1, v2, v1
	; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v2
	; GFX10-NEXT: global_store_dword v0, v1, s[0:1]			; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
	; GFX10-NEXT: s_endpgm			; GFX10-NEXT: s_endpgm
	%a.val = load <2 x half>, <2 x half> addrspace(1)* %a			%a.val = load <2 x half>, <2 x half> addrspace(1)* %a
	%r.val = call <2 x half> @llvm.cos.v2f16(<2 x half> %a.val)			%r.val = call <2 x half> @llvm.cos.v2f16(<2 x half> %a.val)
	store <2 x half> %r.val, <2 x half> addrspace(1)* %r			store <2 x half> %r.val, <2 x half> addrspace(1)* %r
	ret void			ret void
	}			}

	declare half @llvm.cos.f16(half %a)			declare half @llvm.cos.f16(half %a)
	declare <2 x half> @llvm.cos.v2f16(<2 x half> %a)			declare <2 x half> @llvm.cos.v2f16(<2 x half> %a)

llvm/test/CodeGen/AMDGPU/llvm.log.f16.ll

	Show First 20 Lines • Show All 49 Lines • ▼ Show 20 Lines
	; VI: v_mul_f16_sdwa v[[R_F16_2:[0-9]+]], v[[R_F16_1]], [[A_F32_2_V]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD			; VI: v_mul_f16_sdwa v[[R_F16_2:[0-9]+]], v[[R_F16_1]], [[A_F32_2_V]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
	; GFX9: v_mul_f16_e32 v[[R_F32_3:[0-9]+]], [[A_F32_2]], v[[R_F16_2]]			; GFX9: v_mul_f16_e32 v[[R_F32_3:[0-9]+]], [[A_F32_2]], v[[R_F16_2]]
	; VIGFX9: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], [[A_F32_2]], v[[R_F16_0]]			; VIGFX9: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], [[A_F32_2]], v[[R_F16_0]]
	; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_0]]			; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_0]]
	; SI-NOT: v_and_b32_e32			; SI-NOT: v_and_b32_e32
	; SI: v_or_b32_e32 v[[R_F32_5:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]			; SI: v_or_b32_e32 v[[R_F32_5:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
	; VI-NOT: v_and_b32_e32			; VI-NOT: v_and_b32_e32
	; VI: v_or_b32_e32 v[[R_F32_5:[0-9]+]], v[[R_F16_0]], v[[R_F16_2]]			; VI: v_or_b32_e32 v[[R_F32_5:[0-9]+]], v[[R_F16_0]], v[[R_F16_2]]
	; GFX9: v_and_b32_e32 v[[R_F32_4:[0-9]+]], 0xffff, v[[R_F32_3]]			; GFX9: v_pack_b32_f16 v[[R_F32_5:[0-9]+]], v[[R_F32_3]], v[[R_F32_2]]
	; GFX9: v_lshl_or_b32 v[[R_F32_5:[0-9]+]], v[[R_F32_2]], 16, v[[R_F32_4]]
	; SI: buffer_store_dword v[[R_F32_5]]			; SI: buffer_store_dword v[[R_F32_5]]
	; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[R_F32_5]]			; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[R_F32_5]]
	; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[R_F32_5]]			; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[R_F32_5]]
	define void @log_v2f16(			define void @log_v2f16(
	<2 x half> addrspace(1)* %r,			<2 x half> addrspace(1)* %r,
	<2 x half> addrspace(1)* %a) {			<2 x half> addrspace(1)* %a) {
	entry:			entry:
	%a.val = load <2 x half>, <2 x half> addrspace(1)* %a			%a.val = load <2 x half>, <2 x half> addrspace(1)* %a
	%r.val = call <2 x half> @llvm.log.v2f16(<2 x half> %a.val)			%r.val = call <2 x half> @llvm.log.v2f16(<2 x half> %a.val)
	store <2 x half> %r.val, <2 x half> addrspace(1)* %r			store <2 x half> %r.val, <2 x half> addrspace(1)* %r
	ret void			ret void
	}			}

llvm/test/CodeGen/AMDGPU/llvm.log10.f16.ll

	Show First 20 Lines • Show All 49 Lines • ▼ Show 20 Lines
	; VI: v_mul_f16_sdwa v[[R_F16_2:[0-9]+]], v[[R_F16_1]], [[A_F32_2_V]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD			; VI: v_mul_f16_sdwa v[[R_F16_2:[0-9]+]], v[[R_F16_1]], [[A_F32_2_V]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
	; GFX9: v_mul_f16_e32 v[[R_F32_3:[0-9]+]], [[A_F32_2]], v[[R_F16_2]]			; GFX9: v_mul_f16_e32 v[[R_F32_3:[0-9]+]], [[A_F32_2]], v[[R_F16_2]]
	; VIGFX9: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], [[A_F32_2]], v[[R_F16_0]]			; VIGFX9: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], [[A_F32_2]], v[[R_F16_0]]
	; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_0]]			; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_0]]
	; SI-NOT: v_and_b32_e32			; SI-NOT: v_and_b32_e32
	; SI: v_or_b32_e32 v[[R_F32_5:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]			; SI: v_or_b32_e32 v[[R_F32_5:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
	; VI-NOT: v_and_b32_e32			; VI-NOT: v_and_b32_e32
	; VI: v_or_b32_e32 v[[R_F32_5:[0-9]+]], v[[R_F16_0]], v[[R_F16_2]]			; VI: v_or_b32_e32 v[[R_F32_5:[0-9]+]], v[[R_F16_0]], v[[R_F16_2]]
	; GFX9: v_and_b32_e32 v[[R_F32_4:[0-9]+]], 0xffff, v[[R_F32_3]]			; GFX9: v_pack_b32_f16 v[[R_F32_5:[0-9]+]], v[[R_F32_3]], v[[R_F32_2]]
	; GFX9: v_lshl_or_b32 v[[R_F32_5:[0-9]+]], v[[R_F32_2]], 16, v[[R_F32_4]]
	; SI: buffer_store_dword v[[R_F32_5]]			; SI: buffer_store_dword v[[R_F32_5]]
	; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[R_F32_5]]			; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[R_F32_5]]
	; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[R_F32_5]]			; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[R_F32_5]]
	define void @log10_v2f16(			define void @log10_v2f16(
	<2 x half> addrspace(1)* %r,			<2 x half> addrspace(1)* %r,
	<2 x half> addrspace(1)* %a) {			<2 x half> addrspace(1)* %a) {
	entry:			entry:
	%a.val = load <2 x half>, <2 x half> addrspace(1)* %a			%a.val = load <2 x half>, <2 x half> addrspace(1)* %a
	%r.val = call <2 x half> @llvm.log10.v2f16(<2 x half> %a.val)			%r.val = call <2 x half> @llvm.log10.v2f16(<2 x half> %a.val)
	store <2 x half> %r.val, <2 x half> addrspace(1)* %r			store <2 x half> %r.val, <2 x half> addrspace(1)* %r
	ret void			ret void
	}			}

llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll

	Show All 37 Lines

	; VI-DAG: v_rndne_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]			; VI-DAG: v_rndne_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
	; VI-DAG: v_rndne_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1			; VI-DAG: v_rndne_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
	; VI-NOT: v_and_b32			; VI-NOT: v_and_b32
	; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]			; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]

	; GFX9: v_rndne_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]			; GFX9: v_rndne_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
	; GFX9: v_rndne_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1			; GFX9: v_rndne_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
	; GFX9: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]			; GFX9: v_pack_b32_f16 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
	; GFX9: v_lshl_or_b32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], 16, v[[R_F16_LO]]

	; GCN: buffer_store_dword v[[R_V2_F16]]			; GCN: buffer_store_dword v[[R_V2_F16]]
	; GCN: s_endpgm			; GCN: s_endpgm

	define amdgpu_kernel void @rint_v2f16(			define amdgpu_kernel void @rint_v2f16(
	<2 x half> addrspace(1)* %r,			<2 x half> addrspace(1)* %r,
	<2 x half> addrspace(1)* %a) {			<2 x half> addrspace(1)* %a) {
	entry:			entry:
	%a.val = load <2 x half>, <2 x half> addrspace(1)* %a			%a.val = load <2 x half>, <2 x half> addrspace(1)* %a
	%r.val = call <2 x half> @llvm.rint.v2f16(<2 x half> %a.val)			%r.val = call <2 x half> @llvm.rint.v2f16(<2 x half> %a.val)
	store <2 x half> %r.val, <2 x half> addrspace(1)* %r			store <2 x half> %r.val, <2 x half> addrspace(1)* %r
	ret void			ret void
	}			}

llvm/test/CodeGen/AMDGPU/llvm.round.ll

	Show First 20 Lines • Show All 81 Lines • ▼ Show 20 Lines

	; Should be scalarized			; Should be scalarized
	; FUNC-LABEL: {{^}}round_v2f16:			; FUNC-LABEL: {{^}}round_v2f16:
	; GFX89-DAG: s_movk_i32 [[K:s[0-9]+]], 0x7fff{{$}}			; GFX89-DAG: s_movk_i32 [[K:s[0-9]+]], 0x7fff{{$}}
	; GFX89-DAG: v_mov_b32_e32 [[BFI_K:v[0-9]+]], 0x3c00			; GFX89-DAG: v_mov_b32_e32 [[BFI_K:v[0-9]+]], 0x3c00
	; GFX89: v_bfi_b32 [[COPYSIGN0:v[0-9]+]], [[K]], [[BFI_K]],			; GFX89: v_bfi_b32 [[COPYSIGN0:v[0-9]+]], [[K]], [[BFI_K]],
	; GFX89: v_bfi_b32 [[COPYSIGN1:v[0-9]+]], [[K]], [[BFI_K]],			; GFX89: v_bfi_b32 [[COPYSIGN1:v[0-9]+]], [[K]], [[BFI_K]],

	; GFX9: v_and_b32_e32			; GFX9: v_pack_b32_f16
	; GFX9: v_lshl_or_b32
	define amdgpu_kernel void @round_v2f16(<2 x half> addrspace(1)* %out, i32 %in.arg) #0 {			define amdgpu_kernel void @round_v2f16(<2 x half> addrspace(1)* %out, i32 %in.arg) #0 {
	%in = bitcast i32 %in.arg to <2 x half>			%in = bitcast i32 %in.arg to <2 x half>
	%result = call <2 x half> @llvm.round.v2f16(<2 x half> %in)			%result = call <2 x half> @llvm.round.v2f16(<2 x half> %in)
	store <2 x half> %result, <2 x half> addrspace(1)* %out			store <2 x half> %result, <2 x half> addrspace(1)* %out
	ret void			ret void
	}			}

	declare float @llvm.round.f32(float) #1			declare float @llvm.round.f32(float) #1
	Show All 11 Lines

llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll

	Show First 20 Lines • Show All 129 Lines • ▼ Show 20 Lines
	; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24			; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
	; GFX9-NEXT: v_mov_b32_e32 v0, 0			; GFX9-NEXT: v_mov_b32_e32 v0, 0
	; GFX9-NEXT: v_mov_b32_e32 v2, 0x3118			; GFX9-NEXT: v_mov_b32_e32 v2, 0x3118
	; GFX9-NEXT: s_waitcnt lgkmcnt(0)			; GFX9-NEXT: s_waitcnt lgkmcnt(0)
	; GFX9-NEXT: global_load_dword v1, v0, s[2:3]			; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
	; GFX9-NEXT: s_waitcnt vmcnt(0)			; GFX9-NEXT: s_waitcnt vmcnt(0)
	; GFX9-NEXT: v_mul_f16_e32 v3, 0.15915494, v1			; GFX9-NEXT: v_mul_f16_e32 v3, 0.15915494, v1
	; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD			; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
	; GFX9-NEXT: v_sin_f16_e32 v3, v3			; GFX9-NEXT: v_sin_f16_e32 v2, v3
	; GFX9-NEXT: v_sin_f16_e32 v1, v1			; GFX9-NEXT: v_sin_f16_e32 v1, v1
	; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3			; GFX9-NEXT: v_pack_b32_f16 v1, v2, v1
	; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2
	; GFX9-NEXT: global_store_dword v0, v1, s[0:1]			; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
	; GFX9-NEXT: s_endpgm			; GFX9-NEXT: s_endpgm
	;			;
	; GFX10-LABEL: sin_v2f16:			; GFX10-LABEL: sin_v2f16:
	; GFX10: ; %bb.0:			; GFX10: ; %bb.0:
	; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24			; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
	; GFX10-NEXT: v_mov_b32_e32 v0, 0			; GFX10-NEXT: v_mov_b32_e32 v0, 0
	; GFX10-NEXT: v_mov_b32_e32 v3, 0x3118			; GFX10-NEXT: v_mov_b32_e32 v2, 0x3118
	; GFX10-NEXT: s_waitcnt lgkmcnt(0)			; GFX10-NEXT: s_waitcnt lgkmcnt(0)
	; GFX10-NEXT: global_load_dword v1, v0, s[2:3]			; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
	; GFX10-NEXT: s_waitcnt vmcnt(0)			; GFX10-NEXT: s_waitcnt vmcnt(0)
	; GFX10-NEXT: v_mul_f16_e32 v2, 0.15915494, v1			; GFX10-NEXT: v_mul_f16_e32 v3, 0.15915494, v1
	; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD			; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
	; GFX10-NEXT: v_sin_f16_e32 v2, v2			; GFX10-NEXT: v_sin_f16_e32 v2, v3
	; GFX10-NEXT: v_sin_f16_e32 v1, v1			; GFX10-NEXT: v_sin_f16_e32 v1, v1
	; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2			; GFX10-NEXT: v_pack_b32_f16 v1, v2, v1
	; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v2
	; GFX10-NEXT: global_store_dword v0, v1, s[0:1]			; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
	; GFX10-NEXT: s_endpgm			; GFX10-NEXT: s_endpgm
	%a.val = load <2 x half>, <2 x half> addrspace(1)* %a			%a.val = load <2 x half>, <2 x half> addrspace(1)* %a
	%r.val = call <2 x half> @llvm.sin.v2f16(<2 x half> %a.val)			%r.val = call <2 x half> @llvm.sin.v2f16(<2 x half> %a.val)
	store <2 x half> %r.val, <2 x half> addrspace(1)* %r			store <2 x half> %r.val, <2 x half> addrspace(1)* %r
	ret void			ret void
	}			}

	declare half @llvm.sin.f16(half %a)			declare half @llvm.sin.f16(half %a)
	declare <2 x half> @llvm.sin.v2f16(<2 x half> %a)			declare <2 x half> @llvm.sin.v2f16(<2 x half> %a)

llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll

Show First 20 Lines • Show All 219 Lines • ▼ Show 20 Lines	define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
%insert = insertelement <2 x half> %cvt.result, half %clamp.hi, i32 1		%insert = insertelement <2 x half> %cvt.result, half %clamp.hi, i32 1
ret <2 x half> %insert		ret <2 x half> %insert
}		}

; FIXME: Should be able to use mixlo/mixhi		; FIXME: Should be able to use mixlo/mixhi
; GCN-LABEL: {{^}}v_mad_mix_v2f32_clamp_precvt:		; GCN-LABEL: {{^}}v_mad_mix_v2f32_clamp_precvt:
; GFX9: v_mad_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp		; GFX9: v_mad_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
; GFX9-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp		; GFX9-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
; GFX9: v_cvt_f16_f32_e32 v0, v0
; GFX9: v_cvt_f16_f32_e32 v1, v3		; GFX9: v_cvt_f16_f32_e32 v1, v3
; GFX9: v_and_b32_e32 v0, 0xffff, v0		; GFX9: v_cvt_f16_f32_e32 v0, v0
; GFX9: v_lshl_or_b32 v0, v1, 16, v0		; GFX9: v_pack_b32_f16 v0, v0, v1
; GFX9: s_setpc_b64		; GFX9: s_setpc_b64
define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {		define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
%src0.ext = fpext <2 x half> %src0 to <2 x float>		%src0.ext = fpext <2 x half> %src0 to <2 x float>
%src1.ext = fpext <2 x half> %src1 to <2 x float>		%src1.ext = fpext <2 x half> %src1 to <2 x float>
%src2.ext = fpext <2 x half> %src2 to <2 x float>		%src2.ext = fpext <2 x half> %src2 to <2 x float>
%result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext)		%result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext)
%max = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %result, <2 x float> zeroinitializer)		%max = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %result, <2 x float> zeroinitializer)
%clamp = call <2 x float> @llvm.minnum.v2f32(<2 x float> %max, <2 x float> <float 1.0, float 1.0>)		%clamp = call <2 x float> @llvm.minnum.v2f32(<2 x float> %max, <2 x float> <float 1.0, float 1.0>)
%cvt.result = fptrunc <2 x float> %clamp to <2 x half>		%cvt.result = fptrunc <2 x float> %clamp to <2 x half>
ret <2 x half> %cvt.result		ret <2 x half> %cvt.result
}		}

; FIXME: Handling undef 4th component		; FIXME: Handling undef 4th component
; GCN-LABEL: {{^}}v_mad_mix_v3f32_clamp_precvt:		; GCN-LABEL: {{^}}v_mad_mix_v3f32_clamp_precvt:
; GCN: s_waitcnt		; GCN: s_waitcnt
; GFX9-NEXT: v_mad_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp		; GFX9-NEXT: v_mad_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
; GFX9-NEXT: v_mad_mix_f32 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp		; GFX9-NEXT: v_mad_mix_f32 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
; GFX9-NEXT: v_mad_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp		; GFX9-NEXT: v_mad_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp
; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v3		; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v3
		; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1		; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0		; GFX9-NEXT: v_pack_b32_f16 v0, v0, v2
; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0
; GFX9-NEXT: s_setpc_b64		; GFX9-NEXT: s_setpc_b64
define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 {		define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 {
%src0.ext = fpext <3 x half> %src0 to <3 x float>		%src0.ext = fpext <3 x half> %src0 to <3 x float>
%src1.ext = fpext <3 x half> %src1 to <3 x float>		%src1.ext = fpext <3 x half> %src1 to <3 x float>
%src2.ext = fpext <3 x half> %src2 to <3 x float>		%src2.ext = fpext <3 x half> %src2 to <3 x float>
%result = tail call <3 x float> @llvm.fmuladd.v3f32(<3 x float> %src0.ext, <3 x float> %src1.ext, <3 x float> %src2.ext)		%result = tail call <3 x float> @llvm.fmuladd.v3f32(<3 x float> %src0.ext, <3 x float> %src1.ext, <3 x float> %src2.ext)
%max = call <3 x float> @llvm.maxnum.v3f32(<3 x float> %result, <3 x float> zeroinitializer)		%max = call <3 x float> @llvm.maxnum.v3f32(<3 x float> %result, <3 x float> zeroinitializer)
%clamp = call <3 x float> @llvm.minnum.v3f32(<3 x float> %max, <3 x float> <float 1.0, float 1.0, float 1.0>)		%clamp = call <3 x float> @llvm.minnum.v3f32(<3 x float> %max, <3 x float> <float 1.0, float 1.0, float 1.0>)
▲ Show 20 Lines • Show All 52 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/v_pack.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefix=GCN %s
				; RUN: llc -global-isel -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefix=GISEL %s
				foadUnsubmitted Not Done Reply Inline Actions Maybe add a second RUN line with "llc -global-isel ..."? foad: Maybe add a second RUN line with "llc -global-isel ..."?

				declare i32 @llvm.amdgcn.workitem.id.x() #1

				define amdgpu_kernel void @v_pack_b32_v2f16(half addrspace(1)* %in0, half addrspace(1)* %in1) #0 {
				; GCN-LABEL: v_pack_b32_v2f16:
				arsenmUnsubmitted Done Reply Inline Actions s/v2half/v2f16 for consistency arsenm: s/v2half/v2f16 for consistency
				; GCN: ; %bb.0:
				; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
				; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0
				; GCN-NEXT: s_waitcnt lgkmcnt(0)
				; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
				; GCN-NEXT: s_waitcnt vmcnt(0)
				; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
				; GCN-NEXT: s_waitcnt vmcnt(0)
				; GCN-NEXT: v_add_f16_e32 v0, 2.0, v1
				; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2
				; GCN-NEXT: v_pack_b32_f16 v0, v0, v1
				; GCN-NEXT: ;;#ASMSTART
				; GCN-NEXT: ; use v0
				; GCN-NEXT: ;;#ASMEND
				; GCN-NEXT: s_endpgm
				;
				; GISEL-LABEL: v_pack_b32_v2f16:
				; GISEL: ; %bb.0:
				; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
				; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0
				; GISEL-NEXT: s_waitcnt lgkmcnt(0)
				; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
				; GISEL-NEXT: s_waitcnt vmcnt(0)
				; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
				; GISEL-NEXT: s_waitcnt vmcnt(0)
				; GISEL-NEXT: s_waitcnt_depctr 0xffe3
				; GISEL-NEXT: s_movk_i32 s0, 0x4000
				; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1
				foadUnsubmitted Not Done Reply Inline Actions I think you are testing the wrong thing here. The result of "fadd float" is canonicalized, but if you extract the low 16 bits and bitcast it to half, then you get a meaningless value that is not canonicalized. Instead you should use two "fadd half" instructions and insert the results into a <2 x half>, with no bitcasting. foad: I think you are testing the wrong thing here. The result of "fadd float" is canonicalized, but…
				; GISEL-NEXT: v_add_f16_sdwa v1, v2, s0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
				; GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
				; GISEL-NEXT: ;;#ASMSTART
				; GISEL-NEXT: ; use v0
				; GISEL-NEXT: ;;#ASMEND
				; GISEL-NEXT: s_endpgm
				%tid = call i32 @llvm.amdgcn.workitem.id.x()
				%tid.ext = sext i32 %tid to i64
				%in0.gep = getelementptr inbounds half, half addrspace(1)* %in0, i64 %tid.ext
				%in1.gep = getelementptr inbounds half, half addrspace(1)* %in1, i64 %tid.ext
				%v0 = load volatile half, half addrspace(1)* %in0.gep
				%v1 = load volatile half, half addrspace(1)* %in1.gep
				%v0.add = fadd half %v0, 2.0
				%v1.add = fadd half %v1, 2.0
				%vec.0 = insertelement <2 x half> undef, half %v0.add, i32 0
				%vec.1 = insertelement <2 x half> %vec.0, half %v1.add, i32 1
				%vec.i32 = bitcast <2 x half> %vec.1 to i32
				call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
				ret void
				}

				define amdgpu_kernel void @v_pack_b32_v2f16_sub(half addrspace(1)* %in0, half addrspace(1)* %in1) #0 {
				; GCN-LABEL: v_pack_b32_v2f16_sub:
				; GCN: ; %bb.0:
				; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
				; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0
				; GCN-NEXT: s_waitcnt lgkmcnt(0)
				; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
				; GCN-NEXT: s_waitcnt vmcnt(0)
				; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
				; GCN-NEXT: s_waitcnt vmcnt(0)
				; GCN-NEXT: v_subrev_f16_e32 v0, 2.0, v1
				; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2
				; GCN-NEXT: v_pack_b32_f16 v0, v0, v1
				; GCN-NEXT: ;;#ASMSTART
				; GCN-NEXT: ; use v0
				; GCN-NEXT: ;;#ASMEND
				; GCN-NEXT: s_endpgm
				;
				; GISEL-LABEL: v_pack_b32_v2f16_sub:
				; GISEL: ; %bb.0:
				; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
				; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0
				; GISEL-NEXT: s_waitcnt lgkmcnt(0)
				; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
				; GISEL-NEXT: s_waitcnt vmcnt(0)
				; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
				; GISEL-NEXT: s_waitcnt vmcnt(0)
				; GISEL-NEXT: v_mov_b32_e32 v0, 0x4000
				; GISEL-NEXT: v_add_f16_e32 v1, -2.0, v1
				; GISEL-NEXT: v_add_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
				; GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0
				; GISEL-NEXT: ;;#ASMSTART
				; GISEL-NEXT: ; use v0
				; GISEL-NEXT: ;;#ASMEND
				; GISEL-NEXT: s_endpgm
				%tid = call i32 @llvm.amdgcn.workitem.id.x()
				%tid.ext = sext i32 %tid to i64
				%in0.gep = getelementptr inbounds half, half addrspace(1)* %in0, i64 %tid.ext
				%in1.gep = getelementptr inbounds half, half addrspace(1)* %in1, i64 %tid.ext
				%v0 = load volatile half, half addrspace(1)* %in0.gep
				%v1 = load volatile half, half addrspace(1)* %in1.gep
				%v0.add = fsub half %v0, 2.0
				%v1.add = fadd half %v1, 2.0
				%vec.0 = insertelement <2 x half> undef, half %v0.add, i32 0
				%vec.1 = insertelement <2 x half> %vec.0, half %v1.add, i32 1
				%vec.i32 = bitcast <2 x half> %vec.1 to i32
				call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
				ret void
				}

				define amdgpu_kernel void @fptrunc(
				; GCN-LABEL: fptrunc:
				; GCN: ; %bb.0:
				; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
				; GCN-NEXT: s_mov_b32 s6, -1
				; GCN-NEXT: s_mov_b32 s7, 0x31016000
				; GCN-NEXT: s_mov_b32 s10, s6
				; GCN-NEXT: s_mov_b32 s11, s7
				; GCN-NEXT: s_waitcnt lgkmcnt(0)
				; GCN-NEXT: s_mov_b32 s8, s2
				; GCN-NEXT: s_mov_b32 s9, s3
				; GCN-NEXT: s_mov_b32 s4, s0
				; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
				; GCN-NEXT: s_mov_b32 s5, s1
				; GCN-NEXT: s_waitcnt vmcnt(0)
				; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
				; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
				; GCN-NEXT: v_pack_b32_f16 v0, v0, v1
				; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
				; GCN-NEXT: s_endpgm
				;
				; GISEL-LABEL: fptrunc:
				; GISEL: ; %bb.0:
				; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
				; GISEL-NEXT: s_waitcnt lgkmcnt(0)
				; GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
				; GISEL-NEXT: s_waitcnt lgkmcnt(0)
				; GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
				; GISEL-NEXT: v_cvt_f16_f32_sdwa v1, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
				; GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
				; GISEL-NEXT: v_mov_b32_e32 v1, 0
				; GISEL-NEXT: global_store_dword v1, v0, s[0:1]
				; GISEL-NEXT: s_endpgm
				<2 x half> addrspace(1)* %r,
				<2 x float> addrspace(1)* %a) {
				%a.val = load <2 x float>, <2 x float> addrspace(1)* %a
				%r.val = fptrunc <2 x float> %a.val to <2 x half>
				store <2 x half> %r.val, <2 x half> addrspace(1)* %r
				ret void
				}

				define amdgpu_kernel void @v_pack_b32.fabs(half addrspace(1)* %in0, half addrspace(1)* %in1) #0 {
				arsenmUnsubmitted Done Reply Inline Actions Can you add some cases with source modifier combinations? arsenm: Can you add some cases with source modifier combinations?
				; GCN-LABEL: v_pack_b32.fabs:
				; GCN: ; %bb.0:
				; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
				; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0
				; GCN-NEXT: s_waitcnt lgkmcnt(0)
				; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
				; GCN-NEXT: s_waitcnt vmcnt(0)
				; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
				; GCN-NEXT: s_waitcnt vmcnt(0)
				; GCN-NEXT: v_add_f16_e32 v0, 2.0, v1
				; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2
				; GCN-NEXT: v_pack_b32_f16 v0, \|v0\|, \|v1\|
				; GCN-NEXT: ;;#ASMSTART
				; GCN-NEXT: ; use v0
				; GCN-NEXT: ;;#ASMEND
				; GCN-NEXT: s_endpgm
				;
				; GISEL-LABEL: v_pack_b32.fabs:
				; GISEL: ; %bb.0:
				; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
				; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0
				; GISEL-NEXT: s_waitcnt lgkmcnt(0)
				; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
				; GISEL-NEXT: s_waitcnt vmcnt(0)
				; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
				; GISEL-NEXT: s_waitcnt vmcnt(0)
				; GISEL-NEXT: s_waitcnt_depctr 0xffe3
				; GISEL-NEXT: s_movk_i32 s0, 0x7fff
				; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1
				; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2
				; GISEL-NEXT: v_and_b32_e32 v0, s0, v0
				; GISEL-NEXT: v_and_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
				; GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
				; GISEL-NEXT: ;;#ASMSTART
				; GISEL-NEXT: ; use v0
				; GISEL-NEXT: ;;#ASMEND
				; GISEL-NEXT: s_endpgm
				%tid = call i32 @llvm.amdgcn.workitem.id.x()
				%tid.ext = sext i32 %tid to i64
				%in0.gep = getelementptr inbounds half, half addrspace(1)* %in0, i64 %tid.ext
				%in1.gep = getelementptr inbounds half, half addrspace(1)* %in1, i64 %tid.ext
				%v0 = load volatile half, half addrspace(1)* %in0.gep
				%v1 = load volatile half, half addrspace(1)* %in1.gep
				%v0.add = fadd half %v0, 2.0
				%v1.add = fadd half %v1, 2.0
				%v0.fabs = call half @llvm.fabs.f16(half %v0.add)
				%v1.fabs = call half @llvm.fabs.f16(half %v1.add)
				%vec.0 = insertelement <2 x half> undef, half %v0.fabs, i32 0
				%vec.1 = insertelement <2 x half> %vec.0, half %v1.fabs, i32 1
				%vec.i32 = bitcast <2 x half> %vec.1 to i32
				call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
				ret void
				}

				define amdgpu_kernel void @v_pack_b32.fneg(half addrspace(1)* %in0, half addrspace(1)* %in1) #0 {
				; GCN-LABEL: v_pack_b32.fneg:
				; GCN: ; %bb.0:
				; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
				; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0
				; GCN-NEXT: s_waitcnt lgkmcnt(0)
				; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
				; GCN-NEXT: s_waitcnt vmcnt(0)
				; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
				; GCN-NEXT: s_waitcnt vmcnt(0)
				; GCN-NEXT: v_add_f16_e32 v0, 2.0, v1
				; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2
				; GCN-NEXT: v_pack_b32_f16 v0, -v0, -v1
				; GCN-NEXT: ;;#ASMSTART
				; GCN-NEXT: ; use v0
				; GCN-NEXT: ;;#ASMEND
				; GCN-NEXT: s_endpgm
				;
				; GISEL-LABEL: v_pack_b32.fneg:
				; GISEL: ; %bb.0:
				; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
				; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0
				; GISEL-NEXT: s_waitcnt lgkmcnt(0)
				; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
				; GISEL-NEXT: s_waitcnt vmcnt(0)
				; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
				; GISEL-NEXT: s_waitcnt vmcnt(0)
				; GISEL-NEXT: s_waitcnt_depctr 0xffe3
				; GISEL-NEXT: s_mov_b32 s0, 0x8000
				; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1
				; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2
				; GISEL-NEXT: v_add_f16_e64 v0, s0, -v0
				; GISEL-NEXT: v_add_f16_sdwa v1, s0, -v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
				; GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
				; GISEL-NEXT: ;;#ASMSTART
				; GISEL-NEXT: ; use v0
				; GISEL-NEXT: ;;#ASMEND
				; GISEL-NEXT: s_endpgm
				%tid = call i32 @llvm.amdgcn.workitem.id.x()
				%tid.ext = sext i32 %tid to i64
				%in0.gep = getelementptr inbounds half, half addrspace(1)* %in0, i64 %tid.ext
				%in1.gep = getelementptr inbounds half, half addrspace(1)* %in1, i64 %tid.ext
				%v0 = load volatile half, half addrspace(1)* %in0.gep
				%v1 = load volatile half, half addrspace(1)* %in1.gep
				%v0.add = fadd half %v0, 2.0
				%v1.add = fadd half %v1, 2.0
				%v0.fneg = fsub half -0.0, %v0.add
				%v1.fneg = fsub half -0.0, %v1.add
				%vec.0 = insertelement <2 x half> undef, half %v0.fneg, i32 0
				%vec.1 = insertelement <2 x half> %vec.0, half %v1.fneg, i32 1
				%vec.i32 = bitcast <2 x half> %vec.1 to i32
				call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
				ret void
				}

				declare half @llvm.fabs.f16(half) #1

				attributes #0 = { nounwind }
				attributes #1 = { nounwind readnone }

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Improve Codegen for build_vector
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 344799

llvm/lib/Target/AMDGPU/AMDGPUInstructions.td

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

llvm/lib/Target/AMDGPU/SIInstructions.td

llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll

llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll

llvm/test/CodeGen/AMDGPU/fexp.ll

llvm/test/CodeGen/AMDGPU/fpow.ll

llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll

llvm/test/CodeGen/AMDGPU/frem.ll

llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll

llvm/test/CodeGen/AMDGPU/llvm.log.f16.ll

llvm/test/CodeGen/AMDGPU/llvm.log10.f16.ll

llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll

llvm/test/CodeGen/AMDGPU/llvm.round.ll

llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll

llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll

llvm/test/CodeGen/AMDGPU/v_pack.ll

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Improve Codegen for build_vectorClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 344799

llvm/lib/Target/AMDGPU/AMDGPUInstructions.td

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

llvm/lib/Target/AMDGPU/SIInstructions.td

llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll

llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll

llvm/test/CodeGen/AMDGPU/fexp.ll

llvm/test/CodeGen/AMDGPU/fpow.ll

llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll

llvm/test/CodeGen/AMDGPU/frem.ll

llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll

llvm/test/CodeGen/AMDGPU/llvm.log.f16.ll

llvm/test/CodeGen/AMDGPU/llvm.log10.f16.ll

llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll

llvm/test/CodeGen/AMDGPU/llvm.round.ll

llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll

llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll

llvm/test/CodeGen/AMDGPU/v_pack.ll

[AMDGPU] Improve Codegen for build_vector
ClosedPublic