This is an archive of the discontinued LLVM Phabricator instance.

Differential D52018

[AMDGPU] Add instruction selection for i1 to f16 conversion
ClosedPublic

Authored by critson on Sep 13 2018, 1:02 AM.

Download Raw Diff

Details

Reviewers

arsenm
nhaehnle

Commits

rG6b8d75425eeb: [AMDGPU] Add instruction selection for i1 to f16 conversion
rL342558: [AMDGPU] Add instruction selection for i1 to f16 conversion

Summary

This is required for GPUs with 16 bit instructions where f16 is a
legal register type and hence int_to_fp i1 to f16 is not lowered
by legalizing.

Change-Id: Ie4c0fd6ced7cf10ad612023c6879724d9ded5851

Diff Detail

Repository

rL LLVM

Build Status

Buildable 22648
Build 22648: arc lint + arc unit

Event Timeline

critson created this revision.Sep 13 2018, 1:02 AM

Herald added subscribers: llvm-commits, t-tye, tpr and 6 others. · View Herald TranscriptSep 13 2018, 1:02 AM

What about sitofp?

Why the detour via V_CVT_F16_F32 instead of selecting an fp16 1.0 constant directly?

Add support and test for sint_to_fp.

In D52018#1232860, @nhaehnle wrote:

Why the detour via V_CVT_F16_F32 instead of selecting an fp16 1.0 constant directly?

It is not possible select an fp16 constant with V_CNDMASK_B32.
In principle the VOP2 version could be used to select an inline literal, but only when the i1 is vcc.

In D52018#1235002, @critson wrote:

In D52018#1232860, @nhaehnle wrote:

Why the detour via V_CVT_F16_F32 instead of selecting an fp16 1.0 constant directly?

It is not possible select an fp16 constant with V_CNDMASK_B32.
In principle the VOP2 version could be used to select an inline literal, but only when the i1 is vcc.

Oh yeah, that makes sense.

Thanks for the sitofp change, LGTM.

This revision is now accepted and ready to land.Sep 17 2018, 3:08 AM

Closed by commit rL342558: [AMDGPU] Add instruction selection for i1 to f16 conversion (authored by critson). · Explain WhySep 19 2018, 9:33 AM

This revision was automatically updated to reflect the committed changes.

ronlieb added a subscriber: ronlieb.Sep 19 2018, 10:10 AM

Revision Contents

Path

Size

lib/

Target/

AMDGPU/

SIInstructions.td

10 lines

test/

CodeGen/

AMDGPU/

sitofp.f16.ll

19 lines

uitofp.f16.ll

19 lines

Diff 165512

lib/Target/AMDGPU/SIInstructions.td

	Show First 20 Lines • Show All 1,315 Lines • ▼ Show 20 Lines
	>;			>;

	def : GCNPat <			def : GCNPat <
	(i1 (xor i1:$src0, i1:$src1)),			(i1 (xor i1:$src0, i1:$src1)),
	(S_XOR_B64 $src0, $src1)			(S_XOR_B64 $src0, $src1)
	>;			>;

	def : GCNPat <			def : GCNPat <
				(f16 (sint_to_fp i1:$src)),
				(V_CVT_F16_F32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_NEG_ONE), $src))
				>;

				def : GCNPat <
				(f16 (uint_to_fp i1:$src)),
				(V_CVT_F16_F32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_ONE), $src))
				>;

				def : GCNPat <
	(f32 (sint_to_fp i1:$src)),			(f32 (sint_to_fp i1:$src)),
	(V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_NEG_ONE), $src)			(V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_NEG_ONE), $src)
	>;			>;

	def : GCNPat <			def : GCNPat <
	(f32 (uint_to_fp i1:$src)),			(f32 (uint_to_fp i1:$src)),
	(V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_ONE), $src)			(V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_ONE), $src)
	>;			>;
	▲ Show 20 Lines • Show All 318 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/sitofp.f16.ll

Show First 20 Lines • Show All 86 Lines • ▼ Show 20 Lines	define amdgpu_kernel void @sitofp_v2i32_to_v2f16(
<2 x i32> addrspace(1)* %a) {		<2 x i32> addrspace(1)* %a) {
entry:		entry:
%a.val = load <2 x i32>, <2 x i32> addrspace(1)* %a		%a.val = load <2 x i32>, <2 x i32> addrspace(1)* %a
%r.val = sitofp <2 x i32> %a.val to <2 x half>		%r.val = sitofp <2 x i32> %a.val to <2 x half>
store <2 x half> %r.val, <2 x half> addrspace(1)* %r		store <2 x half> %r.val, <2 x half> addrspace(1)* %r
ret void		ret void
}		}

		; FUNC-LABEL: {{^}}s_sint_to_fp_i1_to_f16:
		; GCN-DAG: v_cmp_le_f32_e32 [[CMP0:vcc]], 1.0, {{v[0-9]+}}
		; GCN-DAG: v_cmp_le_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], 0, {{v[0-9]+}}
		; GCN: s_xor_b64 [[R_CMP:s\[[0-9]+:[0-9]+\]]], [[CMP1]], [[CMP0]]
		; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1.0, [[R_CMP]]
		; GCN-NEXT: v_cvt_f16_f32_e32 [[R_F16:v[0-9]+]], [[RESULT]]
		; GCN: buffer_store_short
		; GCN: s_endpgm
		define amdgpu_kernel void @s_sint_to_fp_i1_to_f16(half addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
		%a = load float, float addrspace(1) * %in0
		%b = load float, float addrspace(1) * %in1
		%acmp = fcmp oge float %a, 0.000000e+00
		%bcmp = fcmp oge float %b, 1.000000e+00
		%result = xor i1 %acmp, %bcmp
		%fp = sitofp i1 %result to half
		store half %fp, half addrspace(1)* %out
		ret void
		}

; v2f16 = sitofp v2i64 is in sint_to_fp.i64.ll		; v2f16 = sitofp v2i64 is in sint_to_fp.i64.ll

test/CodeGen/AMDGPU/uitofp.f16.ll

Show First 20 Lines • Show All 86 Lines • ▼ Show 20 Lines	define amdgpu_kernel void @uitofp_v2i32_to_v2f16(
<2 x i32> addrspace(1)* %a) {		<2 x i32> addrspace(1)* %a) {
entry:		entry:
%a.val = load <2 x i32>, <2 x i32> addrspace(1)* %a		%a.val = load <2 x i32>, <2 x i32> addrspace(1)* %a
%r.val = uitofp <2 x i32> %a.val to <2 x half>		%r.val = uitofp <2 x i32> %a.val to <2 x half>
store <2 x half> %r.val, <2 x half> addrspace(1)* %r		store <2 x half> %r.val, <2 x half> addrspace(1)* %r
ret void		ret void
}		}

		; FUNC-LABEL: {{^}}s_uint_to_fp_i1_to_f16:
		; GCN-DAG: v_cmp_le_f32_e32 [[CMP0:vcc]], 1.0, {{v[0-9]+}}
		; GCN-DAG: v_cmp_le_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], 0, {{v[0-9]+}}
		; GCN: s_xor_b64 [[R_CMP:s\[[0-9]+:[0-9]+\]]], [[CMP1]], [[CMP0]]
		; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0, [[R_CMP]]
		; GCN-NEXT: v_cvt_f16_f32_e32 [[R_F16:v[0-9]+]], [[RESULT]]
		; GCN: buffer_store_short
		; GCN: s_endpgm
		define amdgpu_kernel void @s_uint_to_fp_i1_to_f16(half addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
		%a = load float, float addrspace(1) * %in0
		%b = load float, float addrspace(1) * %in1
		%acmp = fcmp oge float %a, 0.000000e+00
		%bcmp = fcmp oge float %b, 1.000000e+00
		%result = xor i1 %acmp, %bcmp
		%fp = uitofp i1 %result to half
		store half %fp, half addrspace(1)* %out
		ret void
		}

; f16 = uitofp i64 is in uint_to_fp.i64.ll		; f16 = uitofp i64 is in uint_to_fp.i64.ll