This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Simplify f16 to i64 custom lowering
ClosedPublic

Authored by Petar.Avramovic on Jul 20 2020, 5:48 AM.

Download Raw Diff

Details

Reviewers

foad
arsenm

Commits

rG44967fc60451: AMDGPU: Simplify f16 to i64 custom lowering

Summary

Range that f16 can represent fits into i32.
Lower as f16->i32->i64 instead of f16->f32->i64
since f32->i64 has long expansion.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

Petar.Avramovic created this revision.Jul 20 2020, 5:48 AM

Herald added a project: Restricted Project. · View Herald TranscriptJul 20 2020, 5:48 AM

Herald added subscribers: llvm-commits, kerbowa, hiraditya and 8 others. · View Herald Transcript

arsenm added inline comments.Jul 20 2020, 6:13 AM

llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
108	Why are there test changes only for the vector cases?

Context was missing.

llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
108	Tests only check for a few instructions and here there was no v_cvt_f32_f16_e32. I think that they test if there was a f16 -> f32 followed by long f32 -> i64 expansion but don't check expansion. Now f16->i32 is selected as f16->f32->i32 and f16->f32 part is still there.

arsenm added inline comments.Jul 20 2020, 6:31 AM

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
2726–2727	I don't think this needs to be limited to Subtarget->has16BitInsts()
llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
45	These checks should be strengthened to show the new expansion

This update also simplifies f16 -> i64 on subtargets without has16BitInsts().
Such targets immediately promote f16 to f32 whenever they encounter f16 def using fp16_to_fp node. Recognize fp16_to_fp input to f32 to i64 and do the same thing as for f16 to i64 conversion. This gives similar(small difference for f16 vectors) results for subtargets with or without has16BitInsts() since f16->i32 gets selected like f16->f32->i32.
Update tests with more detailed checks.

arsenm accepted this revision.Jul 21 2020, 8:04 AM

This revision is now accepted and ready to land.Jul 21 2020, 8:04 AM

Closed by commit rG44967fc60451: AMDGPU: Simplify f16 to i64 custom lowering (authored by Petar.Avramovic). · Explain WhyJul 22 2020, 1:32 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

lib/

Target/

AMDGPU/

AMDGPUISelLowering.cpp

20 lines

test/

CodeGen/

AMDGPU/

fptosi.f16.ll

26 lines

fptoui.f16.ll

24 lines

Diff 279722

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

	Show First 20 Lines • Show All 2,696 Lines • ▼ Show 20 Lines

	SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op,			SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op,
	SelectionDAG &DAG) const {			SelectionDAG &DAG) const {
	SDValue Src = Op.getOperand(0);			SDValue Src = Op.getOperand(0);

	// TODO: Factor out code common with LowerFP_TO_UINT.			// TODO: Factor out code common with LowerFP_TO_UINT.

	EVT SrcVT = Src.getValueType();			EVT SrcVT = Src.getValueType();
	if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {			if (SrcVT == MVT::f16 \|\|
				(SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
	SDLoc DL(Op);			SDLoc DL(Op);

	SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);			SDValue FpToInt32 = DAG.getNode(Op.getOpcode(), DL, MVT::i32, Src);
	SDValue FpToInt32 =			return DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, FpToInt32);
	DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);

	return FpToInt32;
	}			}

	if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)			if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
	return LowerFP64_TO_INT(Op, DAG, true);			return LowerFP64_TO_INT(Op, DAG, true);

	return SDValue();			return SDValue();
	}			}

	SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op,			SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op,
	SelectionDAG &DAG) const {			SelectionDAG &DAG) const {
	SDValue Src = Op.getOperand(0);			SDValue Src = Op.getOperand(0);

	// TODO: Factor out code common with LowerFP_TO_SINT.			// TODO: Factor out code common with LowerFP_TO_SINT.

	EVT SrcVT = Src.getValueType();			EVT SrcVT = Src.getValueType();
	if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {			if (SrcVT == MVT::f16 \|\|
				(SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
				arsenmUnsubmitted Not Done Reply Inline Actions I don't think this needs to be limited to Subtarget->has16BitInsts() arsenm: I don't think this needs to be limited to Subtarget->has16BitInsts()
	SDLoc DL(Op);			SDLoc DL(Op);

	SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);			SDValue FpToUInt32 = DAG.getNode(Op.getOpcode(), DL, MVT::i32, Src);
	SDValue FpToInt32 =			return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, FpToUInt32);
	DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);

	return FpToInt32;
	}			}

	if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)			if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
	return LowerFP64_TO_INT(Op, DAG, false);			return LowerFP64_TO_INT(Op, DAG, false);

	return SDValue();			return SDValue();
	}			}

	▲ Show 20 Lines • Show All 2,001 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/fptosi.f16.ll

Show All 31 Lines	entry:
store i32 %r.val, i32 addrspace(1)* %r		store i32 %r.val, i32 addrspace(1)* %r
ret void		ret void
}		}

; Need to make sure we promote f16 to f32 when converting f16 to i64. Existing		; Need to make sure we promote f16 to f32 when converting f16 to i64. Existing
; test checks code generated for 'i64 = fp_to_sint f32'.		; test checks code generated for 'i64 = fp_to_sint f32'.

; GCN-LABEL: {{^}}fptosi_f16_to_i64		; GCN-LABEL: {{^}}fptosi_f16_to_i64
; GCN: buffer_load_ushort		; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
; GCN: v_cvt_f32_f16_e32		; GCN: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
		; GCN: v_cvt_i32_f32_e32 v[[R_I64_Low:[0-9]+]], v[[A_F32]]
		; GCN: v_ashrrev_i32_e32 v[[R_I64_High:[0-9]+]], 31, v[[R_I64_Low]]
		; GCN: buffer_store_dwordx2 v{{\[}}[[R_I64_Low]]{{\:}}[[R_I64_High]]{{\]}}
; GCN: s_endpgm		; GCN: s_endpgm
		arsenmUnsubmitted Not Done Reply Inline Actions These checks should be strengthened to show the new expansion arsenm: These checks should be strengthened to show the new expansion
define amdgpu_kernel void @fptosi_f16_to_i64(		define amdgpu_kernel void @fptosi_f16_to_i64(
i64 addrspace(1)* %r,		i64 addrspace(1)* %r,
half addrspace(1)* %a) {		half addrspace(1)* %a) {
entry:		entry:
%a.val = load half, half addrspace(1)* %a		%a.val = load half, half addrspace(1)* %a
%r.val = fptosi half %a.val to i64		%r.val = fptosi half %a.val to i64
store i64 %r.val, i64 addrspace(1)* %r		store i64 %r.val, i64 addrspace(1)* %r
ret void		ret void
▲ Show 20 Lines • Show All 48 Lines • ▼ Show 20 Lines	entry:
store <2 x i32> %r.val, <2 x i32> addrspace(1)* %r		store <2 x i32> %r.val, <2 x i32> addrspace(1)* %r
ret void		ret void
}		}

; Need to make sure we promote f16 to f32 when converting f16 to i64. Existing		; Need to make sure we promote f16 to f32 when converting f16 to i64. Existing
; test checks code generated for 'i64 = fp_to_sint f32'.		; test checks code generated for 'i64 = fp_to_sint f32'.

; GCN-LABEL: {{^}}fptosi_v2f16_to_v2i64		; GCN-LABEL: {{^}}fptosi_v2f16_to_v2i64
; GCN: buffer_load_dword		; GCN: buffer_load_dword v[[A_F16_0:[0-9]+]]
; GCN: v_cvt_f32_f16_e32		; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_F16_0]]
arsenmUnsubmitted Not Done Reply Inline Actions Why are there test changes only for the vector cases? arsenm: Why are there test changes only for the vector cases?
Petar.AvramovicAuthorUnsubmitted Done Reply Inline Actions Tests only check for a few instructions and here there was no v_cvt_f32_f16_e32. I think that they test if there was a f16 -> f32 followed by long f32 -> i64 expansion but don't check expansion. Now f16->i32 is selected as f16->f32->i32 and f16->f32 part is still there. Petar.Avramovic: Tests only check for a few instructions and here there was no v_cvt_f32_f16_e32. I think that…
; SI: v_cvt_f32_f16_e32		; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_0]]
; VI: v_cvt_f32_f16_sdwa		; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
		; SI: v_cvt_i32_f32_e32 v[[R_I64_0_Low:[0-9]+]], v[[A_F32_0]]
		; SI: v_ashrrev_i32_e32 v[[R_I64_0_High:[0-9]+]], 31, v[[R_I64_0_Low]]
		; SI: v_cvt_i32_f32_e32 v[[R_I64_1_Low:[0-9]+]], v[[A_F32_1]]
		; SI: v_ashrrev_i32_e32 v[[R_I64_1_High:[0-9]+]], 31, v[[R_I64_1_Low]]
		; VI: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_F16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
		; VI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_0]]
		; VI: v_cvt_i32_f32_e32 v[[R_I64_1_Low:[0-9]+]], v[[A_F32_1]]
		; VI: v_cvt_i32_f32_e32 v[[R_I64_0_Low:[0-9]+]], v[[A_F32_0]]
		; VI: v_ashrrev_i32_e32 v[[R_I64_1_High:[0-9]+]], 31, v[[R_I64_1_Low]]
		; VI: v_ashrrev_i32_e32 v[[R_I64_0_High:[0-9]+]], 31, v[[R_I64_0_Low]]
		; GCN: buffer_store_dwordx4 v{{\[}}[[R_I64_0_Low]]{{\:}}[[R_I64_1_High]]{{\]}}
; GCN: s_endpgm		; GCN: s_endpgm
define amdgpu_kernel void @fptosi_v2f16_to_v2i64(		define amdgpu_kernel void @fptosi_v2f16_to_v2i64(
<2 x i64> addrspace(1)* %r,		<2 x i64> addrspace(1)* %r,
<2 x half> addrspace(1)* %a) {		<2 x half> addrspace(1)* %a) {
entry:		entry:
%a.val = load <2 x half>, <2 x half> addrspace(1)* %a		%a.val = load <2 x half>, <2 x half> addrspace(1)* %a
%r.val = fptosi <2 x half> %a.val to <2 x i64>		%r.val = fptosi <2 x half> %a.val to <2 x i64>
store <2 x i64> %r.val, <2 x i64> addrspace(1)* %r		store <2 x i64> %r.val, <2 x i64> addrspace(1)* %r
ret void		ret void
}		}

llvm/test/CodeGen/AMDGPU/fptoui.f16.ll

Show All 32 Lines	entry:
store i32 %r.val, i32 addrspace(1)* %r		store i32 %r.val, i32 addrspace(1)* %r
ret void		ret void
}		}

; Need to make sure we promote f16 to f32 when converting f16 to i64. Existing		; Need to make sure we promote f16 to f32 when converting f16 to i64. Existing
; test checks code generated for 'i64 = fp_to_uint f32'.		; test checks code generated for 'i64 = fp_to_uint f32'.

; GCN-LABEL: {{^}}fptoui_f16_to_i64		; GCN-LABEL: {{^}}fptoui_f16_to_i64
; GCN: buffer_load_ushort		; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
; GCN: v_cvt_f32_f16_e32		; GCN: v_mov_b32_e32 v[[R_I64_High:[0-9]+]], 0
		; GCN: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
		; GCN: v_cvt_u32_f32_e32 v[[R_I64_Low:[0-9]+]], v[[A_F32]]
		; GCN: buffer_store_dwordx2 v{{\[}}[[R_I64_Low]]{{\:}}[[R_I64_High]]{{\]}}
; GCN: s_endpgm		; GCN: s_endpgm
define amdgpu_kernel void @fptoui_f16_to_i64(		define amdgpu_kernel void @fptoui_f16_to_i64(
i64 addrspace(1)* %r,		i64 addrspace(1)* %r,
half addrspace(1)* %a) {		half addrspace(1)* %a) {
entry:		entry:
%a.val = load half, half addrspace(1)* %a		%a.val = load half, half addrspace(1)* %a
%r.val = fptoui half %a.val to i64		%r.val = fptoui half %a.val to i64
store i64 %r.val, i64 addrspace(1)* %r		store i64 %r.val, i64 addrspace(1)* %r
▲ Show 20 Lines • Show All 48 Lines • ▼ Show 20 Lines	entry:
store <2 x i32> %r.val, <2 x i32> addrspace(1)* %r		store <2 x i32> %r.val, <2 x i32> addrspace(1)* %r
ret void		ret void
}		}

; Need to make sure we promote f16 to f32 when converting f16 to i64. Existing		; Need to make sure we promote f16 to f32 when converting f16 to i64. Existing
; test checks code generated for 'i64 = fp_to_uint f32'.		; test checks code generated for 'i64 = fp_to_uint f32'.

; GCN-LABEL: {{^}}fptoui_v2f16_to_v2i64		; GCN-LABEL: {{^}}fptoui_v2f16_to_v2i64
; GCN: buffer_load_dword		; GCN: buffer_load_dword v[[A_F16_0:[0-9]+]]
; GCN: v_cvt_f32_f16_e32		; GCN: v_mov_b32_e32 v[[R_I64_1_High:[0-9]+]], 0
; SI: v_cvt_f32_f16_e32		; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_F16_0]]
; VI: v_cvt_f32_f16_sdwa		; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_0]]
		; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
		; SI: v_cvt_u32_f32_e32 v[[R_I64_0_Low:[0-9]+]], v[[A_F32_0]]
		; SI: v_cvt_u32_f32_e32 v[[R_I64_1_Low:[0-9]+]], v[[A_F32_1]]
		; VI: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_F16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
		; VI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_0]]
		; VI: v_cvt_u32_f32_e32 v[[R_I64_1_Low:[0-9]+]], v[[A_F32_1]]
		; VI: v_cvt_u32_f32_e32 v[[R_I64_0_Low:[0-9]+]], v[[A_F32_0]]
		; GCN: v_mov_b32_e32 v[[R_I64_0_High:[0-9]+]], 0
		; GCN: buffer_store_dwordx4 v{{\[}}[[R_I64_0_Low]]{{\:}}[[R_I64_1_High]]{{\]}}
; GCN: s_endpgm		; GCN: s_endpgm
define amdgpu_kernel void @fptoui_v2f16_to_v2i64(		define amdgpu_kernel void @fptoui_v2f16_to_v2i64(
<2 x i64> addrspace(1)* %r,		<2 x i64> addrspace(1)* %r,
<2 x half> addrspace(1)* %a) {		<2 x half> addrspace(1)* %a) {
entry:		entry:
%a.val = load <2 x half>, <2 x half> addrspace(1)* %a		%a.val = load <2 x half>, <2 x half> addrspace(1)* %a
%r.val = fptoui <2 x half> %a.val to <2 x i64>		%r.val = fptoui <2 x half> %a.val to <2 x i64>
store <2 x i64> %r.val, <2 x i64> addrspace(1)* %r		store <2 x i64> %r.val, <2 x i64> addrspace(1)* %r
ret void		ret void
}		}