This is an archive of the discontinued LLVM Phabricator instance.

I wanted to implement this as a target independent expansion, however when
targets say they want to expand FP_TO_FP16 what they actually want is
the unsafe math expansion when possible and expansion to a libcall in all
other cases.

The only way to make this work as a target independent would be to add logic
to target's TargetLowering construction to mark theses nodes as Expand when
LegalizeDAG can use the unsafe expansion and mark them as LibCall when it
cannot. I think this would be possible, but I think it would be too fragile
and complex as it would require targets to keep their expansion logic up
to date with the code in LegalizeDAG.

LGTM with the test fixed

test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll
1 ↗	(On Diff #76435)	Can you remove the -mcpu and add -verify-machineinstrs, also the FileCheck part is missing

Closed by commit rL285704: AMDGPU: Implement expansion of f16 = FP_TO_FP16 f64 (authored by tstellar). · Explain WhyNov 1 2016, 9:41 AM

This revision was automatically updated to reflect the committed changes.

jvesely added a subscriber: jvesely.Nov 1 2016, 11:52 AM

jvesely added inline comments.

llvm/trunk/test/CodeGen/AMDGPU/fptrunc.ll
4	GCN-UNSAFE is never checked. did you mean GCN-FAST (or use GCN-UNSAFE in the tests)?
17	Is there a reason to use i16 instead of half? can the half test be moved to this file?

Revision Contents

Path

Size

llvm/

trunk/

lib/

Target/

AMDGPU/

AMDGPUISelLowering.h

1 line

AMDGPUISelLowering.cpp

98 lines

test/

CodeGen/

AMDGPU/

fptrunc.ll

46 lines

trunc-store-f64-to-f16.ll

3 lines

Diff 76580

llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h

Show First 20 Lines • Show All 47 Lines • ▼ Show 20 Lines	protected:
SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) const;

SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const;		SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const;
SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const;		SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const;
SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;

SDValue LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, bool Signed) const;		SDValue LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, bool Signed) const;
		SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;

SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;

protected:		protected:
bool shouldCombineMemoryType(EVT VT) const;		bool shouldCombineMemoryType(EVT VT) const;
SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const;		SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const;
▲ Show 20 Lines • Show All 259 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Show First 20 Lines • Show All 273 Lines • ▼ Show 20 Lines	AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,

if (!Subtarget->hasBFI()) {		if (!Subtarget->hasBFI()) {
// fcopysign can be done in a single instruction with BFI.		// fcopysign can be done in a single instruction with BFI.
setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);		setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);		setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
}		}

setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);		setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
		setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom);

const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };		const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
for (MVT VT : ScalarIntVTs) {		for (MVT VT : ScalarIntVTs) {
// These should use [SU]DIVREM, so set them to expand		// These should use [SU]DIVREM, so set them to expand
setOperationAction(ISD::SDIV, VT, Expand);		setOperationAction(ISD::SDIV, VT, Expand);
setOperationAction(ISD::UDIV, VT, Expand);		setOperationAction(ISD::UDIV, VT, Expand);
setOperationAction(ISD::SREM, VT, Expand);		setOperationAction(ISD::SREM, VT, Expand);
setOperationAction(ISD::UREM, VT, Expand);		setOperationAction(ISD::UREM, VT, Expand);
▲ Show 20 Lines • Show All 511 Lines • ▼ Show 20 Lines	SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
case ISD::FCEIL: return LowerFCEIL(Op, DAG);		case ISD::FCEIL: return LowerFCEIL(Op, DAG);
case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);		case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
case ISD::FRINT: return LowerFRINT(Op, DAG);		case ISD::FRINT: return LowerFRINT(Op, DAG);
case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);		case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
case ISD::FROUND: return LowerFROUND(Op, DAG);		case ISD::FROUND: return LowerFROUND(Op, DAG);
case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);		case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);		case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);		case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
		case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);		case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);		case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
case ISD::CTLZ:		case ISD::CTLZ:
case ISD::CTLZ_ZERO_UNDEF:		case ISD::CTLZ_ZERO_UNDEF:
return LowerCTLZ(Op, DAG);		return LowerCTLZ(Op, DAG);
case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);		case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
}		}
return Op;		return Op;
▲ Show 20 Lines • Show All 1,137 Lines • ▼ Show 20 Lines	SDValue Hi = DAG.getNode(Signed ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, SL,
MVT::i32, FloorMul);		MVT::i32, FloorMul);
SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);		SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);

SDValue Result = DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi});		SDValue Result = DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi});

return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result);		return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result);
}		}

		SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {

		if (getTargetMachine().Options.UnsafeFPMath) {
		// There is a generic expand for FP_TO_FP16 with unsafe fast math.
		return SDValue();
		}

		SDLoc DL(Op);
		SDValue N0 = Op.getOperand(0);
		MVT SVT = N0.getSimpleValueType();
		assert(SVT == MVT::f64);

		// f64 -> f16 conversion using round-to-nearest-even rounding mode.
		const unsigned ExpMask = 0x7ff;
		const unsigned ExpBiasf64 = 1023;
		const unsigned ExpBiasf16 = 15;
		SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
		SDValue One = DAG.getConstant(1, DL, MVT::i32);
		SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
		SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
		DAG.getConstant(32, DL, MVT::i64));
		UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
		U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
		SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
		DAG.getConstant(20, DL, MVT::i64));
		E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
		DAG.getConstant(ExpMask, DL, MVT::i32));
		// Subtract the fp64 exponent bias (1023) to get the real exponent and
		// add the f16 bias (15) to get the biased exponent for the f16 format.
		E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
		DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));

		SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
		DAG.getConstant(8, DL, MVT::i32));
		M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
		DAG.getConstant(0xffe, DL, MVT::i32));

		SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
		DAG.getConstant(0x1ff, DL, MVT::i32));
		MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);

		SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
		M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);

		// (M != 0 ? 0x0200 : 0) \| 0x7c00;
		SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
		DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
		Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));

		// N = M \| (E << 12);
		SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
		DAG.getNode(ISD::SHL, DL, MVT::i32, E,
		DAG.getConstant(12, DL, MVT::i32)));

		// B = clamp(1-E, 0, 13);
		SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
		One, E);
		SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
		B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
		DAG.getConstant(13, DL, MVT::i32));

		SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
		DAG.getConstant(0x1000, DL, MVT::i32));

		SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
		SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
		SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
		D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);

		SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
		SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
		DAG.getConstant(0x7, DL, MVT::i32));
		V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
		DAG.getConstant(2, DL, MVT::i32));
		SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
		One, Zero, ISD::SETEQ);
		SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
		One, Zero, ISD::SETGT);
		V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
		V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);

		V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
		DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
		V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
		I, V, ISD::SETEQ);

		// Extract the sign bit.
		SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
		DAG.getConstant(16, DL, MVT::i32));
		Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
		DAG.getConstant(0x8000, DL, MVT::i32));

		V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
		return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
		}

SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op,		SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op,
SelectionDAG &DAG) const {		SelectionDAG &DAG) const {
SDValue Src = Op.getOperand(0);		SDValue Src = Op.getOperand(0);

if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)		if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
return LowerFP64_TO_INT(Op, DAG, true);		return LowerFP64_TO_INT(Op, DAG, true);

return SDValue();		return SDValue();
▲ Show 20 Lines • Show All 1,003 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/fptrunc.ll

	; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s \| FileCheck -check-prefix=SI -check-prefix=FUNC %s			; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s
	; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -check-prefix=SI -check-prefix=FUNC %s			; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s
				; RUN: llc -march=amdgcn -mcpu=tonga -enable-unsafe-fp-math -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN,GCN-UNSAFE %s

				jveselyUnsubmitted Not Done Reply Inline Actions GCN-UNSAFE is never checked. did you mean GCN-FAST (or use GCN-UNSAFE in the tests)? jvesely: GCN-UNSAFE is never checked. did you mean GCN-FAST (or use GCN-UNSAFE in the tests)?
	; FUNC-LABEL: {{^}}fptrunc_f64_to_f32:			; FUNC-LABEL: {{^}}fptrunc_f64_to_f32:
	; SI: v_cvt_f32_f64_e32 {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}			; GCN: v_cvt_f32_f64_e32 {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}
	define void @fptrunc_f64_to_f32(float addrspace(1)* %out, double %in) {			define void @fptrunc_f64_to_f32(float addrspace(1)* %out, double %in) {
	%result = fptrunc double %in to float			%result = fptrunc double %in to float
	store float %result, float addrspace(1)* %out			store float %result, float addrspace(1)* %out
	ret void			ret void
	}			}

				; FUNC-LABEL: {{^}}fptrunc_f64_to_f16:
				; GCN-NOT: v_cvt
				; GCN-FAST: v_cvt_f32_f64_e32 [[F32:v[0-9]+]]
				; GCN-FAST: v_cvt_f16_f32_e32 v[0-9]+, [[F32]]
				define void @fptrunc_f64_to_f16(i16 addrspace(1)* %out, double %in) {
				jveselyUnsubmitted Not Done Reply Inline Actions Is there a reason to use i16 instead of half? can the half test be moved to this file? jvesely: Is there a reason to use i16 instead of half? can the half test be moved to this file?
				%result = fptrunc double %in to half
				%result_i16 = bitcast half %result to i16
				store i16 %result_i16, i16 addrspace(1)* %out
				ret void
				}

	; FUNC-LABEL: {{^}}fptrunc_v2f64_to_v2f32:			; FUNC-LABEL: {{^}}fptrunc_v2f64_to_v2f32:
	; SI: v_cvt_f32_f64_e32			; GCN: v_cvt_f32_f64_e32
	; SI: v_cvt_f32_f64_e32			; GCN: v_cvt_f32_f64_e32
	define void @fptrunc_v2f64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x double> %in) {			define void @fptrunc_v2f64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x double> %in) {
	%result = fptrunc <2 x double> %in to <2 x float>			%result = fptrunc <2 x double> %in to <2 x float>
	store <2 x float> %result, <2 x float> addrspace(1)* %out			store <2 x float> %result, <2 x float> addrspace(1)* %out
	ret void			ret void
	}			}

	; FUNC-LABEL: {{^}}fptrunc_v4f64_to_v4f32:			; FUNC-LABEL: {{^}}fptrunc_v4f64_to_v4f32:
	; SI: v_cvt_f32_f64_e32			; GCN: v_cvt_f32_f64_e32
	; SI: v_cvt_f32_f64_e32			; GCN: v_cvt_f32_f64_e32
	; SI: v_cvt_f32_f64_e32			; GCN: v_cvt_f32_f64_e32
	; SI: v_cvt_f32_f64_e32			; GCN: v_cvt_f32_f64_e32
	define void @fptrunc_v4f64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x double> %in) {			define void @fptrunc_v4f64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x double> %in) {
	%result = fptrunc <4 x double> %in to <4 x float>			%result = fptrunc <4 x double> %in to <4 x float>
	store <4 x float> %result, <4 x float> addrspace(1)* %out			store <4 x float> %result, <4 x float> addrspace(1)* %out
	ret void			ret void
	}			}

	; FUNC-LABEL: {{^}}fptrunc_v8f64_to_v8f32:			; FUNC-LABEL: {{^}}fptrunc_v8f64_to_v8f32:
	; SI: v_cvt_f32_f64_e32			; GCN: v_cvt_f32_f64_e32
	; SI: v_cvt_f32_f64_e32			; GCN: v_cvt_f32_f64_e32
	; SI: v_cvt_f32_f64_e32			; GCN: v_cvt_f32_f64_e32
	; SI: v_cvt_f32_f64_e32			; GCN: v_cvt_f32_f64_e32
	; SI: v_cvt_f32_f64_e32			; GCN: v_cvt_f32_f64_e32
	; SI: v_cvt_f32_f64_e32			; GCN: v_cvt_f32_f64_e32
	; SI: v_cvt_f32_f64_e32			; GCN: v_cvt_f32_f64_e32
	; SI: v_cvt_f32_f64_e32			; GCN: v_cvt_f32_f64_e32
	define void @fptrunc_v8f64_to_v8f32(<8 x float> addrspace(1)* %out, <8 x double> %in) {			define void @fptrunc_v8f64_to_v8f32(<8 x float> addrspace(1)* %out, <8 x double> %in) {
	%result = fptrunc <8 x double> %in to <8 x float>			%result = fptrunc <8 x double> %in to <8 x float>
	store <8 x float> %result, <8 x float> addrspace(1)* %out			store <8 x float> %result, <8 x float> addrspace(1)* %out
	ret void			ret void
	}			}

llvm/trunk/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll

	; XFAIL: *			; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s
	; RUN: llc -march=amdgcn -mcpu=SI < %s

	; GCN-LABEL: {{^}}global_truncstore_f64_to_f16:			; GCN-LABEL: {{^}}global_truncstore_f64_to_f16:
	; GCN: s_endpgm			; GCN: s_endpgm
	define void @global_truncstore_f64_to_f16(half addrspace(1)* %out, double addrspace(1)* %in) #0 {			define void @global_truncstore_f64_to_f16(half addrspace(1)* %out, double addrspace(1)* %in) #0 {
	%val = load double, double addrspace(1)* %in			%val = load double, double addrspace(1)* %in
	%cvt = fptrunc double %val to half			%cvt = fptrunc double %val to half
	store half %cvt, half addrspace(1)* %out			store half %cvt, half addrspace(1)* %out
	ret void			ret void
	▲ Show 20 Lines • Show All 46 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

LegalizeDAG: Implement expansion of f16 = FP_TO_FP16 f64ClosedPublic

Details

Diff Detail