This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Implement {{s|u}}int_to_fp i64 -> f32
ClosedPublic

Authored by arsenm on Jan 11 2016, 8:13 AM.

Download Raw Diff

Details

Reviewers

• tstellarAMD
arsenm

Summary

The old lowering for uint_to_fp failed opencl conformance.
It might be OK for fast math mode, but I'm not sure.

Diff Detail

Event Timeline

arsenm updated this revision to Diff 44512.Jan 11 2016, 8:13 AM

arsenm retitled this revision from to AMDGPU: Implement {{s|u}}int_to_fp i64 -> f32.

arsenm updated this object.

arsenm added a reviewer: • tstellarAMD.

arsenm added a subscriber: llvm-commits.

Herald added a subscriber: arsenm. · View Herald TranscriptJan 11 2016, 8:13 AM

Other than the removed test, LGTM.

lib/Target/AMDGPU/AMDGPUISelLowering.cpp
2233–2236	I was thinking a bit about this because of all the i64, but it quickly gets messy and it's not clear to me that there is a much better way. I wonder whether bitcasting u to v2i32 and only shifting the high dword by 8 results in better code, but I'm fine with not trying that.
test/CodeGen/AMDGPU/uint_to_fp.ll
124–132	I think the R600 variant of the test should stay.

arsenm added inline comments.Jan 11 2016, 1:21 PM

lib/Target/AMDGPU/AMDGPUISelLowering.cpp
2233–2236	There are a few missing combines I'm working on that impact this that SC does. For example, the > 32 bit shift is split into a 32-bit shift and a mov 0. It's best to implement those separately rather than trying to specially emit them here

r257393 with r600 test readded

This revision is now accepted and ready to land.Jan 11 2016, 2:05 PM

arsenm closed this revision.Jan 11 2016, 2:05 PM

Revision Contents

Path

Size

lib/

Target/

AMDGPU/

AMDGPUISelLowering.h

1 line

AMDGPUISelLowering.cpp

117 lines

test/

CodeGen/

AMDGPU/

sint_to_fp.i64.ll

62 lines

uint_to_fp.i64.ll

57 lines

uint_to_fp.ll

16 lines

Diff 44512

lib/Target/AMDGPU/AMDGPUISelLowering.h

Show First 20 Lines • Show All 50 Lines • ▼ Show 20 Lines	private:

SDValue LowerFROUND32(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerFROUND32(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFROUND64(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerFROUND64(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const;

SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) const;

		SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const;
SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const;		SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const;
SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;

SDValue LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, bool Signed) const;		SDValue LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, bool Signed) const;
SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;

▲ Show 20 Lines • Show All 253 Lines • Show Last 20 Lines

lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Show First 20 Lines • Show All 2,217 Lines • ▼ Show 20 Lines	if (!ZeroUndef) {
// behavior is to return the number of bits.		// behavior is to return the number of bits.
NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32,		NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32,
SrcIsZero, Bits32, NewCtlz);		SrcIsZero, Bits32, NewCtlz);
}		}

return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewCtlz);		return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewCtlz);
}		}

		SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
		bool Signed) const {
		// Unsigned
		// cul2f(ulong u)
		//{
		// uint lz = clz(u);
		// uint e = (u != 0) ? 127U + 63U - lz : 0;
		// u = (u << lz) & 0x7fffffffffffffffUL;
		// ulong t = u & 0xffffffffffUL;
		// uint v = (e << 23) \| (uint)(u >> 40);
		// uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
		nhaehnleUnsubmitted Not Done Reply Inline Actions I was thinking a bit about this because of all the i64, but it quickly gets messy and it's not clear to me that there is a much better way. I wonder whether bitcasting u to v2i32 and only shifting the high dword by 8 results in better code, but I'm fine with not trying that. nhaehnle: I was thinking a bit about this because of all the i64, but it quickly gets messy and it's not…
		arsenmAuthorUnsubmitted Not Done Reply Inline Actions There are a few missing combines I'm working on that impact this that SC does. For example, the > 32 bit shift is split into a 32-bit shift and a mov 0. It's best to implement those separately rather than trying to specially emit them here arsenm: There are a few missing combines I'm working on that impact this that SC does. For example, the…
		// return as_float(v + r);
		//}
		// Signed
		// cl2f(long l)
		//{
		// long s = l >> 63;
		// float r = cul2f((l + s) ^ s);
		// return s ? -r : r;
		//}

		SDLoc SL(Op);
		SDValue Src = Op.getOperand(0);
		SDValue L = Src;

		SDValue S;
		if (Signed) {
		const SDValue SignBit = DAG.getConstant(63, SL, MVT::i64);
		S = DAG.getNode(ISD::SRA, SL, MVT::i64, L, SignBit);

		SDValue LPlusS = DAG.getNode(ISD::ADD, SL, MVT::i64, L, S);
		L = DAG.getNode(ISD::XOR, SL, MVT::i64, LPlusS, S);
		}

		EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
		*DAG.getContext(), MVT::f32);


		SDValue ZeroI32 = DAG.getConstant(0, SL, MVT::i32);
		SDValue ZeroI64 = DAG.getConstant(0, SL, MVT::i64);
		SDValue LZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i64, L);
		LZ = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LZ);

		SDValue K = DAG.getConstant(127U + 63U, SL, MVT::i32);
		SDValue E = DAG.getSelect(SL, MVT::i32,
		DAG.getSetCC(SL, SetCCVT, L, ZeroI64, ISD::SETNE),
		DAG.getNode(ISD::SUB, SL, MVT::i32, K, LZ),
		ZeroI32);

		SDValue U = DAG.getNode(ISD::AND, SL, MVT::i64,
		DAG.getNode(ISD::SHL, SL, MVT::i64, L, LZ),
		DAG.getConstant((-1ULL) >> 1, SL, MVT::i64));

		SDValue T = DAG.getNode(ISD::AND, SL, MVT::i64, U,
		DAG.getConstant(0xffffffffffULL, SL, MVT::i64));

		SDValue UShl = DAG.getNode(ISD::SRL, SL, MVT::i64,
		U, DAG.getConstant(40, SL, MVT::i64));

		SDValue V = DAG.getNode(ISD::OR, SL, MVT::i32,
		DAG.getNode(ISD::SHL, SL, MVT::i32, E, DAG.getConstant(23, SL, MVT::i32)),
		DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, UShl));

		SDValue C = DAG.getConstant(0x8000000000ULL, SL, MVT::i64);
		SDValue RCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETUGT);
		SDValue TCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETEQ);

		SDValue One = DAG.getConstant(1, SL, MVT::i32);

		SDValue VTrunc1 = DAG.getNode(ISD::AND, SL, MVT::i32, V, One);

		SDValue R = DAG.getSelect(SL, MVT::i32,
		RCmp,
		One,
		DAG.getSelect(SL, MVT::i32, TCmp, VTrunc1, ZeroI32));
		R = DAG.getNode(ISD::ADD, SL, MVT::i32, V, R);
		R = DAG.getNode(ISD::BITCAST, SL, MVT::f32, R);

		if (!Signed)
		return R;

		SDValue RNeg = DAG.getNode(ISD::FNEG, SL, MVT::f32, R);
		return DAG.getSelect(SL, MVT::f32, DAG.getSExtOrTrunc(S, SL, SetCCVT), RNeg, R);
		}

SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,		SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
bool Signed) const {		bool Signed) const {
SDLoc SL(Op);		SDLoc SL(Op);
SDValue Src = Op.getOperand(0);		SDValue Src = Op.getOperand(0);

SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);		SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);

SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,		SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
Show All 9 Lines	SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,		SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
DAG.getConstant(32, SL, MVT::i32));		DAG.getConstant(32, SL, MVT::i32));
// TODO: Should this propagate fast-math-flags?		// TODO: Should this propagate fast-math-flags?
return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);		return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
}		}

SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,		SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
SelectionDAG &DAG) const {		SelectionDAG &DAG) const {
SDValue S0 = Op.getOperand(0);		assert(Op.getOperand(0).getValueType() == MVT::i64 &&
if (S0.getValueType() != MVT::i64)		"operation should be legal");
return SDValue();

EVT DestVT = Op.getValueType();		EVT DestVT = Op.getValueType();
if (DestVT == MVT::f64)		if (DestVT == MVT::f64)
return LowerINT_TO_FP64(Op, DAG, false);		return LowerINT_TO_FP64(Op, DAG, false);

assert(DestVT == MVT::f32);		if (DestVT == MVT::f32)
		return LowerINT_TO_FP32(Op, DAG, false);

SDLoc DL(Op);		return SDValue();

// f32 uint_to_fp i64
SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0,
DAG.getConstant(0, DL, MVT::i32));
SDValue FloatLo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Lo);
SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0,
DAG.getConstant(1, DL, MVT::i32));
SDValue FloatHi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Hi);
// TODO: Should this propagate fast-math-flags?
FloatHi = DAG.getNode(ISD::FMUL, DL, MVT::f32, FloatHi,
DAG.getConstantFP(4294967296.0f, DL, MVT::f32)); // 2^32
return DAG.getNode(ISD::FADD, DL, MVT::f32, FloatLo, FloatHi);
}		}

SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,		SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
SelectionDAG &DAG) const {		SelectionDAG &DAG) const {
SDValue Src = Op.getOperand(0);		assert(Op.getOperand(0).getValueType() == MVT::i64 &&
if (Src.getValueType() == MVT::i64 && Op.getValueType() == MVT::f64)		"operation should be legal");

		EVT DestVT = Op.getValueType();
		if (DestVT == MVT::f32)
		return LowerINT_TO_FP32(Op, DAG, true);

		if (DestVT == MVT::f64)
return LowerINT_TO_FP64(Op, DAG, true);		return LowerINT_TO_FP64(Op, DAG, true);

return SDValue();		return SDValue();
}		}

SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG,		SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG,
bool Signed) const {		bool Signed) const {
SDLoc SL(Op);		SDLoc SL(Op);
▲ Show 20 Lines • Show All 730 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/sint_to_fp.i64.ll

This file was added.

				; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
				; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s

				; FIXME: This should be merged with sint_to_fp.ll, but s_sint_to_fp_v2i64 crashes on r600

				; FUNC-LABEL: {{^}}s_sint_to_fp_i64_to_f32:
				define void @s_sint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 {
				%result = sitofp i64 %in to float
				store float %result, float addrspace(1)* %out
				ret void
				}

				; FUNC-LABEL: {{^}}v_sint_to_fp_i64_to_f32:
				; GCN: {{buffer\|flat}}_load_dwordx2

				; SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, 63
				; VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\]}}, 63, {{v\[[0-9]+:[0-9]+\]}}
				; GCN: v_xor_b32

				; GCN: v_ffbh_u32
				; GCN: v_ffbh_u32
				; GCN: v_cndmask
				; GCN: v_cndmask

				; GCN-DAG: v_cmp_eq_i64
				; GCN-DAG: v_cmp_lt_u64

				; GCN: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
				; GCN: v_cndmask_b32_e32 [[SIGN_SEL:v[0-9]+]],
				; GCN: {{buffer\|flat}}_store_dword [[SIGN_SEL]]
				define void @v_sint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
				%tid = call i32 @llvm.r600.read.tidig.x()
				%in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
				%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				%val = load i64, i64 addrspace(1)* %in.gep
				%result = sitofp i64 %val to float
				store float %result, float addrspace(1)* %out.gep
				ret void
				}

				; FUNC-LABEL: {{^}}s_sint_to_fp_v2i64:
				define void @s_sint_to_fp_v2i64(<2 x float> addrspace(1)* %out, <2 x i64> %in) #0{
				%result = sitofp <2 x i64> %in to <2 x float>
				store <2 x float> %result, <2 x float> addrspace(1)* %out
				ret void
				}

				; FUNC-LABEL: {{^}}v_sint_to_fp_v4i64:
				define void @v_sint_to_fp_v4i64(<4 x float> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
				%tid = call i32 @llvm.r600.read.tidig.x()
				%in.gep = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i32 %tid
				%out.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i32 %tid
				%value = load <4 x i64>, <4 x i64> addrspace(1)* %in.gep
				%result = sitofp <4 x i64> %value to <4 x float>
				store <4 x float> %result, <4 x float> addrspace(1)* %out.gep
				ret void
				}

				declare i32 @llvm.r600.read.tidig.x() #1

				attributes #0 = { nounwind }
				attributes #1 = { nounwind readnone }

test/CodeGen/AMDGPU/uint_to_fp.i64.ll

This file was added.

				; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
				; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s

				; FIXME: This should be merged with uint_to_fp.ll, but s_uint_to_fp_v2i64 crashes on r600

				; FUNC-LABEL: {{^}}s_uint_to_fp_i64_to_f32:
				define void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 {
				%result = uitofp i64 %in to float
				store float %result, float addrspace(1)* %out
				ret void
				}

				; FUNC-LABEL: {{^}}v_uint_to_fp_i64_to_f32:
				; GCN: {{buffer\|flat}}_load_dwordx2

				; GCN: v_ffbh_u32
				; GCN: v_ffbh_u32
				; GCN: v_cndmask
				; GCN: v_cndmask

				; GCN-DAG: v_cmp_eq_i64
				; GCN-DAG: v_cmp_lt_u64

				; GCN: v_add_i32_e32 [[VR:v[0-9]+]]
				; GCN: {{buffer\|flat}}_store_dword [[VR]]
				define void @v_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
				%tid = call i32 @llvm.r600.read.tidig.x()
				%in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
				%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
				%val = load i64, i64 addrspace(1)* %in.gep
				%result = uitofp i64 %val to float
				store float %result, float addrspace(1)* %out.gep
				ret void
				}

				; FUNC-LABEL: {{^}}s_uint_to_fp_v2i64:
				define void @s_uint_to_fp_v2i64(<2 x float> addrspace(1)* %out, <2 x i64> %in) #0{
				%result = uitofp <2 x i64> %in to <2 x float>
				store <2 x float> %result, <2 x float> addrspace(1)* %out
				ret void
				}

				; FUNC-LABEL: {{^}}v_uint_to_fp_v4i64:
				define void @v_uint_to_fp_v4i64(<4 x float> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
				%tid = call i32 @llvm.r600.read.tidig.x()
				%in.gep = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i32 %tid
				%out.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i32 %tid
				%value = load <4 x i64>, <4 x i64> addrspace(1)* %in.gep
				%result = uitofp <4 x i64> %value to <4 x float>
				store <4 x float> %result, <4 x float> addrspace(1)* %out.gep
				ret void
				}

				declare i32 @llvm.r600.read.tidig.x() #1

				attributes #0 = { nounwind }
				attributes #1 = { nounwind readnone }

test/CodeGen/AMDGPU/uint_to_fp.ll

Show First 20 Lines • Show All 109 Lines • ▼ Show 20 Lines	define void @v_uint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
%in.gep = getelementptr i1, i1 addrspace(1)* %in, i32 %tid		%in.gep = getelementptr i1, i1 addrspace(1)* %in, i32 %tid
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid		%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
%val = load i1, i1 addrspace(1)* %in.gep		%val = load i1, i1 addrspace(1)* %in.gep
%fp = uitofp i1 %val to float		%fp = uitofp i1 %val to float
store float %fp, float addrspace(1)* %out.gep		store float %fp, float addrspace(1)* %out.gep
ret void		ret void
}		}

; FUNC-LABEL: {{^}}s_uint_to_fp_i64_to_f32:
; SI: v_cvt_f32_u32_e32
; SI: v_cvt_f32_u32_e32
; SI: v_madmk_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, 0x4f800000
; SI: s_endpgm

; R600: UINT_TO_FLT
; R600: UINT_TO_FLT
; R600: MULADD_IEEE
define void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 {
entry:
%cvt = uitofp i64 %in to float
store float %cvt, float addrspace(1)* %out
ret void
}
nhaehnleUnsubmitted Not Done Reply Inline Actions I think the R600 variant of the test should stay. nhaehnle: I think the R600 variant of the test should stay.

declare i32 @llvm.r600.read.tidig.x() #1		declare i32 @llvm.r600.read.tidig.x() #1

attributes #0 = { nounwind }		attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }		attributes #1 = { nounwind readnone }