This is an archive of the discontinued LLVM Phabricator instance.

Differential D6970

R600/SI: Try to form (f64 [s|u]int_to_fp i1)
AbandonedPublic

Authored by arsenm on Jan 14 2015, 9:53 AM.

Download Raw Diff

Details

Reviewers

rampitec

Group Reviewers

Restricted Project

Summary

This is un-optimized by the DAG combiner now to avoid
the from-i1 conversion. We get slightly better code
by doing this than materializing the weird constants
since there is no 64-bit select which end up getting split
up. The expanded pattern also shows up in fceil / ffloor
lowering.

This is worse depending on the rate of v_cvt_f64_i32. I'm
not sure the scheduling models is accurate for every subtarget;
llvm-mca is saying v_cvt_f64_i32 is quarter rate but I believe
it is supposed to be half rate (or at least it used to be on older
subtargets)

Diff Detail

Event Timeline

arsenm updated this revision to Diff 18163.Jan 14 2015, 9:53 AM

arsenm retitled this revision from to R600/SI: Try to form (f64 [s|u]int_to_fp i1) .

arsenm updated this object.

arsenm edited the test plan for this revision. (Show Details)

arsenm added a subscriber: Unknown Object (MLST).

ping

Rebase 7 years into the future.

This optimization doesn't make sense if v_cvt_f64_i32 is quarter rate, which it is on some subtargets at least. I'm not sure the scheduler model is 100% accurate
for every subtarget, and it seems to always say it's quarter rate

Herald added projects: Restricted Project, Restricted Project. · View Herald TranscriptNov 12 2022, 10:44 AM

Herald added subscribers: kosarev, foad, ecnelises and 4 others. · View Herald Transcript

Harbormaster completed remote builds in B197377: Diff 474949.Nov 12 2022, 11:10 AM

arsenm added a reviewer: rampitec.Nov 18 2022, 5:12 PM

Looks reasonable, but description mentions R600/SI while it affects other targets. I.e. it needs to be retitled.

This revision is now accepted and ready to land.Nov 28 2022, 12:45 PM

In D6970#3954786, @rampitec wrote:

Looks reasonable, but description mentions R600/SI while it affects other targets. I.e. it needs to be retitled.

It's slower assuming quarter rate conversions, which is what the scheduler thinks is the case. I was thinking of just abandoning this

In D6970#3954790, @arsenm wrote:

In D6970#3954786, @rampitec wrote:

Looks reasonable, but description mentions R600/SI while it affects other targets. I.e. it needs to be retitled.

It's slower assuming quarter rate conversions, which is what the scheduler thinks is the case. I was thinking of just abandoning this

Actually GFX11SpeedModel suggests it is even slower...

This is only conditionally faster and it might not actually be. I don't have time to look into this

Revision Contents

Path

Size

lib/

Target/

R600/

SIISelLowering.cpp

41 lines

test/

CodeGen/

R600/

4 lines

4 lines

47 lines

47 lines

Diff 18163

lib/Target/R600/SIISelLowering.cpp

Context not available.
	SDValue Zero = DAG.getConstant(0, MVT::i32);	SDValue Zero = DAG.getConstant(0, MVT::i32);
	SDValue One = DAG.getConstant(1, MVT::i32);	SDValue One = DAG.getConstant(1, MVT::i32);

	SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));	SDValue LHS = Op.getOperand(1);
	SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));	SDValue RHS = Op.getOperand(2);

		// Undo combine done in visitSINT_TO_FP / visitUINT_TO_FP.
		// f64 (select (i1 cnd), [+\|-]1.0, 0.0) -> f64 [u\|s]int_to_fp (i1 cnd)
		//
		// It is larger and expensive to do the 2 selects and materialize the weird
		// constant than selecting an i32 -1 / 0 and doing the conversion to f64.
		//
		// = 16 byte, 12 cycle
		// v_cndmask_b32_e32 v0, 0, -1, s[0:1]
		// v_cvt_f64_i32_e32 v[0:1], v0
		//
		// vs.
		//
		// = 20 byte, 16 cycle
		// v_mov_b32_e32 v0, 0xbff00000
		// v_cndmask_b32_e64 v1, 0, v0, s[0:1]
		// v_mov_b32 v0, 0
		//

		if (const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS)) {
		if (CRHS->isNullValue()) {
		if (const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS)) {
		if (CLHS->getZExtValue() == DoubleToBits(-1.0)) {
		SDValue Cvt = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f64, Cond);
		return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Cvt);
		}

		if (CLHS->getZExtValue() == DoubleToBits(1.0)) {
		SDValue Cvt = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f64, Cond);
		return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Cvt);
		}
		}
		}
		}

		LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, LHS);
		RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, RHS);

	SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);	SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
	SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);	SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
Context not available.

test/CodeGen/R600/fceil64.ll

Context not available.
	; SI: v_cmp_lg_f64	; SI: v_cmp_lg_f64
	; SI: v_cmp_gt_f64	; SI: v_cmp_gt_f64
	; SI: s_and_b64	; SI: s_and_b64
	; SI: v_cndmask_b32	; SI: v_cndmask_b32_e64 {{v[0-9]+}}, 0, 1,
	; SI: v_cndmask_b32	; SI-NEXT: v_cvt_f64_u32_e32
	; SI: v_add_f64	; SI: v_add_f64
	; SI: s_endpgm	; SI: s_endpgm
	define void @fceil_f64(double addrspace(1)* %out, double %x) {	define void @fceil_f64(double addrspace(1)* %out, double %x) {
Context not available.

test/CodeGen/R600/ffloor.ll

Context not available.
	; SI: v_cmp_lg_f64	; SI: v_cmp_lg_f64
	; SI: v_cmp_lt_f64	; SI: v_cmp_lt_f64
	; SI: s_and_b64	; SI: s_and_b64
	; SI: v_cndmask_b32	; SI: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1,
	; SI: v_cndmask_b32	; SI-NEXT: v_cvt_f64_i32_e32
	; SI: v_add_f64	; SI: v_add_f64
	; SI: s_endpgm	; SI: s_endpgm
	define void @ffloor_f64(double addrspace(1)* %out, double %x) {	define void @ffloor_f64(double addrspace(1)* %out, double %x) {
Context not available.

test/CodeGen/R600/sint_to_fp.f64.ll

Context not available.

	; SI-LABEL: {{^}}sint_to_fp_i1_f64:	; SI-LABEL: {{^}}sint_to_fp_i1_f64:
	; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],	; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
	; We can't fold the SGPRs into v_cndmask_b32_e64, because it already	; SI: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, -1, [[CMP]]
	; uses an SGPR for [[CMP]]	; SI: v_cvt_f64_i32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[TMP]]
	; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]]	; SI: buffer_store_dwordx2 [[RESULT]]
	; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]]
	; SI: buffer_store_dwordx2
	; SI: s_endpgm	; SI: s_endpgm
	define void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) {	define void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) {
	%cmp = icmp eq i32 %in, 0	%cmp = icmp eq i32 %in, 0
Context not available.
	store double %result, double addrspace(1)* %out	store double %result, double addrspace(1)* %out
	ret void	ret void
	}	}

		; SI-LABEL: {{^}}select_sint_to_fp_i1_vals_f64:
		; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
		; SI: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, -1, [[CMP]]
		; SI: v_cvt_f64_i32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[TMP]]
		; SI: buffer_store_dwordx2 [[RESULT]]
		; SI: s_endpgm
		define void @select_sint_to_fp_i1_vals_f64(double addrspace(1)* %out, i32 %in) {
		%cmp = icmp eq i32 %in, 0
		%select = select i1 %cmp, double -1.0, double 0.0
		store double %select, double addrspace(1)* %out, align 8
		ret void
		}

		; SI-LABEL: {{^}}select_sint_to_fp_i1_vals_i64:
		; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
		; SI: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, -1, [[CMP]]
		; SI: v_cvt_f64_i32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[TMP]]
		; SI: buffer_store_dwordx2 [[RESULT]]
		; SI: s_endpgm
		define void @select_sint_to_fp_i1_vals_i64(i64 addrspace(1)* %out, i32 %in) {
		%cmp = icmp eq i32 %in, 0
		%select = select i1 %cmp, i64 u0xbff0000000000000, i64 0
		store i64 %select, i64 addrspace(1)* %out, align 8
		ret void
		}

		; TODO: This should swap the selected order / invert the compare and do it.
		; SI-LABEL: {{^}}swap_select_sint_to_fp_i1_vals_f64:
		; SI-NOT: v_cvt_f64_u32
		; SI: v_cndmask_b32_e64
		; SI: v_cndmask_b32_e64
		; SI: s_endpgm
		define void @swap_select_sint_to_fp_i1_vals_f64(double addrspace(1)* %out, i32 %in) {
		%cmp = icmp eq i32 %in, 0
		%select = select i1 %cmp, double 0.0, double -1.0
		store double %select, double addrspace(1)* %out, align 8
		ret void
		}
Context not available.

test/CodeGen/R600/uint_to_fp.f64.ll

Context not available.

	; SI-LABEL: {{^}}uint_to_fp_i1_to_f64:	; SI-LABEL: {{^}}uint_to_fp_i1_to_f64:
	; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],	; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
	; We can't fold the SGPRs into v_cndmask_b32_e64, because it already	; SI: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, 1, [[CMP]]
	; uses an SGPR for [[CMP]]	; SI: v_cvt_f64_u32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[TMP]]
	; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]]	; SI: buffer_store_dwordx2 [[RESULT]]
	; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]]
	; SI: buffer_store_dwordx2
	; SI: s_endpgm	; SI: s_endpgm
	define void @uint_to_fp_i1_to_f64(double addrspace(1)* %out, i32 %in) {	define void @uint_to_fp_i1_to_f64(double addrspace(1)* %out, i32 %in) {
	%cmp = icmp eq i32 %in, 0	%cmp = icmp eq i32 %in, 0
Context not available.
	store double %fp, double addrspace(1)* %out, align 8	store double %fp, double addrspace(1)* %out, align 8
	ret void	ret void
	}	}

		; SI-LABEL: {{^}}select_uint_to_fp_i1_vals_f64:
		; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
		; SI: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, 1, [[CMP]]
		; SI: v_cvt_f64_u32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[TMP]]
		; SI: buffer_store_dwordx2 [[RESULT]]
		; SI: s_endpgm
		define void @select_uint_to_fp_i1_vals_f64(double addrspace(1)* %out, i32 %in) {
		%cmp = icmp eq i32 %in, 0
		%select = select i1 %cmp, double 1.0, double 0.0
		store double %select, double addrspace(1)* %out, align 8
		ret void
		}

		; SI-LABEL: {{^}}select_uint_to_fp_i1_vals_i64:
		; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
		; SI: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, 1, [[CMP]]
		; SI: v_cvt_f64_u32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[TMP]]
		; SI: buffer_store_dwordx2 [[RESULT]]
		; SI: s_endpgm
		define void @select_uint_to_fp_i1_vals_i64(i64 addrspace(1)* %out, i32 %in) {
		%cmp = icmp eq i32 %in, 0
		%select = select i1 %cmp, i64 u0x3ff0000000000000, i64 0
		store i64 %select, i64 addrspace(1)* %out, align 8
		ret void
		}

		; TODO: This should swap the selected order / invert the compare and do it.
		; SI-LABEL: {{^}}swap_select_uint_to_fp_i1_vals_f64:
		; SI-NOT: v_cvt_f64_u32
		; SI: v_cndmask_b32_e64
		; SI: v_cndmask_b32_e64
		; SI: s_endpgm
		define void @swap_select_uint_to_fp_i1_vals_f64(double addrspace(1)* %out, i32 %in) {
		%cmp = icmp eq i32 %in, 0
		%select = select i1 %cmp, double 0.0, double 1.0
		store double %select, double addrspace(1)* %out, align 8
		ret void
		}
Context not available.