This is an archive of the discontinued LLVM Phabricator instance.

Differential D52060

AMDGPU: Add a fast path for icmp.i1(src, false, NE)
ClosedPublic

Authored by mareko on Sep 13 2018, 2:30 PM.

Download Raw Diff

Details

Reviewers

arsenm
nhaehnle

Commits

rG33eb4d947d82: AMDGPU: Add a fast path for icmp.i1(src, false, NE)
rL351150: AMDGPU: Add a fast path for icmp.i1(src, false, NE)

Summary

This allows moving the condition from the intrinsic to the standard ICmp
opcode, so that LLVM can do simplifications on it. The icmp.i1 intrinsic
is an identity for retrieving the SGPR mask.

And we can also get the mask from and i1, or i1, xor i1.

Diff Detail

Repository

rL LLVM

Build Status

Buildable 22617
Build 22617: arc lint + arc unit

Event Timeline

mareko created this revision.Sep 13 2018, 2:30 PM

Herald added subscribers: t-tye, tpr, dstuttard and 4 others. · View Herald TranscriptSep 13 2018, 2:30 PM

Harbormaster completed remote builds in B22617: Diff 165383.Sep 13 2018, 2:30 PM

Should the instcombine part change also to allow creation of i1 uses?

In D52060#1234370, @arsenm wrote:

Should the instcombine part change also to allow creation of i1 uses?

What do you mean by that? I'm not sure what you mean.

In D52060#1241470, @mareko wrote:

In D52060#1234370, @arsenm wrote:

Should the instcombine part change also to allow creation of i1 uses?

What do you mean by that? I'm not sure what you mean.

In InstCombineCalls we whitelist bitwidth sizes that are legal, so if the input compare is an i1 compare, it will fold into the intrinsic

AMDGPU: Add a fast path for icmp.i1(src, false, NE)

Summary:
This allows moving the condition from the intrinsic to the standard ICmp
opcode, so that LLVM can do simplifications on it. The icmp.i1 intrinsic
is an identity for retrieving the SGPR mask.

And we can also get the mask from and i1, or i1, xor i1.

Don't fold icmp in InstCombineCalls.

Reviewers: arsenm, nhaehnle

Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits

Differential Revision: https://reviews.llvm.org/D52060

Harbormaster completed remote builds in B23531: Diff 168520.Oct 5 2018, 1:31 PM

arsenm added inline comments.Oct 29 2018, 6:30 PM

lib/Transforms/InstCombine/InstCombineCalls.cpp
3656–3658 ↗	(On Diff #168520)	Needs test in InstCombine

mareko added inline comments.Nov 20 2018, 3:18 PM

lib/Transforms/InstCombine/InstCombineCalls.cpp
3656–3658 ↗	(On Diff #168520)	What should the test do?

arsenm added inline comments.Nov 20 2018, 3:59 PM

lib/Transforms/InstCombine/InstCombineCalls.cpp
3656–3658 ↗	(On Diff #168520)	use an original i1 eq/ne comparison. Like the others, just i1

Add InstCombine tests.

Harbormaster completed remote builds in B25359: Diff 175387.Nov 26 2018, 8:30 PM

This revision was not accepted when it landed; it landed in state Needs Review.Jan 14 2019, 6:17 PM

Closed by commit rL351150: AMDGPU: Add a fast path for icmp.i1(src, false, NE) (authored by mareko). · Explain Why

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

lib/

Target/

AMDGPU/

SIISelLowering.cpp

4 lines

SIInstructions.td

5 lines

test/

CodeGen/

AMDGPU/

llvm.amdgcn.icmp.ll

18 lines

Diff 165383

lib/Target/AMDGPU/SIISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 5,055 Lines • ▼ Show 20 Lines	case Intrinsic::amdgcn_div_scale: {
// division operation.		// division operation.

SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;		SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;

return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,		return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
Denominator, Numerator);		Denominator, Numerator);
}		}
case Intrinsic::amdgcn_icmp: {		case Intrinsic::amdgcn_icmp: {
		if (Op.getOperand(1).getValueType() == MVT::i1 &&
		Op.getConstantOperandVal(2) == 0 &&
		Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
		return Op;
return lowerICMPIntrinsic(*this, Op.getNode(), DAG);		return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
}		}
case Intrinsic::amdgcn_fcmp: {		case Intrinsic::amdgcn_fcmp: {
return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);		return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
}		}
case Intrinsic::amdgcn_fmed3:		case Intrinsic::amdgcn_fmed3:
return DAG.getNode(AMDGPUISD::FMED3, DL, VT,		return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));		Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
▲ Show 20 Lines • Show All 4,162 Lines • Show Last 20 Lines

lib/Target/AMDGPU/SIInstructions.td

	Show First 20 Lines • Show All 591 Lines • ▼ Show 20 Lines
	>;			>;

	def : Pat <			def : Pat <
	(int_amdgcn_kill (i1 (setcc f32:$src, InlineFPImm<f32>:$imm, cond:$cond))),			(int_amdgcn_kill (i1 (setcc f32:$src, InlineFPImm<f32>:$imm, cond:$cond))),
	(SI_KILL_F32_COND_IMM_PSEUDO $src, (bitcast_fpimm_to_i32 $imm), (cond_as_i32imm $cond))			(SI_KILL_F32_COND_IMM_PSEUDO $src, (bitcast_fpimm_to_i32 $imm), (cond_as_i32imm $cond))
	>;			>;
	// TODO: we could add more variants for other types of conditionals			// TODO: we could add more variants for other types of conditionals

				def : Pat <
				(int_amdgcn_icmp i1:$src, (i1 0), (i32 33)),
				(COPY $src) // Return the SGPRs representing i1 src
				>;

	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	// VOP1 Patterns			// VOP1 Patterns
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	let SubtargetPredicate = isGCN, OtherPredicates = [UnsafeFPMath] in {			let SubtargetPredicate = isGCN, OtherPredicates = [UnsafeFPMath] in {

	//def : RcpPat<V_RCP_F64_e32, f64>;			//def : RcpPat<V_RCP_F64_e32, f64>;
	//defm : RsqPat<V_RSQ_F64_e32, f64>;			//defm : RsqPat<V_RSQ_F64_e32, f64>;
	▲ Show 20 Lines • Show All 1,042 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll

	; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,SI %s			; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
	; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,VI %s			; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,VI %s

	declare i64 @llvm.amdgcn.icmp.i32(i32, i32, i32) #0			declare i64 @llvm.amdgcn.icmp.i32(i32, i32, i32) #0
	declare i64 @llvm.amdgcn.icmp.i64(i64, i64, i32) #0			declare i64 @llvm.amdgcn.icmp.i64(i64, i64, i32) #0
	declare i64 @llvm.amdgcn.icmp.i16(i16, i16, i32) #0			declare i64 @llvm.amdgcn.icmp.i16(i16, i16, i32) #0
				declare i64 @llvm.amdgcn.icmp.i1(i1, i1, i32) #0

	; No crash on invalid input			; No crash on invalid input
	; GCN-LABEL: {{^}}v_icmp_i32_dynamic_cc:			; GCN-LABEL: {{^}}v_icmp_i32_dynamic_cc:
	; GCN: s_endpgm			; GCN: s_endpgm
	define amdgpu_kernel void @v_icmp_i32_dynamic_cc(i64 addrspace(1)* %out, i32 %src, i32 %cc) {			define amdgpu_kernel void @v_icmp_i32_dynamic_cc(i64 addrspace(1)* %out, i32 %src, i32 %cc) {
	%result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 %cc)			%result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 %cc)
	store i64 %result, i64 addrspace(1)* %out			store i64 %result, i64 addrspace(1)* %out
	ret void			ret void
	▲ Show 20 Lines • Show All 294 Lines • ▼ Show 20 Lines
	; SI-DAG: s_sext_i32_i16 [[CVT:s[0-9]+]], s{{[0-9]+}}			; SI-DAG: s_sext_i32_i16 [[CVT:s[0-9]+]], s{{[0-9]+}}
	; SI: v_cmp_le_i32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[CVT]], [[K]]			; SI: v_cmp_le_i32_e64 s{{\[[0-9]+:[0-9]+\]}}, [[CVT]], [[K]]
	define amdgpu_kernel void @v_icmp_i16_sle(i64 addrspace(1)* %out, i16 %src) {			define amdgpu_kernel void @v_icmp_i16_sle(i64 addrspace(1)* %out, i16 %src) {
	%result = call i64 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 41)			%result = call i64 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 41)
	store i64 %result, i64 addrspace(1)* %out			store i64 %result, i64 addrspace(1)* %out
	ret void			ret void
	}			}

				; GCN-LABEL: {{^}}v_icmp_i1_ne0:
				; GCN: v_cmp_gt_u32_e64 s[[C0:\[[0-9]+:[0-9]+\]]],
				; GCN: v_cmp_gt_u32_e64 s[[C1:\[[0-9]+:[0-9]+\]]],
				; GCN: s_and_b64 s[[SRC:\[[0-9]+:[0-9]+\]]], s[[C0]], s[[C1]]
				; SI-NEXT: s_mov_b32 s{{[0-9]+}}, -1
				; GCN-NEXT: v_mov_b32_e32
				; GCN-NEXT: v_mov_b32_e32
				; GCN-NEXT: {{global\|flat\|buffer}}_store_dwordx2
				define amdgpu_kernel void @v_icmp_i1_ne0(i64 addrspace(1)* %out, i32 %a, i32 %b) {
				%c0 = icmp ugt i32 %a, 1
				%c1 = icmp ugt i32 %b, 2
				%src = and i1 %c0, %c1
				%result = call i64 @llvm.amdgcn.icmp.i1(i1 %src, i1 false, i32 33)
				store i64 %result, i64 addrspace(1)* %out
				ret void
				}

	attributes #0 = { nounwind readnone convergent }			attributes #0 = { nounwind readnone convergent }