This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Enable divergence-driven 'ctpop' selection
ClosedPublic

Authored by alex-t on Dec 26 2021, 4:30 AM.

Download Raw Diff

Details

Reviewers

rampitec
foad

Commits

rG5d46263a5ac5: [AMDGPU] Enable divergence-driven 'ctpop' selection

Summary

This change adds the patterns and divergence predicates for the ctpop (bitcount) nodes
to make them selected according to the divergence.

Diff Detail

Repository: rG LLVM Github Monorepo

Unit TestsFailed

	Time	Test
	50 ms	x64 debian > LLVM.Bindings/Go::go.test

Event Timeline

alex-t created this revision.Dec 26 2021, 4:30 AM

Herald added subscribers: foad, kerbowa, hiraditya and 8 others. · View Herald TranscriptDec 26 2021, 4:30 AM

alex-t requested review of this revision.Dec 26 2021, 4:30 AM

Herald added a project: Restricted Project. · View Herald TranscriptDec 26 2021, 4:30 AM

Herald added a subscriber: wdng. · View Herald Transcript

test file attributes corrected

Harbormaster completed remote builds in B140651: Diff 396216.Dec 26 2021, 5:18 AM

foad added inline comments.Dec 31 2021, 3:40 AM

llvm/lib/Target/AMDGPU/SIInstructions.td
1028	Do you really need COPY_TO_REGCLASS here?
llvm/lib/Target/AMDGPU/SOPInstructions.td
1374	Do we really need both this pattern and the one on line 252? Surely one of them is redundant?

odd COPY_TO_REGCLASS removed. Test updated.

alex-t marked an inline comment as done.Jan 6 2022, 5:15 AM

alex-t added inline comments.

llvm/lib/Target/AMDGPU/SOPInstructions.td
1374	These two are not exactly identical. The first one, at line 252, accepts i64 and returns i32. The second one - accepts i64 and returns i64. W/o the latter one, no implicit zero extend occurs.

Harbormaster completed remote builds in B141881: Diff 397856.Jan 6 2022, 5:49 AM

LGTM.

llvm/lib/Target/AMDGPU/SOPInstructions.td
1374	Actually I see now, the i64 to i32 pattern is used for GlobalISel only, and the i64 to i64 pattern is used for SelectionDAG only.

This revision is now accepted and ready to land.Jan 6 2022, 5:49 AM

This revision was landed with ongoing or failed builds.Jan 7 2022, 5:05 AM

Closed by commit rG5d46263a5ac5: [AMDGPU] Enable divergence-driven 'ctpop' selection (authored by alex-t). · Explain Why

This revision was automatically updated to reflect the committed changes.

alex-t added a commit: rG5d46263a5ac5: [AMDGPU] Enable divergence-driven 'ctpop' selection.

Revision Contents

Path

Size

llvm/

lib/

Target/

AMDGPU/

SIInstructions.td

10 lines

SOPInstructions.td

6 lines

test/

CodeGen/

AMDGPU/

divergence-driven-ctpop.ll

54 lines

Diff 396216

llvm/lib/Target/AMDGPU/SIInstructions.td

	Show First 20 Lines • Show All 1,005 Lines • ▼ Show 20 Lines
	let AddedComplexity = 1 in {			let AddedComplexity = 1 in {
	def : GCNPat <			def : GCNPat <
	(i32 (add (i32 (getDivergentFrag<ctpop>.ret i32:$popcnt)), i32:$val)),			(i32 (add (i32 (getDivergentFrag<ctpop>.ret i32:$popcnt)), i32:$val)),
	(V_BCNT_U32_B32_e64 $popcnt, $val)			(V_BCNT_U32_B32_e64 $popcnt, $val)
	>;			>;
	}			}

	def : GCNPat <			def : GCNPat <
	(i32 (ctpop i32:$popcnt)),			(i32 (DivergentUnaryFrag<ctpop> i32:$popcnt)),
	(V_BCNT_U32_B32_e64 VSrc_b32:$popcnt, (i32 0))			(V_BCNT_U32_B32_e64 VSrc_b32:$popcnt, (i32 0))
	>;			>;

	def : GCNPat <			def : GCNPat <
	(i16 (add (i16 (trunc (i32 (getDivergentFrag<ctpop>.ret i32:$popcnt)))), i16:$val)),			(i16 (add (i16 (trunc (i32 (getDivergentFrag<ctpop>.ret i32:$popcnt)))), i16:$val)),
	(V_BCNT_U32_B32_e64 $popcnt, $val)			(V_BCNT_U32_B32_e64 $popcnt, $val)
	>;			>;

				def : GCNPat <
				(i64 (DivergentUnaryFrag<ctpop> i64:$src)),
				(REG_SEQUENCE VReg_64,
				(V_BCNT_U32_B32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub1)),
				(i32 (V_BCNT_U32_B32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0)))), sub0,
				(i32 (COPY_TO_REGCLASS (i32 (V_MOV_B32_e32 (i32 0))), VGPR_32)), sub1)
				foadUnsubmitted Done Reply Inline Actions Do you really need COPY_TO_REGCLASS here? foad: Do you really need COPY_TO_REGCLASS here?
				>;

	/******** ============================================ ********/			/******** ============================================ ********/
	/******** Extraction, Insertion, Building and Casting ********/			/******** Extraction, Insertion, Building and Casting ********/
	/******** ============================================ ********/			/******** ============================================ ********/

	// Special case for 2 element vectors. REQ_SEQUENCE produces better code			// Special case for 2 element vectors. REQ_SEQUENCE produces better code
	// than an INSERT_SUBREG.			// than an INSERT_SUBREG.
	multiclass Insert_Element_V2<RegisterClass RC, ValueType elem_type, ValueType vec_type> {			multiclass Insert_Element_V2<RegisterClass RC, ValueType elem_type, ValueType vec_type> {
	def : GCNPat <			def : GCNPat <
	▲ Show 20 Lines • Show All 2,028 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/SOPInstructions.td

Show First 20 Lines • Show All 240 Lines • ▼ Show 20 Lines	def S_BREV_B64 : SOP1_64 <"s_brev_b64",
[(set i64:$sdst, (bitreverse i64:$src0))]		[(set i64:$sdst, (bitreverse i64:$src0))]
>;		>;
} // End isReMaterializable = 1, isAsCheapAsAMove = 1		} // End isReMaterializable = 1, isAsCheapAsAMove = 1

let Defs = [SCC] in {		let Defs = [SCC] in {
def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32">;		def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32">;
def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64">;		def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64">;
def S_BCNT1_I32_B32 : SOP1_32 <"s_bcnt1_i32_b32",		def S_BCNT1_I32_B32 : SOP1_32 <"s_bcnt1_i32_b32",
[(set i32:$sdst, (ctpop i32:$src0))]		[(set i32:$sdst, (UniformUnaryFrag<ctpop> i32:$src0))]
>;		>;
def S_BCNT1_I32_B64 : SOP1_32_64 <"s_bcnt1_i32_b64",		def S_BCNT1_I32_B64 : SOP1_32_64 <"s_bcnt1_i32_b64",
[(set i32:$sdst, (ctpop i64:$src0))]		[(set i32:$sdst, (UniformUnaryFrag<ctpop> i64:$src0))]
>;		>;
} // End Defs = [SCC]		} // End Defs = [SCC]

let isReMaterializable = 1 in {		let isReMaterializable = 1 in {
def S_FF0_I32_B32 : SOP1_32 <"s_ff0_i32_b32">;		def S_FF0_I32_B32 : SOP1_32 <"s_ff0_i32_b32">;
def S_FF0_I32_B64 : SOP1_32_64 <"s_ff0_i32_b64">;		def S_FF0_I32_B64 : SOP1_32_64 <"s_ff0_i32_b64">;
def S_FF1_I32_B64 : SOP1_32_64 <"s_ff1_i32_b64",		def S_FF1_I32_B64 : SOP1_32_64 <"s_ff1_i32_b64",
[(set i32:$sdst, (UniformUnaryFrag<AMDGPUffbl_b32> i64:$src0))]		[(set i32:$sdst, (UniformUnaryFrag<AMDGPUffbl_b32> i64:$src0))]
▲ Show 20 Lines • Show All 1,105 Lines • ▼ Show 20 Lines
>;		>;

def : GCNPat <		def : GCNPat <
(int_amdgcn_endpgm),		(int_amdgcn_endpgm),
(S_ENDPGM (i16 0))		(S_ENDPGM (i16 0))
>;		>;

def : GCNPat <		def : GCNPat <
(i64 (ctpop i64:$src)),		(i64 (UniformUnaryFrag<ctpop> i64:$src)),
		foadUnsubmitted Not Done Reply Inline Actions Do we really need both this pattern and the one on line 252? Surely one of them is redundant? foad: Do we really need both this pattern and the one on line 252? Surely one of them is redundant?
		alex-tAuthorUnsubmitted Done Reply Inline Actions These two are not exactly identical. The first one, at line 252, accepts i64 and returns i32. The second one - accepts i64 and returns i64. W/o the latter one, no implicit zero extend occurs. alex-t: These two are not exactly identical. The first one, at line 252, accepts i64 and returns i32.
		foadUnsubmitted Not Done Reply Inline Actions Actually I see now, the i64 to i32 pattern is used for GlobalISel only, and the i64 to i64 pattern is used for SelectionDAG only. foad: Actually I see now, the i64 to i32 pattern is used for GlobalISel only, and the i64 to i64…
(i64 (REG_SEQUENCE SReg_64,		(i64 (REG_SEQUENCE SReg_64,
(i32 (COPY_TO_REGCLASS (S_BCNT1_I32_B64 $src), SReg_32)), sub0,		(i32 (COPY_TO_REGCLASS (S_BCNT1_I32_B64 $src), SReg_32)), sub0,
(S_MOV_B32 (i32 0)), sub1))		(S_MOV_B32 (i32 0)), sub1))
>;		>;

def : GCNPat <		def : GCNPat <
(i32 (smax i32:$x, (i32 (ineg i32:$x)))),		(i32 (smax i32:$x, (i32 (ineg i32:$x)))),
(S_ABS_I32 SReg_32:$x)		(S_ABS_I32 SReg_32:$x)
▲ Show 20 Lines • Show All 650 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/divergence-driven-ctpop.ll

This file was added.

				; RUN: llc -march=amdgcn -stop-after=amdgpu-isel < %s \| FileCheck -check-prefix=GCN %s

				; GCN-LABEL: name: s_ctpop_i32
				; GCN: S_BCNT1_I32_B32
				define amdgpu_kernel void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
				%ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
				store i32 %ctpop, i32 addrspace(1)* %out, align 4
				ret void
				}

				; GCN-LABEL: name: s_ctpop_i64
				; GCN: %[[BCNT:[0-9]+]]:sreg_32 = S_BCNT1_I32_B64
				; GCN: %[[SREG1:[0-9]+]]:sreg_32 = COPY %[[BCNT]]
				; GCN: %[[SREG2:[0-9]+]]:sreg_32 = S_MOV_B32 0
				; GCN: REG_SEQUENCE killed %[[SREG1]], %subreg.sub0, killed %[[SREG2]], %subreg.sub1
				define amdgpu_kernel void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
				%ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
				%truncctpop = trunc i64 %ctpop to i32
				store i32 %truncctpop, i32 addrspace(1)* %out, align 4
				ret void
				}

				; GCN-LABEL: name: v_ctpop_i32
				; GCN: V_BCNT_U32_B32_e64
				define amdgpu_kernel void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
				%tid = call i32 @llvm.amdgcn.workitem.id.x()
				%in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
				%val = load i32, i32 addrspace(1)* %in.gep, align 4
				%ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
				store i32 %ctpop, i32 addrspace(1)* %out, align 4
				ret void
				}

				; GCN-LABEL: name: v_ctpop_i64
				; GCN: %[[BCNT1:[0-9]+]]:vgpr_32 = V_BCNT_U32_B32_e64 killed %{{[0-9]+}}, 0, implicit $exec
				; GCN: %[[BCNT2:[0-9]+]]:vgpr_32 = V_BCNT_U32_B32_e64 killed %{{[0-9]+}}, killed %[[BCNT1]], implicit $exec
				; GCN: %[[VGPR1:[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
				; GCN: %[[VGPR2:[0-9]+]]:vgpr_32 = COPY %[[VGPR1]]
				; GCN: REG_SEQUENCE killed %[[BCNT2]], %subreg.sub0, killed %[[VGPR2]], %subreg.sub1
				define amdgpu_kernel void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
				%tid = call i32 @llvm.amdgcn.workitem.id.x()
				%in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
				%val = load i64, i64 addrspace(1)* %in.gep, align 8
				%ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
				%truncctpop = trunc i64 %ctpop to i32
				store i32 %truncctpop, i32 addrspace(1)* %out, align 4
				ret void
				}

				declare i64 @llvm.ctpop.i64(i64) nounwind readnone

				declare i32 @llvm.ctpop.i32(i32) nounwind readnone

				declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone