This is an archive of the discontinued LLVM Phabricator instance.

Pierre-vh retitled this revision from [AMDGPU] Use `S_BFE_U64` for unary i1-i64 ext to [AMDGPU] Use `S_BFE_U64` for uniform i1-i64 ext.Feb 24 2023, 3:32 AM

Pierre-vh added inline comments.Feb 24 2023, 3:34 AM

llvm/test/CodeGen/AMDGPU/saddo.ll
36	Not sure if it's a regression here. Yes there's one more instruction, but we're using more scalar instructions so isn't it beneficial in the end?
llvm/test/CodeGen/AMDGPU/usubo.ll
19	This looks a bit like a regression but I'm not sure how to address it. The pattern comes from `zext (setcc)`. I thought about adding a PatFrag that doesn't accept setcc operands to zext but it feels hacky. Thoughts?

foad added inline comments.Feb 24 2023, 3:55 AM

llvm/test/CodeGen/AMDGPU/saddo.ll
32	I'm not sure this is correct. The old code treated s[4:5] like a divergent boolean, with a bit for each active lane. The new code assumes the boolean value is in bit 0 - but will that work if lane 0 is not active?

foad added inline comments.Feb 24 2023, 4:00 AM

llvm/test/CodeGen/AMDGPU/saddo.ll
34	This looks like we could avoid the v_cndmask_b32_e64 if we used v_addc with the carry input coming from s[4:5]

arsenm added inline comments.Feb 24 2023, 4:03 AM

llvm/test/CodeGen/AMDGPU/saddo.ll
36	It's one more instruction because of the copy to VGPR for the store. We probably should have a rematerialize-as-VALU optimization to handle this kind of case.
llvm/test/CodeGen/AMDGPU/usubo.ll
19	Before this was a 32-bit select, so I assume this was a zext to i32 so I don't see why this zext to i64 change matters. What was the DAG here?

Pierre-vh added inline comments.Feb 24 2023, 4:23 AM

llvm/test/CodeGen/AMDGPU/saddo.ll
32	Very good question; I have a bit of trouble following what V_CNDMASK does exactly in this case. v0 is the destination, but what do the 0/1/s[4:5] correspond to? This function doesn't seem to select with global isel so I can't compare with that
llvm/test/CodeGen/AMDGPU/usubo.ll
19	It was a zext to i64 t41: i1 = setcc t39, t51, setugt:ch t30: i64 = zero_extend t41 t31: i64 = add t39, t30 t37: v2i32 = bitcast t31

Harbormaster completed remote builds in B215707: Diff 500128.Feb 24 2023, 4:54 AM

foad added inline comments.Feb 24 2023, 5:35 AM

llvm/test/CodeGen/AMDGPU/saddo.ll
32	For each active lane, v0 get 0 if the corresponding bit of s[4:5] if 0, 1 if the corresponding bit is 1. `v_cndmask v0, v1, v2, s0` is a bit like `v0 = s0 ? v2 : v1` (NB v1/v2 are swapped!) where s0 holds the condition as a divergent boolean, 1 bit per lane.

Pierre-vh planned changes to this revision.Apr 17 2023, 6:39 AM

Revision Contents

Path

Size

llvm/

lib/

Target/

AMDGPU/

SIInstructions.td

32 lines

test/

CodeGen/

AMDGPU/

72 lines

45 lines

45 lines

2 lines

Diff 500128

llvm/lib/Target/AMDGPU/SIInstructions.td

Show First 20 Lines • Show All 2,289 Lines • ▼ Show 20 Lines	def : GCNPat <
(REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 (i32 0)), sub1)		(REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 (i32 0)), sub1)
>;		>;

def : GCNPat <		def : GCNPat <
(i64 (anyext i32:$src)),		(i64 (anyext i32:$src)),
(REG_SEQUENCE SReg_64, $src, sub0, (i32 (IMPLICIT_DEF)), sub1)		(REG_SEQUENCE SReg_64, $src, sub0, (i32 (IMPLICIT_DEF)), sub1)
>;		>;

class ZExt_i64_i1_Pat <SDNode ext> : GCNPat <		multiclass ZExt_i64_i1_Pat <SDNode ext> {
		def: GCNPat <
(i64 (ext i1:$src)),		(i64 (ext i1:$src)),
(REG_SEQUENCE VReg_64,		(REG_SEQUENCE VReg_64,
(V_CNDMASK_B32_e64 /src0mod/(i32 0), /src0/(i32 0),		(V_CNDMASK_B32_e64 /src0mod/(i32 0), /src0/(i32 0),
/src1mod/(i32 0), /src1/(i32 1), $src),		/src1mod/(i32 0), /src1/(i32 1), $src),
sub0, (S_MOV_B32 (i32 0)), sub1)		sub0, (S_MOV_B32 (i32 0)), sub1)
>;		>;

		let WaveSizePredicate = isWave32 in
		def : GCNPat <
		(i64 (UniformUnaryFrag<ext> SReg_1:$src)),
		(S_BFE_U64 (REG_SEQUENCE SReg_64, SReg_32:$src, sub0, (i32 (IMPLICIT_DEF)), sub1), (i32 0x10000))
		>;

		let WaveSizePredicate = isWave64 in
		def : GCNPat <
		(i64 (UniformUnaryFrag<ext> SReg_1:$src)),
		(S_BFE_U64 SReg_64:$src, (i32 0x10000))
		>;
		}


def : ZExt_i64_i1_Pat<zext>;		defm : ZExt_i64_i1_Pat<zext>;
def : ZExt_i64_i1_Pat<anyext>;		defm : ZExt_i64_i1_Pat<anyext>;

// FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that		// FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that
// REG_SEQUENCE patterns don't support instructions with multiple outputs.		// REG_SEQUENCE patterns don't support instructions with multiple outputs.
def : GCNPat <		def : GCNPat <
(i64 (UniformUnaryFrag<sext> i32:$src)),		(i64 (UniformUnaryFrag<sext> i32:$src)),
(REG_SEQUENCE SReg_64, $src, sub0,		(REG_SEQUENCE SReg_64, $src, sub0,
(i32 (COPY_TO_REGCLASS (S_ASHR_I32 $src, (i32 31)), SReg_32_XM0)), sub1)		(i32 (COPY_TO_REGCLASS (S_ASHR_I32 $src, (i32 31)), SReg_32_XM0)), sub1)
>;		>;
▲ Show 20 Lines • Show All 1,331 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/saddo.ll

	Show All 23 Lines
	; SI-NEXT: s_add_u32 s10, s6, s8			; SI-NEXT: s_add_u32 s10, s6, s8
	; SI-NEXT: s_addc_u32 s11, s7, s9			; SI-NEXT: s_addc_u32 s11, s7, s9
	; SI-NEXT: v_mov_b32_e32 v1, s7			; SI-NEXT: v_mov_b32_e32 v1, s7
	; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[0:1]			; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[0:1]
	; SI-NEXT: v_cmp_lt_i64_e64 s[6:7], s[8:9], 0			; SI-NEXT: v_cmp_lt_i64_e64 s[6:7], s[8:9], 0
	; SI-NEXT: s_mov_b32 s0, s4			; SI-NEXT: s_mov_b32 s0, s4
	; SI-NEXT: s_mov_b32 s1, s5			; SI-NEXT: s_mov_b32 s1, s5
	; SI-NEXT: s_xor_b64 s[4:5], s[6:7], vcc			; SI-NEXT: s_xor_b64 s[4:5], s[6:7], vcc
	; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]			; SI-NEXT: s_bfe_u64 s[4:5], s[4:5], 0x10000
				foadUnsubmitted Not Done Reply Inline Actions I'm not sure this is correct. The old code treated s[4:5] like a divergent boolean, with a bit for each active lane. The new code assumes the boolean value is in bit 0 - but will that work if lane 0 is not active? foad: I'm not sure this is correct. The old code treated s[4:5] like a divergent boolean, with a bit…
				Pierre-vhAuthorUnsubmitted Done Reply Inline Actions Very good question; I have a bit of trouble following what V_CNDMASK does exactly in this case. v0 is the destination, but what do the 0/1/s[4:5] correspond to? This function doesn't seem to select with global isel so I can't compare with that Pierre-vh: Very good question; I have a bit of trouble following what V_CNDMASK does exactly in this case.
				foadUnsubmitted Not Done Reply Inline Actions For each active lane, v0 get 0 if the corresponding bit of s[4:5] if 0, 1 if the corresponding bit is 1. `v_cndmask v0, v1, v2, s0` is a bit like `v0 = s0 ? v2 : v1` (NB v1/v2 are swapped!) where s0 holds the condition as a divergent boolean, 1 bit per lane. foad: For each active lane, v0 get 0 if the corresponding bit of s[4:5] if 0, 1 if the corresponding…
	; SI-NEXT: v_mov_b32_e32 v1, s11			; SI-NEXT: s_add_u32 s4, s10, s4
	; SI-NEXT: v_add_i32_e32 v0, vcc, s10, v0			; SI-NEXT: s_addc_u32 s5, s11, s5
	foadUnsubmitted Not Done Reply Inline Actions This looks like we could avoid the v_cndmask_b32_e64 if we used v_addc with the carry input coming from s[4:5] foad: This looks like we could avoid the v_cndmask_b32_e64 if we used v_addc with the carry input…
	; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc			; SI-NEXT: v_mov_b32_e32 v0, s4
				; SI-NEXT: v_mov_b32_e32 v1, s5
				Pierre-vhAuthorUnsubmitted Done Reply Inline Actions Not sure if it's a regression here. Yes there's one more instruction, but we're using more scalar instructions so isn't it beneficial in the end? Pierre-vh: Not sure if it's a regression here. Yes there's one more instruction, but we're using more…
				arsenmUnsubmitted Not Done Reply Inline Actions It's one more instruction because of the copy to VGPR for the store. We probably should have a rematerialize-as-VALU optimization to handle this kind of case. arsenm: It's one more instruction because of the copy to VGPR for the store. We probably should have a…
	; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0			; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
	; SI-NEXT: s_endpgm			; SI-NEXT: s_endpgm
	;			;
	; VI-LABEL: saddo_i64_zext:			; VI-LABEL: saddo_i64_zext:
	; VI: ; %bb.0:			; VI: ; %bb.0:
	; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24			; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
	; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34			; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
	; VI-NEXT: s_waitcnt lgkmcnt(0)			; VI-NEXT: s_waitcnt lgkmcnt(0)
	; VI-NEXT: v_mov_b32_e32 v1, s6			; VI-NEXT: v_mov_b32_e32 v1, s6
	; VI-NEXT: s_add_u32 s2, s6, s0			; VI-NEXT: s_add_u32 s2, s6, s0
	; VI-NEXT: v_mov_b32_e32 v2, s7			; VI-NEXT: v_mov_b32_e32 v2, s7
	; VI-NEXT: s_addc_u32 s3, s7, s1			; VI-NEXT: s_addc_u32 s3, s7, s1
	; VI-NEXT: v_cmp_lt_i64_e64 s[8:9], s[0:1], 0
	; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2]			; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2]
	; VI-NEXT: v_mov_b32_e32 v3, s3			; VI-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
	; VI-NEXT: s_xor_b64 s[0:1], s[8:9], vcc
	; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
	; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
	; VI-NEXT: v_mov_b32_e32 v0, s4			; VI-NEXT: v_mov_b32_e32 v0, s4
				; VI-NEXT: s_xor_b64 s[0:1], s[0:1], vcc
				; VI-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x10000
				; VI-NEXT: s_add_u32 s0, s2, s0
				; VI-NEXT: s_addc_u32 s1, s3, s1
				; VI-NEXT: v_mov_b32_e32 v3, s1
	; VI-NEXT: v_mov_b32_e32 v1, s5			; VI-NEXT: v_mov_b32_e32 v1, s5
	; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc			; VI-NEXT: v_mov_b32_e32 v2, s0
	; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]			; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
	; VI-NEXT: s_endpgm			; VI-NEXT: s_endpgm
	;			;
	; GFX9-LABEL: saddo_i64_zext:			; GFX9-LABEL: saddo_i64_zext:
	; GFX9: ; %bb.0:			; GFX9: ; %bb.0:
	; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24			; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
	; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34			; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
	; GFX9-NEXT: v_mov_b32_e32 v2, 0			; GFX9-NEXT: v_mov_b32_e32 v2, 0
	; GFX9-NEXT: s_waitcnt lgkmcnt(0)			; GFX9-NEXT: s_waitcnt lgkmcnt(0)
	; GFX9-NEXT: v_mov_b32_e32 v0, s6			; GFX9-NEXT: v_mov_b32_e32 v0, s6
	; GFX9-NEXT: s_add_u32 s0, s6, s2			; GFX9-NEXT: s_add_u32 s0, s6, s2
	; GFX9-NEXT: v_mov_b32_e32 v1, s7			; GFX9-NEXT: v_mov_b32_e32 v1, s7
	; GFX9-NEXT: s_addc_u32 s1, s7, s3			; GFX9-NEXT: s_addc_u32 s1, s7, s3
	; GFX9-NEXT: v_cmp_lt_i64_e64 s[8:9], s[2:3], 0
	; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]			; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
				; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[2:3], 0
				; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], vcc
				; GFX9-NEXT: s_bfe_u64 s[2:3], s[2:3], 0x10000
				; GFX9-NEXT: s_add_u32 s0, s0, s2
				; GFX9-NEXT: s_addc_u32 s1, s1, s3
				; GFX9-NEXT: v_mov_b32_e32 v0, s0
	; GFX9-NEXT: v_mov_b32_e32 v1, s1			; GFX9-NEXT: v_mov_b32_e32 v1, s1
	; GFX9-NEXT: s_xor_b64 s[2:3], s[8:9], vcc
	; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
	; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
	; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
	; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]			; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
	; GFX9-NEXT: s_endpgm			; GFX9-NEXT: s_endpgm
	;			;
	; GFX10-LABEL: saddo_i64_zext:			; GFX10-LABEL: saddo_i64_zext:
	; GFX10: ; %bb.0:			; GFX10: ; %bb.0:
	; GFX10-NEXT: s_clause 0x1			; GFX10-NEXT: s_clause 0x1
	; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24			; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
	; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34			; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
	; GFX10-NEXT: v_mov_b32_e32 v2, 0			; GFX10-NEXT: v_mov_b32_e32 v2, 0
	; GFX10-NEXT: s_waitcnt lgkmcnt(0)			; GFX10-NEXT: s_waitcnt lgkmcnt(0)
	; GFX10-NEXT: s_add_u32 s0, s6, s2			; GFX10-NEXT: s_add_u32 s0, s6, s2
	; GFX10-NEXT: s_addc_u32 s1, s7, s3			; GFX10-NEXT: s_addc_u32 s1, s7, s3
	; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[2:3], 0			; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[2:3], 0
	; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[0:1], s[6:7]			; GFX10-NEXT: v_cmp_lt_i64_e64 s6, s[0:1], s[6:7]
	; GFX10-NEXT: s_xor_b32 s2, s2, s3			; GFX10-NEXT: s_xor_b32 s2, s2, s6
	; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2			; GFX10-NEXT: s_bfe_u64 s[2:3], s[2:3], 0x10000
	; GFX10-NEXT: v_add_co_u32 v0, s0, s0, v0			; GFX10-NEXT: s_add_u32 s0, s0, s2
	; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0			; GFX10-NEXT: s_addc_u32 s1, s1, s3
				; GFX10-NEXT: v_mov_b32_e32 v0, s0
				; GFX10-NEXT: v_mov_b32_e32 v1, s1
	; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]			; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
	; GFX10-NEXT: s_endpgm			; GFX10-NEXT: s_endpgm
	;			;
	; GFX11-LABEL: saddo_i64_zext:			; GFX11-LABEL: saddo_i64_zext:
	; GFX11: ; %bb.0:			; GFX11: ; %bb.0:
	; GFX11-NEXT: s_clause 0x1			; GFX11-NEXT: s_clause 0x1
	; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24			; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
	; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34			; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
	; GFX11-NEXT: v_mov_b32_e32 v2, 0
	; GFX11-NEXT: s_waitcnt lgkmcnt(0)			; GFX11-NEXT: s_waitcnt lgkmcnt(0)
	; GFX11-NEXT: s_add_u32 s2, s6, s0			; GFX11-NEXT: s_add_u32 s2, s6, s0
	; GFX11-NEXT: s_addc_u32 s3, s7, s1			; GFX11-NEXT: s_addc_u32 s3, s7, s1
	; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0			; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0
	; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], s[6:7]			; GFX11-NEXT: v_cmp_lt_i64_e64 s6, s[2:3], s[6:7]
	; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) \| instskip(NEXT) \| instid1(SALU_CYCLE_1)			; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) \| instskip(NEXT) \| instid1(SALU_CYCLE_1)
	; GFX11-NEXT: s_xor_b32 s0, s0, s1			; GFX11-NEXT: s_xor_b32 s0, s0, s6
	; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0			; GFX11-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x10000
	; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) \| instskip(NEXT) \| instid1(VALU_DEP_1)			; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
	; GFX11-NEXT: v_add_co_u32 v0, s0, s2, v0			; GFX11-NEXT: s_add_u32 s0, s2, s0
	; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0			; GFX11-NEXT: s_addc_u32 s1, s3, s1
				; GFX11-NEXT: v_mov_b32_e32 v0, s0
				; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
	; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]			; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
	; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)			; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
	; GFX11-NEXT: s_endpgm			; GFX11-NEXT: s_endpgm
	%sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind			%sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
	%val = extractvalue { i64, i1 } %sadd, 0			%val = extractvalue { i64, i1 } %sadd, 0
	%carry = extractvalue { i64, i1 } %sadd, 1			%carry = extractvalue { i64, i1 } %sadd, 1
	%ext = zext i1 %carry to i64			%ext = zext i1 %carry to i64
	%add2 = add i64 %val, %ext			%add2 = add i64 %val, %ext
	▲ Show 20 Lines • Show All 123 Lines • ▼ Show 20 Lines
	; SI-NEXT: s_mov_b32 s7, s11			; SI-NEXT: s_mov_b32 s7, s11
	; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0			; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
	; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0			; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
	; SI-NEXT: s_mov_b32 s8, s0			; SI-NEXT: s_mov_b32 s8, s0
	; SI-NEXT: s_mov_b32 s9, s1			; SI-NEXT: s_mov_b32 s9, s1
	; SI-NEXT: s_mov_b32 s4, s2			; SI-NEXT: s_mov_b32 s4, s2
	; SI-NEXT: s_mov_b32 s5, s3			; SI-NEXT: s_mov_b32 s5, s3
	; SI-NEXT: s_waitcnt vmcnt(0)			; SI-NEXT: s_waitcnt vmcnt(0)
	; SI-NEXT: v_add_i32_e32 v2, vcc, v0, v1			; SI-NEXT: v_add_i32_e32 v2, vcc, v1, v0
	; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1			; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1
	; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], v2, v0			; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], v2, v0
	; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]			; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
	; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]			; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
	; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0			; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0
	; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0			; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
	; SI-NEXT: s_endpgm			; SI-NEXT: s_endpgm
	;			;
	; VI-LABEL: v_saddo_i32:			; VI-LABEL: v_saddo_i32:
	; VI: ; %bb.0:			; VI: ; %bb.0:
	; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24			; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
	; VI-NEXT: s_waitcnt lgkmcnt(0)			; VI-NEXT: s_waitcnt lgkmcnt(0)
	; VI-NEXT: v_mov_b32_e32 v0, s4			; VI-NEXT: v_mov_b32_e32 v0, s4
	; VI-NEXT: v_mov_b32_e32 v1, s5			; VI-NEXT: v_mov_b32_e32 v1, s5
	; VI-NEXT: v_mov_b32_e32 v2, s6			; VI-NEXT: v_mov_b32_e32 v2, s6
	; VI-NEXT: v_mov_b32_e32 v3, s7			; VI-NEXT: v_mov_b32_e32 v3, s7
	; VI-NEXT: flat_load_dword v4, v[0:1]			; VI-NEXT: flat_load_dword v4, v[0:1]
	; VI-NEXT: flat_load_dword v5, v[2:3]			; VI-NEXT: flat_load_dword v5, v[2:3]
	; VI-NEXT: v_mov_b32_e32 v0, s0			; VI-NEXT: v_mov_b32_e32 v0, s0
	; VI-NEXT: v_mov_b32_e32 v1, s1			; VI-NEXT: v_mov_b32_e32 v1, s1
	; VI-NEXT: v_mov_b32_e32 v2, s2			; VI-NEXT: v_mov_b32_e32 v2, s2
	; VI-NEXT: v_mov_b32_e32 v3, s3			; VI-NEXT: v_mov_b32_e32 v3, s3
	; VI-NEXT: s_waitcnt vmcnt(0)			; VI-NEXT: s_waitcnt vmcnt(0)
	; VI-NEXT: v_add_u32_e32 v6, vcc, v4, v5			; VI-NEXT: v_add_u32_e32 v6, vcc, v5, v4
	; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v5			; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v5
	; VI-NEXT: v_cmp_lt_i32_e64 s[0:1], v6, v4			; VI-NEXT: v_cmp_lt_i32_e64 s[0:1], v6, v4
	; VI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]			; VI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
	; VI-NEXT: flat_store_dword v[0:1], v6			; VI-NEXT: flat_store_dword v[0:1], v6
	; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]			; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
	; VI-NEXT: flat_store_byte v[2:3], v0			; VI-NEXT: flat_store_byte v[2:3], v0
	; VI-NEXT: s_endpgm			; VI-NEXT: s_endpgm
	;			;
	▲ Show 20 Lines • Show All 314 Lines • ▼ Show 20 Lines
	; SI-NEXT: s_mov_b32 s7, s11			; SI-NEXT: s_mov_b32 s7, s11
	; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0			; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
	; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0			; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
	; SI-NEXT: s_mov_b32 s8, s0			; SI-NEXT: s_mov_b32 s8, s0
	; SI-NEXT: s_mov_b32 s9, s1			; SI-NEXT: s_mov_b32 s9, s1
	; SI-NEXT: s_mov_b32 s12, s2			; SI-NEXT: s_mov_b32 s12, s2
	; SI-NEXT: s_mov_b32 s13, s3			; SI-NEXT: s_mov_b32 s13, s3
	; SI-NEXT: s_waitcnt vmcnt(0)			; SI-NEXT: s_waitcnt vmcnt(0)
	; SI-NEXT: v_add_i32_e32 v5, vcc, v1, v3			; SI-NEXT: v_add_i32_e32 v5, vcc, v3, v1
	; SI-NEXT: v_add_i32_e32 v4, vcc, v0, v2			; SI-NEXT: v_add_i32_e32 v4, vcc, v2, v0
	; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v3			; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v3
	; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], v5, v1			; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], v5, v1
	; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2			; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2
	; SI-NEXT: v_cmp_lt_i32_e64 s[2:3], v4, v0			; SI-NEXT: v_cmp_lt_i32_e64 s[2:3], v4, v0
	; SI-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]			; SI-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
	; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]			; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
	; SI-NEXT: s_xor_b64 s[0:1], vcc, s[2:3]			; SI-NEXT: s_xor_b64 s[0:1], vcc, s[2:3]
	; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]			; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
	Show All 11 Lines
	; VI-NEXT: v_mov_b32_e32 v3, s7			; VI-NEXT: v_mov_b32_e32 v3, s7
	; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]			; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
	; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]			; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
	; VI-NEXT: v_mov_b32_e32 v4, s0			; VI-NEXT: v_mov_b32_e32 v4, s0
	; VI-NEXT: v_mov_b32_e32 v5, s1			; VI-NEXT: v_mov_b32_e32 v5, s1
	; VI-NEXT: v_mov_b32_e32 v6, s2			; VI-NEXT: v_mov_b32_e32 v6, s2
	; VI-NEXT: v_mov_b32_e32 v7, s3			; VI-NEXT: v_mov_b32_e32 v7, s3
	; VI-NEXT: s_waitcnt vmcnt(0)			; VI-NEXT: s_waitcnt vmcnt(0)
	; VI-NEXT: v_add_u32_e32 v9, vcc, v1, v3			; VI-NEXT: v_add_u32_e32 v9, vcc, v3, v1
	; VI-NEXT: v_add_u32_e32 v8, vcc, v0, v2			; VI-NEXT: v_add_u32_e32 v8, vcc, v2, v0
	; VI-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v3			; VI-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v3
	; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], v9, v1			; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], v9, v1
	; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2			; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2
	; VI-NEXT: v_cmp_lt_i32_e64 s[2:3], v8, v0			; VI-NEXT: v_cmp_lt_i32_e64 s[2:3], v8, v0
	; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]			; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
	; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]			; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
	; VI-NEXT: s_xor_b64 s[0:1], vcc, s[2:3]			; VI-NEXT: s_xor_b64 s[0:1], vcc, s[2:3]
	; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]			; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
	▲ Show 20 Lines • Show All 78 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/uaddo.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefixes=SI %s			; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefixes=SI %s
	; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -check-prefixes=VI %s			; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -check-prefixes=VI %s
	; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -check-prefixes=GFX9 %s			; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -check-prefixes=GFX9 %s

	define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {			define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
	; SI-LABEL: s_uaddo_i64_zext:			; SI-LABEL: s_uaddo_i64_zext:
	; SI: ; %bb.0:			; SI: ; %bb.0:
	; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9			; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
	; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd			; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
	; SI-NEXT: s_mov_b32 s3, 0xf000			; SI-NEXT: s_mov_b32 s3, 0xf000
	; SI-NEXT: s_mov_b32 s2, -1
	; SI-NEXT: s_waitcnt lgkmcnt(0)			; SI-NEXT: s_waitcnt lgkmcnt(0)
				; SI-NEXT: s_add_u32 s0, s6, s0
				; SI-NEXT: v_mov_b32_e32 v0, s6
				; SI-NEXT: v_mov_b32_e32 v1, s7
				; SI-NEXT: s_addc_u32 s1, s7, s1
				; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
				; SI-NEXT: s_bfe_u64 s[6:7], vcc, 0x10000
				; SI-NEXT: s_add_u32 s6, s0, s6
				; SI-NEXT: s_addc_u32 s7, s1, s7
				; SI-NEXT: s_mov_b32 s2, -1
	; SI-NEXT: s_mov_b32 s0, s4			; SI-NEXT: s_mov_b32 s0, s4
	; SI-NEXT: s_mov_b32 s1, s5			; SI-NEXT: s_mov_b32 s1, s5
	; SI-NEXT: s_add_u32 s4, s6, s8
	; SI-NEXT: v_mov_b32_e32 v0, s6			; SI-NEXT: v_mov_b32_e32 v0, s6
	; SI-NEXT: v_mov_b32_e32 v1, s7			; SI-NEXT: v_mov_b32_e32 v1, s7
	; SI-NEXT: s_addc_u32 s5, s7, s9
	; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
	; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
	; SI-NEXT: v_mov_b32_e32 v1, s5
	; SI-NEXT: v_add_i32_e32 v0, vcc, s4, v0
	; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
	; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0			; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
	; SI-NEXT: s_endpgm			; SI-NEXT: s_endpgm
	;			;
	; VI-LABEL: s_uaddo_i64_zext:			; VI-LABEL: s_uaddo_i64_zext:
	; VI: ; %bb.0:			; VI: ; %bb.0:
	; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24			; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
	; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34			; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
	; VI-NEXT: s_waitcnt lgkmcnt(0)			; VI-NEXT: s_waitcnt lgkmcnt(0)
	; VI-NEXT: v_mov_b32_e32 v2, s6			; VI-NEXT: v_mov_b32_e32 v1, s6
	; VI-NEXT: s_add_u32 s0, s6, s0			; VI-NEXT: s_add_u32 s0, s6, s0
	; VI-NEXT: v_mov_b32_e32 v3, s7
	; VI-NEXT: s_addc_u32 s1, s7, s1			; VI-NEXT: s_addc_u32 s1, s7, s1
	; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]			; VI-NEXT: v_mov_b32_e32 v2, s7
	; VI-NEXT: v_mov_b32_e32 v3, s1			; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[1:2]
	; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
	; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
	; VI-NEXT: v_mov_b32_e32 v0, s4			; VI-NEXT: v_mov_b32_e32 v0, s4
				; VI-NEXT: s_bfe_u64 s[2:3], vcc, 0x10000
				; VI-NEXT: s_add_u32 s0, s0, s2
				; VI-NEXT: s_addc_u32 s1, s1, s3
				; VI-NEXT: v_mov_b32_e32 v3, s1
	; VI-NEXT: v_mov_b32_e32 v1, s5			; VI-NEXT: v_mov_b32_e32 v1, s5
	; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc			; VI-NEXT: v_mov_b32_e32 v2, s0
	; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]			; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
	; VI-NEXT: s_endpgm			; VI-NEXT: s_endpgm
	;			;
	; GFX9-LABEL: s_uaddo_i64_zext:			; GFX9-LABEL: s_uaddo_i64_zext:
	; GFX9: ; %bb.0:			; GFX9: ; %bb.0:
	; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24			; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
	; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34			; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
	; GFX9-NEXT: v_mov_b32_e32 v2, 0			; GFX9-NEXT: v_mov_b32_e32 v2, 0
	; GFX9-NEXT: s_waitcnt lgkmcnt(0)			; GFX9-NEXT: s_waitcnt lgkmcnt(0)
	; GFX9-NEXT: v_mov_b32_e32 v0, s6			; GFX9-NEXT: v_mov_b32_e32 v0, s6
	; GFX9-NEXT: s_add_u32 s0, s6, s2			; GFX9-NEXT: s_add_u32 s0, s6, s2
	; GFX9-NEXT: v_mov_b32_e32 v1, s7
	; GFX9-NEXT: s_addc_u32 s1, s7, s3			; GFX9-NEXT: s_addc_u32 s1, s7, s3
				; GFX9-NEXT: v_mov_b32_e32 v1, s7
	; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]			; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
				; GFX9-NEXT: s_bfe_u64 s[2:3], vcc, 0x10000
				; GFX9-NEXT: s_add_u32 s0, s0, s2
				; GFX9-NEXT: s_addc_u32 s1, s1, s3
				; GFX9-NEXT: v_mov_b32_e32 v0, s0
	; GFX9-NEXT: v_mov_b32_e32 v1, s1			; GFX9-NEXT: v_mov_b32_e32 v1, s1
	; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
	; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
	; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
	; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]			; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
	; GFX9-NEXT: s_endpgm			; GFX9-NEXT: s_endpgm
	%uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)			%uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
	%val = extractvalue { i64, i1 } %uadd, 0			%val = extractvalue { i64, i1 } %uadd, 0
	%carry = extractvalue { i64, i1 } %uadd, 1			%carry = extractvalue { i64, i1 } %uadd, 1
	%ext = zext i1 %carry to i64			%ext = zext i1 %carry to i64
	%add2 = add i64 %val, %ext			%add2 = add i64 %val, %ext
	store i64 %add2, ptr addrspace(1) %out, align 8			store i64 %add2, ptr addrspace(1) %out, align 8
	▲ Show 20 Lines • Show All 400 Lines • ▼ Show 20 Lines
	; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0			; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0
	; SI-NEXT: s_mov_b32 s6, s10			; SI-NEXT: s_mov_b32 s6, s10
	; SI-NEXT: s_mov_b32 s7, s11			; SI-NEXT: s_mov_b32 s7, s11
	; SI-NEXT: s_mov_b32 s8, s0			; SI-NEXT: s_mov_b32 s8, s0
	; SI-NEXT: s_mov_b32 s9, s1			; SI-NEXT: s_mov_b32 s9, s1
	; SI-NEXT: s_mov_b32 s4, s2			; SI-NEXT: s_mov_b32 s4, s2
	; SI-NEXT: s_mov_b32 s5, s3			; SI-NEXT: s_mov_b32 s5, s3
	; SI-NEXT: s_waitcnt vmcnt(0)			; SI-NEXT: s_waitcnt vmcnt(0)
	; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1			; SI-NEXT: v_add_i32_e32 v0, vcc, v1, v0
	; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0			; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0
	; SI-NEXT: buffer_store_short v0, off, s[8:11], 0			; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
	; SI-NEXT: v_cmp_ne_u32_e32 vcc, v1, v0			; SI-NEXT: v_cmp_ne_u32_e32 vcc, v1, v0
	; SI-NEXT: s_waitcnt expcnt(0)			; SI-NEXT: s_waitcnt expcnt(0)
	; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc			; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
	; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0			; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
	; SI-NEXT: s_endpgm			; SI-NEXT: s_endpgm
	;			;
	▲ Show 20 Lines • Show All 402 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/usubo.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefixes=SI %s			; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefixes=SI %s
	; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -check-prefixes=VI %s			; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -check-prefixes=VI %s
	; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -check-prefixes=GFX9 %s			; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -check-prefixes=GFX9 %s


	define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {			define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
	; SI-LABEL: s_usubo_i64_zext:			; SI-LABEL: s_usubo_i64_zext:
	; SI: ; %bb.0:			; SI: ; %bb.0:
	; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9			; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
	; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd			; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
	; SI-NEXT: s_mov_b32 s3, 0xf000			; SI-NEXT: s_mov_b32 s3, 0xf000
	; SI-NEXT: s_mov_b32 s2, -1
	; SI-NEXT: s_waitcnt lgkmcnt(0)			; SI-NEXT: s_waitcnt lgkmcnt(0)
				; SI-NEXT: s_sub_u32 s0, s6, s0
				; SI-NEXT: v_mov_b32_e32 v0, s6
				; SI-NEXT: v_mov_b32_e32 v1, s7
				; SI-NEXT: s_subb_u32 s1, s7, s1
				; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
				; SI-NEXT: s_bfe_u64 s[6:7], vcc, 0x10000
				Pierre-vhAuthorUnsubmitted Done Reply Inline Actions This looks a bit like a regression but I'm not sure how to address it. The pattern comes from `zext (setcc)`. I thought about adding a PatFrag that doesn't accept setcc operands to zext but it feels hacky. Thoughts? Pierre-vh: This looks a bit like a regression but I'm not sure how to address it. The pattern comes from…
				arsenmUnsubmitted Not Done Reply Inline Actions Before this was a 32-bit select, so I assume this was a zext to i32 so I don't see why this zext to i64 change matters. What was the DAG here? arsenm: Before this was a 32-bit select, so I assume this was a zext to i32 so I don't see why this…
				Pierre-vhAuthorUnsubmitted Done Reply Inline Actions It was a zext to i64 t41: i1 = setcc t39, t51, setugt:ch t30: i64 = zero_extend t41 t31: i64 = add t39, t30 t37: v2i32 = bitcast t31 Pierre-vh: It was a zext to i64 ``` t41: i1 = setcc t39, t51, setugt:ch t30: i64 =…
				; SI-NEXT: s_add_u32 s6, s0, s6
				; SI-NEXT: s_addc_u32 s7, s1, s7
				; SI-NEXT: s_mov_b32 s2, -1
	; SI-NEXT: s_mov_b32 s0, s4			; SI-NEXT: s_mov_b32 s0, s4
	; SI-NEXT: s_mov_b32 s1, s5			; SI-NEXT: s_mov_b32 s1, s5
	; SI-NEXT: s_sub_u32 s4, s6, s8
	; SI-NEXT: v_mov_b32_e32 v0, s6			; SI-NEXT: v_mov_b32_e32 v0, s6
	; SI-NEXT: v_mov_b32_e32 v1, s7			; SI-NEXT: v_mov_b32_e32 v1, s7
	; SI-NEXT: s_subb_u32 s5, s7, s9
	; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
	; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
	; SI-NEXT: v_mov_b32_e32 v1, s5
	; SI-NEXT: v_add_i32_e32 v0, vcc, s4, v0
	; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
	; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0			; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
	; SI-NEXT: s_endpgm			; SI-NEXT: s_endpgm
	;			;
	; VI-LABEL: s_usubo_i64_zext:			; VI-LABEL: s_usubo_i64_zext:
	; VI: ; %bb.0:			; VI: ; %bb.0:
	; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24			; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
	; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34			; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
	; VI-NEXT: s_waitcnt lgkmcnt(0)			; VI-NEXT: s_waitcnt lgkmcnt(0)
	; VI-NEXT: v_mov_b32_e32 v2, s6			; VI-NEXT: v_mov_b32_e32 v1, s6
	; VI-NEXT: s_sub_u32 s0, s6, s0			; VI-NEXT: s_sub_u32 s0, s6, s0
	; VI-NEXT: v_mov_b32_e32 v3, s7
	; VI-NEXT: s_subb_u32 s1, s7, s1			; VI-NEXT: s_subb_u32 s1, s7, s1
	; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[2:3]			; VI-NEXT: v_mov_b32_e32 v2, s7
	; VI-NEXT: v_mov_b32_e32 v3, s1			; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[1:2]
	; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
	; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
	; VI-NEXT: v_mov_b32_e32 v0, s4			; VI-NEXT: v_mov_b32_e32 v0, s4
				; VI-NEXT: s_bfe_u64 s[2:3], vcc, 0x10000
				; VI-NEXT: s_add_u32 s0, s0, s2
				; VI-NEXT: s_addc_u32 s1, s1, s3
				; VI-NEXT: v_mov_b32_e32 v3, s1
	; VI-NEXT: v_mov_b32_e32 v1, s5			; VI-NEXT: v_mov_b32_e32 v1, s5
	; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc			; VI-NEXT: v_mov_b32_e32 v2, s0
	; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]			; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
	; VI-NEXT: s_endpgm			; VI-NEXT: s_endpgm
	;			;
	; GFX9-LABEL: s_usubo_i64_zext:			; GFX9-LABEL: s_usubo_i64_zext:
	; GFX9: ; %bb.0:			; GFX9: ; %bb.0:
	; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24			; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
	; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34			; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
	; GFX9-NEXT: v_mov_b32_e32 v2, 0			; GFX9-NEXT: v_mov_b32_e32 v2, 0
	; GFX9-NEXT: s_waitcnt lgkmcnt(0)			; GFX9-NEXT: s_waitcnt lgkmcnt(0)
	; GFX9-NEXT: v_mov_b32_e32 v0, s6			; GFX9-NEXT: v_mov_b32_e32 v0, s6
	; GFX9-NEXT: s_sub_u32 s0, s6, s2			; GFX9-NEXT: s_sub_u32 s0, s6, s2
	; GFX9-NEXT: v_mov_b32_e32 v1, s7
	; GFX9-NEXT: s_subb_u32 s1, s7, s3			; GFX9-NEXT: s_subb_u32 s1, s7, s3
				; GFX9-NEXT: v_mov_b32_e32 v1, s7
	; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]			; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
				; GFX9-NEXT: s_bfe_u64 s[2:3], vcc, 0x10000
				; GFX9-NEXT: s_add_u32 s0, s0, s2
				; GFX9-NEXT: s_addc_u32 s1, s1, s3
				; GFX9-NEXT: v_mov_b32_e32 v0, s0
	; GFX9-NEXT: v_mov_b32_e32 v1, s1			; GFX9-NEXT: v_mov_b32_e32 v1, s1
	; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
	; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
	; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
	; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]			; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
	; GFX9-NEXT: s_endpgm			; GFX9-NEXT: s_endpgm
	%usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) #0			%usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) #0
	%val = extractvalue { i64, i1 } %usub, 0			%val = extractvalue { i64, i1 } %usub, 0
	%carry = extractvalue { i64, i1 } %usub, 1			%carry = extractvalue { i64, i1 } %usub, 1
	%ext = zext i1 %carry to i64			%ext = zext i1 %carry to i64
	%add2 = add i64 %val, %ext			%add2 = add i64 %val, %ext
	store i64 %add2, ptr addrspace(1) %out, align 8			store i64 %add2, ptr addrspace(1) %out, align 8
	▲ Show 20 Lines • Show All 399 Lines • ▼ Show 20 Lines
	; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0			; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0
	; SI-NEXT: s_mov_b32 s6, s10			; SI-NEXT: s_mov_b32 s6, s10
	; SI-NEXT: s_mov_b32 s7, s11			; SI-NEXT: s_mov_b32 s7, s11
	; SI-NEXT: s_mov_b32 s8, s0			; SI-NEXT: s_mov_b32 s8, s0
	; SI-NEXT: s_mov_b32 s9, s1			; SI-NEXT: s_mov_b32 s9, s1
	; SI-NEXT: s_mov_b32 s4, s2			; SI-NEXT: s_mov_b32 s4, s2
	; SI-NEXT: s_mov_b32 s5, s3			; SI-NEXT: s_mov_b32 s5, s3
	; SI-NEXT: s_waitcnt vmcnt(0)			; SI-NEXT: s_waitcnt vmcnt(0)
	; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v1			; SI-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0
	; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0			; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0
	; SI-NEXT: buffer_store_short v0, off, s[8:11], 0			; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
	; SI-NEXT: v_cmp_ne_u32_e32 vcc, v1, v0			; SI-NEXT: v_cmp_ne_u32_e32 vcc, v1, v0
	; SI-NEXT: s_waitcnt expcnt(0)			; SI-NEXT: s_waitcnt expcnt(0)
	; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc			; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
	; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0			; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
	; SI-NEXT: s_endpgm			; SI-NEXT: s_endpgm
	;			;
	▲ Show 20 Lines • Show All 333 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/zero_extend.ll

Show All 32 Lines	define amdgpu_kernel void @s_arg_zext_i1_to_i64(ptr addrspace(1) %out, i1 zeroext %arg) #0 {
%ext = zext i1 %arg to i64		%ext = zext i1 %arg to i64
store i64 %ext, ptr addrspace(1) %out, align 8		store i64 %ext, ptr addrspace(1) %out, align 8
ret void		ret void
}		}

; GCN-LABEL: {{^}}s_cmp_zext_i1_to_i64:		; GCN-LABEL: {{^}}s_cmp_zext_i1_to_i64:
; GCN-DAG: s_mov_b32 s{{[0-9]+}}, 0		; GCN-DAG: s_mov_b32 s{{[0-9]+}}, 0
; GCN-DAG: s_cmp_eq_u32		; GCN-DAG: s_cmp_eq_u32
; GCN: v_cndmask_b32		; GCN: s_bfe_u64
define amdgpu_kernel void @s_cmp_zext_i1_to_i64(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {		define amdgpu_kernel void @s_cmp_zext_i1_to_i64(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
%cmp = icmp eq i32 %a, %b		%cmp = icmp eq i32 %a, %b
%ext = zext i1 %cmp to i64		%ext = zext i1 %cmp to i64
store i64 %ext, ptr addrspace(1) %out, align 8		store i64 %ext, ptr addrspace(1) %out, align 8
ret void		ret void
}		}

; FIXME: Why different commute?		; FIXME: Why different commute?
Show All 18 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Use `S_BFE_U64` for uniform i1-i64 extChanges PlannedPublic

Details

Diff Detail

Event Timeline