This is an archive of the discontinued LLVM Phabricator instance.

llvm/test/CodeGen/AMDGPU/mul_int24.ll
184–186	We should have been able to eliminate these shifts since the high bits are known unused. Maybe we need a SimplifyDemandedBits combine on the mul24/mulhi24 sources?

foad added a subscriber: foad.Oct 26 2021, 9:38 AM

foad added inline comments.

llvm/test/CodeGen/AMDGPU/mul_int24.ll
223	"lhs" should be "lshr.rhs" here.
286	Same.

Harbormaster completed remote builds in B130735: Diff 382360.Oct 26 2021, 10:11 AM

Test additions LGTM but they do show there's more to be done here

This revision is now accepted and ready to land.Oct 27 2021, 3:27 PM

abinavpp added inline comments.Oct 27 2021, 7:35 PM

llvm/test/CodeGen/AMDGPU/mul_int24.ll
223	I'm seeing this problem in amdgpu-codegenprepare-mul24.ll as well. Fix: D112685

Addressed Jay's comment.

abinavpp marked 2 inline comments as done.Oct 27 2021, 7:37 PM

This revision was landed with ongoing or failed builds.Oct 27 2021, 7:40 PM

Closed by commit rGfa592180b3f4: [AMDGPU] Add more llc tests for 48-bit mul generation. (authored by abinavpp). · Explain Why

This revision was automatically updated to reflect the committed changes.

abinavpp added a commit: rGfa592180b3f4: [AMDGPU] Add more llc tests for 48-bit mul generation..

Harbormaster completed remote builds in B131107: Diff 382891.Oct 27 2021, 8:28 PM

foad added inline comments.Oct 28 2021, 2:10 AM

llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
581–582	This seems to show that we've done a simplify-demanded-bits for the mul (because it is using the original v0 and v2) but not for the mul_hi (which is using the ANDed v1 and v3). Maybe AMDGPUTargetLowering::performIntrinsicWOChainCombine needs to call simplifyMul24 for the new mulhi intrinsics?

abinavpp added inline comments.Oct 28 2021, 4:04 AM

llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
581–582	I missed adding this before. Fix: D112702

Revision Contents

Path

Size

llvm/

test/

CodeGen/

AMDGPU/

mul_int24.ll

132 lines

mul_uint24-amdgcn.ll

66 lines

Diff 382891

llvm/test/CodeGen/AMDGPU/mul_int24.ll

Show First 20 Lines • Show All 171 Lines • ▼ Show 20 Lines	entry:
%b.24.i64 = sext i32 %b.24 to i64		%b.24.i64 = sext i32 %b.24 to i64
%mul48 = mul i64 %a.24.i64, %b.24.i64		%mul48 = mul i64 %a.24.i64, %b.24.i64
%mul48.hi = lshr i64 %mul48, 32		%mul48.hi = lshr i64 %mul48, 32
%mul24hi = trunc i64 %mul48.hi to i32		%mul24hi = trunc i64 %mul48.hi to i32
store i32 %mul24hi, i32 addrspace(1)* %out		store i32 %mul24hi, i32 addrspace(1)* %out
ret void		ret void
}		}

		define i64 @test_smul48_i64(i64 %lhs, i64 %rhs) {
		; SI-LABEL: test_smul48_i64:
		; SI: ; %bb.0:
		; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
		; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v0
		; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
		; SI-NEXT: v_ashr_i64 v[3:4], v[0:1], 40
		arsenmUnsubmitted Not Done Reply Inline Actions We should have been able to eliminate these shifts since the high bits are known unused. Maybe we need a SimplifyDemandedBits combine on the mul24/mulhi24 sources? arsenm: We should have been able to eliminate these shifts since the high bits are known unused. Maybe…
		; SI-NEXT: v_ashr_i64 v[1:2], v[1:2], 40
		; SI-NEXT: v_mul_i32_i24_e32 v0, v3, v1
		; SI-NEXT: v_mul_hi_i32_i24_e32 v1, v3, v1
		; SI-NEXT: s_setpc_b64 s[30:31]
		;
		; VI-LABEL: test_smul48_i64:
		; VI: ; %bb.0:
		; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
		; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0
		; VI-NEXT: v_ashrrev_i64 v[3:4], 40, v[0:1]
		; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
		; VI-NEXT: v_ashrrev_i64 v[1:2], 40, v[0:1]
		; VI-NEXT: v_mul_i32_i24_e32 v0, v3, v1
		; VI-NEXT: v_mul_hi_i32_i24_e32 v1, v3, v1
		; VI-NEXT: s_setpc_b64 s[30:31]
		;
		; GFX9-LABEL: test_smul48_i64:
		; GFX9: ; %bb.0:
		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
		; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v0
		; GFX9-NEXT: v_ashrrev_i64 v[3:4], 40, v[0:1]
		; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2
		; GFX9-NEXT: v_ashrrev_i64 v[1:2], 40, v[0:1]
		; GFX9-NEXT: v_mul_i32_i24_e32 v0, v3, v1
		; GFX9-NEXT: v_mul_hi_i32_i24_e32 v1, v3, v1
		; GFX9-NEXT: s_setpc_b64 s[30:31]
		;
		; EG-LABEL: test_smul48_i64:
		; EG: ; %bb.0:
		; EG-NEXT: CF_END
		; EG-NEXT: PAD
		;
		; CM-LABEL: test_smul48_i64:
		; CM: ; %bb.0:
		; CM-NEXT: CF_END
		; CM-NEXT: PAD
		%shl.lhs = shl i64 %lhs, 40
		foadUnsubmitted Done Reply Inline Actions "lhs" should be "lshr.rhs" here. foad: "lhs" should be "lshr.rhs" here.
		abinavppAuthorUnsubmitted Done Reply Inline Actions I'm seeing this problem in amdgpu-codegenprepare-mul24.ll as well. Fix: D112685 abinavpp: I'm seeing this problem in amdgpu-codegenprepare-mul24.ll as well. Fix: D112685
		%lhs24 = ashr i64 %shl.lhs, 40
		%shl.rhs = shl i64 %rhs, 40
		%rhs24 = ashr i64 %shl.rhs, 40
		%mul = mul i64 %lhs24, %rhs24
		ret i64 %mul
		}

		define <2 x i64> @test_smul48_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
		; SI-LABEL: test_smul48_v2i64:
		; SI: ; %bb.0:
		; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
		; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
		; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v0
		; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v6
		; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
		; SI-NEXT: v_ashr_i64 v[5:6], v[0:1], 40
		; SI-NEXT: v_ashr_i64 v[1:2], v[1:2], 40
		; SI-NEXT: v_ashr_i64 v[6:7], v[2:3], 40
		; SI-NEXT: v_ashr_i64 v[2:3], v[3:4], 40
		; SI-NEXT: v_mul_i32_i24_e32 v0, v1, v2
		; SI-NEXT: v_mul_hi_i32_i24_e32 v1, v1, v2
		; SI-NEXT: v_mul_i32_i24_e32 v2, v5, v6
		; SI-NEXT: v_mul_hi_i32_i24_e32 v3, v5, v6
		; SI-NEXT: s_setpc_b64 s[30:31]
		;
		; VI-LABEL: test_smul48_v2i64:
		; VI: ; %bb.0:
		; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
		; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
		; VI-NEXT: v_ashrrev_i64 v[7:8], 40, v[0:1]
		; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0
		; VI-NEXT: v_ashrrev_i64 v[1:2], 40, v[0:1]
		; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v6
		; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4
		; VI-NEXT: v_ashrrev_i64 v[3:4], 40, v[2:3]
		; VI-NEXT: v_ashrrev_i64 v[4:5], 40, v[1:2]
		; VI-NEXT: v_mul_i32_i24_e32 v0, v1, v3
		; VI-NEXT: v_mul_hi_i32_i24_e32 v1, v1, v3
		; VI-NEXT: v_mul_i32_i24_e32 v2, v7, v4
		; VI-NEXT: v_mul_hi_i32_i24_e32 v3, v7, v4
		; VI-NEXT: s_setpc_b64 s[30:31]
		;
		; GFX9-LABEL: test_smul48_v2i64:
		; GFX9: ; %bb.0:
		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
		; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2
		; GFX9-NEXT: v_ashrrev_i64 v[7:8], 40, v[0:1]
		; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v0
		; GFX9-NEXT: v_ashrrev_i64 v[1:2], 40, v[0:1]
		; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v6
		; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v4
		; GFX9-NEXT: v_ashrrev_i64 v[3:4], 40, v[2:3]
		; GFX9-NEXT: v_ashrrev_i64 v[4:5], 40, v[1:2]
		; GFX9-NEXT: v_mul_i32_i24_e32 v0, v1, v3
		; GFX9-NEXT: v_mul_hi_i32_i24_e32 v1, v1, v3
		; GFX9-NEXT: v_mul_i32_i24_e32 v2, v7, v4
		; GFX9-NEXT: v_mul_hi_i32_i24_e32 v3, v7, v4
		; GFX9-NEXT: s_setpc_b64 s[30:31]
		;
		; EG-LABEL: test_smul48_v2i64:
		; EG: ; %bb.0:
		; EG-NEXT: CF_END
		; EG-NEXT: PAD
		foadUnsubmitted Done Reply Inline Actions Same. foad: Same.
		;
		; CM-LABEL: test_smul48_v2i64:
		; CM: ; %bb.0:
		; CM-NEXT: CF_END
		; CM-NEXT: PAD
		%shl.lhs = shl <2 x i64> %lhs, <i64 40, i64 40>
		%lhs24 = ashr <2 x i64> %shl.lhs, <i64 40, i64 40>
		%shl.rhs = shl <2 x i64> %rhs, <i64 40, i64 40>
		%rhs24 = ashr <2 x i64> %shl.rhs, <i64 40, i64 40>
		%mul = mul <2 x i64> %lhs24, %rhs24
		ret <2 x i64> %mul
		}

; This requires handling of the original 64-bit mul node to eliminate		; This requires handling of the original 64-bit mul node to eliminate
; unnecessary extension instructions because after legalization they		; unnecessary extension instructions because after legalization they
; will not be removed by SimplifyDemandedBits because there are		; will not be removed by SimplifyDemandedBits because there are
; multiple uses by the separate mul and mulhi.		; multiple uses by the separate mul and mulhi.
define amdgpu_kernel void @test_smul24_i64(i64 addrspace(1)* %out, [8 x i32], i32 %a, [8 x i32], i32 %b) #0 {		define amdgpu_kernel void @test_smul24_i64(i64 addrspace(1)* %out, [8 x i32], i32 %a, [8 x i32], i32 %b) #0 {
; SI-LABEL: test_smul24_i64:		; SI-LABEL: test_smul24_i64:
; SI: ; %bb.0:		; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9		; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
▲ Show 20 Lines • Show All 395 Lines • ▼ Show 20 Lines
}		}

define amdgpu_kernel void @simplify_i24_crash(<2 x i32> addrspace(1)* %out, i32 %arg0, <2 x i32> %arg1, <2 x i32> %arg2) {		define amdgpu_kernel void @simplify_i24_crash(<2 x i32> addrspace(1)* %out, i32 %arg0, <2 x i32> %arg1, <2 x i32> %arg2) {
; SI-LABEL: simplify_i24_crash:		; SI-LABEL: simplify_i24_crash:
; SI: ; %bb.0: ; %bb		; SI: ; %bb.0: ; %bb
; SI-NEXT: s_load_dword s2, s[0:1], 0xb		; SI-NEXT: s_load_dword s2, s[0:1], 0xb
; SI-NEXT: s_waitcnt lgkmcnt(0)		; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_cmp_lg_u32 s2, 0		; SI-NEXT: s_cmp_lg_u32 s2, 0
; SI-NEXT: s_cbranch_scc0 BB6_2		; SI-NEXT: s_cbranch_scc0 BB8_2
; SI-NEXT: ; %bb.1: ; %bb7		; SI-NEXT: ; %bb.1: ; %bb7
; SI-NEXT: s_endpgm		; SI-NEXT: s_endpgm
; SI-NEXT: BB6_2: ; %bb11		; SI-NEXT: BB8_2: ; %bb11
; SI-NEXT: s_load_dword s2, s[0:1], 0xd		; SI-NEXT: s_load_dword s2, s[0:1], 0xd
; SI-NEXT: s_load_dword s4, s[0:1], 0xf		; SI-NEXT: s_load_dword s4, s[0:1], 0xf
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9		; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000		; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)		; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bfe_i32 s2, s2, 0x180000		; SI-NEXT: s_bfe_i32 s2, s2, 0x180000
; SI-NEXT: s_bfe_i32 s4, s4, 0x180000		; SI-NEXT: s_bfe_i32 s4, s4, 0x180000
; SI-NEXT: s_mul_i32 s4, s2, s4		; SI-NEXT: s_mul_i32 s4, s2, s4
; SI-NEXT: s_mov_b32 s2, -1		; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4		; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: v_mov_b32_e32 v1, s4		; SI-NEXT: v_mov_b32_e32 v1, s4
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0		; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm		; SI-NEXT: s_endpgm
;		;
; VI-LABEL: simplify_i24_crash:		; VI-LABEL: simplify_i24_crash:
; VI: ; %bb.0: ; %bb		; VI: ; %bb.0: ; %bb
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c		; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)		; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s2, 0		; VI-NEXT: s_cmp_lg_u32 s2, 0
; VI-NEXT: s_cbranch_scc0 BB6_2		; VI-NEXT: s_cbranch_scc0 BB8_2
; VI-NEXT: ; %bb.1: ; %bb7		; VI-NEXT: ; %bb.1: ; %bb7
; VI-NEXT: s_endpgm		; VI-NEXT: s_endpgm
; VI-NEXT: BB6_2: ; %bb11		; VI-NEXT: BB8_2: ; %bb11
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24		; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_load_dword s2, s[0:1], 0x34		; VI-NEXT: s_load_dword s2, s[0:1], 0x34
; VI-NEXT: s_load_dword s0, s[0:1], 0x3c		; VI-NEXT: s_load_dword s0, s[0:1], 0x3c
; VI-NEXT: s_mov_b32 s7, 0xf000		; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1		; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)		; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bfe_i32 s1, s2, 0x180000		; VI-NEXT: s_bfe_i32 s1, s2, 0x180000
; VI-NEXT: s_bfe_i32 s0, s0, 0x180000		; VI-NEXT: s_bfe_i32 s0, s0, 0x180000
; VI-NEXT: s_mul_i32 s1, s1, s0		; VI-NEXT: s_mul_i32 s1, s1, s0
; VI-NEXT: v_mov_b32_e32 v0, s1		; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: v_mov_b32_e32 v1, s1		; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0		; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm		; VI-NEXT: s_endpgm
;		;
; GFX9-LABEL: simplify_i24_crash:		; GFX9-LABEL: simplify_i24_crash:
; GFX9: ; %bb.0: ; %bb		; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c		; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
; GFX9-NEXT: s_waitcnt lgkmcnt(0)		; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_cmp_lg_u32 s2, 0		; GFX9-NEXT: s_cmp_lg_u32 s2, 0
; GFX9-NEXT: s_cbranch_scc0 BB6_2		; GFX9-NEXT: s_cbranch_scc0 BB8_2
; GFX9-NEXT: ; %bb.1: ; %bb7		; GFX9-NEXT: ; %bb.1: ; %bb7
; GFX9-NEXT: s_endpgm		; GFX9-NEXT: s_endpgm
; GFX9-NEXT: BB6_2: ; %bb11		; GFX9-NEXT: BB8_2: ; %bb11
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24		; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34		; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34
; GFX9-NEXT: s_load_dword s3, s[0:1], 0x3c		; GFX9-NEXT: s_load_dword s3, s[0:1], 0x3c
; GFX9-NEXT: s_mov_b32 s7, 0xf000		; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1		; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)		; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000		; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000
; GFX9-NEXT: s_bfe_i32 s1, s3, 0x180000		; GFX9-NEXT: s_bfe_i32 s1, s3, 0x180000
▲ Show 20 Lines • Show All 76 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll

Show First 20 Lines • Show All 565 Lines • ▼ Show 20 Lines	entry:
%a_24 = lshr i64 %tmp0, 40		%a_24 = lshr i64 %tmp0, 40
%tmp1 = shl i64 %b, 40		%tmp1 = shl i64 %b, 40
%b_24 = lshr i64 %tmp1, 40		%b_24 = lshr i64 %tmp1, 40
%tmp2 = mul i64 %a_24, %b_24		%tmp2 = mul i64 %a_24, %b_24
store i64 %tmp2, i64 addrspace(1)* %out		store i64 %tmp2, i64 addrspace(1)* %out
ret void		ret void
}		}

		define i64 @test_umul48_i64(i64 %lhs, i64 %rhs) {
		; GCN-LABEL: test_umul48_i64:
		; GCN: ; %bb.0:
		; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
		; GCN-NEXT: s_mov_b32 s4, 0xffffff
		; GCN-NEXT: v_and_b32_e32 v1, s4, v0
		; GCN-NEXT: v_and_b32_e32 v3, s4, v2
		; GCN-NEXT: v_mul_u32_u24_e32 v0, v0, v2
		; GCN-NEXT: v_mul_hi_u32_u24_e32 v1, v1, v3
		foadUnsubmitted Not Done Reply Inline Actions This seems to show that we've done a simplify-demanded-bits for the mul (because it is using the original v0 and v2) but not for the mul_hi (which is using the ANDed v1 and v3). Maybe AMDGPUTargetLowering::performIntrinsicWOChainCombine needs to call simplifyMul24 for the new mulhi intrinsics? foad: This seems to show that we've done a simplify-demanded-bits for the mul (because it is using…
		abinavppAuthorUnsubmitted Done Reply Inline Actions I missed adding this before. Fix: D112702 abinavpp: I missed adding this before. Fix: D112702
		; GCN-NEXT: s_setpc_b64 s[30:31]
		%lhs24 = and i64 %lhs, 16777215
		%rhs24 = and i64 %rhs, 16777215
		%mul = mul i64 %lhs24, %rhs24
		ret i64 %mul
		}

		define <2 x i64> @test_umul48_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
		; SI-LABEL: test_umul48_v2i64:
		; SI: ; %bb.0:
		; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
		; SI-NEXT: s_mov_b32 s4, 0xffffff
		; SI-NEXT: v_mul_u32_u24_e32 v5, v0, v4
		; SI-NEXT: v_mul_u32_u24_e32 v7, v2, v6
		; SI-NEXT: v_and_b32_e32 v2, s4, v2
		; SI-NEXT: v_and_b32_e32 v0, s4, v0
		; SI-NEXT: v_and_b32_e32 v3, s4, v6
		; SI-NEXT: v_and_b32_e32 v1, s4, v4
		; SI-NEXT: v_mul_hi_u32_u24_e32 v1, v0, v1
		; SI-NEXT: v_mul_hi_u32_u24_e32 v3, v2, v3
		; SI-NEXT: v_mov_b32_e32 v0, v5
		; SI-NEXT: v_mov_b32_e32 v2, v7
		; SI-NEXT: s_setpc_b64 s[30:31]
		;
		; VI-LABEL: test_umul48_v2i64:
		; VI: ; %bb.0:
		; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
		; VI-NEXT: s_mov_b32 s4, 0xffffff
		; VI-NEXT: v_and_b32_e32 v3, s4, v2
		; VI-NEXT: v_and_b32_e32 v1, s4, v0
		; VI-NEXT: v_and_b32_e32 v5, s4, v6
		; VI-NEXT: v_and_b32_e32 v7, s4, v4
		; VI-NEXT: v_mul_u32_u24_e32 v0, v0, v4
		; VI-NEXT: v_mul_hi_u32_u24_e32 v1, v1, v7
		; VI-NEXT: v_mul_u32_u24_e32 v2, v2, v6
		; VI-NEXT: v_mul_hi_u32_u24_e32 v3, v3, v5
		; VI-NEXT: s_setpc_b64 s[30:31]
		;
		; GFX9-LABEL: test_umul48_v2i64:
		; GFX9: ; %bb.0:
		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
		; GFX9-NEXT: s_mov_b32 s4, 0xffffff
		; GFX9-NEXT: v_and_b32_e32 v3, s4, v2
		; GFX9-NEXT: v_and_b32_e32 v1, s4, v0
		; GFX9-NEXT: v_and_b32_e32 v5, s4, v6
		; GFX9-NEXT: v_and_b32_e32 v7, s4, v4
		; GFX9-NEXT: v_mul_u32_u24_e32 v0, v0, v4
		; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, v1, v7
		; GFX9-NEXT: v_mul_u32_u24_e32 v2, v2, v6
		; GFX9-NEXT: v_mul_hi_u32_u24_e32 v3, v3, v5
		; GFX9-NEXT: s_setpc_b64 s[30:31]
		%lhs24 = and <2 x i64> %lhs, <i64 16777215, i64 16777215>
		%rhs24 = and <2 x i64> %rhs, <i64 16777215, i64 16777215>
		%mul = mul <2 x i64> %lhs24, %rhs24
		ret <2 x i64> %mul
		}

define amdgpu_kernel void @test_umul24_i64_square(i64 addrspace(1)* %out, [8 x i32], i64 %a) {		define amdgpu_kernel void @test_umul24_i64_square(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
; SI-LABEL: test_umul24_i64_square:		; SI-LABEL: test_umul24_i64_square:
; SI: ; %bb.0: ; %entry		; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dword s4, s[0:1], 0x13		; SI-NEXT: s_load_dword s4, s[0:1], 0x13
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9		; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000		; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1		; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)		; SI-NEXT: s_waitcnt lgkmcnt(0)
▲ Show 20 Lines • Show All 273 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Add more llc tests for 48-bit mul generation.ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 382891

llvm/test/CodeGen/AMDGPU/mul_int24.ll

llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll

[AMDGPU] Add more llc tests for 48-bit mul generation.
ClosedPublic