diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll @@ -688,37 +688,243 @@ ret <2 x i16> %and } -; FIXME: -; define amdgpu_ps i48 @s_andn2_v3i16(<3 x i16> inreg %src0, <3 x i16> inreg %src1) { -; %not.src1 = xor <3 x i16> %src1, -; %and = and <3 x i16> %src0, %not.src1 -; %cast = bitcast <3 x i16> %and to i48 -; ret i48 %cast -; } - -; define amdgpu_ps i48 @s_andn2_v3i16_commute(<3 x i16> inreg %src0, <3 x i16> inreg %src1) { -; %not.src1 = xor <3 x i16> %src1, -; %and = and <3 x i16> %not.src1, %src0 -; %cast = bitcast <3 x i16> %and to i48 -; ret i48 %cast -; } - -; define amdgpu_ps { i48, i48 } @s_andn2_v3i16_multi_use(<3 x i16> inreg %src0, <3 x i16> inreg %src1) { -; %not.src1 = xor <3 x i16> %src1, -; %and = and <3 x i16> %src0, %not.src1 - -; %cast.0 = bitcast <3 x i16> %and to i48 -; %cast.1 = bitcast <3 x i16> %not.src1 to i48 -; %insert.0 = insertvalue { i48, i48 } undef, i48 %cast.0, 0 -; %insert.1 = insertvalue { i48, i48 } %insert.0, i48 %cast.1, 1 -; ret { i48, i48 } %insert.1 -; } - -; define <3 x i16> @v_andn2_v3i16(<3 x i16> %src0, <3 x i16> %src1) { -; %not.src1 = xor <3 x i16> %src1, -; %and = and <3 x i16> %src0, %not.src1 -; ret <3 x i16> %and -; } + +define amdgpu_ps i48 @s_andn2_v3i16(<3 x i16> inreg %src0, <3 x i16> inreg %src1) { +; GFX6-LABEL: s_andn2_v3i16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_and_b32 s6, s6, 0xffff +; GFX6-NEXT: s_mov_b32 s0, -1 +; GFX6-NEXT: s_and_b32 s5, s5, 0xffff +; GFX6-NEXT: s_lshl_b32 s6, s6, 16 +; GFX6-NEXT: s_mov_b32 s1, 0xffff +; GFX6-NEXT: s_or_b32 s6, s5, s6 +; GFX6-NEXT: s_and_b32 s7, s7, 0xffff +; GFX6-NEXT: s_xor_b64 s[0:1], s[6:7], s[0:1] +; GFX6-NEXT: s_and_b32 s3, s3, 0xffff +; GFX6-NEXT: s_lshr_b32 s5, s0, 16 +; GFX6-NEXT: s_and_b32 s2, s2, 0xffff +; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_or_b32 s2, s2, s3 +; GFX6-NEXT: s_and_b32 s3, s4, 0xffff +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff +; GFX6-NEXT: s_lshl_b32 s4, s5, 16 +; GFX6-NEXT: s_or_b32 s0, s0, s4 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] +; GFX6-NEXT: s_lshr_b32 s2, s0, 16 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_or_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_andn2_v3i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b64 s[0:1], -1 +; GFX9-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] +; GFX9-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] +; GFX9-NEXT: s_lshr_b32 s2, s0, 16 +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NEXT: s_lshl_b32 s2, s2, 16 +; GFX9-NEXT: s_or_b32 s0, s0, s2 +; GFX9-NEXT: s_and_b32 s1, s1, 0xffff +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10PLUS-LABEL: s_andn2_v3i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_mov_b64 s[0:1], -1 +; GFX10PLUS-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] +; GFX10PLUS-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] +; GFX10PLUS-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10PLUS-NEXT: s_lshl_b32 s2, s2, 16 +; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff +; GFX10PLUS-NEXT: s_or_b32 s0, s0, s2 +; GFX10PLUS-NEXT: ; return to shader part epilog + %not.src1 = xor <3 x i16> %src1, + %and = and <3 x i16> %src0, %not.src1 + %cast = bitcast <3 x i16> %and to i48 + ret i48 %cast +} + +define amdgpu_ps i48 @s_andn2_v3i16_commute(<3 x i16> inreg %src0, <3 x i16> inreg %src1) { +; GFX6-LABEL: s_andn2_v3i16_commute: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_and_b32 s6, s6, 0xffff +; GFX6-NEXT: s_mov_b32 s0, -1 +; GFX6-NEXT: s_and_b32 s5, s5, 0xffff +; GFX6-NEXT: s_lshl_b32 s6, s6, 16 +; GFX6-NEXT: s_mov_b32 s1, 0xffff +; GFX6-NEXT: s_or_b32 s6, s5, s6 +; GFX6-NEXT: s_and_b32 s7, s7, 0xffff +; GFX6-NEXT: s_xor_b64 s[0:1], s[6:7], s[0:1] +; GFX6-NEXT: s_lshr_b32 s5, s0, 16 +; GFX6-NEXT: s_and_b32 s3, s3, 0xffff +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff +; GFX6-NEXT: s_lshl_b32 s5, s5, 16 +; GFX6-NEXT: s_and_b32 s2, s2, 0xffff +; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_or_b32 s0, s0, s5 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_or_b32 s2, s2, s3 +; GFX6-NEXT: s_and_b32 s3, s4, 0xffff +; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; GFX6-NEXT: s_lshr_b32 s2, s0, 16 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_or_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_andn2_v3i16_commute: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b64 s[0:1], -1 +; GFX9-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_lshr_b32 s2, s0, 16 +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NEXT: s_lshl_b32 s2, s2, 16 +; GFX9-NEXT: s_or_b32 s0, s0, s2 +; GFX9-NEXT: s_and_b32 s1, s1, 0xffff +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10PLUS-LABEL: s_andn2_v3i16_commute: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_mov_b64 s[0:1], -1 +; GFX10PLUS-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] +; GFX10PLUS-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; GFX10PLUS-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10PLUS-NEXT: s_lshl_b32 s2, s2, 16 +; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff +; GFX10PLUS-NEXT: s_or_b32 s0, s0, s2 +; GFX10PLUS-NEXT: ; return to shader part epilog + %not.src1 = xor <3 x i16> %src1, + %and = and <3 x i16> %not.src1, %src0 + %cast = bitcast <3 x i16> %and to i48 + ret i48 %cast +} + +define amdgpu_ps { i48, i48 } @s_andn2_v3i16_multi_use(<3 x i16> inreg %src0, <3 x i16> inreg %src1) { +; GFX6-LABEL: s_andn2_v3i16_multi_use: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_and_b32 s6, s6, 0xffff +; GFX6-NEXT: s_mov_b32 s0, -1 +; GFX6-NEXT: s_and_b32 s5, s5, 0xffff +; GFX6-NEXT: s_lshl_b32 s6, s6, 16 +; GFX6-NEXT: s_mov_b32 s1, 0xffff +; GFX6-NEXT: s_or_b32 s6, s5, s6 +; GFX6-NEXT: s_and_b32 s7, s7, 0xffff +; GFX6-NEXT: s_xor_b64 s[0:1], s[6:7], s[0:1] +; GFX6-NEXT: s_lshr_b32 s5, s0, 16 +; GFX6-NEXT: s_and_b32 s3, s3, 0xffff +; GFX6-NEXT: s_and_b32 s2, s2, 0xffff +; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_and_b32 s7, s4, 0xffff +; GFX6-NEXT: s_and_b32 s4, s0, 0xffff +; GFX6-NEXT: s_lshl_b32 s5, s5, 16 +; GFX6-NEXT: s_or_b32 s6, s2, s3 +; GFX6-NEXT: s_or_b32 s2, s4, s5 +; GFX6-NEXT: s_and_b32 s3, s1, 0xffff +; GFX6-NEXT: s_and_b64 s[0:1], s[6:7], s[2:3] +; GFX6-NEXT: s_lshr_b32 s2, s0, 16 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_or_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_or_b32 s2, s4, s5 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_andn2_v3i16_multi_use: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b64 s[0:1], -1 +; GFX9-NEXT: s_xor_b64 s[4:5], s[4:5], s[0:1] +; GFX9-NEXT: s_and_b64 s[0:1], s[2:3], s[4:5] +; GFX9-NEXT: s_lshr_b32 s2, s0, 16 +; GFX9-NEXT: s_lshr_b32 s6, s4, 16 +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NEXT: s_lshl_b32 s2, s2, 16 +; GFX9-NEXT: s_or_b32 s0, s0, s2 +; GFX9-NEXT: s_and_b32 s2, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s3, s6, 16 +; GFX9-NEXT: s_and_b32 s1, s1, 0xffff +; GFX9-NEXT: s_or_b32 s2, s2, s3 +; GFX9-NEXT: s_and_b32 s3, s5, 0xffff +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10PLUS-LABEL: s_andn2_v3i16_multi_use: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_mov_b64 s[0:1], -1 +; GFX10PLUS-NEXT: s_xor_b64 s[4:5], s[4:5], s[0:1] +; GFX10PLUS-NEXT: s_and_b64 s[0:1], s[2:3], s[4:5] +; GFX10PLUS-NEXT: s_lshr_b32 s3, s4, 16 +; GFX10PLUS-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10PLUS-NEXT: s_lshl_b32 s2, s2, 16 +; GFX10PLUS-NEXT: s_lshl_b32 s3, s3, 16 +; GFX10PLUS-NEXT: s_or_b32 s0, s0, s2 +; GFX10PLUS-NEXT: s_and_b32 s2, s4, 0xffff +; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff +; GFX10PLUS-NEXT: s_or_b32 s2, s2, s3 +; GFX10PLUS-NEXT: s_and_b32 s3, s5, 0xffff +; GFX10PLUS-NEXT: ; return to shader part epilog + %not.src1 = xor <3 x i16> %src1, + %and = and <3 x i16> %src0, %not.src1 + %cast.0 = bitcast <3 x i16> %and to i48 + %cast.1 = bitcast <3 x i16> %not.src1 to i48 + %insert.0 = insertvalue { i48, i48 } undef, i48 %cast.0, 0 + %insert.1 = insertvalue { i48, i48 } %insert.0, i48 %cast.1, 1 + ret { i48, i48 } %insert.1 +} + +define <3 x i16> @v_andn2_v3i16(<3 x i16> %src0, <3 x i16> %src1) { +; GFX6-LABEL: v_andn2_v3i16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX6-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_xor_b32_e32 v4, 0xfff5, v4 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX6-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_and_b32_e32 v2, v1, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_andn2_v3i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX9-NEXT: v_xor_b32_e32 v3, -11, v3 +; GFX9-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX9-NEXT: v_and_b32_e32 v1, v1, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10PLUS-LABEL: v_andn2_v3i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX10PLUS-NEXT: v_xor_b32_e32 v3, -11, v3 +; GFX10PLUS-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX10PLUS-NEXT: v_and_b32_e32 v1, v1, v3 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] + %not.src1 = xor <3 x i16> %src1, + %and = and <3 x i16> %src0, %not.src1 + ret <3 x i16> %and +} define amdgpu_ps i64 @s_andn2_v4i16(<4 x i16> inreg %src0, <4 x i16> inreg %src1) { ; GFX6-LABEL: s_andn2_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll @@ -599,11 +599,55 @@ ret <2 x i16> %bswap } -; FIXME -; define <3 x i16> @v_bswap_v3i16(<3 x i16> %src) { -; %bswap = call <3 x i16> @llvm.bswap.v3i16(<3 x i16> %ext.src) -; ret <3 x i16> %bswap -; } +define <3 x i16> @v_bswap_v3i16(<3 x i16> %src) { +; GFX7-LABEL: v_bswap_v3i16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v0 +; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v1 +; GFX7-NEXT: v_bfe_u32 v1, v1, 8, 8 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v2 +; GFX7-NEXT: v_bfe_u32 v2, v2, 8, 8 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_bswap_v3i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_mov_b32 s4, 0x2030001 +; GFX8-NEXT: v_perm_b32 v0, 0, v0, s4 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX8-NEXT: v_perm_b32 v1, 0, v1, s4 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_bswap_v3i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0x2030001 +; GFX9-NEXT: v_perm_b32 v0, 0, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, 0, v1, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_bswap_v3i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_perm_b32 v0, 0, v0, 0x2030001 +; GFX10-NEXT: v_perm_b32 v1, 0, v1, 0x2030001 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %bswap = call <3 x i16> @llvm.bswap.v3i16(<3 x i16> %src) + ret <3 x i16> %bswap +} define i64 @v_bswap_i48(i64 %src) { ; GFX7-LABEL: v_bswap_i48: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll @@ -490,34 +490,152 @@ ret void } -; FIXME: -; define amdgpu_kernel void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind { -; %tid = call i32 @llvm.amdgcn.workitem.id.x() -; %gep = getelementptr <2 x i8>, <2 x i8> addrspace(1)* %in, i32 %tid -; %load = load <2 x i8>, <2 x i8> addrspace(1)* %gep, align 2 -; %cvt = uitofp <2 x i8> %load to <2 x float> -; store <2 x float> %cvt, <2 x float> addrspace(1)* %out, align 16 -; ret void -; } +define amdgpu_kernel void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind { +; SI-LABEL: load_v2i8_to_v2f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v0 +; SI-NEXT: v_bfe_u32 v2, v0, 8, 8 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v1 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v2 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: load_v2i8_to_v2f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_ushort v1, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <2 x i8>, <2 x i8> addrspace(1)* %in, i32 %tid + %load = load <2 x i8>, <2 x i8> addrspace(1)* %gep, align 2 + %cvt = uitofp <2 x i8> %load to <2 x float> + store <2 x float> %cvt, <2 x float> addrspace(1)* %out, align 16 + ret void +} -; FIXME: -; define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind { -; %tid = call i32 @llvm.amdgcn.workitem.id.x() -; %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid -; %load = load <3 x i8>, <3 x i8> addrspace(1)* %gep, align 4 -; %cvt = uitofp <3 x i8> %load to <3 x float> -; store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16 -; ret void -; } +define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind { +; SI-LABEL: load_v3i8_to_v3f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v0 +; SI-NEXT: v_bfe_u32 v2, v0, 8, 8 +; SI-NEXT: v_bfe_u32 v3, v0, 16, 8 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v1 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v2 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v3 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8 +; SI-NEXT: s_endpgm +; +; VI-LABEL: load_v3i8_to_v3f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v2, v[0:1] +; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 +; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 +; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] +; VI-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid + %load = load <3 x i8>, <3 x i8> addrspace(1)* %gep, align 4 + %cvt = uitofp <3 x i8> %load to <3 x float> + store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16 + ret void +} -; define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { -; %tid = call i32 @llvm.amdgcn.workitem.id.x() -; %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid -; %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4 -; %cvt = uitofp <4 x i8> %load to <4 x float> -; store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 -; ret void -; } +define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { +; SI-LABEL: load_v4i8_to_v4f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v0 +; SI-NEXT: v_bfe_u32 v2, v0, 8, 8 +; SI-NEXT: v_bfe_u32 v4, v0, 16, 8 +; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v1 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v2 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: load_v4i8_to_v4f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 +; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 +; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v3 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid + %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4 + %cvt = uitofp <4 x i8> %load to <4 x float> + store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 + ret void +} ; This should not be adding instructions to shift into the correct ; position in the word for the component. @@ -604,39 +722,265 @@ ret void } -; FIXME: Need to handle non-uniform case for function below (load without gep). -; Instructions still emitted to repack bytes for add use. -; define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind { -; %tid.x = call i32 @llvm.amdgcn.workitem.id.x() -; %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x -; %load = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4 -; %cvt = uitofp <4 x i8> %load to <4 x float> -; store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 -; %add = add <4 x i8> %load, ; Second use of %load -; store <4 x i8> %add, <4 x i8> addrspace(1)* %out2, align 4 -; ret void -; } +define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind { +; SI-LABEL: load_v4i8_to_v4f32_2_uses: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[0:1] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 9, v0 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v1 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v8, vcc, 9, v1 +; SI-NEXT: v_add_i32_e32 v9, vcc, 9, v2 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v5 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v8 +; SI-NEXT: v_add_i32_e32 v4, vcc, 9, v4 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v9 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; SI-NEXT: v_or_b32_e32 v0, v6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: load_v4i8_to_v4f32_2_uses: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: v_mov_b32_e32 v6, 8 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 9 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v1 +; VI-NEXT: v_add_u16_e32 v9, 9, v1 +; VI-NEXT: v_add_u16_sdwa v10, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v7, 9, v7 +; VI-NEXT: v_add_u16_e32 v8, 9, v8 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_and_b32_e32 v1, 0xff, v8 +; VI-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v10 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_or_b32_e32 v2, v0, v2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x + %load = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4 + %cvt = uitofp <4 x i8> %load to <4 x float> + store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 + %add = add <4 x i8> %load, ; Second use of %load + store <4 x i8> %add, <4 x i8> addrspace(1)* %out2, align 4 + ret void +} -; Make sure this doesn't crash. -; FIXME: -; define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind { -; %tid = call i32 @llvm.amdgcn.workitem.id.x() -; %gep = getelementptr <7 x i8>, <7 x i8> addrspace(1)* %in, i32 %tid -; %load = load <7 x i8>, <7 x i8> addrspace(1)* %gep, align 1 -; %cvt = uitofp <7 x i8> %load to <7 x float> -; store <7 x float> %cvt, <7 x float> addrspace(1)* %out, align 16 -; ret void -; } +define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind { +; SI-LABEL: load_v7i8_to_v7f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[4:7], 0 addr64 offset:1 +; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[4:7], 0 addr64 offset:2 +; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[4:7], 0 addr64 offset:3 +; SI-NEXT: buffer_load_ubyte v6, v[0:1], s[4:7], 0 addr64 offset:4 +; SI-NEXT: buffer_load_ubyte v7, v[0:1], s[4:7], 0 addr64 offset:5 +; SI-NEXT: buffer_load_ubyte v8, v[0:1], s[4:7], 0 addr64 offset:6 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v3 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v5 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_ubyte0_e32 v5, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_ubyte0_e32 v6, v8 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:24 +; SI-NEXT: s_endpgm +; +; VI-LABEL: load_v7i8_to_v7f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 4, v0 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 5, v0 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 6, v0 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc +; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: flat_load_ubyte v1, v[2:3] +; VI-NEXT: flat_load_ubyte v2, v[4:5] +; VI-NEXT: flat_load_ubyte v3, v[6:7] +; VI-NEXT: flat_load_ubyte v4, v[8:9] +; VI-NEXT: flat_load_ubyte v5, v[10:11] +; VI-NEXT: flat_load_ubyte v6, v[12:13] +; VI-NEXT: v_mov_b32_e32 v8, s1 +; VI-NEXT: v_mov_b32_e32 v7, s0 +; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: v_mov_b32_e32 v10, s1 +; VI-NEXT: v_mov_b32_e32 v9, s0 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v3 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v4 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cvt_f32_ubyte0_e32 v5, v5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v6 +; VI-NEXT: flat_store_dwordx4 v[7:8], v[0:3] +; VI-NEXT: flat_store_dwordx3 v[9:10], v[4:6] +; VI-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <7 x i8>, <7 x i8> addrspace(1)* %in, i32 %tid + %load = load <7 x i8>, <7 x i8> addrspace(1)* %gep, align 1 + %cvt = uitofp <7 x i8> %load to <7 x float> + store <7 x float> %cvt, <7 x float> addrspace(1)* %out, align 16 + ret void +} -; FIXME -; define amdgpu_kernel void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind { -; %tid = call i32 @llvm.amdgcn.workitem.id.x() -; %gep = getelementptr <8 x i8>, <8 x i8> addrspace(1)* %in, i32 %tid -; %load = load <8 x i8>, <8 x i8> addrspace(1)* %gep, align 8 -; %cvt = uitofp <8 x i8> %load to <8 x float> -; store <8 x float> %cvt, <8 x float> addrspace(1)* %out, align 16 -; ret void -; } +define amdgpu_kernel void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind { +; SI-LABEL: load_v8i8_to_v8f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v0 +; SI-NEXT: v_bfe_u32 v4, v0, 8, 8 +; SI-NEXT: v_bfe_u32 v5, v0, 16, 8 +; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 +; SI-NEXT: v_bfe_u32 v8, v1, 8, 8 +; SI-NEXT: v_bfe_u32 v9, v1, 16, 8 +; SI-NEXT: v_cvt_f32_ubyte3_e32 v7, v1 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v5 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v6 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v5, v8 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v6, v9 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-NEXT: s_endpgm +; +; VI-LABEL: load_v8i8_to_v8f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: v_mov_b32_e32 v9, s1 +; VI-NEXT: v_mov_b32_e32 v8, s0 +; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: v_mov_b32_e32 v11, s1 +; VI-NEXT: v_mov_b32_e32 v10, s0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 +; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 +; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v6 +; VI-NEXT: v_cvt_f32_ubyte0_sdwa v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; VI-NEXT: v_cvt_f32_ubyte0_sdwa v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 +; VI-NEXT: v_cvt_f32_ubyte0_sdwa v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 +; VI-NEXT: v_cvt_f32_ubyte3_e32 v7, v7 +; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; VI-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <8 x i8>, <8 x i8> addrspace(1)* %in, i32 %tid + %load = load <8 x i8>, <8 x i8> addrspace(1)* %gep, align 8 + %cvt = uitofp <8 x i8> %load to <8 x float> + store <8 x float> %cvt, <8 x float> addrspace(1)* %out, align 16 + ret void +} define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: i8_zext_inreg_i32_to_f32: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll @@ -346,11 +346,67 @@ ret <2 x half> %fma } -; FIXME: -; define <3 x half> @v_fma_v3f16(<3 x half> %x, <3 x half> %y, <3 x half> %z) { -; %fma = call <3 x half> @llvm.fma.v3f16(<3 x half> %x, <3 x half> %y, <3 x half> %z) -; ret <3 x half> %fma -; } +define <3 x half> @v_fma_v3f16(<3 x half> %x, <3 x half> %y, <3 x half> %z) { +; GFX6-LABEL: v_fma_v3f16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_fma_f32 v0, v0, v3, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v8 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_fma_f32 v1, v1, v3, v4 +; GFX6-NEXT: v_fma_f32 v2, v2, v5, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fma_v3f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX8-NEXT: v_fma_f16 v0, v0, v2, v4 +; GFX8-NEXT: v_fma_f16 v2, v6, v7, v8 +; GFX8-NEXT: v_fma_f16 v1, v1, v3, v5 +; GFX8-NEXT: v_mov_b32_e32 v3, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fma_v3f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_fma_f16 v0, v0, v2, v4 +; GFX9-NEXT: v_pk_fma_f16 v1, v1, v3, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fma_v3f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_fma_f16 v0, v0, v2, v4 +; GFX10-NEXT: v_pk_fma_f16 v1, v1, v3, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fma_v3f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_pk_fma_f16 v0, v0, v2, v4 +; GFX11-NEXT: v_pk_fma_f16 v1, v1, v3, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %fma = call <3 x half> @llvm.fma.v3f16(<3 x half> %x, <3 x half> %y, <3 x half> %z) + ret <3 x half> %fma +} define <4 x half> @v_fma_v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z) { ; GFX6-LABEL: v_fma_v4f16: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll @@ -118,30 +118,145 @@ ret <2 x half> %mul } -; FIXME -; define <3 x half> @v_fmul_v3f16(<3 x half> %a, <3 x half> %b) { -; %mul = fmul <3 x half> %a, %b -; ret <3 x half> %mul -; } +define <3 x half> @v_fmul_v3f16(<3 x half> %a, <3 x half> %b) { +; GFX9-LABEL: v_fmul_v3f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 +; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fmul_v3f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mul_f16_e32 v4, v0, v2 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_mul_f16_e32 v1, v1, v3 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fmul_v3f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 +; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %mul = fmul <3 x half> %a, %b + ret <3 x half> %mul +} -; define <3 x half> @v_fmul_v3f16_fneg_lhs(<3 x half> %a, <3 x half> %b) { -; %neg.a = fneg <3 x half> %a -; %mul = fmul <3 x half> %neg.a, %b -; ret <3 x half> %mul -; } +define <3 x half> @v_fmul_v3f16_fneg_lhs(<3 x half> %a, <3 x half> %b) { +; GFX9-LABEL: v_fmul_v3f16_fneg_lhs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[1,0] neg_hi:[1,0] +; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[1,0] neg_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fmul_v3f16_fneg_lhs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, 0x80008000 +; GFX8-NEXT: v_xor_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_mul_f16_e32 v4, v0, v2 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_mul_f16_e32 v1, v1, v3 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fmul_v3f16_fneg_lhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[1,0] neg_hi:[1,0] +; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[1,0] neg_hi:[1,0] +; GFX10-NEXT: s_setpc_b64 s[30:31] + %neg.a = fneg <3 x half> %a + %mul = fmul <3 x half> %neg.a, %b + ret <3 x half> %mul +} -; define <3 x half> @v_fmul_v3f16_fneg_rhs(<3 x half> %a, <3 x half> %b) { -; %neg.b = fneg <3 x half> %b -; %mul = fmul <3 x half> %a, %neg.b -; ret <3 x half> %mul -; } +define <3 x half> @v_fmul_v3f16_fneg_rhs(<3 x half> %a, <3 x half> %b) { +; GFX9-LABEL: v_fmul_v3f16_fneg_rhs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fmul_v3f16_fneg_rhs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, 0x80008000 +; GFX8-NEXT: v_xor_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_mul_f16_e32 v4, v0, v2 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_mul_f16_e32 v1, v1, v3 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fmul_v3f16_fneg_rhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1] +; GFX10-NEXT: s_setpc_b64 s[30:31] + %neg.b = fneg <3 x half> %b + %mul = fmul <3 x half> %a, %neg.b + ret <3 x half> %mul +} -; define <3 x half> @v_fmul_v3f16_fneg_lhs_fneg_rhs(<3 x half> %a, <3 x half> %b) { -; %neg.a = fneg <3 x half> %a -; %neg.b = fneg <3 x half> %b -; %mul = fmul <3 x half> %neg.a, %neg.b -; ret <3 x half> %mul -; } +define <3 x half> @v_fmul_v3f16_fneg_lhs_fneg_rhs(<3 x half> %a, <3 x half> %b) { +; GFX9-LABEL: v_fmul_v3f16_fneg_lhs_fneg_rhs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 +; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fmul_v3f16_fneg_lhs_fneg_rhs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mul_f16_e32 v4, v0, v2 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_mul_f16_e32 v1, v1, v3 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fmul_v3f16_fneg_lhs_fneg_rhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 +; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %neg.a = fneg <3 x half> %a + %neg.b = fneg <3 x half> %b + %mul = fmul <3 x half> %neg.a, %neg.b + ret <3 x half> %mul +} define <4 x half> @v_fmul_v4f16(<4 x half> %a, <4 x half> %b) { ; GFX9-LABEL: v_fmul_v4f16: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -4582,19 +4582,354 @@ ret float %cast } -; ; FIXME -; define amdgpu_ps i48 @s_fshl_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, <3 x i16> inreg %amt) { -; %result = call <3 x i16> @llvm.fshl.v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt) -; %cast = bitcast <3 x i16> %result to i48 -; ret i48 %cast -; } -; ; FIXME -; define <3 x half> @v_fshl_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt) { -; %result = call <3 x i16> @llvm.fshl.v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt) -; %cast.result = bitcast <3 x i16> %result to <3 x half> -; ret <3 x half> %cast.result -; } +define amdgpu_ps i48 @s_fshl_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, <3 x i16> inreg %amt) { +; GFX6-LABEL: s_fshl_v3i16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_and_b32 s9, s6, 15 +; GFX6-NEXT: s_andn2_b32 s6, 15, s6 +; GFX6-NEXT: s_bfe_u32 s9, s9, 0x100000 +; GFX6-NEXT: s_bfe_u32 s3, s3, 0xf0001 +; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX6-NEXT: s_lshl_b32 s0, s0, s9 +; GFX6-NEXT: s_lshr_b32 s3, s3, s6 +; GFX6-NEXT: s_or_b32 s0, s0, s3 +; GFX6-NEXT: s_and_b32 s3, s7, 15 +; GFX6-NEXT: s_andn2_b32 s6, 15, s7 +; GFX6-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX6-NEXT: s_lshl_b32 s1, s1, s3 +; GFX6-NEXT: s_bfe_u32 s3, s4, 0xf0001 +; GFX6-NEXT: s_bfe_u32 s4, s6, 0x100000 +; GFX6-NEXT: s_lshr_b32 s3, s3, s4 +; GFX6-NEXT: s_or_b32 s1, s1, s3 +; GFX6-NEXT: s_and_b32 s3, s8, 15 +; GFX6-NEXT: s_andn2_b32 s4, 15, s8 +; GFX6-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX6-NEXT: s_lshl_b32 s2, s2, s3 +; GFX6-NEXT: s_bfe_u32 s3, s5, 0xf0001 +; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: s_lshr_b32 s3, s3, s4 +; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX6-NEXT: s_or_b32 s2, s2, s3 +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_bfe_u32 s1, s2, 0x100000 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_fshl_v3i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s9, s4, 15 +; GFX8-NEXT: s_bfe_u32 s9, s9, 0x100000 +; GFX8-NEXT: s_lshr_b32 s6, s0, 16 +; GFX8-NEXT: s_lshr_b32 s7, s2, 16 +; GFX8-NEXT: s_lshr_b32 s8, s4, 16 +; GFX8-NEXT: s_andn2_b32 s4, 15, s4 +; GFX8-NEXT: s_lshl_b32 s0, s0, s9 +; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_bfe_u32 s9, 1, 0x100000 +; GFX8-NEXT: s_lshr_b32 s2, s2, s9 +; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX8-NEXT: s_lshr_b32 s2, s2, s4 +; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s2, s8, 15 +; GFX8-NEXT: s_andn2_b32 s4, 15, s8 +; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_lshl_b32 s2, s6, s2 +; GFX8-NEXT: s_lshr_b32 s6, s7, s9 +; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX8-NEXT: s_lshr_b32 s4, s6, s4 +; GFX8-NEXT: s_or_b32 s2, s2, s4 +; GFX8-NEXT: s_and_b32 s4, s5, 15 +; GFX8-NEXT: s_andn2_b32 s5, 15, s5 +; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX8-NEXT: s_lshl_b32 s1, s1, s4 +; GFX8-NEXT: s_lshr_b32 s3, s3, s9 +; GFX8-NEXT: s_bfe_u32 s4, s5, 0x100000 +; GFX8-NEXT: s_lshr_b32 s3, s3, s4 +; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_or_b32 s1, s1, s3 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX8-NEXT: s_lshl_b32 s2, s2, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_fshl_v3i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s6, s4, 0xf000f +; GFX9-NEXT: s_lshr_b32 s7, s0, 16 +; GFX9-NEXT: s_lshr_b32 s8, s6, 16 +; GFX9-NEXT: s_lshl_b32 s0, s0, s6 +; GFX9-NEXT: s_lshl_b32 s6, s7, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX9-NEXT: s_lshr_b32 s6, s2, 16 +; GFX9-NEXT: s_and_b32 s2, s2, 0xffff +; GFX9-NEXT: s_lshr_b32 s2, s2, 0x10001 +; GFX9-NEXT: s_lshr_b32 s6, s6, 1 +; GFX9-NEXT: s_andn2_b32 s4, 0xf000f, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s6 +; GFX9-NEXT: s_lshr_b32 s6, s2, 16 +; GFX9-NEXT: s_and_b32 s2, s2, 0xffff +; GFX9-NEXT: s_lshr_b32 s7, s4, 16 +; GFX9-NEXT: s_lshr_b32 s2, s2, s4 +; GFX9-NEXT: s_lshr_b32 s4, s6, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX9-NEXT: s_or_b32 s0, s0, s2 +; GFX9-NEXT: s_and_b32 s2, s5, 0xf000f +; GFX9-NEXT: s_andn2_b32 s4, 0xf000f, s5 +; GFX9-NEXT: s_lshr_b32 s5, s1, 16 +; GFX9-NEXT: s_lshr_b32 s6, s2, 16 +; GFX9-NEXT: s_lshl_b32 s1, s1, s2 +; GFX9-NEXT: s_lshl_b32 s2, s5, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX9-NEXT: s_lshr_b32 s2, s3, 16 +; GFX9-NEXT: s_and_b32 s3, s3, 0xffff +; GFX9-NEXT: s_lshr_b32 s3, s3, 0x10001 +; GFX9-NEXT: s_lshr_b32 s2, s2, 1 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s2 +; GFX9-NEXT: s_lshr_b32 s3, s2, 16 +; GFX9-NEXT: s_and_b32 s2, s2, 0xffff +; GFX9-NEXT: s_lshr_b32 s5, s4, 16 +; GFX9-NEXT: s_lshr_b32 s2, s2, s4 +; GFX9-NEXT: s_lshr_b32 s3, s3, s5 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s3 +; GFX9-NEXT: s_or_b32 s1, s1, s2 +; GFX9-NEXT: s_lshr_b32 s2, s0, 16 +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NEXT: s_lshl_b32 s2, s2, 16 +; GFX9-NEXT: s_or_b32 s0, s0, s2 +; GFX9-NEXT: s_and_b32 s1, s1, 0xffff +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_fshl_v3i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_and_b32 s9, s2, 0xffff +; GFX10-NEXT: s_lshr_b32 s2, s2, 16 +; GFX10-NEXT: s_and_b32 s6, s4, 0xf000f +; GFX10-NEXT: s_lshr_b32 s9, s9, 0x10001 +; GFX10-NEXT: s_lshr_b32 s2, s2, 1 +; GFX10-NEXT: s_andn2_b32 s4, 0xf000f, s4 +; GFX10-NEXT: s_lshr_b32 s7, s0, 16 +; GFX10-NEXT: s_lshr_b32 s8, s6, 16 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s9, s2 +; GFX10-NEXT: s_lshl_b32 s0, s0, s6 +; GFX10-NEXT: s_lshl_b32 s6, s7, s8 +; GFX10-NEXT: s_lshr_b32 s7, s2, 16 +; GFX10-NEXT: s_and_b32 s2, s2, 0xffff +; GFX10-NEXT: s_lshr_b32 s8, s4, 16 +; GFX10-NEXT: s_lshr_b32 s2, s2, s4 +; GFX10-NEXT: s_lshr_b32 s4, s7, s8 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX10-NEXT: s_and_b32 s7, s3, 0xffff +; GFX10-NEXT: s_lshr_b32 s3, s3, 16 +; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: s_and_b32 s2, s5, 0xf000f +; GFX10-NEXT: s_lshr_b32 s7, s7, 0x10001 +; GFX10-NEXT: s_lshr_b32 s3, s3, 1 +; GFX10-NEXT: s_andn2_b32 s4, 0xf000f, s5 +; GFX10-NEXT: s_lshr_b32 s5, s1, 16 +; GFX10-NEXT: s_lshr_b32 s6, s2, 16 +; GFX10-NEXT: s_lshl_b32 s1, s1, s2 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s7, s3 +; GFX10-NEXT: s_lshl_b32 s3, s5, s6 +; GFX10-NEXT: s_lshr_b32 s5, s2, 16 +; GFX10-NEXT: s_and_b32 s2, s2, 0xffff +; GFX10-NEXT: s_lshr_b32 s6, s4, 16 +; GFX10-NEXT: s_lshr_b32 s2, s2, s4 +; GFX10-NEXT: s_lshr_b32 s4, s5, s6 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX10-NEXT: s_lshr_b32 s3, s0, 16 +; GFX10-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10-NEXT: s_lshl_b32 s3, s3, 16 +; GFX10-NEXT: s_or_b32 s1, s1, s2 +; GFX10-NEXT: s_or_b32 s0, s0, s3 +; GFX10-NEXT: s_and_b32 s1, s1, 0xffff +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshl_v3i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_b32 s9, s2, 0xffff +; GFX11-NEXT: s_lshr_b32 s2, s2, 16 +; GFX11-NEXT: s_and_b32 s6, s4, 0xf000f +; GFX11-NEXT: s_lshr_b32 s9, s9, 0x10001 +; GFX11-NEXT: s_lshr_b32 s2, s2, 1 +; GFX11-NEXT: s_and_not1_b32 s4, 0xf000f, s4 +; GFX11-NEXT: s_lshr_b32 s7, s0, 16 +; GFX11-NEXT: s_lshr_b32 s8, s6, 16 +; GFX11-NEXT: s_pack_ll_b32_b16 s2, s9, s2 +; GFX11-NEXT: s_lshl_b32 s0, s0, s6 +; GFX11-NEXT: s_lshl_b32 s6, s7, s8 +; GFX11-NEXT: s_lshr_b32 s7, s2, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshr_b32 s8, s4, 16 +; GFX11-NEXT: s_lshr_b32 s2, s2, s4 +; GFX11-NEXT: s_lshr_b32 s4, s7, s8 +; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX11-NEXT: s_and_b32 s7, s3, 0xffff +; GFX11-NEXT: s_lshr_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: s_and_b32 s2, s5, 0xf000f +; GFX11-NEXT: s_lshr_b32 s7, s7, 0x10001 +; GFX11-NEXT: s_lshr_b32 s3, s3, 1 +; GFX11-NEXT: s_and_not1_b32 s4, 0xf000f, s5 +; GFX11-NEXT: s_lshr_b32 s5, s1, 16 +; GFX11-NEXT: s_lshr_b32 s6, s2, 16 +; GFX11-NEXT: s_lshl_b32 s1, s1, s2 +; GFX11-NEXT: s_pack_ll_b32_b16 s2, s7, s3 +; GFX11-NEXT: s_lshl_b32 s3, s5, s6 +; GFX11-NEXT: s_lshr_b32 s5, s2, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshr_b32 s6, s4, 16 +; GFX11-NEXT: s_lshr_b32 s2, s2, s4 +; GFX11-NEXT: s_lshr_b32 s4, s5, s6 +; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s3 +; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX11-NEXT: s_lshr_b32 s3, s0, 16 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s1, s1, s2 +; GFX11-NEXT: s_or_b32 s0, s0, s3 +; GFX11-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-NEXT: ; return to shader part epilog + %result = call <3 x i16> @llvm.fshl.v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt) + %cast = bitcast <3 x i16> %result to i48 + ret i48 %cast +} + +define <3 x half> @v_fshl_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt) { +; GFX6-LABEL: v_fshl_v3i16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v9, 15, v6 +; GFX6-NEXT: v_xor_b32_e32 v6, -1, v6 +; GFX6-NEXT: v_and_b32_e32 v6, 15, v6 +; GFX6-NEXT: v_bfe_u32 v9, v9, 0, 16 +; GFX6-NEXT: v_bfe_u32 v3, v3, 1, 15 +; GFX6-NEXT: v_bfe_u32 v6, v6, 0, 16 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, v9, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v6, v3 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 15, v7 +; GFX6-NEXT: v_xor_b32_e32 v6, -1, v7 +; GFX6-NEXT: v_and_b32_e32 v6, 15, v6 +; GFX6-NEXT: v_bfe_u32 v3, v3, 0, 16 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, v3, v1 +; GFX6-NEXT: v_bfe_u32 v3, v4, 1, 15 +; GFX6-NEXT: v_bfe_u32 v4, v6, 0, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v4, v3 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 15, v8 +; GFX6-NEXT: v_xor_b32_e32 v4, -1, v8 +; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX6-NEXT: v_bfe_u32 v3, v3, 0, 16 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v3, v2 +; GFX6-NEXT: v_bfe_u32 v3, v5, 1, 15 +; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v4, v3 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshl_v3i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v7, 15, v4 +; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX8-NEXT: v_lshrrev_b16_e32 v8, 1, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v7, v7, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v8 +; GFX8-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX8-NEXT: v_and_b32_e32 v7, 15, v6 +; GFX8-NEXT: v_xor_b32_e32 v6, -1, v6 +; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v7, 1 +; GFX8-NEXT: v_and_b32_e32 v6, 15, v6 +; GFX8-NEXT: v_lshrrev_b16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, v6, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 15, v5 +; GFX8-NEXT: v_xor_b32_e32 v5, -1, v5 +; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, v2, v1 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v3 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, v5, v2 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshl_v3i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v6, 0xf000f, v4 +; GFX9-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX9-NEXT: v_and_b32_e32 v4, 0xf000f, v4 +; GFX9-NEXT: v_pk_lshrrev_b16 v2, 1, v2 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v0, v6, v0 +; GFX9-NEXT: v_pk_lshrrev_b16 v2, v4, v2 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xf000f, v5 +; GFX9-NEXT: v_xor_b32_e32 v4, -1, v5 +; GFX9-NEXT: v_and_b32_e32 v4, 0xf000f, v4 +; GFX9-NEXT: v_pk_lshlrev_b16 v1, v2, v1 +; GFX9-NEXT: v_pk_lshrrev_b16 v2, 1, v3 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshrrev_b16 v2, v4, v2 +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshl_v3i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_xor_b32_e32 v6, -1, v4 +; GFX10-NEXT: v_xor_b32_e32 v7, -1, v5 +; GFX10-NEXT: v_and_b32_e32 v4, 0xf000f, v4 +; GFX10-NEXT: v_pk_lshrrev_b16 v2, 1, v2 op_sel_hi:[0,1] +; GFX10-NEXT: v_and_b32_e32 v5, 0xf000f, v5 +; GFX10-NEXT: v_and_b32_e32 v6, 0xf000f, v6 +; GFX10-NEXT: v_pk_lshrrev_b16 v3, 1, v3 op_sel_hi:[0,1] +; GFX10-NEXT: v_and_b32_e32 v7, 0xf000f, v7 +; GFX10-NEXT: v_pk_lshlrev_b16 v0, v4, v0 +; GFX10-NEXT: v_pk_lshlrev_b16 v1, v5, v1 +; GFX10-NEXT: v_pk_lshrrev_b16 v2, v6, v2 +; GFX10-NEXT: v_pk_lshrrev_b16 v3, v7, v3 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshl_v3i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_xor_b32_e32 v6, -1, v4 +; GFX11-NEXT: v_xor_b32_e32 v7, -1, v5 +; GFX11-NEXT: v_and_b32_e32 v4, 0xf000f, v4 +; GFX11-NEXT: v_pk_lshrrev_b16 v2, 1, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_and_b32_e32 v5, 0xf000f, v5 +; GFX11-NEXT: v_and_b32_e32 v6, 0xf000f, v6 +; GFX11-NEXT: v_pk_lshrrev_b16 v3, 1, v3 op_sel_hi:[0,1] +; GFX11-NEXT: v_and_b32_e32 v7, 0xf000f, v7 +; GFX11-NEXT: v_pk_lshlrev_b16 v0, v4, v0 +; GFX11-NEXT: v_pk_lshlrev_b16 v1, v5, v1 +; GFX11-NEXT: v_pk_lshrrev_b16 v2, v6, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_pk_lshrrev_b16 v3, v7, v3 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %result = call <3 x i16> @llvm.fshl.v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt) + %cast.result = bitcast <3 x i16> %result to <3 x half> + ret <3 x half> %cast.result +} define amdgpu_ps <2 x i32> @s_fshl_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %rhs, <4 x i16> inreg %amt) { ; GFX6-LABEL: s_fshl_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -4522,19 +4522,440 @@ ret float %cast } -; ; FIXME -; define amdgpu_ps i48 @s_fshr_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, <3 x i16> inreg %amt) { -; %result = call <3 x i16> @llvm.fshr.v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt) -; %cast = bitcast <3 x i16> %result to i48 -; ret i48 %cast -; } +define amdgpu_ps i48 @s_fshr_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, <3 x i16> inreg %amt) { +; GFX6-LABEL: s_fshr_v3i16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_and_b32 s7, s7, 0xffff +; GFX6-NEXT: s_and_b32 s6, s6, 0xffff +; GFX6-NEXT: s_lshl_b32 s7, s7, 16 +; GFX6-NEXT: s_or_b32 s6, s6, s7 +; GFX6-NEXT: s_and_b32 s7, s8, 0xffff +; GFX6-NEXT: s_bfe_u32 s8, 1, 0x100000 +; GFX6-NEXT: s_bfe_u32 s9, s3, 0xf0001 +; GFX6-NEXT: s_bfe_u32 s10, 14, 0x100000 +; GFX6-NEXT: s_lshl_b32 s0, s0, s8 +; GFX6-NEXT: s_lshr_b32 s9, s9, s10 +; GFX6-NEXT: s_or_b32 s0, s0, s9 +; GFX6-NEXT: s_bfe_u32 s9, s4, 0xf0001 +; GFX6-NEXT: s_lshl_b32 s1, s1, s8 +; GFX6-NEXT: s_lshr_b32 s9, s9, s10 +; GFX6-NEXT: s_xor_b32 s6, s6, -1 +; GFX6-NEXT: s_or_b32 s1, s1, s9 +; GFX6-NEXT: s_lshl_b32 s3, s3, 1 +; GFX6-NEXT: s_lshr_b32 s9, s6, 16 +; GFX6-NEXT: s_and_b32 s11, s6, 15 +; GFX6-NEXT: s_andn2_b32 s6, 15, s6 +; GFX6-NEXT: s_bfe_u32 s11, s11, 0x100000 +; GFX6-NEXT: s_bfe_u32 s3, s3, 0xf0001 +; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX6-NEXT: s_lshl_b32 s0, s0, s11 +; GFX6-NEXT: s_lshr_b32 s3, s3, s6 +; GFX6-NEXT: s_or_b32 s0, s0, s3 +; GFX6-NEXT: s_and_b32 s3, s9, 15 +; GFX6-NEXT: s_lshl_b32 s4, s4, 1 +; GFX6-NEXT: s_andn2_b32 s6, 15, s9 +; GFX6-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX6-NEXT: s_lshl_b32 s1, s1, s3 +; GFX6-NEXT: s_bfe_u32 s3, s4, 0xf0001 +; GFX6-NEXT: s_bfe_u32 s4, s6, 0x100000 +; GFX6-NEXT: s_lshr_b32 s3, s3, s4 +; GFX6-NEXT: s_or_b32 s1, s1, s3 +; GFX6-NEXT: s_bfe_u32 s3, s5, 0xf0001 +; GFX6-NEXT: s_lshl_b32 s2, s2, s8 +; GFX6-NEXT: s_lshr_b32 s3, s3, s10 +; GFX6-NEXT: s_xor_b32 s4, s7, -1 +; GFX6-NEXT: s_or_b32 s2, s2, s3 +; GFX6-NEXT: s_lshl_b32 s3, s5, 1 +; GFX6-NEXT: s_and_b32 s5, s4, 15 +; GFX6-NEXT: s_andn2_b32 s4, 15, s4 +; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX6-NEXT: s_bfe_u32 s3, s3, 0xf0001 +; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: s_lshl_b32 s2, s2, s5 +; GFX6-NEXT: s_lshr_b32 s3, s3, s4 +; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX6-NEXT: s_or_b32 s2, s2, s3 +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_bfe_u32 s1, s2, 0x100000 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_fshr_v3i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_lshr_b32 s8, s4, 16 +; GFX8-NEXT: s_and_b32 s4, s4, 0xffff +; GFX8-NEXT: s_lshl_b32 s8, s8, 16 +; GFX8-NEXT: s_or_b32 s4, s4, s8 +; GFX8-NEXT: s_bfe_u32 s8, 1, 0x100000 +; GFX8-NEXT: s_bfe_u32 s9, s2, 0x100000 +; GFX8-NEXT: s_bfe_u32 s10, 15, 0x100000 +; GFX8-NEXT: s_lshr_b32 s6, s0, 16 +; GFX8-NEXT: s_lshr_b32 s7, s2, 16 +; GFX8-NEXT: s_lshl_b32 s0, s0, s8 +; GFX8-NEXT: s_lshr_b32 s9, s9, s10 +; GFX8-NEXT: s_or_b32 s0, s0, s9 +; GFX8-NEXT: s_lshl_b32 s6, s6, s8 +; GFX8-NEXT: s_lshr_b32 s9, s7, s10 +; GFX8-NEXT: s_lshl_b32 s2, s2, s8 +; GFX8-NEXT: s_xor_b32 s4, s4, -1 +; GFX8-NEXT: s_or_b32 s6, s6, s9 +; GFX8-NEXT: s_lshr_b32 s9, s4, 16 +; GFX8-NEXT: s_and_b32 s11, s4, 15 +; GFX8-NEXT: s_andn2_b32 s4, 15, s4 +; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_bfe_u32 s11, s11, 0x100000 +; GFX8-NEXT: s_lshr_b32 s2, s2, s8 +; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX8-NEXT: s_lshl_b32 s0, s0, s11 +; GFX8-NEXT: s_lshr_b32 s2, s2, s4 +; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s2, s9, 15 +; GFX8-NEXT: s_lshl_b32 s7, s7, s8 +; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_andn2_b32 s4, 15, s9 +; GFX8-NEXT: s_lshl_b32 s2, s6, s2 +; GFX8-NEXT: s_bfe_u32 s6, s7, 0x100000 +; GFX8-NEXT: s_lshr_b32 s6, s6, s8 +; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX8-NEXT: s_lshr_b32 s4, s6, s4 +; GFX8-NEXT: s_or_b32 s2, s2, s4 +; GFX8-NEXT: s_bfe_u32 s4, s3, 0x100000 +; GFX8-NEXT: s_and_b32 s5, s5, 0xffff +; GFX8-NEXT: s_lshl_b32 s1, s1, s8 +; GFX8-NEXT: s_lshr_b32 s4, s4, s10 +; GFX8-NEXT: s_or_b32 s1, s1, s4 +; GFX8-NEXT: s_lshl_b32 s3, s3, s8 +; GFX8-NEXT: s_xor_b32 s4, s5, -1 +; GFX8-NEXT: s_and_b32 s5, s4, 15 +; GFX8-NEXT: s_andn2_b32 s4, 15, s4 +; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX8-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX8-NEXT: s_lshr_b32 s3, s3, s8 +; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX8-NEXT: s_lshl_b32 s1, s1, s5 +; GFX8-NEXT: s_lshr_b32 s3, s3, s4 +; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_or_b32 s1, s1, s3 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX8-NEXT: s_lshl_b32 s2, s2, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_fshr_v3i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_lshr_b32 s7, s0, 16 +; GFX9-NEXT: s_lshl_b32 s0, s0, 0x10001 +; GFX9-NEXT: s_lshl_b32 s7, s7, 1 +; GFX9-NEXT: s_and_b32 s6, s4, 0xf000f +; GFX9-NEXT: s_andn2_b32 s4, 0xf000f, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX9-NEXT: s_lshr_b32 s7, s0, 16 +; GFX9-NEXT: s_lshr_b32 s8, s4, 16 +; GFX9-NEXT: s_lshl_b32 s0, s0, s4 +; GFX9-NEXT: s_lshl_b32 s4, s7, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX9-NEXT: s_lshr_b32 s4, s2, 16 +; GFX9-NEXT: s_and_b32 s2, s2, 0xffff +; GFX9-NEXT: s_lshr_b32 s7, s6, 16 +; GFX9-NEXT: s_lshr_b32 s2, s2, s6 +; GFX9-NEXT: s_lshr_b32 s4, s4, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX9-NEXT: s_or_b32 s0, s0, s2 +; GFX9-NEXT: s_and_b32 s2, s5, 0xf000f +; GFX9-NEXT: s_andn2_b32 s4, 0xf000f, s5 +; GFX9-NEXT: s_lshr_b32 s5, s1, 16 +; GFX9-NEXT: s_lshl_b32 s1, s1, 0x10001 +; GFX9-NEXT: s_lshl_b32 s5, s5, 1 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5 +; GFX9-NEXT: s_lshr_b32 s5, s1, 16 +; GFX9-NEXT: s_lshr_b32 s6, s4, 16 +; GFX9-NEXT: s_lshl_b32 s1, s1, s4 +; GFX9-NEXT: s_lshl_b32 s4, s5, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX9-NEXT: s_lshr_b32 s4, s3, 16 +; GFX9-NEXT: s_and_b32 s3, s3, 0xffff +; GFX9-NEXT: s_lshr_b32 s5, s2, 16 +; GFX9-NEXT: s_lshr_b32 s2, s3, s2 +; GFX9-NEXT: s_lshr_b32 s3, s4, s5 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s3 +; GFX9-NEXT: s_or_b32 s1, s1, s2 +; GFX9-NEXT: s_lshr_b32 s2, s0, 16 +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NEXT: s_lshl_b32 s2, s2, 16 +; GFX9-NEXT: s_or_b32 s0, s0, s2 +; GFX9-NEXT: s_and_b32 s1, s1, 0xffff +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_fshr_v3i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshr_b32 s6, s0, 16 +; GFX10-NEXT: s_lshl_b32 s0, s0, 0x10001 +; GFX10-NEXT: s_lshl_b32 s6, s6, 1 +; GFX10-NEXT: s_and_b32 s7, s4, 0xf000f +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX10-NEXT: s_andn2_b32 s4, 0xf000f, s4 +; GFX10-NEXT: s_lshr_b32 s6, s0, 16 +; GFX10-NEXT: s_lshr_b32 s8, s4, 16 +; GFX10-NEXT: s_lshl_b32 s0, s0, s4 +; GFX10-NEXT: s_lshl_b32 s4, s6, s8 +; GFX10-NEXT: s_lshr_b32 s6, s2, 16 +; GFX10-NEXT: s_and_b32 s2, s2, 0xffff +; GFX10-NEXT: s_lshr_b32 s8, s7, 16 +; GFX10-NEXT: s_lshr_b32 s2, s2, s7 +; GFX10-NEXT: s_lshr_b32 s6, s6, s8 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s6 +; GFX10-NEXT: s_and_b32 s4, s5, 0xf000f +; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: s_lshr_b32 s2, s1, 16 +; GFX10-NEXT: s_lshl_b32 s1, s1, 0x10001 +; GFX10-NEXT: s_lshl_b32 s2, s2, 1 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX10-NEXT: s_andn2_b32 s2, 0xf000f, s5 +; GFX10-NEXT: s_lshr_b32 s5, s1, 16 +; GFX10-NEXT: s_lshr_b32 s6, s2, 16 +; GFX10-NEXT: s_lshl_b32 s1, s1, s2 +; GFX10-NEXT: s_lshl_b32 s2, s5, s6 +; GFX10-NEXT: s_lshr_b32 s5, s3, 16 +; GFX10-NEXT: s_and_b32 s3, s3, 0xffff +; GFX10-NEXT: s_lshr_b32 s6, s4, 16 +; GFX10-NEXT: s_lshr_b32 s3, s3, s4 +; GFX10-NEXT: s_lshr_b32 s4, s5, s6 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4 +; GFX10-NEXT: s_lshr_b32 s3, s0, 16 +; GFX10-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10-NEXT: s_lshl_b32 s3, s3, 16 +; GFX10-NEXT: s_or_b32 s1, s1, s2 +; GFX10-NEXT: s_or_b32 s0, s0, s3 +; GFX10-NEXT: s_and_b32 s1, s1, 0xffff +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshr_v3i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_lshr_b32 s6, s0, 16 +; GFX11-NEXT: s_lshl_b32 s0, s0, 0x10001 +; GFX11-NEXT: s_lshl_b32 s6, s6, 1 +; GFX11-NEXT: s_and_b32 s7, s4, 0xf000f +; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX11-NEXT: s_and_not1_b32 s4, 0xf000f, s4 +; GFX11-NEXT: s_lshr_b32 s6, s0, 16 +; GFX11-NEXT: s_lshr_b32 s8, s4, 16 +; GFX11-NEXT: s_lshl_b32 s0, s0, s4 +; GFX11-NEXT: s_lshl_b32 s4, s6, s8 +; GFX11-NEXT: s_lshr_b32 s6, s2, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshr_b32 s8, s7, 16 +; GFX11-NEXT: s_lshr_b32 s2, s2, s7 +; GFX11-NEXT: s_lshr_b32 s6, s6, s8 +; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s6 +; GFX11-NEXT: s_and_b32 s4, s5, 0xf000f +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: s_lshr_b32 s2, s1, 16 +; GFX11-NEXT: s_lshl_b32 s1, s1, 0x10001 +; GFX11-NEXT: s_lshl_b32 s2, s2, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX11-NEXT: s_and_not1_b32 s2, 0xf000f, s5 +; GFX11-NEXT: s_lshr_b32 s5, s1, 16 +; GFX11-NEXT: s_lshr_b32 s6, s2, 16 +; GFX11-NEXT: s_lshl_b32 s1, s1, s2 +; GFX11-NEXT: s_lshl_b32 s2, s5, s6 +; GFX11-NEXT: s_lshr_b32 s5, s3, 16 +; GFX11-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-NEXT: s_lshr_b32 s6, s4, 16 +; GFX11-NEXT: s_lshr_b32 s3, s3, s4 +; GFX11-NEXT: s_lshr_b32 s4, s5, s6 +; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX11-NEXT: s_pack_ll_b32_b16 s2, s3, s4 +; GFX11-NEXT: s_lshr_b32 s3, s0, 16 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s1, s1, s2 +; GFX11-NEXT: s_or_b32 s0, s0, s3 +; GFX11-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-NEXT: ; return to shader part epilog + %result = call <3 x i16> @llvm.fshr.v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt) + %cast = bitcast <3 x i16> %result to i48 + ret i48 %cast +} -; ; FIXME -; define <3 x half> @v_fshr_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt) { -; %result = call <3 x i16> @llvm.fshr.v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt) -; %cast.result = bitcast <3 x i16> %result to <3 x half> -; ret <3 x half> %cast.result -; } +define <3 x half> @v_fshr_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt) { +; GFX6-LABEL: v_fshr_v3i16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX6-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; GFX6-NEXT: s_bfe_u32 s4, 1, 0x100000 +; GFX6-NEXT: v_bfe_u32 v8, v3, 1, 15 +; GFX6-NEXT: s_bfe_u32 s5, 14, 0x100000 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v8, s5, v8 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX6-NEXT: v_bfe_u32 v8, v4, 1, 15 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v8, s5, v8 +; GFX6-NEXT: v_xor_b32_e32 v6, -1, v6 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v8 +; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; GFX6-NEXT: v_and_b32_e32 v9, 15, v6 +; GFX6-NEXT: v_xor_b32_e32 v6, -1, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3 +; GFX6-NEXT: v_and_b32_e32 v6, 15, v6 +; GFX6-NEXT: v_bfe_u32 v9, v9, 0, 16 +; GFX6-NEXT: v_bfe_u32 v3, v3, 1, 15 +; GFX6-NEXT: v_bfe_u32 v6, v6, 0, 16 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, v9, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v6, v3 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 15, v8 +; GFX6-NEXT: v_xor_b32_e32 v6, -1, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v4 +; GFX6-NEXT: v_and_b32_e32 v6, 15, v6 +; GFX6-NEXT: v_bfe_u32 v3, v3, 0, 16 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, v3, v1 +; GFX6-NEXT: v_bfe_u32 v3, v4, 1, 15 +; GFX6-NEXT: v_bfe_u32 v4, v6, 0, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v4, v3 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX6-NEXT: v_bfe_u32 v3, v5, 1, 15 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, s5, v3 +; GFX6-NEXT: v_xor_b32_e32 v4, -1, v7 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v5 +; GFX6-NEXT: v_and_b32_e32 v5, 15, v4 +; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX6-NEXT: v_bfe_u32 v5, v5, 0, 16 +; GFX6-NEXT: v_bfe_u32 v3, v3, 1, 15 +; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v4, v3 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshr_v3i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX8-NEXT: v_or_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b16_e32 v7, 1, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v8, 15, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX8-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX8-NEXT: v_mov_b32_e32 v8, 1 +; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b16_e32 v8, 15, v6 +; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX8-NEXT: v_lshlrev_b16_e32 v2, 1, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v9, 15, v4 +; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v7, v9, v7 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, v4, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v6, 1, v6 +; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 15, v8 +; GFX8-NEXT: v_xor_b32_e32 v7, -1, v8 +; GFX8-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, v4, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v4, 1, v6 +; GFX8-NEXT: v_lshrrev_b16_e32 v4, v7, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, 1, v1 +; GFX8-NEXT: v_lshrrev_b16_e32 v4, 15, v3 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, -1 +; GFX8-NEXT: v_xor_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b16_e32 v3, 1, v3 +; GFX8-NEXT: v_and_b32_e32 v5, 15, v4 +; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX8-NEXT: v_lshrrev_b16_e32 v3, 1, v3 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, v5, v1 +; GFX8-NEXT: v_lshrrev_b16_e32 v3, v4, v3 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshr_v3i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v6, 0xf000f, v4 +; GFX9-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX9-NEXT: v_and_b32_e32 v4, 0xf000f, v4 +; GFX9-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v0, v4, v0 +; GFX9-NEXT: v_pk_lshrrev_b16 v2, v6, v2 +; GFX9-NEXT: v_xor_b32_e32 v4, -1, v5 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xf000f, v5 +; GFX9-NEXT: v_and_b32_e32 v4, 0xf000f, v4 +; GFX9-NEXT: v_pk_lshlrev_b16 v1, 1, v1 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v1, v4, v1 +; GFX9-NEXT: v_pk_lshrrev_b16 v2, v2, v3 +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshr_v3i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_xor_b32_e32 v6, -1, v4 +; GFX10-NEXT: v_xor_b32_e32 v7, -1, v5 +; GFX10-NEXT: v_and_b32_e32 v4, 0xf000f, v4 +; GFX10-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] +; GFX10-NEXT: v_and_b32_e32 v5, 0xf000f, v5 +; GFX10-NEXT: v_and_b32_e32 v6, 0xf000f, v6 +; GFX10-NEXT: v_pk_lshlrev_b16 v1, 1, v1 op_sel_hi:[0,1] +; GFX10-NEXT: v_and_b32_e32 v7, 0xf000f, v7 +; GFX10-NEXT: v_pk_lshrrev_b16 v2, v4, v2 +; GFX10-NEXT: v_pk_lshrrev_b16 v3, v5, v3 +; GFX10-NEXT: v_pk_lshlrev_b16 v0, v6, v0 +; GFX10-NEXT: v_pk_lshlrev_b16 v1, v7, v1 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshr_v3i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_xor_b32_e32 v6, -1, v4 +; GFX11-NEXT: v_xor_b32_e32 v7, -1, v5 +; GFX11-NEXT: v_and_b32_e32 v4, 0xf000f, v4 +; GFX11-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] +; GFX11-NEXT: v_and_b32_e32 v5, 0xf000f, v5 +; GFX11-NEXT: v_and_b32_e32 v6, 0xf000f, v6 +; GFX11-NEXT: v_pk_lshlrev_b16 v1, 1, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_and_b32_e32 v7, 0xf000f, v7 +; GFX11-NEXT: v_pk_lshrrev_b16 v2, v4, v2 +; GFX11-NEXT: v_pk_lshrrev_b16 v3, v5, v3 +; GFX11-NEXT: v_pk_lshlrev_b16 v0, v6, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_pk_lshlrev_b16 v1, v7, v1 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %result = call <3 x i16> @llvm.fshr.v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt) + %cast.result = bitcast <3 x i16> %result to <3 x half> + ret <3 x half> %cast.result +} define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %rhs, <4 x i16> inreg %amt) { ; GFX6-LABEL: s_fshr_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-add.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-add.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-add.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-add.mir @@ -501,21 +501,52 @@ $vgpr0 = COPY %5 ... -# FIXME -# --- -# name: test_add_s33 -# body: | -# bb.0: -# liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - -# %0:_(s64) = COPY $vgpr0_vgpr1 -# %1:_(s64) = COPY $vgpr2_vgpr3 -# %2:_(s33) = G_TRUNC %0 -# %3:_(s33) = G_TRUNC %1 -# %4:_(s33) = G_ADD %2, %3 -# %5:_(s64) = G_ANYEXT %4 -# $vgpr0_vgpr1 = COPY %5 -# ... +--- +name: test_add_s33 +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX6-LABEL: name: test_add_s33 + ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX6-NEXT: {{ $}} + ; GFX6-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; GFX6-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 + ; GFX6-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; GFX6-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64) + ; GFX6-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV]], [[UV2]] + ; GFX6-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV1]], [[UV3]], [[UADDO1]] + ; GFX6-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32) + ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) + ; GFX8-LABEL: name: test_add_s33 + ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX8-NEXT: {{ $}} + ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 + ; GFX8-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; GFX8-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64) + ; GFX8-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV]], [[UV2]] + ; GFX8-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV1]], [[UV3]], [[UADDO1]] + ; GFX8-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32) + ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) + ; GFX9-LABEL: name: test_add_s33 + ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64) + ; GFX9-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV]], [[UV2]] + ; GFX9-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV1]], [[UV3]], [[UADDO1]] + ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32) + ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s64) = COPY $vgpr2_vgpr3 + %2:_(s33) = G_TRUNC %0 + %3:_(s33) = G_TRUNC %1 + %4:_(s33) = G_ADD %2, %3 + %5:_(s64) = G_ANYEXT %4 + $vgpr0_vgpr1 = COPY %5 +... --- name: test_add_s96 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sub.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sub.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sub.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sub.mir @@ -495,21 +495,52 @@ $vgpr0 = COPY %5 ... -# FIXME -# --- -# name: test_sub_s33 -# body: | -# bb.0: -# liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - -# %0:_(s64) = COPY $vgpr0_vgpr1 -# %1:_(s64) = COPY $vgpr2_vgpr3 -# %2:_(s33) = G_TRUNC %0 -# %3:_(s33) = G_TRUNC %1 -# %4:_(s33) = G_SUB %2, %3 -# %5:_(s64) = G_ANYEXT %4 -# $vgpr0_vgpr1 = COPY %5 -# ... +--- +name: test_sub_s33 +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX6-LABEL: name: test_sub_s33 + ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX6-NEXT: {{ $}} + ; GFX6-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; GFX6-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 + ; GFX6-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; GFX6-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64) + ; GFX6-NEXT: [[USUBO:%[0-9]+]]:_(s32), [[USUBO1:%[0-9]+]]:_(s1) = G_USUBO [[UV]], [[UV2]] + ; GFX6-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[UV1]], [[UV3]], [[USUBO1]] + ; GFX6-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO]](s32), [[USUBE]](s32) + ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) + ; GFX8-LABEL: name: test_sub_s33 + ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX8-NEXT: {{ $}} + ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 + ; GFX8-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; GFX8-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64) + ; GFX8-NEXT: [[USUBO:%[0-9]+]]:_(s32), [[USUBO1:%[0-9]+]]:_(s1) = G_USUBO [[UV]], [[UV2]] + ; GFX8-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[UV1]], [[UV3]], [[USUBO1]] + ; GFX8-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO]](s32), [[USUBE]](s32) + ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) + ; GFX9-LABEL: name: test_sub_s33 + ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64) + ; GFX9-NEXT: [[USUBO:%[0-9]+]]:_(s32), [[USUBO1:%[0-9]+]]:_(s1) = G_USUBO [[UV]], [[UV2]] + ; GFX9-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[UV1]], [[UV3]], [[USUBO1]] + ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO]](s32), [[USUBE]](s32) + ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s64) = COPY $vgpr2_vgpr3 + %2:_(s33) = G_TRUNC %0 + %3:_(s33) = G_TRUNC %1 + %4:_(s33) = G_SUB %2, %3 + %5:_(s64) = G_ANYEXT %4 + $vgpr0_vgpr1 = COPY %5 +... --- name: test_sub_s96 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll @@ -688,37 +688,242 @@ ret <2 x i16> %or } -; FIXME: -; define amdgpu_ps i48 @s_orn2_v3i16(<3 x i16> inreg %src0, <3 x i16> inreg %src1) { -; %not.src1 = xor <3 x i16> %src1, -; %or = or <3 x i16> %src0, %not.src1 -; %cast = bitcast <3 x i16> %or to i48 -; ret i48 %cast -; } - -; define amdgpu_ps i48 @s_orn2_v3i16_commute(<3 x i16> inreg %src0, <3 x i16> inreg %src1) { -; %not.src1 = xor <3 x i16> %src1, -; %or = or <3 x i16> %not.src1, %src0 -; %cast = bitcast <3 x i16> %or to i48 -; ret i48 %cast -; } - -; define amdgpu_ps { i48, i48 } @s_orn2_v3i16_multi_use(<3 x i16> inreg %src0, <3 x i16> inreg %src1) { -; %not.src1 = xor <3 x i16> %src1, -; %or = or <3 x i16> %src0, %not.src1 - -; %cast.0 = bitcast <3 x i16> %or to i48 -; %cast.1 = bitcast <3 x i16> %not.src1 to i48 -; %insert.0 = insertvalue { i48, i48 } undef, i48 %cast.0, 0 -; %insert.1 = insertvalue { i48, i48 } %insert.0, i48 %cast.1, 1 -; ret { i48, i48 } %insert.1 -; } - -; define <3 x i16> @v_orn2_v3i16(<3 x i16> %src0, <3 x i16> %src1) { -; %not.src1 = xor <3 x i16> %src1, -; %or = or <3 x i16> %src0, %not.src1 -; ret <3 x i16> %or -; } +define amdgpu_ps i48 @s_orn2_v3i16(<3 x i16> inreg %src0, <3 x i16> inreg %src1) { +; GFX6-LABEL: s_orn2_v3i16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_and_b32 s6, s6, 0xffff +; GFX6-NEXT: s_mov_b32 s0, -1 +; GFX6-NEXT: s_and_b32 s5, s5, 0xffff +; GFX6-NEXT: s_lshl_b32 s6, s6, 16 +; GFX6-NEXT: s_mov_b32 s1, 0xffff +; GFX6-NEXT: s_or_b32 s6, s5, s6 +; GFX6-NEXT: s_and_b32 s7, s7, 0xffff +; GFX6-NEXT: s_xor_b64 s[0:1], s[6:7], s[0:1] +; GFX6-NEXT: s_and_b32 s3, s3, 0xffff +; GFX6-NEXT: s_lshr_b32 s5, s0, 16 +; GFX6-NEXT: s_and_b32 s2, s2, 0xffff +; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_or_b32 s2, s2, s3 +; GFX6-NEXT: s_and_b32 s3, s4, 0xffff +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff +; GFX6-NEXT: s_lshl_b32 s4, s5, 16 +; GFX6-NEXT: s_or_b32 s0, s0, s4 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GFX6-NEXT: s_lshr_b32 s2, s0, 16 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_or_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_orn2_v3i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b64 s[0:1], -1 +; GFX9-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] +; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GFX9-NEXT: s_lshr_b32 s2, s0, 16 +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NEXT: s_lshl_b32 s2, s2, 16 +; GFX9-NEXT: s_or_b32 s0, s0, s2 +; GFX9-NEXT: s_and_b32 s1, s1, 0xffff +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10PLUS-LABEL: s_orn2_v3i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_mov_b64 s[0:1], -1 +; GFX10PLUS-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] +; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GFX10PLUS-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10PLUS-NEXT: s_lshl_b32 s2, s2, 16 +; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff +; GFX10PLUS-NEXT: s_or_b32 s0, s0, s2 +; GFX10PLUS-NEXT: ; return to shader part epilog + %not.src1 = xor <3 x i16> %src1, + %or = or <3 x i16> %src0, %not.src1 + %cast = bitcast <3 x i16> %or to i48 + ret i48 %cast +} + +define amdgpu_ps i48 @s_orn2_v3i16_commute(<3 x i16> inreg %src0, <3 x i16> inreg %src1) { +; GFX6-LABEL: s_orn2_v3i16_commute: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_and_b32 s6, s6, 0xffff +; GFX6-NEXT: s_mov_b32 s0, -1 +; GFX6-NEXT: s_and_b32 s5, s5, 0xffff +; GFX6-NEXT: s_lshl_b32 s6, s6, 16 +; GFX6-NEXT: s_mov_b32 s1, 0xffff +; GFX6-NEXT: s_or_b32 s6, s5, s6 +; GFX6-NEXT: s_and_b32 s7, s7, 0xffff +; GFX6-NEXT: s_xor_b64 s[0:1], s[6:7], s[0:1] +; GFX6-NEXT: s_lshr_b32 s5, s0, 16 +; GFX6-NEXT: s_and_b32 s3, s3, 0xffff +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff +; GFX6-NEXT: s_lshl_b32 s5, s5, 16 +; GFX6-NEXT: s_and_b32 s2, s2, 0xffff +; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_or_b32 s0, s0, s5 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_or_b32 s2, s2, s3 +; GFX6-NEXT: s_and_b32 s3, s4, 0xffff +; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX6-NEXT: s_lshr_b32 s2, s0, 16 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_or_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_orn2_v3i16_commute: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b64 s[0:1], -1 +; GFX9-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] +; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_lshr_b32 s2, s0, 16 +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NEXT: s_lshl_b32 s2, s2, 16 +; GFX9-NEXT: s_or_b32 s0, s0, s2 +; GFX9-NEXT: s_and_b32 s1, s1, 0xffff +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10PLUS-LABEL: s_orn2_v3i16_commute: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_mov_b64 s[0:1], -1 +; GFX10PLUS-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] +; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX10PLUS-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10PLUS-NEXT: s_lshl_b32 s2, s2, 16 +; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff +; GFX10PLUS-NEXT: s_or_b32 s0, s0, s2 +; GFX10PLUS-NEXT: ; return to shader part epilog + %not.src1 = xor <3 x i16> %src1, + %or = or <3 x i16> %not.src1, %src0 + %cast = bitcast <3 x i16> %or to i48 + ret i48 %cast +} + +define amdgpu_ps { i48, i48 } @s_orn2_v3i16_multi_use(<3 x i16> inreg %src0, <3 x i16> inreg %src1) { +; GFX6-LABEL: s_orn2_v3i16_multi_use: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_and_b32 s6, s6, 0xffff +; GFX6-NEXT: s_mov_b32 s0, -1 +; GFX6-NEXT: s_and_b32 s5, s5, 0xffff +; GFX6-NEXT: s_lshl_b32 s6, s6, 16 +; GFX6-NEXT: s_mov_b32 s1, 0xffff +; GFX6-NEXT: s_or_b32 s6, s5, s6 +; GFX6-NEXT: s_and_b32 s7, s7, 0xffff +; GFX6-NEXT: s_xor_b64 s[0:1], s[6:7], s[0:1] +; GFX6-NEXT: s_lshr_b32 s5, s0, 16 +; GFX6-NEXT: s_and_b32 s3, s3, 0xffff +; GFX6-NEXT: s_and_b32 s2, s2, 0xffff +; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_and_b32 s7, s4, 0xffff +; GFX6-NEXT: s_and_b32 s4, s0, 0xffff +; GFX6-NEXT: s_lshl_b32 s5, s5, 16 +; GFX6-NEXT: s_or_b32 s6, s2, s3 +; GFX6-NEXT: s_or_b32 s2, s4, s5 +; GFX6-NEXT: s_and_b32 s3, s1, 0xffff +; GFX6-NEXT: s_or_b64 s[0:1], s[6:7], s[2:3] +; GFX6-NEXT: s_lshr_b32 s2, s0, 16 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_or_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_or_b32 s2, s4, s5 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_orn2_v3i16_multi_use: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b64 s[0:1], -1 +; GFX9-NEXT: s_xor_b64 s[4:5], s[4:5], s[0:1] +; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5] +; GFX9-NEXT: s_lshr_b32 s2, s0, 16 +; GFX9-NEXT: s_lshr_b32 s6, s4, 16 +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NEXT: s_lshl_b32 s2, s2, 16 +; GFX9-NEXT: s_or_b32 s0, s0, s2 +; GFX9-NEXT: s_and_b32 s2, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s3, s6, 16 +; GFX9-NEXT: s_and_b32 s1, s1, 0xffff +; GFX9-NEXT: s_or_b32 s2, s2, s3 +; GFX9-NEXT: s_and_b32 s3, s5, 0xffff +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10PLUS-LABEL: s_orn2_v3i16_multi_use: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_mov_b64 s[0:1], -1 +; GFX10PLUS-NEXT: s_xor_b64 s[4:5], s[4:5], s[0:1] +; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5] +; GFX10PLUS-NEXT: s_lshr_b32 s3, s4, 16 +; GFX10PLUS-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10PLUS-NEXT: s_lshl_b32 s2, s2, 16 +; GFX10PLUS-NEXT: s_lshl_b32 s3, s3, 16 +; GFX10PLUS-NEXT: s_or_b32 s0, s0, s2 +; GFX10PLUS-NEXT: s_and_b32 s2, s4, 0xffff +; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff +; GFX10PLUS-NEXT: s_or_b32 s2, s2, s3 +; GFX10PLUS-NEXT: s_and_b32 s3, s5, 0xffff +; GFX10PLUS-NEXT: ; return to shader part epilog + %not.src1 = xor <3 x i16> %src1, + %or = or <3 x i16> %src0, %not.src1 + %cast.0 = bitcast <3 x i16> %or to i48 + %cast.1 = bitcast <3 x i16> %not.src1 to i48 + %insert.0 = insertvalue { i48, i48 } undef, i48 %cast.0, 0 + %insert.1 = insertvalue { i48, i48 } %insert.0, i48 %cast.1, 1 + ret { i48, i48 } %insert.1 +} + +define <3 x i16> @v_orn2_v3i16(<3 x i16> %src0, <3 x i16> %src1) { +; GFX6-LABEL: v_orn2_v3i16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX6-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_xor_b32_e32 v4, 0xfff5, v4 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v1, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_orn2_v3i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX9-NEXT: v_xor_b32_e32 v3, -11, v3 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10PLUS-LABEL: v_orn2_v3i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX10PLUS-NEXT: v_xor_b32_e32 v3, -11, v3 +; GFX10PLUS-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX10PLUS-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] + %not.src1 = xor <3 x i16> %src1, + %or = or <3 x i16> %src0, %not.src1 + ret <3 x i16> %or +} define amdgpu_ps i64 @s_orn2_v4i16(<4 x i16> inreg %src0, <4 x i16> inreg %src1) { ; GFX6-LABEL: s_orn2_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -4155,30 +4155,410 @@ ret <4 x i32> %cast } -; FIXME: i48 broken because i48 add broken -; define i48 @v_saddsat_i48(i48 %lhs, i48 %rhs) { -; %result = call i48 @llvm.sadd.sat.i48(i48 %lhs, i48 %rhs) -; ret i48 %result -; } +define i48 @v_saddsat_i48(i48 %lhs, i48 %rhs) { +; GFX6-LABEL: v_saddsat_i48: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v0, v2 +; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v1, v3, vcc +; GFX6-NEXT: v_bfe_i32 v5, v4, 0, 16 +; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 16 +; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16 +; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] +; GFX6-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[2:3] +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v5 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xffff8000, v0 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v5 +; GFX6-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_saddsat_i48: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v1, v3, vcc +; GFX8-NEXT: v_bfe_i32 v5, v4, 0, 16 +; GFX8-NEXT: v_bfe_i32 v1, v0, 0, 16 +; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 16 +; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] +; GFX8-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[2:3] +; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v5 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xffff8000, v0 +; GFX8-NEXT: v_ashrrev_i32_e32 v1, 15, v5 +; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_saddsat_i48: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3] +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v3, vcc +; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] +; GFX9-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[2:3] +; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v5 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_saddsat_i48: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3] +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3] +; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[4:5], v[0:1] +; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v6 +; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo +; GFX10-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_saddsat_i48: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3] +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo +; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3] +; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1] +; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v6 +; GFX11-NEXT: s_xor_b32 vcc_lo, vcc_lo, s0 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1 +; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] +; GFX11-NEXT: s_setpc_b64 s[30:31] + %result = call i48 @llvm.sadd.sat.i48(i48 %lhs, i48 %rhs) + ret i48 %result +} -; define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) { -; %result = call i48 @llvm.sadd.sat.i48(i48 %lhs, i48 %rhs) -; ret i48 %result -; } +define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) { +; GFX6-LABEL: s_saddsat_i48: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_add_u32 s4, s0, s2 +; GFX6-NEXT: s_addc_u32 s5, s1, s3 +; GFX6-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: s_bfe_i64 s[6:7], s[4:5], 0x300000 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x300000 +; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] +; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0 +; GFX6-NEXT: s_ashr_i32 s3, s7, 31 +; GFX6-NEXT: s_ashr_i32 s2, s7, 15 +; GFX6-NEXT: s_add_u32 s3, s3, 0xffff8000 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: v_mov_b32_e32 v3, s5 +; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: v_readfirstlane_b32 s1, v1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_saddsat_i48: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_add_u32 s4, s0, s2 +; GFX8-NEXT: s_addc_u32 s5, s1, s3 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_bfe_i64 s[6:7], s[4:5], 0x300000 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x300000 +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] +; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0 +; GFX8-NEXT: s_ashr_i32 s3, s7, 31 +; GFX8-NEXT: s_ashr_i32 s2, s7, 15 +; GFX8-NEXT: s_add_u32 s3, s3, 0xffff8000 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_saddsat_i48: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 +; GFX9-NEXT: s_add_u32 s4, s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_addc_u32 s5, s1, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 +; GFX9-NEXT: s_ashr_i32 s2, s5, 31 +; GFX9-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_saddsat_i48: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 +; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 +; GFX10-NEXT: s_add_u32 s4, s0, s2 +; GFX10-NEXT: s_addc_u32 s5, s1, s3 +; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[2:3], 0 +; GFX10-NEXT: v_cmp_lt_i64_e64 s6, s[4:5], s[0:1] +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: s_ashr_i32 s0, s5, 31 +; GFX10-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX10-NEXT: s_xor_b32 s2, s2, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s0, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s1, s2 +; GFX10-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_saddsat_i48: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 +; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 +; GFX11-NEXT: s_add_u32 s4, s0, s2 +; GFX11-NEXT: s_addc_u32 s5, s1, s3 +; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[2:3], 0 +; GFX11-NEXT: v_cmp_lt_i64_e64 s6, s[4:5], s[0:1] +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: s_ashr_i32 s0, s5, 31 +; GFX11-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX11-NEXT: s_xor_b32 s2, s2, s6 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s1, s2 +; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-NEXT: ; return to shader part epilog + %result = call i48 @llvm.sadd.sat.i48(i48 %lhs, i48 %rhs) + ret i48 %result +} -; define amdgpu_ps <2 x float> @saddsat_i48_sv(i48 inreg %lhs, i48 %rhs) { -; %result = call i48 @llvm.sadd.sat.i48(i48 %lhs, i48 %rhs) -; %ext.result = zext i48 %result to i64 -; %cast = bitcast i64 %ext.result to <2 x float> -; ret <2 x float> %cast -; } +define amdgpu_ps <2 x float> @saddsat_i48_sv(i48 inreg %lhs, i48 %rhs) { +; GFX6-LABEL: saddsat_i48_sv: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_mov_b32_e32 v3, s1 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, s0, v0 +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v3, v1, vcc +; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16 +; GFX6-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000 +; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 16 +; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3] +; GFX6-NEXT: v_cmp_gt_i64_e64 s[2:3], 0, v[0:1] +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, 0xffff8000 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v0, v3 +; GFX6-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] +; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: saddsat_i48_sv: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v3, v1, vcc +; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 16 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000 +; GFX8-NEXT: v_bfe_i32 v1, v0, 0, 16 +; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3] +; GFX8-NEXT: v_cmp_gt_i64_e64 s[2:3], 0, v[0:1] +; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3 +; GFX8-NEXT: v_ashrrev_i32_e32 v1, 15, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff8000 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v0, v3 +; GFX8-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: saddsat_i48_sv: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v1, vcc +; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3] +; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], 0, v[0:1] +; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 +; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v0, v1 +; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: saddsat_i48_sv: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, s0, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo +; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] +; GFX10-NEXT: v_cmp_gt_i64_e64 s0, 0, v[0:1] +; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4 +; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX10-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: saddsat_i48_sv: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, s0, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo +; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] +; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[0:1] +; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4 +; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1 +; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: ; return to shader part epilog + %result = call i48 @llvm.sadd.sat.i48(i48 %lhs, i48 %rhs) + %ext.result = zext i48 %result to i64 + %cast = bitcast i64 %ext.result to <2 x float> + ret <2 x float> %cast +} -; define amdgpu_ps <2 x float> @saddsat_i48_vs(i48 %lhs, i48 inreg %rhs) { -; %result = call i48 @llvm.sadd.sat.i48(i48 %lhs, i48 %rhs) -; %ext.result = zext i48 %result to i64 -; %cast = bitcast i64 %ext.result to <2 x float> -; ret <2 x float> %cast -; } +define amdgpu_ps <2 x float> @saddsat_i48_vs(i48 %lhs, i48 inreg %rhs) { +; GFX6-LABEL: saddsat_i48_vs: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_mov_b32_e32 v3, s1 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, s0, v0 +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v1, v3, vcc +; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16 +; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 16 +; GFX6-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000 +; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1] +; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0 +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, 0xffff8000 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v0, v3 +; GFX6-NEXT: s_xor_b64 vcc, s[0:1], s[2:3] +; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: saddsat_i48_vs: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v1, v3, vcc +; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 16 +; GFX8-NEXT: v_bfe_i32 v1, v0, 0, 16 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000 +; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1] +; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0 +; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3 +; GFX8-NEXT: v_ashrrev_i32_e32 v1, 15, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff8000 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v0, v3 +; GFX8-NEXT: s_xor_b64 vcc, s[0:1], s[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: saddsat_i48_vs: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] +; GFX9-NEXT: s_lshl_b64 s[2:3], s[0:1], 16 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc +; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[2:3], v[0:1] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[2:3], 0 +; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 +; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v0, v1 +; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: saddsat_i48_vs: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, s0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo +; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0 +; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] +; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4 +; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX10-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: saddsat_i48_vs: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v0, s0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo +; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0 +; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] +; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4 +; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1 +; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: ; return to shader part epilog + %result = call i48 @llvm.sadd.sat.i48(i48 %lhs, i48 %rhs) + %ext.result = zext i48 %result to i64 + %cast = bitcast i64 %ext.result to <2 x float> + ret <2 x float> %cast +} define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) { ; GFX6-LABEL: v_saddsat_i64: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -4141,30 +4141,410 @@ ret <4 x i32> %cast } -; FIXME: i48 broken because i48 add broken -; define i48 @v_ssubsat_i48(i48 %lhs, i48 %rhs) { -; %result = call i48 @llvm.ssub.sat.i48(i48 %lhs, i48 %rhs) -; ret i48 %result -; } +define i48 @v_ssubsat_i48(i48 %lhs, i48 %rhs) { +; GFX6-LABEL: v_ssubsat_i48: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v0, v2 +; GFX6-NEXT: v_subb_u32_e32 v6, vcc, v1, v3, vcc +; GFX6-NEXT: v_bfe_i32 v5, v4, 0, 16 +; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 16 +; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16 +; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] +; GFX6-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[2:3] +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v5 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xffff8000, v0 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v5 +; GFX6-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_ssubsat_i48: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v0, v2 +; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v1, v3, vcc +; GFX8-NEXT: v_bfe_i32 v5, v4, 0, 16 +; GFX8-NEXT: v_bfe_i32 v1, v0, 0, 16 +; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 16 +; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] +; GFX8-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[2:3] +; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v5 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xffff8000, v0 +; GFX8-NEXT: v_ashrrev_i32_e32 v1, 15, v5 +; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_ssubsat_i48: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3] +; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v3, vcc +; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[2:3] +; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v5 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_ssubsat_i48: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3] +; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2 +; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3] +; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[4:5], v[0:1] +; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v6 +; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo +; GFX10-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_ssubsat_i48: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3] +; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2 +; GFX11-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo +; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3] +; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1] +; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v6 +; GFX11-NEXT: s_xor_b32 vcc_lo, vcc_lo, s0 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1 +; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] +; GFX11-NEXT: s_setpc_b64 s[30:31] + %result = call i48 @llvm.ssub.sat.i48(i48 %lhs, i48 %rhs) + ret i48 %result +} -; define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) { -; %result = call i48 @llvm.ssub.sat.i48(i48 %lhs, i48 %rhs) -; ret i48 %result -; } +define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) { +; GFX6-LABEL: s_ssubsat_i48: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_sub_u32 s4, s0, s2 +; GFX6-NEXT: s_subb_u32 s5, s1, s3 +; GFX6-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: s_bfe_i64 s[6:7], s[4:5], 0x300000 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x300000 +; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] +; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0 +; GFX6-NEXT: s_ashr_i32 s3, s7, 31 +; GFX6-NEXT: s_ashr_i32 s2, s7, 15 +; GFX6-NEXT: s_add_u32 s3, s3, 0xffff8000 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: v_mov_b32_e32 v3, s5 +; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: v_readfirstlane_b32 s1, v1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_ssubsat_i48: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_sub_u32 s4, s0, s2 +; GFX8-NEXT: s_subb_u32 s5, s1, s3 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_bfe_i64 s[6:7], s[4:5], 0x300000 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x300000 +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] +; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0 +; GFX8-NEXT: s_ashr_i32 s3, s7, 31 +; GFX8-NEXT: s_ashr_i32 s2, s7, 15 +; GFX8-NEXT: s_add_u32 s3, s3, 0xffff8000 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_ssubsat_i48: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 +; GFX9-NEXT: s_sub_u32 s4, s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_subb_u32 s5, s1, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] +; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 +; GFX9-NEXT: s_ashr_i32 s2, s5, 31 +; GFX9-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_ssubsat_i48: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 +; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 +; GFX10-NEXT: s_sub_u32 s4, s0, s2 +; GFX10-NEXT: s_subb_u32 s5, s1, s3 +; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[2:3], 0 +; GFX10-NEXT: v_cmp_lt_i64_e64 s6, s[4:5], s[0:1] +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: s_ashr_i32 s0, s5, 31 +; GFX10-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX10-NEXT: s_xor_b32 s2, s2, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s0, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s1, s2 +; GFX10-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_ssubsat_i48: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 +; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 +; GFX11-NEXT: s_sub_u32 s4, s0, s2 +; GFX11-NEXT: s_subb_u32 s5, s1, s3 +; GFX11-NEXT: v_cmp_gt_i64_e64 s2, s[2:3], 0 +; GFX11-NEXT: v_cmp_lt_i64_e64 s6, s[4:5], s[0:1] +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: s_ashr_i32 s0, s5, 31 +; GFX11-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX11-NEXT: s_xor_b32 s2, s2, s6 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s1, s2 +; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-NEXT: ; return to shader part epilog + %result = call i48 @llvm.ssub.sat.i48(i48 %lhs, i48 %rhs) + ret i48 %result +} -; define amdgpu_ps <2 x float> @ssubsat_i48_sv(i48 inreg %lhs, i48 %rhs) { -; %result = call i48 @llvm.ssub.sat.i48(i48 %lhs, i48 %rhs) -; %ext.result = zext i48 %result to i64 -; %cast = bitcast i64 %ext.result to <2 x float> -; ret <2 x float> %cast -; } +define amdgpu_ps <2 x float> @ssubsat_i48_sv(i48 inreg %lhs, i48 %rhs) { +; GFX6-LABEL: ssubsat_i48_sv: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_mov_b32_e32 v3, s1 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s0, v0 +; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v3, v1, vcc +; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16 +; GFX6-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000 +; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 16 +; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3] +; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], 0, v[0:1] +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, 0xffff8000 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v0, v3 +; GFX6-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] +; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: ssubsat_i48_sv: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0 +; GFX8-NEXT: v_subb_u32_e32 v4, vcc, v3, v1, vcc +; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 16 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000 +; GFX8-NEXT: v_bfe_i32 v1, v0, 0, 16 +; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3] +; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], 0, v[0:1] +; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3 +; GFX8-NEXT: v_ashrrev_i32_e32 v1, 15, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff8000 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v0, v3 +; GFX8-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: ssubsat_i48_sv: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s0, v0 +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v1, vcc +; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], 0, v[0:1] +; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 +; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v0, v1 +; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: ssubsat_i48_sv: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 +; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, s0, v0 +; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo +; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] +; GFX10-NEXT: v_cmp_lt_i64_e64 s0, 0, v[0:1] +; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4 +; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX10-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: ssubsat_i48_sv: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 +; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, s0, v0 +; GFX11-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo +; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] +; GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0, v[0:1] +; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4 +; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1 +; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: ; return to shader part epilog + %result = call i48 @llvm.ssub.sat.i48(i48 %lhs, i48 %rhs) + %ext.result = zext i48 %result to i64 + %cast = bitcast i64 %ext.result to <2 x float> + ret <2 x float> %cast +} -; define amdgpu_ps <2 x float> @ssubsat_i48_vs(i48 %lhs, i48 inreg %rhs) { -; %result = call i48 @llvm.ssub.sat.i48(i48 %lhs, i48 %rhs) -; %ext.result = zext i48 %result to i64 -; %cast = bitcast i64 %ext.result to <2 x float> -; ret <2 x float> %cast -; } +define amdgpu_ps <2 x float> @ssubsat_i48_vs(i48 %lhs, i48 inreg %rhs) { +; GFX6-LABEL: ssubsat_i48_vs: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_mov_b32_e32 v3, s1 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s0, v0 +; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v1, v3, vcc +; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16 +; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 16 +; GFX6-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000 +; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1] +; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0 +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, 0xffff8000 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v0, v3 +; GFX6-NEXT: s_xor_b64 vcc, s[0:1], s[2:3] +; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: ssubsat_i48_vs: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s0, v0 +; GFX8-NEXT: v_subb_u32_e32 v4, vcc, v1, v3, vcc +; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 16 +; GFX8-NEXT: v_bfe_i32 v1, v0, 0, 16 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000 +; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1] +; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0 +; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3 +; GFX8-NEXT: v_ashrrev_i32_e32 v1, 15, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff8000 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v0, v3 +; GFX8-NEXT: s_xor_b64 vcc, s[0:1], s[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: ssubsat_i48_vs: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] +; GFX9-NEXT: s_lshl_b64 s[2:3], s[0:1], 16 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc +; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[2:3], v[0:1] +; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[2:3], 0 +; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 +; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v0, v1 +; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: ssubsat_i48_vs: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 +; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s0 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo +; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], 0 +; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] +; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4 +; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX10-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: ssubsat_i48_vs: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 +; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s0 +; GFX11-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo +; GFX11-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], 0 +; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] +; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4 +; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1 +; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: ; return to shader part epilog + %result = call i48 @llvm.ssub.sat.i48(i48 %lhs, i48 %rhs) + %ext.result = zext i48 %result to i64 + %cast = bitcast i64 %ext.result to <2 x float> + ret <2 x float> %cast +} define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) { ; GFX6-LABEL: v_ssubsat_i64: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -2636,30 +2636,250 @@ ret <4 x i32> %cast } -; FIXME: i48 broken because i48 add broken -; define i48 @v_uaddsat_i48(i48 %lhs, i48 %rhs) { -; %result = call i48 @llvm.uadd.sat.i48(i48 %lhs, i48 %rhs) -; ret i48 %result -; } +define i48 @v_uaddsat_i48(i48 %lhs, i48 %rhs) { +; GFX6-LABEL: v_uaddsat_i48: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v1, v3, vcc +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v1, v4, -1, vcc +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_uaddsat_i48: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc +; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_uaddsat_i48: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc +; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10PLUS-LABEL: v_uaddsat_i48: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] +; GFX10PLUS-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3] +; GFX10PLUS-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX10PLUS-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo +; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] + %result = call i48 @llvm.uadd.sat.i48(i48 %lhs, i48 %rhs) + ret i48 %result +} -; define amdgpu_ps i48 @s_uaddsat_i48(i48 inreg %lhs, i48 inreg %rhs) { -; %result = call i48 @llvm.uadd.sat.i48(i48 %lhs, i48 %rhs) -; ret i48 %result -; } +define amdgpu_ps i48 @s_uaddsat_i48(i48 inreg %lhs, i48 inreg %rhs) { +; GFX6-LABEL: s_uaddsat_i48: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_mov_b32 s4, -1 +; GFX6-NEXT: s_add_u32 s0, s0, s2 +; GFX6-NEXT: s_mov_b32 s5, 0xffff +; GFX6-NEXT: s_addc_u32 s1, s1, s3 +; GFX6-NEXT: s_and_b64 s[2:3], s[2:3], s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_and_b64 s[6:7], s[0:1], s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: v_mov_b32_e32 v3, s1 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: v_readfirstlane_b32 s1, v1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_uaddsat_i48: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 +; GFX8-NEXT: s_add_u32 s0, s0, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: s_addc_u32 s1, s1, s3 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc +; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_uaddsat_i48: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 +; GFX9-NEXT: s_add_u32 s0, s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_addc_u32 s1, s1, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc +; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10PLUS-LABEL: s_uaddsat_i48: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 +; GFX10PLUS-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 +; GFX10PLUS-NEXT: s_add_u32 s0, s0, s2 +; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s3 +; GFX10PLUS-NEXT: v_cmp_lt_u64_e64 s2, s[0:1], s[2:3] +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, s0, -1, s2 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, s1, -1, s2 +; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10PLUS-NEXT: ; return to shader part epilog + %result = call i48 @llvm.uadd.sat.i48(i48 %lhs, i48 %rhs) + ret i48 %result +} -; define amdgpu_ps <2 x float> @uaddsat_i48_sv(i48 inreg %lhs, i48 %rhs) { -; %result = call i48 @llvm.uadd.sat.i48(i48 %lhs, i48 %rhs) -; %ext.result = zext i48 %result to i64 -; %cast = bitcast i64 %ext.result to <2 x float> -; ret <2 x float> %cast -; } +define amdgpu_ps <2 x float> @uaddsat_i48_sv(i48 inreg %lhs, i48 %rhs) { +; GFX6-LABEL: uaddsat_i48_sv: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_mov_b32_e32 v3, s1 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, s0, v0 +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v3, v1, vcc +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1] +; GFX6-NEXT: v_cndmask_b32_e64 v1, v4, -1, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: uaddsat_i48_sv: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v1, vcc +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc +; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: uaddsat_i48_sv: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v1, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc +; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10PLUS-LABEL: uaddsat_i48_sv: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] +; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 +; GFX10PLUS-NEXT: v_add_co_u32 v2, vcc_lo, s0, v0 +; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo +; GFX10PLUS-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1] +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc_lo +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc_lo +; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; GFX10PLUS-NEXT: ; return to shader part epilog + %result = call i48 @llvm.uadd.sat.i48(i48 %lhs, i48 %rhs) + %ext.result = zext i48 %result to i64 + %cast = bitcast i64 %ext.result to <2 x float> + ret <2 x float> %cast +} -; define amdgpu_ps <2 x float> @uaddsat_i48_vs(i48 %lhs, i48 inreg %rhs) { -; %result = call i48 @llvm.uadd.sat.i48(i48 %lhs, i48 %rhs) -; %ext.result = zext i48 %result to i64 -; %cast = bitcast i64 %ext.result to <2 x float> -; ret <2 x float> %cast -; } +define amdgpu_ps <2 x float> @uaddsat_i48_vs(i48 %lhs, i48 inreg %rhs) { +; GFX6-LABEL: uaddsat_i48_vs: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_mov_b32_e32 v2, s1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v1, v2, vcc +; GFX6-NEXT: s_mov_b32 s3, 0xffff +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; GFX6-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] +; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: uaddsat_i48_vs: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc +; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc +; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: uaddsat_i48_vs: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc +; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10PLUS-LABEL: uaddsat_i48_vs: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] +; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 +; GFX10PLUS-NEXT: v_add_co_u32 v0, vcc_lo, v0, s0 +; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo +; GFX10PLUS-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1] +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo +; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; GFX10PLUS-NEXT: ; return to shader part epilog + %result = call i48 @llvm.uadd.sat.i48(i48 %lhs, i48 %rhs) + %ext.result = zext i48 %result to i64 + %cast = bitcast i64 %ext.result to <2 x float> + ret <2 x float> %cast +} define i64 @v_uaddsat_i64(i64 %lhs, i64 %rhs) { ; GFX6-LABEL: v_uaddsat_i64: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -2506,30 +2506,252 @@ ret <4 x i32> %cast } -; FIXME: i48 broken because i48 add broken -; define i48 @v_usubsat_i48(i48 %lhs, i48 %rhs) { -; %result = call i48 @llvm.usub.sat.i48(i48 %lhs, i48 %rhs) -; ret i48 %result -; } +define i48 @v_usubsat_i48(i48 %lhs, i48 %rhs) { +; GFX6-LABEL: v_usubsat_i48: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v0, v2 +; GFX6-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] +; GFX6-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v1, v5, 0, vcc +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_usubsat_i48: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3] +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v0, v2 +; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v5, 0, vcc +; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_usubsat_i48: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3] +; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v3, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, 0, vcc +; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10PLUS-LABEL: v_usubsat_i48: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] +; GFX10PLUS-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3] +; GFX10PLUS-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2 +; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo +; GFX10PLUS-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc_lo +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v5, 0, vcc_lo +; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] + %result = call i48 @llvm.usub.sat.i48(i48 %lhs, i48 %rhs) + ret i48 %result +} -; define amdgpu_ps i48 @s_usubsat_i48(i48 inreg %lhs, i48 inreg %rhs) { -; %result = call i48 @llvm.usub.sat.i48(i48 %lhs, i48 %rhs) -; ret i48 %result -; } +define amdgpu_ps i48 @s_usubsat_i48(i48 inreg %lhs, i48 inreg %rhs) { +; GFX6-LABEL: s_usubsat_i48: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_sub_u32 s4, s0, s2 +; GFX6-NEXT: s_mov_b32 s7, 0xffff +; GFX6-NEXT: s_subb_u32 s5, s1, s3 +; GFX6-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: v_mov_b32_e32 v3, s5 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: v_readfirstlane_b32 s1, v1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_usubsat_i48: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: s_sub_u32 s4, s0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_subb_u32 s5, s1, s3 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc +; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_usubsat_i48: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_sub_u32 s4, s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_subb_u32 s5, s1, s3 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc +; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10PLUS-LABEL: s_usubsat_i48: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 +; GFX10PLUS-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 +; GFX10PLUS-NEXT: s_sub_u32 s4, s0, s2 +; GFX10PLUS-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[2:3] +; GFX10PLUS-NEXT: s_subb_u32 s1, s1, s3 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, s4, 0, s0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, s1, 0, s0 +; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10PLUS-NEXT: ; return to shader part epilog + %result = call i48 @llvm.usub.sat.i48(i48 %lhs, i48 %rhs) + ret i48 %result +} -; define amdgpu_ps <2 x float> @usubsat_i48_sv(i48 inreg %lhs, i48 %rhs) { -; %result = call i48 @llvm.usub.sat.i48(i48 %lhs, i48 %rhs) -; %ext.result = zext i48 %result to i64 -; %cast = bitcast i64 %ext.result to <2 x float> -; ret <2 x float> %cast -; } +define amdgpu_ps <2 x float> @usubsat_i48_sv(i48 inreg %lhs, i48 %rhs) { +; GFX6-LABEL: usubsat_i48_sv: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v2, s1 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s0, v0 +; GFX6-NEXT: s_mov_b32 s3, 0xffff +; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v2, v1, vcc +; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] +; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: usubsat_i48_sv: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s0, v0 +; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v2, v1, vcc +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc +; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: usubsat_i48_sv: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s0, v0 +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v1, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc +; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10PLUS-LABEL: usubsat_i48_sv: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] +; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 +; GFX10PLUS-NEXT: v_sub_co_u32 v2, vcc_lo, s0, v0 +; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo +; GFX10PLUS-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[0:1], v[0:1] +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo +; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; GFX10PLUS-NEXT: ; return to shader part epilog + %result = call i48 @llvm.usub.sat.i48(i48 %lhs, i48 %rhs) + %ext.result = zext i48 %result to i64 + %cast = bitcast i64 %ext.result to <2 x float> + ret <2 x float> %cast +} -; define amdgpu_ps <2 x float> @usubsat_i48_vs(i48 %lhs, i48 inreg %rhs) { -; %result = call i48 @llvm.usub.sat.i48(i48 %lhs, i48 %rhs) -; %ext.result = zext i48 %result to i64 -; %cast = bitcast i64 %ext.result to <2 x float> -; ret <2 x float> %cast -; } +define amdgpu_ps <2 x float> @usubsat_i48_vs(i48 %lhs, i48 inreg %rhs) { +; GFX6-LABEL: usubsat_i48_vs: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v2, s1 +; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s0, v0 +; GFX6-NEXT: s_mov_b32 s3, 0xffff +; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v1, v2, vcc +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; GFX6-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] +; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: usubsat_i48_vs: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s0, v0 +; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v1, v2, vcc +; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc +; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: usubsat_i48_vs: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s0, v0 +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v1, v2, vcc +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc +; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10PLUS-LABEL: usubsat_i48_vs: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] +; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 +; GFX10PLUS-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s0 +; GFX10PLUS-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo +; GFX10PLUS-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1] +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo +; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; GFX10PLUS-NEXT: ; return to shader part epilog + %result = call i48 @llvm.usub.sat.i48(i48 %lhs, i48 %rhs) + %ext.result = zext i48 %result to i64 + %cast = bitcast i64 %ext.result to <2 x float> + ret <2 x float> %cast +} define i64 @v_usubsat_i64(i64 %lhs, i64 %rhs) { ; GFX6-LABEL: v_usubsat_i64: