diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s define <2 x i16> @v_add_v2i16(<2 x i16> %a, <2 x i16> %b) { ; GFX9-LABEL: v_add_v2i16: @@ -16,6 +17,13 @@ ; GFX8-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_add_v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %add = add <2 x i16> %a, %b ret <2 x i16> %add } @@ -35,6 +43,13 @@ ; GFX8-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_add_v2i16_fneg_lhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 neg_lo:[1,0] neg_hi:[1,0] +; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg <2 x half> %a %cast.neg.a = bitcast <2 x half> %neg.a to <2 x i16> %add = add <2 x i16> %cast.neg.a, %b @@ -56,6 +71,13 @@ ; GFX8-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_add_v2i16_fneg_rhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.b = fneg <2 x half> %b %cast.neg.b = bitcast <2 x half> %neg.b to <2 x i16> %add = add <2 x i16> %a, %cast.neg.b @@ -79,6 +101,13 @@ ; GFX8-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_add_v2i16_fneg_lhs_fneg_rhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 neg_lo:[1,1] neg_hi:[1,1] +; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg <2 x half> %a %neg.b = fneg <2 x half> %b %cast.neg.a = bitcast <2 x half> %neg.a to <2 x i16> @@ -104,6 +133,13 @@ ; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_add_v2i16_neg_inline_imm_splat: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_add_u16 v0, 0xffc0, v0 op_sel_hi:[0,1] +; GFX10-NEXT: s_setpc_b64 s[30:31] %add = add <2 x i16> %a, ret <2 x i16> %add } @@ -124,6 +160,13 @@ ; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_add_v2i16_neg_inline_imm_lo: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_add_u16 v0, 0x4ffc0, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %add = add <2 x i16> %a, ret <2 x i16> %add } @@ -144,6 +187,13 @@ ; GFX8-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_add_v2i16_neg_inline_imm_hi: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_add_u16 v0, 0xffc00004, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %add = add <2 x i16> %a, ret <2 x i16> %add } @@ -169,6 +219,14 @@ ; GFX8-NEXT: s_and_b32 s0, s0, s3 ; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_add_v2i16_neg_inline_imm_splat: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshr_b32 s1, s0, 16 +; GFX10-NEXT: s_add_i32 s0, s0, 0xffc0ffc0 +; GFX10-NEXT: s_add_i32 s1, s1, 0xffc0 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog %add = add <2 x i16> %a, %cast = bitcast <2 x i16> %add to i32 ret i32 %cast @@ -194,6 +252,14 @@ ; GFX8-NEXT: s_and_b32 s0, s0, s2 ; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_add_v2i16_neg_inline_imm_lo: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshr_b32 s1, s0, 16 +; GFX10-NEXT: s_add_i32 s0, s0, 0x4ffc0 +; GFX10-NEXT: s_add_i32 s1, s1, 4 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog %add = add <2 x i16> %a, %cast = bitcast <2 x i16> %add to i32 ret i32 %cast @@ -219,6 +285,14 @@ ; GFX8-NEXT: s_and_b32 s0, s0, s2 ; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_add_v2i16_neg_inline_imm_hi: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshr_b32 s1, s0, 16 +; GFX10-NEXT: s_add_i32 s0, s0, 0xffc00004 +; GFX10-NEXT: s_add_i32 s1, s1, 0xffc0 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog %add = add <2 x i16> %a, %cast = bitcast <2 x i16> %add to i32 ret i32 %cast @@ -247,6 +321,15 @@ ; GFX8-NEXT: s_and_b32 s0, s0, s3 ; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_add_v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10-NEXT: s_lshr_b32 s3, s1, 16 +; GFX10-NEXT: s_add_i32 s0, s0, s1 +; GFX10-NEXT: s_add_i32 s2, s2, s3 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX10-NEXT: ; return to shader part epilog %add = add <2 x i16> %a, %b %cast = bitcast <2 x i16> %add to i32 ret i32 %cast @@ -277,6 +360,16 @@ ; GFX8-NEXT: s_and_b32 s0, s0, s3 ; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_add_v2i16_fneg_lhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_xor_b32 s0, s0, 0x80008000 +; GFX10-NEXT: s_lshr_b32 s3, s1, 16 +; GFX10-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10-NEXT: s_add_i32 s0, s0, s1 +; GFX10-NEXT: s_add_i32 s2, s2, s3 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX10-NEXT: ; return to shader part epilog %neg.a = fneg <2 x half> %a %cast.neg.a = bitcast <2 x half> %neg.a to <2 x i16> %add = add <2 x i16> %cast.neg.a, %b @@ -309,6 +402,16 @@ ; GFX8-NEXT: s_and_b32 s0, s0, s3 ; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_add_v2i16_fneg_rhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_xor_b32 s1, s1, 0x80008000 +; GFX10-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10-NEXT: s_lshr_b32 s3, s1, 16 +; GFX10-NEXT: s_add_i32 s0, s0, s1 +; GFX10-NEXT: s_add_i32 s2, s2, s3 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX10-NEXT: ; return to shader part epilog %neg.b = fneg <2 x half> %b %cast.neg.b = bitcast <2 x half> %neg.b to <2 x i16> %add = add <2 x i16> %a, %cast.neg.b @@ -345,6 +448,18 @@ ; GFX8-NEXT: s_and_b32 s0, s0, s3 ; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_add_v2i16_fneg_lhs_fneg_rhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s2, 0x80008000 +; GFX10-NEXT: s_xor_b32 s1, s1, s2 +; GFX10-NEXT: s_xor_b32 s0, s0, s2 +; GFX10-NEXT: s_lshr_b32 s3, s1, 16 +; GFX10-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10-NEXT: s_add_i32 s0, s0, s1 +; GFX10-NEXT: s_add_i32 s2, s2, s3 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX10-NEXT: ; return to shader part epilog %neg.a = fneg <2 x half> %a %neg.b = fneg <2 x half> %b %cast.neg.a = bitcast <2 x half> %neg.a to <2 x i16> diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll @@ -1,12 +1,18 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s ; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s define amdgpu_ps i32 @s_andn2_i32(i32 inreg %src0, i32 inreg %src1) { ; GCN-LABEL: s_andn2_i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_andn2_b32 s0, s2, s3 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_andn2_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_andn2_b32 s0, s2, s3 +; GFX10-NEXT: ; return to shader part epilog %not.src1 = xor i32 %src1, -1 %and = and i32 %src0, %not.src1 ret i32 %and @@ -17,6 +23,11 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_andn2_b32 s0, s2, s3 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_andn2_i32_commute: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_andn2_b32 s0, s2, s3 +; GFX10-NEXT: ; return to shader part epilog %not.src1 = xor i32 %src1, -1 %and = and i32 %not.src1, %src0 ret i32 %and @@ -28,6 +39,12 @@ ; GCN-NEXT: s_not_b32 s1, s3 ; GCN-NEXT: s_andn2_b32 s0, s2, s3 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_andn2_i32_multi_use: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_andn2_b32 s0, s2, s3 +; GFX10-NEXT: s_not_b32 s1, s3 +; GFX10-NEXT: ; return to shader part epilog %not.src1 = xor i32 %src1, -1 %and = and i32 %src0, %not.src1 %insert.0 = insertvalue { i32, i32 } undef, i32 %and, 0 @@ -41,6 +58,12 @@ ; GCN-NEXT: s_andn2_b32 s0, s2, s4 ; GCN-NEXT: s_andn2_b32 s1, s3, s4 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_andn2_i32_multi_foldable_use: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_andn2_b32 s0, s2, s4 +; GFX10-NEXT: s_andn2_b32 s1, s3, s4 +; GFX10-NEXT: ; return to shader part epilog %not.src2 = xor i32 %src2, -1 %and0 = and i32 %src0, %not.src2 %and1 = and i32 %src1, %not.src2 @@ -56,6 +79,14 @@ ; GCN-NEXT: v_xor_b32_e32 v1, -1, v1 ; GCN-NEXT: v_and_b32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_andn2_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX10-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %not.src1 = xor i32 %src1, -1 %and = and i32 %src0, %not.src1 ret i32 %and @@ -67,6 +98,12 @@ ; GCN-NEXT: v_xor_b32_e32 v0, -1, v0 ; GCN-NEXT: v_and_b32_e32 v0, s2, v0 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_andn2_i32_sv: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX10-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX10-NEXT: ; return to shader part epilog %not.src1 = xor i32 %src1, -1 %and = and i32 %src0, %not.src1 %cast = bitcast i32 %and to float @@ -79,6 +116,12 @@ ; GCN-NEXT: s_not_b32 s0, s2 ; GCN-NEXT: v_and_b32_e32 v0, s0, v0 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_andn2_i32_vs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_not_b32 s0, s2 +; GFX10-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX10-NEXT: ; return to shader part epilog %not.src1 = xor i32 %src1, -1 %and = and i32 %src0, %not.src1 %cast = bitcast i32 %and to float @@ -90,6 +133,11 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_andn2_b64 s[0:1], s[2:3], s[4:5] ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_andn2_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_andn2_b64 s[0:1], s[2:3], s[4:5] +; GFX10-NEXT: ; return to shader part epilog %not.src1 = xor i64 %src1, -1 %and = and i64 %src0, %not.src1 ret i64 %and @@ -100,6 +148,11 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_andn2_b64 s[0:1], s[2:3], s[4:5] ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_andn2_i64_commute: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_andn2_b64 s[0:1], s[2:3], s[4:5] +; GFX10-NEXT: ; return to shader part epilog %not.src1 = xor i64 %src1, -1 %and = and i64 %not.src1, %src0 ret i64 %and @@ -111,6 +164,12 @@ ; GCN-NEXT: s_andn2_b64 s[0:1], s[2:3], s[6:7] ; GCN-NEXT: s_andn2_b64 s[2:3], s[4:5], s[6:7] ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_andn2_i64_multi_foldable_use: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_andn2_b64 s[0:1], s[2:3], s[6:7] +; GFX10-NEXT: s_andn2_b64 s[2:3], s[4:5], s[6:7] +; GFX10-NEXT: ; return to shader part epilog %not.src2 = xor i64 %src2, -1 %and0 = and i64 %src0, %not.src2 %and1 = and i64 %src1, %not.src2 @@ -127,6 +186,12 @@ ; GCN-NEXT: s_mov_b32 s2, s6 ; GCN-NEXT: s_mov_b32 s3, s7 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_andn2_i64_multi_use: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_andn2_b64 s[0:1], s[2:3], s[4:5] +; GFX10-NEXT: s_not_b64 s[2:3], s[4:5] +; GFX10-NEXT: ; return to shader part epilog %not.src1 = xor i64 %src1, -1 %and = and i64 %src0, %not.src1 %insert.0 = insertvalue { i64, i64 } undef, i64 %and, 0 @@ -143,6 +208,16 @@ ; GCN-NEXT: v_and_b32_e32 v0, v0, v2 ; GCN-NEXT: v_and_b32_e32 v1, v1, v3 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_andn2_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX10-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX10-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX10-NEXT: v_and_b32_e32 v1, v1, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] %not.src1 = xor i64 %src1, -1 %and = and i64 %src0, %not.src1 ret i64 %and @@ -156,6 +231,14 @@ ; GCN-NEXT: v_and_b32_e32 v0, s2, v0 ; GCN-NEXT: v_and_b32_e32 v1, s3, v1 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_andn2_i64_sv: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX10-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX10-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX10-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX10-NEXT: ; return to shader part epilog %not.src1 = xor i64 %src1, -1 %and = and i64 %src0, %not.src1 %cast = bitcast i64 %and to <2 x float> @@ -169,6 +252,13 @@ ; GCN-NEXT: v_and_b32_e32 v0, s0, v0 ; GCN-NEXT: v_and_b32_e32 v1, s1, v1 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_andn2_i64_vs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_not_b64 s[0:1], s[2:3] +; GFX10-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX10-NEXT: v_and_b32_e32 v1, s1, v1 +; GFX10-NEXT: ; return to shader part epilog %not.src1 = xor i64 %src1, -1 %and = and i64 %src0, %not.src1 %cast = bitcast i64 %and to <2 x float> @@ -180,6 +270,11 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_andn2_b64 s[0:1], s[2:3], s[4:5] ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_andn2_v2i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_andn2_b64 s[0:1], s[2:3], s[4:5] +; GFX10-NEXT: ; return to shader part epilog %not.src1 = xor <2 x i32> %src1, %and = and <2 x i32> %src0, %not.src1 ret <2 x i32> %and @@ -190,6 +285,11 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_andn2_b64 s[0:1], s[2:3], s[4:5] ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_andn2_v2i32_commute: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_andn2_b64 s[0:1], s[2:3], s[4:5] +; GFX10-NEXT: ; return to shader part epilog %not.src1 = xor <2 x i32> %src1, %and = and <2 x i32> %not.src1, %src0 ret <2 x i32> %and @@ -200,6 +300,11 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_andn2_b32 s0, s2, s3 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_andn2_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_andn2_b32 s0, s2, s3 +; GFX10-NEXT: ; return to shader part epilog %not.src1 = xor i16 %src1, -1 %and = and i16 %src0, %not.src1 ret i16 %and @@ -210,6 +315,11 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_andn2_b32 s0, s2, s3 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_andn2_i16_commute: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_andn2_b32 s0, s2, s3 +; GFX10-NEXT: ; return to shader part epilog %not.src1 = xor i16 %src1, -1 %and = and i16 %not.src1, %src0 ret i16 %and @@ -221,6 +331,12 @@ ; GCN-NEXT: s_xor_b32 s1, s3, -1 ; GCN-NEXT: s_andn2_b32 s0, s2, s3 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_andn2_i16_multi_use: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_andn2_b32 s0, s2, s3 +; GFX10-NEXT: s_xor_b32 s1, s3, -1 +; GFX10-NEXT: ; return to shader part epilog %not.src1 = xor i16 %src1, -1 %and = and i16 %src0, %not.src1 %insert.0 = insertvalue { i16, i16 } undef, i16 %and, 0 @@ -234,6 +350,12 @@ ; GCN-NEXT: s_andn2_b32 s0, s2, s4 ; GCN-NEXT: s_andn2_b32 s1, s3, s4 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_andn2_i16_multi_foldable_use: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_andn2_b32 s0, s2, s4 +; GFX10-NEXT: s_andn2_b32 s1, s3, s4 +; GFX10-NEXT: ; return to shader part epilog %not.src2 = xor i16 %src2, -1 %and0 = and i16 %src0, %not.src2 %and1 = and i16 %src1, %not.src2 @@ -249,6 +371,14 @@ ; GCN-NEXT: v_xor_b32_e32 v1, -1, v1 ; GCN-NEXT: v_and_b32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_andn2_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX10-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %not.src1 = xor i16 %src1, -1 %and = and i16 %src0, %not.src1 ret i16 %and @@ -261,6 +391,13 @@ ; GCN-NEXT: v_and_b32_e32 v0, s2, v0 ; GCN-NEXT: v_bfe_u32 v0, v0, 0, 16 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_andn2_i16_sv: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX10-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX10-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX10-NEXT: ; return to shader part epilog %not.src1 = xor i16 %src1, -1 %and = and i16 %src0, %not.src1 %zext = zext i16 %and to i32 @@ -275,6 +412,13 @@ ; GCN-NEXT: v_and_b32_e32 v0, s0, v0 ; GCN-NEXT: v_bfe_u32 v0, v0, 0, 16 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_andn2_i16_vs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_xor_b32 s0, s2, -1 +; GFX10-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX10-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX10-NEXT: ; return to shader part epilog %not.src1 = xor i16 %src1, -1 %and = and i16 %src0, %not.src1 %zext = zext i16 %and to i32 @@ -293,6 +437,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_andn2_b32 s0, s2, s3 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_andn2_v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_andn2_b32 s0, s2, s3 +; GFX10-NEXT: ; return to shader part epilog %not.src1 = xor <2 x i16> %src1, %and = and <2 x i16> %src0, %not.src1 %cast = bitcast <2 x i16> %and to i32 @@ -310,6 +459,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_andn2_b32 s0, s2, s3 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_andn2_v2i16_commute: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_andn2_b32 s0, s2, s3 +; GFX10-NEXT: ; return to shader part epilog %not.src1 = xor <2 x i16> %src1, %and = and <2 x i16> %not.src1, %src0 %cast = bitcast <2 x i16> %and to i32 @@ -328,6 +482,12 @@ ; GFX9-NEXT: s_xor_b32 s1, s3, -1 ; GFX9-NEXT: s_andn2_b32 s0, s2, s3 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_andn2_v2i16_multi_use: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_andn2_b32 s0, s2, s3 +; GFX10-NEXT: s_xor_b32 s1, s3, -1 +; GFX10-NEXT: ; return to shader part epilog %not.src1 = xor <2 x i16> %src1, %and = and <2 x i16> %src0, %not.src1 @@ -351,6 +511,12 @@ ; GFX9-NEXT: s_andn2_b32 s0, s2, s4 ; GFX9-NEXT: s_andn2_b32 s1, s3, s4 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_andn2_v2i16_multi_foldable_use: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_andn2_b32 s0, s2, s4 +; GFX10-NEXT: s_andn2_b32 s1, s3, s4 +; GFX10-NEXT: ; return to shader part epilog %not.src2 = xor <2 x i16> %src2, %and0 = and <2 x i16> %src0, %not.src2 %and1 = and <2 x i16> %src1, %not.src2 @@ -369,6 +535,14 @@ ; GCN-NEXT: v_xor_b32_e32 v1, -1, v1 ; GCN-NEXT: v_and_b32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_andn2_v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX10-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %not.src1 = xor <2 x i16> %src1, %and = and <2 x i16> %src0, %not.src1 ret <2 x i16> %and @@ -435,6 +609,14 @@ ; GFX9-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] ; GFX9-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_andn2_v4i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, -1 +; GFX10-NEXT: s_mov_b32 s1, s0 +; GFX10-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] +; GFX10-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] +; GFX10-NEXT: ; return to shader part epilog %not.src1 = xor <4 x i16> %src1, %and = and <4 x i16> %src0, %not.src1 %cast = bitcast <4 x i16> %and to i64 @@ -470,6 +652,14 @@ ; GFX9-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_andn2_v4i16_commute: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, -1 +; GFX10-NEXT: s_mov_b32 s1, s0 +; GFX10-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] +; GFX10-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; GFX10-NEXT: ; return to shader part epilog %not.src1 = xor <4 x i16> %src1, %and = and <4 x i16> %not.src1, %src0 %cast = bitcast <4 x i16> %and to i64 @@ -507,6 +697,16 @@ ; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_andn2_v4i16_multi_use: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, -1 +; GFX10-NEXT: s_mov_b32 s1, s0 +; GFX10-NEXT: s_xor_b64 s[4:5], s[4:5], s[0:1] +; GFX10-NEXT: s_and_b64 s[0:1], s[2:3], s[4:5] +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: ; return to shader part epilog %not.src1 = xor <4 x i16> %src1, %and = and <4 x i16> %src0, %not.src1 @@ -554,6 +754,15 @@ ; GFX9-NEXT: s_and_b64 s[0:1], s[2:3], s[6:7] ; GFX9-NEXT: s_and_b64 s[2:3], s[4:5], s[6:7] ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_andn2_v4i16_multi_foldable_use: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, -1 +; GFX10-NEXT: s_mov_b32 s1, s0 +; GFX10-NEXT: s_xor_b64 s[6:7], s[6:7], s[0:1] +; GFX10-NEXT: s_and_b64 s[0:1], s[2:3], s[6:7] +; GFX10-NEXT: s_and_b64 s[2:3], s[4:5], s[6:7] +; GFX10-NEXT: ; return to shader part epilog %not.src2 = xor <4 x i16> %src2, %and0 = and <4 x i16> %src0, %not.src2 %and1 = and <4 x i16> %src1, %not.src2 @@ -598,6 +807,16 @@ ; GFX9-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX9-NEXT: v_and_b32_e32 v1, v1, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_andn2_v4i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX10-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX10-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX10-NEXT: v_and_b32_e32 v1, v1, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] %not.src1 = xor <4 x i16> %src1, %and = and <4 x i16> %src0, %not.src1 ret <4 x i16> %and diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll @@ -2,6 +2,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s define i8 @v_ashr_i8(i8 %value, i8 %amount) { ; GFX6-LABEL: v_ashr_i8: @@ -24,6 +25,15 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_ashrrev_i16_sdwa v0, v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_ashr_i8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX10-NEXT: v_ashrrev_i16_e64 v0, v1, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = ashr i8 %value, %amount ret i8 %result } @@ -49,6 +59,14 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 7 ; GFX9-NEXT: v_ashrrev_i16_sdwa v0, v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_ashr_i8_7: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX10-NEXT: v_ashrrev_i16_e64 v0, 7, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = ashr i8 %value, 7 ret i8 %result } @@ -74,6 +92,13 @@ ; GFX9-NEXT: s_sext_i32_i8 s1, s1 ; GFX9-NEXT: s_ashr_i32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_ashr_i8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_sext_i32_i8 s0, s0 +; GFX10-NEXT: s_sext_i32_i8 s1, s1 +; GFX10-NEXT: s_ashr_i32 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog %result = ashr i8 %value, %amount ret i8 %result } @@ -84,6 +109,12 @@ ; GCN-NEXT: s_sext_i32_i8 s0, s0 ; GCN-NEXT: s_ashr_i32 s0, s0, 7 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_ashr_i8_7: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_sext_i32_i8 s0, s0 +; GFX10-NEXT: s_ashr_i32 s0, s0, 7 +; GFX10-NEXT: ; return to shader part epilog %result = ashr i8 %value, 7 ret i8 %result } @@ -97,6 +128,15 @@ ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v0, v1, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_ashr_i24: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff, v1 +; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 24 +; GFX10-NEXT: v_ashrrev_i32_e32 v0, v1, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = ashr i24 %value, %amount ret i24 %result } @@ -108,6 +148,14 @@ ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v0, 7, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_ashr_i24_7: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 24 +; GFX10-NEXT: v_ashrrev_i32_e32 v0, 7, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = ashr i24 %value, 7 ret i24 %result } @@ -119,6 +167,13 @@ ; GCN-NEXT: s_bfe_i32 s0, s0, 0x180000 ; GCN-NEXT: s_ashr_i32 s0, s0, s1 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_ashr_i24: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_and_b32 s1, s1, 0xffffff +; GFX10-NEXT: s_bfe_i32 s0, s0, 0x180000 +; GFX10-NEXT: s_ashr_i32 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog %result = ashr i24 %value, %amount ret i24 %result } @@ -129,6 +184,12 @@ ; GCN-NEXT: s_bfe_i32 s0, s0, 0x180000 ; GCN-NEXT: s_ashr_i32 s0, s0, 7 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_ashr_i24_7: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_bfe_i32 s0, s0, 0x180000 +; GFX10-NEXT: s_ashr_i32 s0, s0, 7 +; GFX10-NEXT: ; return to shader part epilog %result = ashr i24 %value, 7 ret i24 %result } @@ -139,6 +200,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_ashrrev_i32_e32 v0, v1, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_ashr_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_ashrrev_i32_e32 v0, v1, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = ashr i32 %value, %amount ret i32 %result } @@ -149,6 +217,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_ashrrev_i32_e32 v0, 31, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_ashr_i32_31: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = ashr i32 %value, 31 ret i32 %result } @@ -158,6 +233,11 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_ashr_i32 s0, s0, s1 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_ashr_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_ashr_i32 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog %result = ashr i32 %value, %amount ret i32 %result } @@ -167,6 +247,11 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_ashr_i32 s0, s0, 31 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_ashr_i32_31: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_ashr_i32 s0, s0, 31 +; GFX10-NEXT: ; return to shader part epilog %result = ashr i32 %value, 31 ret i32 %result } @@ -186,6 +271,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: v_ashrrev_i32_e64 v0, v0, s0 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: ashr_i32_sv: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_ashrrev_i32_e64 v0, v0, s0 +; GFX10-NEXT: ; return to shader part epilog %result = ashr i32 %value, %amount %cast = bitcast i32 %result to float ret float %cast @@ -196,6 +286,11 @@ ; GCN: ; %bb.0: ; GCN-NEXT: v_ashrrev_i32_e32 v0, s0, v0 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: ashr_i32_vs: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_ashrrev_i32_e32 v0, s0, v0 +; GFX10-NEXT: ; return to shader part epilog %result = ashr i32 %value, %amount %cast = bitcast i32 %result to float ret float %cast @@ -208,6 +303,14 @@ ; GCN-NEXT: v_ashrrev_i32_e32 v0, v2, v0 ; GCN-NEXT: v_ashrrev_i32_e32 v1, v3, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_ashr_v2i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_ashrrev_i32_e32 v0, v2, v0 +; GFX10-NEXT: v_ashrrev_i32_e32 v1, v3, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = ashr <2 x i32> %value, %amount ret <2 x i32> %result } @@ -219,6 +322,14 @@ ; GCN-NEXT: v_ashrrev_i32_e32 v0, 31, v0 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_ashr_v2i32_31: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v0 +; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = ashr <2 x i32> %value, ret <2 x i32> %result } @@ -229,6 +340,12 @@ ; GCN-NEXT: s_ashr_i32 s0, s0, s2 ; GCN-NEXT: s_ashr_i32 s1, s1, s3 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_ashr_v2i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_ashr_i32 s0, s0, s2 +; GFX10-NEXT: s_ashr_i32 s1, s1, s3 +; GFX10-NEXT: ; return to shader part epilog %result = ashr <2 x i32> %value, %amount ret <2 x i32> %result } @@ -241,6 +358,15 @@ ; GCN-NEXT: v_ashrrev_i32_e32 v1, v4, v1 ; GCN-NEXT: v_ashrrev_i32_e32 v2, v5, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_ashr_v3i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_ashrrev_i32_e32 v0, v3, v0 +; GFX10-NEXT: v_ashrrev_i32_e32 v1, v4, v1 +; GFX10-NEXT: v_ashrrev_i32_e32 v2, v5, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = ashr <3 x i32> %value, %amount ret <3 x i32> %result } @@ -252,6 +378,13 @@ ; GCN-NEXT: s_ashr_i32 s1, s1, s4 ; GCN-NEXT: s_ashr_i32 s2, s2, s5 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_ashr_v3i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_ashr_i32 s0, s0, s3 +; GFX10-NEXT: s_ashr_i32 s1, s1, s4 +; GFX10-NEXT: s_ashr_i32 s2, s2, s5 +; GFX10-NEXT: ; return to shader part epilog %result = ashr <3 x i32> %value, %amount ret <3 x i32> %result } @@ -265,6 +398,16 @@ ; GCN-NEXT: v_ashrrev_i32_e32 v2, v6, v2 ; GCN-NEXT: v_ashrrev_i32_e32 v3, v7, v3 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_ashr_v4i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_ashrrev_i32_e32 v0, v4, v0 +; GFX10-NEXT: v_ashrrev_i32_e32 v1, v5, v1 +; GFX10-NEXT: v_ashrrev_i32_e32 v2, v6, v2 +; GFX10-NEXT: v_ashrrev_i32_e32 v3, v7, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = ashr <4 x i32> %value, %amount ret <4 x i32> %result } @@ -277,6 +420,14 @@ ; GCN-NEXT: s_ashr_i32 s2, s2, s6 ; GCN-NEXT: s_ashr_i32 s3, s3, s7 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_ashr_v4i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_ashr_i32 s0, s0, s4 +; GFX10-NEXT: s_ashr_i32 s1, s1, s5 +; GFX10-NEXT: s_ashr_i32 s2, s2, s6 +; GFX10-NEXT: s_ashr_i32 s3, s3, s7 +; GFX10-NEXT: ; return to shader part epilog %result = ashr <4 x i32> %value, %amount ret <4 x i32> %result } @@ -291,6 +442,17 @@ ; GCN-NEXT: v_ashrrev_i32_e32 v3, v8, v3 ; GCN-NEXT: v_ashrrev_i32_e32 v4, v9, v4 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_ashr_v5i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_ashrrev_i32_e32 v0, v5, v0 +; GFX10-NEXT: v_ashrrev_i32_e32 v1, v6, v1 +; GFX10-NEXT: v_ashrrev_i32_e32 v2, v7, v2 +; GFX10-NEXT: v_ashrrev_i32_e32 v3, v8, v3 +; GFX10-NEXT: v_ashrrev_i32_e32 v4, v9, v4 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = ashr <5 x i32> %value, %amount ret <5 x i32> %result } @@ -304,6 +466,15 @@ ; GCN-NEXT: s_ashr_i32 s3, s3, s8 ; GCN-NEXT: s_ashr_i32 s4, s4, s9 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_ashr_v5i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_ashr_i32 s0, s0, s5 +; GFX10-NEXT: s_ashr_i32 s1, s1, s6 +; GFX10-NEXT: s_ashr_i32 s2, s2, s7 +; GFX10-NEXT: s_ashr_i32 s3, s3, s8 +; GFX10-NEXT: s_ashr_i32 s4, s4, s9 +; GFX10-NEXT: ; return to shader part epilog %result = ashr <5 x i32> %value, %amount ret <5 x i32> %result } @@ -329,6 +500,28 @@ ; GCN-NEXT: v_ashrrev_i32_e32 v14, v30, v14 ; GCN-NEXT: v_ashrrev_i32_e32 v15, v31, v15 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_ashr_v16i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_ashrrev_i32_e32 v0, v16, v0 +; GFX10-NEXT: v_ashrrev_i32_e32 v1, v17, v1 +; GFX10-NEXT: v_ashrrev_i32_e32 v2, v18, v2 +; GFX10-NEXT: v_ashrrev_i32_e32 v3, v19, v3 +; GFX10-NEXT: v_ashrrev_i32_e32 v4, v20, v4 +; GFX10-NEXT: v_ashrrev_i32_e32 v5, v21, v5 +; GFX10-NEXT: v_ashrrev_i32_e32 v6, v22, v6 +; GFX10-NEXT: v_ashrrev_i32_e32 v7, v23, v7 +; GFX10-NEXT: v_ashrrev_i32_e32 v8, v24, v8 +; GFX10-NEXT: v_ashrrev_i32_e32 v9, v25, v9 +; GFX10-NEXT: v_ashrrev_i32_e32 v10, v26, v10 +; GFX10-NEXT: v_ashrrev_i32_e32 v11, v27, v11 +; GFX10-NEXT: v_ashrrev_i32_e32 v12, v28, v12 +; GFX10-NEXT: v_ashrrev_i32_e32 v13, v29, v13 +; GFX10-NEXT: v_ashrrev_i32_e32 v14, v30, v14 +; GFX10-NEXT: v_ashrrev_i32_e32 v15, v31, v15 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = ashr <16 x i32> %value, %amount ret <16 x i32> %result } @@ -353,6 +546,26 @@ ; GCN-NEXT: s_ashr_i32 s14, s14, s30 ; GCN-NEXT: s_ashr_i32 s15, s15, s31 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_ashr_v16i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_ashr_i32 s0, s0, s16 +; GFX10-NEXT: s_ashr_i32 s1, s1, s17 +; GFX10-NEXT: s_ashr_i32 s2, s2, s18 +; GFX10-NEXT: s_ashr_i32 s3, s3, s19 +; GFX10-NEXT: s_ashr_i32 s4, s4, s20 +; GFX10-NEXT: s_ashr_i32 s5, s5, s21 +; GFX10-NEXT: s_ashr_i32 s6, s6, s22 +; GFX10-NEXT: s_ashr_i32 s7, s7, s23 +; GFX10-NEXT: s_ashr_i32 s8, s8, s24 +; GFX10-NEXT: s_ashr_i32 s9, s9, s25 +; GFX10-NEXT: s_ashr_i32 s10, s10, s26 +; GFX10-NEXT: s_ashr_i32 s11, s11, s27 +; GFX10-NEXT: s_ashr_i32 s12, s12, s28 +; GFX10-NEXT: s_ashr_i32 s13, s13, s29 +; GFX10-NEXT: s_ashr_i32 s14, s14, s30 +; GFX10-NEXT: s_ashr_i32 s15, s15, s31 +; GFX10-NEXT: ; return to shader part epilog %result = ashr <16 x i32> %value, %amount ret <16 x i32> %result } @@ -377,6 +590,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_ashrrev_i16_e32 v0, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_ashr_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_ashrrev_i16_e64 v0, v1, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = ashr i16 %value, %amount ret i16 %result } @@ -386,6 +606,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_ashr_i16_31: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = ashr i16 %value, 31 ret i16 %result } @@ -411,6 +637,13 @@ ; GFX9-NEXT: s_sext_i32_i16 s1, s1 ; GFX9-NEXT: s_ashr_i32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_ashr_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_sext_i32_i16 s0, s0 +; GFX10-NEXT: s_sext_i32_i16 s1, s1 +; GFX10-NEXT: s_ashr_i32 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog %result = ashr i16 %value, %amount ret i16 %result } @@ -421,6 +654,12 @@ ; GCN-NEXT: s_sext_i32_i16 s0, s0 ; GCN-NEXT: s_ashr_i32 s0, s0, 15 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_ashr_i16_15: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_sext_i32_i16 s0, s0 +; GFX10-NEXT: s_ashr_i32 s0, s0, 15 +; GFX10-NEXT: ; return to shader part epilog %result = ashr i16 %value, 15 ret i16 %result } @@ -442,6 +681,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: v_ashrrev_i16_e64 v0, v0, s0 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: ashr_i16_sv: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_ashrrev_i16_e64 v0, v0, s0 +; GFX10-NEXT: ; return to shader part epilog %result = ashr i16 %value, %amount %cast = bitcast i16 %result to half ret half %cast @@ -464,6 +708,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: v_ashrrev_i16_e32 v0, s0, v0 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: ashr_i16_vs: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_ashrrev_i16_e64 v0, s0, v0 +; GFX10-NEXT: ; return to shader part epilog %result = ashr i16 %value, %amount %cast = bitcast i16 %result to half ret half %cast @@ -500,6 +749,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_ashrrev_i16 v0, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_ashr_v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_ashrrev_i16 v0, v1, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = ashr <2 x i16> %value, %amount ret <2 x i16> %result } @@ -534,6 +790,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v0 op_sel_hi:[0,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_ashr_v2i16_15: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_ashrrev_i16 v0, 15, v0 op_sel_hi:[0,1] +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = ashr <2 x i16> %value, ret <2 x i16> %result } @@ -580,6 +843,17 @@ ; GFX9-NEXT: s_ashr_i32 s0, s0, s1 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s2, s0 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_ashr_v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_sext_i32_i16 s2, s0 +; GFX10-NEXT: s_sext_i32_i16 s3, s1 +; GFX10-NEXT: s_ashr_i32 s0, s0, 16 +; GFX10-NEXT: s_ashr_i32 s1, s1, 16 +; GFX10-NEXT: s_ashr_i32 s2, s2, s3 +; GFX10-NEXT: s_ashr_i32 s0, s0, s1 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s2, s0 +; GFX10-NEXT: ; return to shader part epilog %result = ashr <2 x i16> %value, %amount %cast = bitcast <2 x i16> %result to i32 ret i32 %cast @@ -615,6 +889,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: v_pk_ashrrev_i16 v0, v0, s0 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: ashr_v2i16_sv: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_pk_ashrrev_i16 v0, v0, s0 +; GFX10-NEXT: ; return to shader part epilog %result = ashr <2 x i16> %value, %amount %cast = bitcast <2 x i16> %result to float ret float %cast @@ -650,6 +929,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: v_pk_ashrrev_i16 v0, s0, v0 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: ashr_v2i16_vs: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_pk_ashrrev_i16 v0, s0, v0 +; GFX10-NEXT: ; return to shader part epilog %result = ashr <2 x i16> %value, %amount %cast = bitcast <2 x i16> %result to float ret float %cast @@ -710,6 +994,14 @@ ; GFX9-NEXT: v_pk_ashrrev_i16 v0, v2, v0 ; GFX9-NEXT: v_pk_ashrrev_i16 v1, v3, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_ashr_v4i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_ashrrev_i16 v0, v2, v0 +; GFX10-NEXT: v_pk_ashrrev_i16 v1, v3, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = ashr <4 x i16> %value, %amount %cast = bitcast <4 x i16> %result to <2 x float> ret <2 x float> %cast @@ -785,6 +1077,24 @@ ; GFX9-NEXT: s_ashr_i32 s1, s1, s3 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s2, s1 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_ashr_v4i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_sext_i32_i16 s4, s0 +; GFX10-NEXT: s_sext_i32_i16 s5, s2 +; GFX10-NEXT: s_ashr_i32 s0, s0, 16 +; GFX10-NEXT: s_ashr_i32 s2, s2, 16 +; GFX10-NEXT: s_ashr_i32 s4, s4, s5 +; GFX10-NEXT: s_ashr_i32 s0, s0, s2 +; GFX10-NEXT: s_sext_i32_i16 s2, s1 +; GFX10-NEXT: s_sext_i32_i16 s5, s3 +; GFX10-NEXT: s_ashr_i32 s1, s1, 16 +; GFX10-NEXT: s_ashr_i32 s3, s3, 16 +; GFX10-NEXT: s_ashr_i32 s2, s2, s5 +; GFX10-NEXT: s_ashr_i32 s1, s1, s3 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s4, s0 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s2, s1 +; GFX10-NEXT: ; return to shader part epilog %result = ashr <4 x i16> %value, %amount %cast = bitcast <4 x i16> %result to <2 x i32> ret <2 x i32> %cast @@ -886,6 +1196,16 @@ ; GFX9-NEXT: v_pk_ashrrev_i16 v2, v6, v2 ; GFX9-NEXT: v_pk_ashrrev_i16 v3, v7, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_ashr_v8i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_ashrrev_i16 v0, v4, v0 +; GFX10-NEXT: v_pk_ashrrev_i16 v1, v5, v1 +; GFX10-NEXT: v_pk_ashrrev_i16 v2, v6, v2 +; GFX10-NEXT: v_pk_ashrrev_i16 v3, v7, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = ashr <8 x i16> %value, %amount %cast = bitcast <8 x i16> %result to <4 x float> ret <4 x float> %cast @@ -1017,6 +1337,38 @@ ; GFX9-NEXT: s_ashr_i32 s3, s3, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s3 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_ashr_v8i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_sext_i32_i16 s8, s0 +; GFX10-NEXT: s_sext_i32_i16 s9, s4 +; GFX10-NEXT: s_ashr_i32 s0, s0, 16 +; GFX10-NEXT: s_ashr_i32 s4, s4, 16 +; GFX10-NEXT: s_ashr_i32 s8, s8, s9 +; GFX10-NEXT: s_ashr_i32 s0, s0, s4 +; GFX10-NEXT: s_sext_i32_i16 s4, s1 +; GFX10-NEXT: s_sext_i32_i16 s9, s5 +; GFX10-NEXT: s_ashr_i32 s1, s1, 16 +; GFX10-NEXT: s_ashr_i32 s5, s5, 16 +; GFX10-NEXT: s_ashr_i32 s4, s4, s9 +; GFX10-NEXT: s_ashr_i32 s1, s1, s5 +; GFX10-NEXT: s_sext_i32_i16 s5, s6 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s4, s1 +; GFX10-NEXT: s_sext_i32_i16 s4, s2 +; GFX10-NEXT: s_ashr_i32 s2, s2, 16 +; GFX10-NEXT: s_ashr_i32 s6, s6, 16 +; GFX10-NEXT: s_ashr_i32 s4, s4, s5 +; GFX10-NEXT: s_ashr_i32 s2, s2, s6 +; GFX10-NEXT: s_sext_i32_i16 s5, s3 +; GFX10-NEXT: s_sext_i32_i16 s6, s7 +; GFX10-NEXT: s_ashr_i32 s3, s3, 16 +; GFX10-NEXT: s_ashr_i32 s7, s7, 16 +; GFX10-NEXT: s_ashr_i32 s5, s5, s6 +; GFX10-NEXT: s_ashr_i32 s3, s3, s7 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s8, s0 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s4, s2 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s5, s3 +; GFX10-NEXT: ; return to shader part epilog %result = ashr <8 x i16> %value, %amount %cast = bitcast <8 x i16> %result to <4 x i32> ret <4 x i32> %cast @@ -1040,6 +1392,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_ashrrev_i64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_ashr_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_ashrrev_i64 v[0:1], v2, v[0:1] +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = ashr i64 %value, %amount ret i64 %result } @@ -1051,6 +1410,14 @@ ; GCN-NEXT: v_ashrrev_i32_e32 v0, 31, v1 ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_ashr_i64_63: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = ashr i64 %value, 63 ret i64 %result } @@ -1063,6 +1430,14 @@ ; GCN-NEXT: v_ashrrev_i32_e32 v0, 1, v1 ; GCN-NEXT: v_mov_b32_e32 v1, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_ashr_i64_33: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_ashrrev_i32_e32 v0, 1, v1 +; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = ashr i64 %value, 33 ret i64 %result } @@ -1074,6 +1449,14 @@ ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_ashr_i64_32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = ashr i64 %value, 32 ret i64 %result } @@ -1096,6 +1479,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_ashrrev_i64 v[0:1], 31, v[0:1] ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_ashr_i64_31: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_ashrrev_i64 v[0:1], 31, v[0:1] +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = ashr i64 %value, 31 ret i64 %result } @@ -1105,6 +1495,11 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_ashr_i64 s[0:1], s[0:1], s2 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_ashr_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_ashr_i64 s[0:1], s[0:1], s2 +; GFX10-NEXT: ; return to shader part epilog %result = ashr i64 %value, %amount ret i64 %result } @@ -1115,6 +1510,12 @@ ; GCN-NEXT: s_ashr_i32 s0, s1, 31 ; GCN-NEXT: s_mov_b32 s1, s0 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_ashr_i64_63: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_ashr_i32 s0, s1, 31 +; GFX10-NEXT: s_mov_b32 s1, s0 +; GFX10-NEXT: ; return to shader part epilog %result = ashr i64 %value, 63 ret i64 %result } @@ -1126,6 +1527,12 @@ ; GCN-NEXT: s_ashr_i32 s0, s1, 1 ; GCN-NEXT: s_mov_b32 s1, s2 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_ashr_i64_33: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_ashr_i32 s0, s1, 1 +; GFX10-NEXT: s_ashr_i32 s1, s1, 31 +; GFX10-NEXT: ; return to shader part epilog %result = ashr i64 %value, 33 ret i64 %result } @@ -1136,6 +1543,12 @@ ; GCN-NEXT: s_mov_b32 s0, s1 ; GCN-NEXT: s_ashr_i32 s1, s1, 31 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_ashr_i64_32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s1 +; GFX10-NEXT: s_ashr_i32 s1, s1, 31 +; GFX10-NEXT: ; return to shader part epilog %result = ashr i64 %value, 32 ret i64 %result } @@ -1145,6 +1558,11 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_ashr_i64 s[0:1], s[0:1], 31 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_ashr_i64_31: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_ashr_i64 s[0:1], s[0:1], 31 +; GFX10-NEXT: ; return to shader part epilog %result = ashr i64 %value, 31 ret i64 %result } @@ -1164,6 +1582,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: v_ashrrev_i64 v[0:1], v0, s[0:1] ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: ashr_i64_sv: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_ashrrev_i64 v[0:1], v0, s[0:1] +; GFX10-NEXT: ; return to shader part epilog %result = ashr i64 %value, %amount %cast = bitcast i64 %result to <2 x float> ret <2 x float> %cast @@ -1184,6 +1607,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: v_ashrrev_i64 v[0:1], s0, v[0:1] ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: ashr_i64_vs: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_ashrrev_i64 v[0:1], s0, v[0:1] +; GFX10-NEXT: ; return to shader part epilog %result = ashr i64 %value, %amount %cast = bitcast i64 %result to <2 x float> ret <2 x float> %cast @@ -1210,6 +1638,18 @@ ; GFX9-NEXT: v_ashrrev_i64 v[0:1], v4, v[0:1] ; GFX9-NEXT: v_ashrrev_i64 v[2:3], v6, v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_ashr_v2i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v10, v0 +; GFX10-NEXT: v_mov_b32_e32 v11, v1 +; GFX10-NEXT: v_mov_b32_e32 v7, v2 +; GFX10-NEXT: v_mov_b32_e32 v8, v3 +; GFX10-NEXT: v_ashrrev_i64 v[0:1], v4, v[10:11] +; GFX10-NEXT: v_ashrrev_i64 v[2:3], v6, v[7:8] +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = ashr <2 x i64> %value, %amount ret <2 x i64> %result } @@ -1235,6 +1675,14 @@ ; GFX9-NEXT: v_ashrrev_i64 v[0:1], 31, v[0:1] ; GFX9-NEXT: v_ashrrev_i64 v[2:3], 31, v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_ashr_v2i64_31: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_ashrrev_i64 v[0:1], 31, v[0:1] +; GFX10-NEXT: v_ashrrev_i64 v[2:3], 31, v[2:3] +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = ashr <2 x i64> %value, ret <2 x i64> %result } @@ -1245,6 +1693,12 @@ ; GCN-NEXT: s_ashr_i64 s[0:1], s[0:1], s4 ; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], s6 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_ashr_v2i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_ashr_i64 s[0:1], s[0:1], s4 +; GFX10-NEXT: s_ashr_i64 s[2:3], s[2:3], s6 +; GFX10-NEXT: ; return to shader part epilog %result = ashr <2 x i64> %value, %amount ret <2 x i64> %result } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll @@ -2,6 +2,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -o - %s | FileCheck -check-prefix=GFX7 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10 %s define amdgpu_ps i32 @s_bswap_i32(i32 inreg %src) { ; GFX7-LABEL: s_bswap_i32: @@ -28,6 +29,12 @@ ; GFX9-NEXT: v_perm_b32 v0, 0, v0, s0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_bswap_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_perm_b32 v0, 0, s0, 0x10203 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: ; return to shader part epilog %bswap = call i32 @llvm.bswap.i32(i32 %src) ret i32 %bswap } @@ -55,6 +62,13 @@ ; GFX9-NEXT: s_mov_b32 s4, 0x10203 ; GFX9-NEXT: v_perm_b32 v0, 0, v0, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_bswap_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_perm_b32 v0, 0, v0, 0x10203 +; GFX10-NEXT: s_setpc_b64 s[30:31] %bswap = call i32 @llvm.bswap.i32(i32 %src) ret i32 %bswap } @@ -94,6 +108,15 @@ ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_bswap_v2i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s2, 0x10203 +; GFX10-NEXT: v_perm_b32 v0, 0, s0, s2 +; GFX10-NEXT: v_perm_b32 v1, 0, s1, s2 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: ; return to shader part epilog %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %src) ret <2 x i32> %bswap } @@ -126,6 +149,15 @@ ; GFX9-NEXT: v_perm_b32 v0, 0, v0, s4 ; GFX9-NEXT: v_perm_b32 v1, 0, v1, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_bswap_v2i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_mov_b32 s4, 0x10203 +; GFX10-NEXT: v_perm_b32 v0, 0, v0, s4 +; GFX10-NEXT: v_perm_b32 v1, 0, v1, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %src) ret <2 x i32> %bswap } @@ -165,6 +197,15 @@ ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_bswap_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s2, 0x10203 +; GFX10-NEXT: v_perm_b32 v0, 0, s1, s2 +; GFX10-NEXT: v_perm_b32 v1, 0, s0, s2 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: ; return to shader part epilog %bswap = call i64 @llvm.bswap.i64(i64 %src) ret i64 %bswap } @@ -200,6 +241,16 @@ ; GFX9-NEXT: v_perm_b32 v1, 0, v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_bswap_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_mov_b32 s4, 0x10203 +; GFX10-NEXT: v_perm_b32 v2, 0, v1, s4 +; GFX10-NEXT: v_perm_b32 v1, 0, v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] %bswap = call i64 @llvm.bswap.i64(i64 %src) ret i64 %bswap } @@ -259,6 +310,19 @@ ; GFX9-NEXT: v_readfirstlane_b32 s2, v2 ; GFX9-NEXT: v_readfirstlane_b32 s3, v3 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_bswap_v2i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s4, 0x10203 +; GFX10-NEXT: v_perm_b32 v0, 0, s1, s4 +; GFX10-NEXT: v_perm_b32 v1, 0, s0, s4 +; GFX10-NEXT: v_perm_b32 v2, 0, s3, s4 +; GFX10-NEXT: v_perm_b32 v3, 0, s2, s4 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 +; GFX10-NEXT: ; return to shader part epilog %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %src) ret <2 x i64> %bswap } @@ -307,6 +371,19 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v2, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_bswap_v2i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_mov_b32 s4, 0x10203 +; GFX10-NEXT: v_perm_b32 v4, 0, v1, s4 +; GFX10-NEXT: v_perm_b32 v5, 0, v3, s4 +; GFX10-NEXT: v_perm_b32 v1, 0, v0, s4 +; GFX10-NEXT: v_perm_b32 v3, 0, v2, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v2, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %src) ret <2 x i64> %bswap } @@ -335,6 +412,12 @@ ; GFX9-NEXT: v_perm_b32 v0, 0, v0, s0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_bswap_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_perm_b32 v0, 0, s0, 0xc0c0001 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: ; return to shader part epilog %bswap = call i16 @llvm.bswap.i16(i16 %src) ret i16 %bswap } @@ -362,6 +445,13 @@ ; GFX9-NEXT: s_mov_b32 s4, 0xc0c0001 ; GFX9-NEXT: v_perm_b32 v0, 0, v0, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_bswap_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_perm_b32 v0, 0, v0, 0xc0c0001 +; GFX10-NEXT: s_setpc_b64 s[30:31] %bswap = call i16 @llvm.bswap.i16(i16 %src) ret i16 %bswap } @@ -398,6 +488,12 @@ ; GFX9-NEXT: v_perm_b32 v0, 0, v0, s0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_bswap_v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_perm_b32 v0, 0, s0, 0x2030001 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: ; return to shader part epilog %bswap = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %src) %cast = bitcast <2 x i16> %bswap to i32 ret i32 %cast @@ -427,6 +523,13 @@ ; GFX9-NEXT: s_mov_b32 s4, 0xc0c0001 ; GFX9-NEXT: v_perm_b32 v0, 0, v0, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_bswap_i16_zext_to_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_perm_b32 v0, 0, v0, 0xc0c0001 +; GFX10-NEXT: s_setpc_b64 s[30:31] %bswap = call i16 @llvm.bswap.i16(i16 %src) %zext = zext i16 %bswap to i32 ret i32 %zext @@ -458,6 +561,14 @@ ; GFX9-NEXT: v_perm_b32 v0, 0, v0, s4 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_bswap_i16_sext_to_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_perm_b32 v0, 0, v0, 0xc0c0001 +; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX10-NEXT: s_setpc_b64 s[30:31] %bswap = call i16 @llvm.bswap.i16(i16 %src) %zext = sext i16 %bswap to i32 ret i32 %zext @@ -494,6 +605,13 @@ ; GFX9-NEXT: s_mov_b32 s4, 0x2030001 ; GFX9-NEXT: v_perm_b32 v0, 0, v0, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_bswap_v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_perm_b32 v0, 0, v0, 0x2030001 +; GFX10-NEXT: s_setpc_b64 s[30:31] %bswap = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %src) ret <2 x i16> %bswap } @@ -535,6 +653,16 @@ ; GFX9-NEXT: v_perm_b32 v2, 0, v0, s4 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[1:2] ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_bswap_i48: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_mov_b32 s4, 0x10203 +; GFX10-NEXT: v_perm_b32 v1, 0, v1, s4 +; GFX10-NEXT: v_perm_b32 v2, 0, v0, s4 +; GFX10-NEXT: v_lshrrev_b64 v[0:1], 16, v[1:2] +; GFX10-NEXT: s_setpc_b64 s[30:31] %trunc = trunc i64 %src to i48 %bswap = call i48 @llvm.bswap.i48(i48 %trunc) %zext = zext i48 %bswap to i64 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll @@ -2,6 +2,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s define amdgpu_ps i128 @extractelement_sgpr_v4i128_sgpr_idx(<4 x i128> addrspace(4)* inreg %ptr, i32 inreg %idx) { ; GCN-LABEL: extractelement_sgpr_v4i128_sgpr_idx: @@ -12,6 +13,15 @@ ; GCN-NEXT: s_movrels_b64 s[0:1], s[8:9] ; GCN-NEXT: s_movrels_b64 s[2:3], s[10:11] ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: extractelement_sgpr_v4i128_sgpr_idx: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x0 +; GFX10-NEXT: s_lshl_b32 m0, s4, 1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_movrels_b64 s[0:1], s[8:9] +; GFX10-NEXT: s_movrels_b64 s[2:3], s[10:11] +; GFX10-NEXT: ; return to shader part epilog %vector = load <4 x i128>, <4 x i128> addrspace(4)* %ptr %element = extractelement <4 x i128> %vector, i32 %idx ret i128 %element @@ -85,6 +95,26 @@ ; GFX7-NEXT: v_readfirstlane_b32 s2, v2 ; GFX7-NEXT: v_readfirstlane_b32 s3, v3 ; GFX7-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: extractelement_vgpr_v4i128_sgpr_idx: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x3 +; GFX10-NEXT: global_load_dwordx4 v[2:5], v[0:1], off +; GFX10-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 +; GFX10-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:32 +; GFX10-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:48 +; GFX10-NEXT: s_lshl_b32 s0, s2, 1 +; GFX10-NEXT: s_lshl_b32 m0, s0, 1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_movrels_b32_e32 v1, v3 +; GFX10-NEXT: v_movrels_b32_e32 v0, v2 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: ; return to shader part epilog %vector = load <4 x i128>, <4 x i128> addrspace(1)* %ptr %element = extractelement <4 x i128> %vector, i32 %idx ret i128 %element @@ -270,6 +300,65 @@ ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v14, s[4:5] ; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v15, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v4i128_vgpr_idx: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off +; GFX10-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 1, v2 +; GFX10-NEXT: global_load_dwordx4 v[11:14], v[0:1], off offset:32 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v19, 1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v19 +; GFX10-NEXT: s_waitcnt vmcnt(2) +; GFX10-NEXT: v_cndmask_b32_e32 v18, v3, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v15, v4, v6, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v23, v4, v6, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v22, v3, v5, s4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 2, v19 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_cndmask_b32_e32 v27, v18, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v15, v15, v8, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v22, v7, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v23, v8, s4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 3, v19 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v15, v10, vcc_lo +; GFX10-NEXT: global_load_dwordx4 v[15:18], v[0:1], off offset:48 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v27, v9, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v22, v3, v9, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v4, v10, s4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 4, v19 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v11, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v12, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v22, v11, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v7, v12, s4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 5, v19 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v14, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v3, v13, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v4, v14, s4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v19 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v15, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v16, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v15, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v7, v16, s4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 7, v19 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v17, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v18, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, v17, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v4, v18, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i128>, <4 x i128> addrspace(1)* %ptr %element = extractelement <4 x i128> %vector, i32 %idx ret i128 %element @@ -485,6 +574,62 @@ ; GFX7-NEXT: v_readfirstlane_b32 s2, v2 ; GFX7-NEXT: v_readfirstlane_b32 s3, v3 ; GFX7-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: extractelement_sgpr_v4i128_vgpr_idx: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 1, v1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v2, s10 +; GFX10-NEXT: v_mov_b32_e32 v3, s11 +; GFX10-NEXT: v_cndmask_b32_e32 v4, s8, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v5, s9, v3, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, s8, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, s9, v3, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s12, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s13, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s12, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s13, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 3, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s14, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s15, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s14, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s15, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s14, 4, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 5, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s16, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s17, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s16, s14 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s17, s14 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s18, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s19, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s18, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s19, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 6, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s20, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s21, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s20, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s21, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 7, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, s23, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, s22, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s22, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s23, s0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 +; GFX10-NEXT: ; return to shader part epilog %vector = load <4 x i128>, <4 x i128> addrspace(4)* %ptr %element = extractelement <4 x i128> %vector, i32 %idx ret i128 %element @@ -496,6 +641,12 @@ ; GCN-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: extractelement_sgpr_v4i128_idx0: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: ; return to shader part epilog %vector = load <4 x i128>, <4 x i128> addrspace(4)* %ptr %element = extractelement <4 x i128> %vector, i32 0 ret i128 %element @@ -511,6 +662,16 @@ ; GCN-NEXT: s_mov_b32 s2, s6 ; GCN-NEXT: s_mov_b32 s3, s7 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: extractelement_sgpr_v4i128_idx1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s0, s4 +; GFX10-NEXT: s_mov_b32 s1, s5 +; GFX10-NEXT: s_mov_b32 s2, s6 +; GFX10-NEXT: s_mov_b32 s3, s7 +; GFX10-NEXT: ; return to shader part epilog %vector = load <4 x i128>, <4 x i128> addrspace(4)* %ptr %element = extractelement <4 x i128> %vector, i32 1 ret i128 %element @@ -526,6 +687,16 @@ ; GCN-NEXT: s_mov_b32 s2, s10 ; GCN-NEXT: s_mov_b32 s3, s11 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: extractelement_sgpr_v4i128_idx2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s0, s8 +; GFX10-NEXT: s_mov_b32 s1, s9 +; GFX10-NEXT: s_mov_b32 s2, s10 +; GFX10-NEXT: s_mov_b32 s3, s11 +; GFX10-NEXT: ; return to shader part epilog %vector = load <4 x i128>, <4 x i128> addrspace(4)* %ptr %element = extractelement <4 x i128> %vector, i32 2 ret i128 %element @@ -541,6 +712,16 @@ ; GCN-NEXT: s_mov_b32 s2, s14 ; GCN-NEXT: s_mov_b32 s3, s15 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: extractelement_sgpr_v4i128_idx3: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s0, s12 +; GFX10-NEXT: s_mov_b32 s1, s13 +; GFX10-NEXT: s_mov_b32 s2, s14 +; GFX10-NEXT: s_mov_b32 s3, s15 +; GFX10-NEXT: ; return to shader part epilog %vector = load <4 x i128>, <4 x i128> addrspace(4)* %ptr %element = extractelement <4 x i128> %vector, i32 3 ret i128 %element @@ -570,6 +751,14 @@ ; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v4i128_idx0: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i128>, <4 x i128> addrspace(1)* %ptr %element = extractelement <4 x i128> %vector, i32 0 ret i128 %element @@ -613,6 +802,18 @@ ; GFX7-NEXT: v_mov_b32_e32 v2, v6 ; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v4i128_idx1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_mov_b32_e32 v2, v6 +; GFX10-NEXT: v_mov_b32_e32 v3, v7 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i128>, <4 x i128> addrspace(1)* %ptr %element = extractelement <4 x i128> %vector, i32 1 ret i128 %element @@ -656,6 +857,18 @@ ; GFX7-NEXT: v_mov_b32_e32 v2, v10 ; GFX7-NEXT: v_mov_b32_e32 v3, v11 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v4i128_idx2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:32 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v8 +; GFX10-NEXT: v_mov_b32_e32 v1, v9 +; GFX10-NEXT: v_mov_b32_e32 v2, v10 +; GFX10-NEXT: v_mov_b32_e32 v3, v11 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i128>, <4 x i128> addrspace(1)* %ptr %element = extractelement <4 x i128> %vector, i32 2 ret i128 %element @@ -699,6 +912,18 @@ ; GFX7-NEXT: v_mov_b32_e32 v2, v14 ; GFX7-NEXT: v_mov_b32_e32 v3, v15 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v4i128_idx3: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:48 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v12 +; GFX10-NEXT: v_mov_b32_e32 v1, v13 +; GFX10-NEXT: v_mov_b32_e32 v2, v14 +; GFX10-NEXT: v_mov_b32_e32 v3, v15 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i128>, <4 x i128> addrspace(1)* %ptr %element = extractelement <4 x i128> %vector, i32 3 ret i128 %element diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll @@ -2,6 +2,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s define amdgpu_ps i16 @extractelement_sgpr_v4i16_sgpr_idx(<4 x i16> addrspace(4)* inreg %ptr, i32 inreg %idx) { ; GCN-LABEL: extractelement_sgpr_v4i16_sgpr_idx: @@ -15,6 +16,18 @@ ; GCN-NEXT: s_lshl_b32 s1, s1, 4 ; GCN-NEXT: s_lshr_b32 s0, s0, s1 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: extractelement_sgpr_v4i16_sgpr_idx: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_lshr_b32 s2, s4, 1 +; GFX10-NEXT: s_cmp_eq_u32 s2, 1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_cselect_b32 s0, s1, s0 +; GFX10-NEXT: s_and_b32 s1, s4, 1 +; GFX10-NEXT: s_lshl_b32 s1, s1, 4 +; GFX10-NEXT: s_lshr_b32 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog %vector = load <4 x i16>, <4 x i16> addrspace(4)* %ptr %element = extractelement <4 x i16> %vector, i32 %idx ret i16 %element @@ -59,6 +72,19 @@ ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: extractelement_vgpr_v4i16_sgpr_idx: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_lshr_b32 s0, s2, 1 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 +; GFX10-NEXT: s_and_b32 s0, s2, 1 +; GFX10-NEXT: s_lshl_b32 s0, s0, 4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: ; return to shader part epilog %vector = load <4 x i16>, <4 x i16> addrspace(1)* %ptr %element = extractelement <4 x i16> %vector, i32 %idx ret i16 %element @@ -103,6 +129,20 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v4i16_vgpr_idx: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 1, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i16>, <4 x i16> addrspace(1)* %ptr %element = extractelement <4 x i16> %vector, i32 %idx ret i16 %element @@ -123,6 +163,20 @@ ; GCN-NEXT: v_lshrrev_b32_e32 v0, v0, v1 ; GCN-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: extractelement_sgpr_v4i16_vgpr_idx: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-NEXT: v_cndmask_b32_e32 v1, s0, v2, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v0, v1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: ; return to shader part epilog %vector = load <4 x i16>, <4 x i16> addrspace(4)* %ptr %element = extractelement <4 x i16> %vector, i32 %idx ret i16 %element @@ -134,6 +188,12 @@ ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: extractelement_sgpr_v4i16_idx0: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: ; return to shader part epilog %vector = load <4 x i16>, <4 x i16> addrspace(4)* %ptr %element = extractelement <4 x i16> %vector, i32 0 ret i16 %element @@ -146,6 +206,13 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshr_b32 s0, s0, 16 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: extractelement_sgpr_v4i16_idx1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshr_b32 s0, s0, 16 +; GFX10-NEXT: ; return to shader part epilog %vector = load <4 x i16>, <4 x i16> addrspace(4)* %ptr %element = extractelement <4 x i16> %vector, i32 1 ret i16 %element @@ -158,6 +225,13 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s0, s1 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: extractelement_sgpr_v4i16_idx2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s0, s1 +; GFX10-NEXT: ; return to shader part epilog %vector = load <4 x i16>, <4 x i16> addrspace(4)* %ptr %element = extractelement <4 x i16> %vector, i32 2 ret i16 %element @@ -170,6 +244,13 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshr_b32 s0, s1, 16 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: extractelement_sgpr_v4i16_idx3: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshr_b32 s0, s1, 16 +; GFX10-NEXT: ; return to shader part epilog %vector = load <4 x i16>, <4 x i16> addrspace(4)* %ptr %element = extractelement <4 x i16> %vector, i32 3 ret i16 %element @@ -196,6 +277,14 @@ ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v4i16_idx0: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i16>, <4 x i16> addrspace(1)* %ptr %element = extractelement <4 x i16> %vector, i32 0 ret i16 %element @@ -225,6 +314,15 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v4i16_idx1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i16>, <4 x i16> addrspace(1)* %ptr %element = extractelement <4 x i16> %vector, i32 1 ret i16 %element @@ -254,6 +352,15 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v4i16_idx2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i16>, <4 x i16> addrspace(1)* %ptr %element = extractelement <4 x i16> %vector, i32 2 ret i16 %element @@ -283,6 +390,15 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v4i16_idx3: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i16>, <4 x i16> addrspace(1)* %ptr %element = extractelement <4 x i16> %vector, i32 3 ret i16 %element @@ -304,6 +420,22 @@ ; GCN-NEXT: s_lshl_b32 s1, s1, 4 ; GCN-NEXT: s_lshr_b32 s0, s0, s1 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: extractelement_sgpr_v8i16_sgpr_idx: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX10-NEXT: s_lshr_b32 s5, s4, 1 +; GFX10-NEXT: s_cmp_eq_u32 s5, 1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_cselect_b32 s0, s1, s0 +; GFX10-NEXT: s_cmp_eq_u32 s5, 2 +; GFX10-NEXT: s_cselect_b32 s0, s2, s0 +; GFX10-NEXT: s_cmp_eq_u32 s5, 3 +; GFX10-NEXT: s_cselect_b32 s0, s3, s0 +; GFX10-NEXT: s_and_b32 s1, s4, 1 +; GFX10-NEXT: s_lshl_b32 s1, s1, 4 +; GFX10-NEXT: s_lshr_b32 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr %element = extractelement <8 x i16> %vector, i32 %idx ret i16 %element @@ -363,6 +495,23 @@ ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: extractelement_vgpr_v8i16_sgpr_idx: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: s_lshr_b32 s0, s2, 1 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 2 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 3 +; GFX10-NEXT: s_and_b32 s0, s2, 1 +; GFX10-NEXT: s_lshl_b32 s0, s0, 4 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: ; return to shader part epilog %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr %element = extractelement <8 x i16> %vector, i32 %idx ret i16 %element @@ -422,6 +571,24 @@ ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v8i16_vgpr_idx: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v6, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr %element = extractelement <8 x i16> %vector, i32 %idx ret i16 %element @@ -448,6 +615,24 @@ ; GCN-NEXT: v_lshrrev_b32_e32 v0, v0, v1 ; GCN-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: extractelement_sgpr_v8i16_vgpr_idx: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-NEXT: v_cndmask_b32_e32 v2, s0, v2, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s3, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v0, v1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: ; return to shader part epilog %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr %element = extractelement <8 x i16> %vector, i32 %idx ret i16 %element @@ -459,6 +644,12 @@ ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: extractelement_sgpr_v8i16_idx0: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: ; return to shader part epilog %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr %element = extractelement <8 x i16> %vector, i32 0 ret i16 %element @@ -471,6 +662,13 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshr_b32 s0, s0, 16 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: extractelement_sgpr_v8i16_idx1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshr_b32 s0, s0, 16 +; GFX10-NEXT: ; return to shader part epilog %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr %element = extractelement <8 x i16> %vector, i32 1 ret i16 %element @@ -483,6 +681,13 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s0, s1 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: extractelement_sgpr_v8i16_idx2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s0, s1 +; GFX10-NEXT: ; return to shader part epilog %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr %element = extractelement <8 x i16> %vector, i32 2 ret i16 %element @@ -495,6 +700,13 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshr_b32 s0, s1, 16 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: extractelement_sgpr_v8i16_idx3: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshr_b32 s0, s1, 16 +; GFX10-NEXT: ; return to shader part epilog %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr %element = extractelement <8 x i16> %vector, i32 3 ret i16 %element @@ -507,6 +719,13 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s0, s2 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: extractelement_sgpr_v8i16_idx4: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: ; return to shader part epilog %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr %element = extractelement <8 x i16> %vector, i32 4 ret i16 %element @@ -519,6 +738,13 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshr_b32 s0, s2, 16 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: extractelement_sgpr_v8i16_idx5: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshr_b32 s0, s2, 16 +; GFX10-NEXT: ; return to shader part epilog %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr %element = extractelement <8 x i16> %vector, i32 5 ret i16 %element @@ -531,6 +757,13 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s0, s3 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: extractelement_sgpr_v8i16_idx6: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s0, s3 +; GFX10-NEXT: ; return to shader part epilog %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr %element = extractelement <8 x i16> %vector, i32 6 ret i16 %element @@ -543,6 +776,13 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshr_b32 s0, s3, 16 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: extractelement_sgpr_v8i16_idx7: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshr_b32 s0, s3, 16 +; GFX10-NEXT: ; return to shader part epilog %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr %element = extractelement <8 x i16> %vector, i32 7 ret i16 %element @@ -572,6 +812,14 @@ ; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v8i16_idx0: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr %element = extractelement <8 x i16> %vector, i32 0 ret i16 %element @@ -604,6 +852,15 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v8i16_idx1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr %element = extractelement <8 x i16> %vector, i32 1 ret i16 %element @@ -636,6 +893,15 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v8i16_idx2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr %element = extractelement <8 x i16> %vector, i32 2 ret i16 %element @@ -668,6 +934,15 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v8i16_idx3: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr %element = extractelement <8 x i16> %vector, i32 3 ret i16 %element @@ -700,6 +975,15 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v8i16_idx4: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr %element = extractelement <8 x i16> %vector, i32 4 ret i16 %element @@ -732,6 +1016,15 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v8i16_idx5: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr %element = extractelement <8 x i16> %vector, i32 5 ret i16 %element @@ -764,6 +1057,15 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v8i16_idx6: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr %element = extractelement <8 x i16> %vector, i32 6 ret i16 %element @@ -796,6 +1098,15 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v8i16_idx7: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr %element = extractelement <8 x i16> %vector, i32 7 ret i16 %element diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll @@ -2,6 +2,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s define amdgpu_ps i8 @extractelement_sgpr_v4i8_sgpr_idx(<4 x i8> addrspace(4)* inreg %ptr, i32 inreg %idx) { ; GCN-LABEL: extractelement_sgpr_v4i8_sgpr_idx: @@ -25,6 +26,28 @@ ; GCN-NEXT: s_lshl_b32 s1, s1, 3 ; GCN-NEXT: s_lshr_b32 s0, s0, s1 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: extractelement_sgpr_v4i8_sgpr_idx: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_movk_i32 s1, 0xff +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshr_b32 s2, s0, 8 +; GFX10-NEXT: s_lshr_b32 s3, s0, 16 +; GFX10-NEXT: s_and_b32 s2, s2, s1 +; GFX10-NEXT: s_lshr_b32 s5, s0, 24 +; GFX10-NEXT: s_and_b32 s0, s0, s1 +; GFX10-NEXT: s_and_b32 s1, s3, s1 +; GFX10-NEXT: s_lshl_b32 s2, s2, 8 +; GFX10-NEXT: s_lshl_b32 s1, s1, 16 +; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: s_lshl_b32 s2, s5, 24 +; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: s_and_b32 s1, s4, 3 +; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: s_lshl_b32 s1, s1, 3 +; GFX10-NEXT: s_lshr_b32 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog %vector = load <4 x i8>, <4 x i8> addrspace(4)* %ptr %element = extractelement <4 x i8> %vector, i32 %idx ret i8 %element @@ -96,6 +119,25 @@ ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: extractelement_vgpr_v4i8_sgpr_idx: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_mov_b32 s0, 8 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_movk_i32 s0, 0xff +; GFX10-NEXT: v_and_b32_sdwa v3, v0, s0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v0, v0, s0, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX10-NEXT: s_and_b32 s0, s2, 3 +; GFX10-NEXT: s_lshl_b32 s0, s0, 3 +; GFX10-NEXT: v_or3_b32 v0, v0, v3, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: ; return to shader part epilog %vector = load <4 x i8>, <4 x i8> addrspace(1)* %ptr %element = extractelement <4 x i8> %vector, i32 %idx ret i8 %element @@ -167,6 +209,26 @@ ; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v4i8_vgpr_idx: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 8 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX10-NEXT: v_and_b32_sdwa v4, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v6, v0, s4, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 3, v2 +; GFX10-NEXT: v_or3_b32 v0, v6, v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i8>, <4 x i8> addrspace(1)* %ptr %element = extractelement <4 x i8> %vector, i32 %idx ret i8 %element @@ -241,6 +303,29 @@ ; GFX7-NEXT: v_lshr_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: extractelement_sgpr_v4i8_vgpr_idx: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_movk_i32 s1, 0xff +; GFX10-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshr_b32 s2, s0, 8 +; GFX10-NEXT: s_lshr_b32 s3, s0, 16 +; GFX10-NEXT: s_and_b32 s2, s2, s1 +; GFX10-NEXT: s_and_b32 s4, s0, s1 +; GFX10-NEXT: s_and_b32 s1, s3, s1 +; GFX10-NEXT: s_lshl_b32 s2, s2, 8 +; GFX10-NEXT: s_lshr_b32 s0, s0, 24 +; GFX10-NEXT: s_lshl_b32 s1, s1, 16 +; GFX10-NEXT: s_or_b32 s2, s4, s2 +; GFX10-NEXT: s_lshl_b32 s0, s0, 24 +; GFX10-NEXT: s_or_b32 s1, s2, s1 +; GFX10-NEXT: s_or_b32 s0, s1, s0 +; GFX10-NEXT: v_lshrrev_b32_e64 v0, v0, s0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: ; return to shader part epilog %vector = load <4 x i8>, <4 x i8> addrspace(4)* %ptr %element = extractelement <4 x i8> %vector, i32 %idx ret i8 %element @@ -265,6 +350,25 @@ ; GCN-NEXT: s_lshl_b32 s1, s4, 24 ; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: extractelement_sgpr_v4i8_idx0: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_movk_i32 s1, 0xff +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshr_b32 s2, s0, 8 +; GFX10-NEXT: s_lshr_b32 s3, s0, 16 +; GFX10-NEXT: s_and_b32 s2, s2, s1 +; GFX10-NEXT: s_and_b32 s4, s0, s1 +; GFX10-NEXT: s_and_b32 s1, s3, s1 +; GFX10-NEXT: s_lshl_b32 s2, s2, 8 +; GFX10-NEXT: s_lshr_b32 s0, s0, 24 +; GFX10-NEXT: s_lshl_b32 s1, s1, 16 +; GFX10-NEXT: s_or_b32 s2, s4, s2 +; GFX10-NEXT: s_lshl_b32 s0, s0, 24 +; GFX10-NEXT: s_or_b32 s1, s2, s1 +; GFX10-NEXT: s_or_b32 s0, s1, s0 +; GFX10-NEXT: ; return to shader part epilog %vector = load <4 x i8>, <4 x i8> addrspace(4)* %ptr %element = extractelement <4 x i8> %vector, i32 0 ret i8 %element @@ -290,6 +394,26 @@ ; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: s_lshr_b32 s0, s0, 8 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: extractelement_sgpr_v4i8_idx1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_movk_i32 s1, 0xff +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshr_b32 s2, s0, 8 +; GFX10-NEXT: s_lshr_b32 s3, s0, 16 +; GFX10-NEXT: s_and_b32 s2, s2, s1 +; GFX10-NEXT: s_and_b32 s4, s0, s1 +; GFX10-NEXT: s_and_b32 s1, s3, s1 +; GFX10-NEXT: s_lshl_b32 s2, s2, 8 +; GFX10-NEXT: s_lshr_b32 s0, s0, 24 +; GFX10-NEXT: s_lshl_b32 s1, s1, 16 +; GFX10-NEXT: s_or_b32 s2, s4, s2 +; GFX10-NEXT: s_lshl_b32 s0, s0, 24 +; GFX10-NEXT: s_or_b32 s1, s2, s1 +; GFX10-NEXT: s_or_b32 s0, s1, s0 +; GFX10-NEXT: s_lshr_b32 s0, s0, 8 +; GFX10-NEXT: ; return to shader part epilog %vector = load <4 x i8>, <4 x i8> addrspace(4)* %ptr %element = extractelement <4 x i8> %vector, i32 1 ret i8 %element @@ -315,6 +439,26 @@ ; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: s_lshr_b32 s0, s0, 16 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: extractelement_sgpr_v4i8_idx2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_movk_i32 s1, 0xff +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshr_b32 s2, s0, 8 +; GFX10-NEXT: s_lshr_b32 s3, s0, 16 +; GFX10-NEXT: s_and_b32 s2, s2, s1 +; GFX10-NEXT: s_and_b32 s4, s0, s1 +; GFX10-NEXT: s_and_b32 s1, s3, s1 +; GFX10-NEXT: s_lshl_b32 s2, s2, 8 +; GFX10-NEXT: s_lshr_b32 s0, s0, 24 +; GFX10-NEXT: s_lshl_b32 s1, s1, 16 +; GFX10-NEXT: s_or_b32 s2, s4, s2 +; GFX10-NEXT: s_lshl_b32 s0, s0, 24 +; GFX10-NEXT: s_or_b32 s1, s2, s1 +; GFX10-NEXT: s_or_b32 s0, s1, s0 +; GFX10-NEXT: s_lshr_b32 s0, s0, 16 +; GFX10-NEXT: ; return to shader part epilog %vector = load <4 x i8>, <4 x i8> addrspace(4)* %ptr %element = extractelement <4 x i8> %vector, i32 2 ret i8 %element @@ -340,6 +484,26 @@ ; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: s_lshr_b32 s0, s0, 24 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: extractelement_sgpr_v4i8_idx3: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_movk_i32 s1, 0xff +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshr_b32 s2, s0, 8 +; GFX10-NEXT: s_lshr_b32 s3, s0, 16 +; GFX10-NEXT: s_and_b32 s2, s2, s1 +; GFX10-NEXT: s_and_b32 s4, s0, s1 +; GFX10-NEXT: s_and_b32 s1, s3, s1 +; GFX10-NEXT: s_lshl_b32 s2, s2, 8 +; GFX10-NEXT: s_lshr_b32 s0, s0, 24 +; GFX10-NEXT: s_lshl_b32 s1, s1, 16 +; GFX10-NEXT: s_or_b32 s2, s4, s2 +; GFX10-NEXT: s_lshl_b32 s0, s0, 24 +; GFX10-NEXT: s_or_b32 s1, s2, s1 +; GFX10-NEXT: s_or_b32 s0, s1, s0 +; GFX10-NEXT: s_lshr_b32 s0, s0, 24 +; GFX10-NEXT: ; return to shader part epilog %vector = load <4 x i8>, <4 x i8> addrspace(4)* %ptr %element = extractelement <4 x i8> %vector, i32 3 ret i8 %element @@ -402,6 +566,23 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v4i8_idx0: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 8 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_and_b32_sdwa v3, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX10-NEXT: v_or3_b32 v0, v0, v3, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i8>, <4 x i8> addrspace(1)* %ptr %element = extractelement <4 x i8> %vector, i32 0 ret i8 %element @@ -467,6 +648,24 @@ ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v4i8_idx1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 8 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_and_b32_sdwa v3, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX10-NEXT: v_or3_b32 v0, v0, v3, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i8>, <4 x i8> addrspace(1)* %ptr %element = extractelement <4 x i8> %vector, i32 1 ret i8 %element @@ -532,6 +731,24 @@ ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v4i8_idx2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 8 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_and_b32_sdwa v3, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX10-NEXT: v_or3_b32 v0, v0, v3, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i8>, <4 x i8> addrspace(1)* %ptr %element = extractelement <4 x i8> %vector, i32 2 ret i8 %element @@ -597,6 +814,24 @@ ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v4i8_idx3: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 8 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_and_b32_sdwa v3, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX10-NEXT: v_or3_b32 v0, v0, v3, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i8>, <4 x i8> addrspace(1)* %ptr %element = extractelement <4 x i8> %vector, i32 3 ret i8 %element @@ -639,6 +874,43 @@ ; GCN-NEXT: s_lshl_b32 s1, s1, 3 ; GCN-NEXT: s_lshr_b32 s0, s0, s1 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: extractelement_sgpr_v8i8_sgpr_idx: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_movk_i32 s2, 0xff +; GFX10-NEXT: s_lshr_b32 s3, s4, 2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshr_b32 s5, s0, 8 +; GFX10-NEXT: s_lshr_b32 s8, s1, 8 +; GFX10-NEXT: s_lshr_b32 s6, s0, 16 +; GFX10-NEXT: s_and_b32 s5, s5, s2 +; GFX10-NEXT: s_and_b32 s8, s8, s2 +; GFX10-NEXT: s_lshr_b32 s9, s1, 16 +; GFX10-NEXT: s_lshr_b32 s7, s0, 24 +; GFX10-NEXT: s_and_b32 s6, s6, s2 +; GFX10-NEXT: s_lshr_b32 s10, s1, 24 +; GFX10-NEXT: s_and_b32 s0, s0, s2 +; GFX10-NEXT: s_and_b32 s1, s1, s2 +; GFX10-NEXT: s_and_b32 s2, s9, s2 +; GFX10-NEXT: s_lshl_b32 s5, s5, 8 +; GFX10-NEXT: s_lshl_b32 s8, s8, 8 +; GFX10-NEXT: s_lshl_b32 s6, s6, 16 +; GFX10-NEXT: s_or_b32 s0, s0, s5 +; GFX10-NEXT: s_lshl_b32 s2, s2, 16 +; GFX10-NEXT: s_or_b32 s1, s1, s8 +; GFX10-NEXT: s_lshl_b32 s7, s7, 24 +; GFX10-NEXT: s_or_b32 s0, s0, s6 +; GFX10-NEXT: s_lshl_b32 s9, s10, 24 +; GFX10-NEXT: s_or_b32 s1, s1, s2 +; GFX10-NEXT: s_or_b32 s0, s0, s7 +; GFX10-NEXT: s_or_b32 s1, s1, s9 +; GFX10-NEXT: s_cmp_eq_u32 s3, 1 +; GFX10-NEXT: s_cselect_b32 s0, s1, s0 +; GFX10-NEXT: s_and_b32 s1, s4, 3 +; GFX10-NEXT: s_lshl_b32 s1, s1, 3 +; GFX10-NEXT: s_lshr_b32 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr %element = extractelement <8 x i8> %vector, i32 %idx ret i8 %element @@ -747,6 +1019,35 @@ ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: extractelement_vgpr_v8i8_sgpr_idx: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_mov_b32 s0, 8 +; GFX10-NEXT: s_movk_i32 s1, 0xff +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX10-NEXT: v_and_b32_sdwa v6, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_sdwa v7, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX10-NEXT: v_and_or_b32 v11, v0, s1, v2 +; GFX10-NEXT: v_and_or_b32 v2, v1, s1, v3 +; GFX10-NEXT: s_lshr_b32 s0, s2, 2 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 +; GFX10-NEXT: v_or3_b32 v0, v11, v6, v4 +; GFX10-NEXT: v_or3_b32 v1, v2, v7, v5 +; GFX10-NEXT: s_and_b32 s0, s2, 3 +; GFX10-NEXT: s_lshl_b32 s0, s0, 3 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: ; return to shader part epilog %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr %element = extractelement <8 x i8> %vector, i32 %idx ret i8 %element @@ -855,6 +1156,36 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v8i8_vgpr_idx: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 8 +; GFX10-NEXT: s_movk_i32 s5, 0xff +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX10-NEXT: v_and_b32_sdwa v7, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, s4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_sdwa v8, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX10-NEXT: v_and_or_b32 v0, v0, s5, v3 +; GFX10-NEXT: v_and_or_b32 v1, v1, s5, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 2, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX10-NEXT: v_or3_b32 v0, v0, v7, v5 +; GFX10-NEXT: v_or3_b32 v1, v1, v8, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr %element = extractelement <8 x i8> %vector, i32 %idx ret i8 %element @@ -900,6 +1231,45 @@ ; GCN-NEXT: v_lshrrev_b32_e32 v0, v0, v1 ; GCN-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: extractelement_sgpr_v8i8_vgpr_idx: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_movk_i32 s2, 0xff +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 2, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshr_b32 s6, s1, 8 +; GFX10-NEXT: s_lshr_b32 s3, s0, 8 +; GFX10-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-NEXT: s_and_b32 s6, s6, s2 +; GFX10-NEXT: s_lshr_b32 s7, s1, 16 +; GFX10-NEXT: s_lshr_b32 s5, s0, 24 +; GFX10-NEXT: s_and_b32 s3, s3, s2 +; GFX10-NEXT: s_lshr_b32 s8, s1, 24 +; GFX10-NEXT: s_and_b32 s0, s0, s2 +; GFX10-NEXT: s_and_b32 s1, s1, s2 +; GFX10-NEXT: s_and_b32 s4, s4, s2 +; GFX10-NEXT: s_and_b32 s2, s7, s2 +; GFX10-NEXT: s_lshl_b32 s6, s6, 8 +; GFX10-NEXT: s_lshl_b32 s2, s2, 16 +; GFX10-NEXT: s_or_b32 s1, s1, s6 +; GFX10-NEXT: s_lshl_b32 s7, s8, 24 +; GFX10-NEXT: s_or_b32 s1, s1, s2 +; GFX10-NEXT: s_lshl_b32 s3, s3, 8 +; GFX10-NEXT: s_or_b32 s1, s1, s7 +; GFX10-NEXT: s_lshl_b32 s2, s4, 16 +; GFX10-NEXT: s_or_b32 s0, s0, s3 +; GFX10-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-NEXT: s_lshl_b32 s5, s5, 24 +; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: s_or_b32 s0, s0, s5 +; GFX10-NEXT: v_cndmask_b32_e32 v1, s0, v2, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v0, v1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: ; return to shader part epilog %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr %element = extractelement <8 x i8> %vector, i32 %idx ret i8 %element @@ -924,6 +1294,25 @@ ; GCN-NEXT: s_lshl_b32 s1, s3, 24 ; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: extractelement_sgpr_v8i8_idx0: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_movk_i32 s1, 0xff +; GFX10-NEXT: s_lshr_b32 s2, s0, 8 +; GFX10-NEXT: s_lshr_b32 s3, s0, 16 +; GFX10-NEXT: s_and_b32 s2, s2, s1 +; GFX10-NEXT: s_and_b32 s4, s0, s1 +; GFX10-NEXT: s_and_b32 s1, s3, s1 +; GFX10-NEXT: s_lshl_b32 s2, s2, 8 +; GFX10-NEXT: s_lshr_b32 s0, s0, 24 +; GFX10-NEXT: s_lshl_b32 s1, s1, 16 +; GFX10-NEXT: s_or_b32 s2, s4, s2 +; GFX10-NEXT: s_lshl_b32 s0, s0, 24 +; GFX10-NEXT: s_or_b32 s1, s2, s1 +; GFX10-NEXT: s_or_b32 s0, s1, s0 +; GFX10-NEXT: ; return to shader part epilog %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr %element = extractelement <8 x i8> %vector, i32 0 ret i8 %element @@ -949,6 +1338,26 @@ ; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: s_lshr_b32 s0, s0, 8 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: extractelement_sgpr_v8i8_idx1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_movk_i32 s1, 0xff +; GFX10-NEXT: s_lshr_b32 s2, s0, 8 +; GFX10-NEXT: s_lshr_b32 s3, s0, 16 +; GFX10-NEXT: s_and_b32 s2, s2, s1 +; GFX10-NEXT: s_and_b32 s4, s0, s1 +; GFX10-NEXT: s_and_b32 s1, s3, s1 +; GFX10-NEXT: s_lshl_b32 s2, s2, 8 +; GFX10-NEXT: s_lshr_b32 s0, s0, 24 +; GFX10-NEXT: s_lshl_b32 s1, s1, 16 +; GFX10-NEXT: s_or_b32 s2, s4, s2 +; GFX10-NEXT: s_lshl_b32 s0, s0, 24 +; GFX10-NEXT: s_or_b32 s1, s2, s1 +; GFX10-NEXT: s_or_b32 s0, s1, s0 +; GFX10-NEXT: s_lshr_b32 s0, s0, 8 +; GFX10-NEXT: ; return to shader part epilog %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr %element = extractelement <8 x i8> %vector, i32 1 ret i8 %element @@ -974,6 +1383,26 @@ ; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: s_lshr_b32 s0, s0, 16 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: extractelement_sgpr_v8i8_idx2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_movk_i32 s1, 0xff +; GFX10-NEXT: s_lshr_b32 s2, s0, 8 +; GFX10-NEXT: s_lshr_b32 s3, s0, 16 +; GFX10-NEXT: s_and_b32 s2, s2, s1 +; GFX10-NEXT: s_and_b32 s4, s0, s1 +; GFX10-NEXT: s_and_b32 s1, s3, s1 +; GFX10-NEXT: s_lshl_b32 s2, s2, 8 +; GFX10-NEXT: s_lshr_b32 s0, s0, 24 +; GFX10-NEXT: s_lshl_b32 s1, s1, 16 +; GFX10-NEXT: s_or_b32 s2, s4, s2 +; GFX10-NEXT: s_lshl_b32 s0, s0, 24 +; GFX10-NEXT: s_or_b32 s1, s2, s1 +; GFX10-NEXT: s_or_b32 s0, s1, s0 +; GFX10-NEXT: s_lshr_b32 s0, s0, 16 +; GFX10-NEXT: ; return to shader part epilog %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr %element = extractelement <8 x i8> %vector, i32 2 ret i8 %element @@ -999,6 +1428,26 @@ ; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: s_lshr_b32 s0, s0, 24 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: extractelement_sgpr_v8i8_idx3: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_movk_i32 s1, 0xff +; GFX10-NEXT: s_lshr_b32 s2, s0, 8 +; GFX10-NEXT: s_lshr_b32 s3, s0, 16 +; GFX10-NEXT: s_and_b32 s2, s2, s1 +; GFX10-NEXT: s_and_b32 s4, s0, s1 +; GFX10-NEXT: s_and_b32 s1, s3, s1 +; GFX10-NEXT: s_lshl_b32 s2, s2, 8 +; GFX10-NEXT: s_lshr_b32 s0, s0, 24 +; GFX10-NEXT: s_lshl_b32 s1, s1, 16 +; GFX10-NEXT: s_or_b32 s2, s4, s2 +; GFX10-NEXT: s_lshl_b32 s0, s0, 24 +; GFX10-NEXT: s_or_b32 s1, s2, s1 +; GFX10-NEXT: s_or_b32 s0, s1, s0 +; GFX10-NEXT: s_lshr_b32 s0, s0, 24 +; GFX10-NEXT: ; return to shader part epilog %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr %element = extractelement <8 x i8> %vector, i32 3 ret i8 %element @@ -1023,6 +1472,25 @@ ; GCN-NEXT: s_lshl_b32 s1, s3, 24 ; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: extractelement_sgpr_v8i8_idx4: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_movk_i32 s0, 0xff +; GFX10-NEXT: s_lshr_b32 s2, s1, 8 +; GFX10-NEXT: s_lshr_b32 s3, s1, 16 +; GFX10-NEXT: s_and_b32 s2, s2, s0 +; GFX10-NEXT: s_and_b32 s4, s1, s0 +; GFX10-NEXT: s_and_b32 s0, s3, s0 +; GFX10-NEXT: s_lshl_b32 s2, s2, 8 +; GFX10-NEXT: s_lshr_b32 s1, s1, 24 +; GFX10-NEXT: s_lshl_b32 s0, s0, 16 +; GFX10-NEXT: s_or_b32 s2, s4, s2 +; GFX10-NEXT: s_lshl_b32 s1, s1, 24 +; GFX10-NEXT: s_or_b32 s0, s2, s0 +; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr %element = extractelement <8 x i8> %vector, i32 4 ret i8 %element @@ -1048,6 +1516,26 @@ ; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: s_lshr_b32 s0, s0, 8 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: extractelement_sgpr_v8i8_idx5: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_movk_i32 s0, 0xff +; GFX10-NEXT: s_lshr_b32 s2, s1, 8 +; GFX10-NEXT: s_lshr_b32 s3, s1, 16 +; GFX10-NEXT: s_and_b32 s2, s2, s0 +; GFX10-NEXT: s_and_b32 s4, s1, s0 +; GFX10-NEXT: s_and_b32 s0, s3, s0 +; GFX10-NEXT: s_lshl_b32 s2, s2, 8 +; GFX10-NEXT: s_lshr_b32 s1, s1, 24 +; GFX10-NEXT: s_lshl_b32 s0, s0, 16 +; GFX10-NEXT: s_or_b32 s2, s4, s2 +; GFX10-NEXT: s_lshl_b32 s1, s1, 24 +; GFX10-NEXT: s_or_b32 s0, s2, s0 +; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: s_lshr_b32 s0, s0, 8 +; GFX10-NEXT: ; return to shader part epilog %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr %element = extractelement <8 x i8> %vector, i32 5 ret i8 %element @@ -1073,6 +1561,26 @@ ; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: s_lshr_b32 s0, s0, 16 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: extractelement_sgpr_v8i8_idx6: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_movk_i32 s0, 0xff +; GFX10-NEXT: s_lshr_b32 s2, s1, 8 +; GFX10-NEXT: s_lshr_b32 s3, s1, 16 +; GFX10-NEXT: s_and_b32 s2, s2, s0 +; GFX10-NEXT: s_and_b32 s4, s1, s0 +; GFX10-NEXT: s_and_b32 s0, s3, s0 +; GFX10-NEXT: s_lshl_b32 s2, s2, 8 +; GFX10-NEXT: s_lshr_b32 s1, s1, 24 +; GFX10-NEXT: s_lshl_b32 s0, s0, 16 +; GFX10-NEXT: s_or_b32 s2, s4, s2 +; GFX10-NEXT: s_lshl_b32 s1, s1, 24 +; GFX10-NEXT: s_or_b32 s0, s2, s0 +; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: s_lshr_b32 s0, s0, 16 +; GFX10-NEXT: ; return to shader part epilog %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr %element = extractelement <8 x i8> %vector, i32 6 ret i8 %element @@ -1098,6 +1606,26 @@ ; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: s_lshr_b32 s0, s0, 24 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: extractelement_sgpr_v8i8_idx7: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_movk_i32 s0, 0xff +; GFX10-NEXT: s_lshr_b32 s2, s1, 8 +; GFX10-NEXT: s_lshr_b32 s3, s1, 16 +; GFX10-NEXT: s_and_b32 s2, s2, s0 +; GFX10-NEXT: s_and_b32 s4, s1, s0 +; GFX10-NEXT: s_and_b32 s0, s3, s0 +; GFX10-NEXT: s_lshl_b32 s2, s2, 8 +; GFX10-NEXT: s_lshr_b32 s1, s1, 24 +; GFX10-NEXT: s_lshl_b32 s0, s0, 16 +; GFX10-NEXT: s_or_b32 s2, s4, s2 +; GFX10-NEXT: s_lshl_b32 s1, s1, 24 +; GFX10-NEXT: s_or_b32 s0, s2, s0 +; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: s_lshr_b32 s0, s0, 24 +; GFX10-NEXT: ; return to shader part epilog %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr %element = extractelement <8 x i8> %vector, i32 7 ret i8 %element @@ -1160,6 +1688,23 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v8i8_idx0: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 8 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_and_b32_sdwa v3, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX10-NEXT: v_or3_b32 v0, v0, v3, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr %element = extractelement <8 x i8> %vector, i32 0 ret i8 %element @@ -1225,6 +1770,24 @@ ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v8i8_idx1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 8 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_and_b32_sdwa v3, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX10-NEXT: v_or3_b32 v0, v0, v3, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr %element = extractelement <8 x i8> %vector, i32 1 ret i8 %element @@ -1290,6 +1853,24 @@ ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v8i8_idx2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 8 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_and_b32_sdwa v3, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX10-NEXT: v_or3_b32 v0, v0, v3, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr %element = extractelement <8 x i8> %vector, i32 2 ret i8 %element @@ -1355,6 +1936,24 @@ ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v8i8_idx3: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 8 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_and_b32_sdwa v3, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX10-NEXT: v_or3_b32 v0, v0, v3, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr %element = extractelement <8 x i8> %vector, i32 3 ret i8 %element @@ -1417,6 +2016,23 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v8i8_idx4: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 8 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_and_b32_sdwa v3, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v0, v1, s4, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX10-NEXT: v_or3_b32 v0, v0, v3, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr %element = extractelement <8 x i8> %vector, i32 4 ret i8 %element @@ -1482,6 +2098,24 @@ ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v8i8_idx5: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 8 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_and_b32_sdwa v3, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v0, v1, s4, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX10-NEXT: v_or3_b32 v0, v0, v3, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr %element = extractelement <8 x i8> %vector, i32 5 ret i8 %element @@ -1547,6 +2181,24 @@ ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v8i8_idx6: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 8 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_and_b32_sdwa v3, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v0, v1, s4, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX10-NEXT: v_or3_b32 v0, v0, v3, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr %element = extractelement <8 x i8> %vector, i32 6 ret i8 %element @@ -1612,6 +2264,24 @@ ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v8i8_idx7: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 8 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_and_b32_sdwa v3, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v0, v1, s4, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX10-NEXT: v_or3_b32 v0, v0, v3, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr %element = extractelement <8 x i8> %vector, i32 7 ret i8 %element @@ -1682,6 +2352,71 @@ ; GCN-NEXT: s_lshl_b32 s1, s1, 3 ; GCN-NEXT: s_lshr_b32 s0, s0, s1 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: extractelement_sgpr_v16i8_sgpr_idx: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX10-NEXT: s_movk_i32 s5, 0xff +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshr_b32 s6, s0, 8 +; GFX10-NEXT: s_lshr_b32 s7, s0, 16 +; GFX10-NEXT: s_and_b32 s6, s6, s5 +; GFX10-NEXT: s_lshr_b32 s8, s0, 24 +; GFX10-NEXT: s_and_b32 s7, s7, s5 +; GFX10-NEXT: s_and_b32 s0, s0, s5 +; GFX10-NEXT: s_lshl_b32 s6, s6, 8 +; GFX10-NEXT: s_lshr_b32 s12, s2, 8 +; GFX10-NEXT: s_lshl_b32 s7, s7, 16 +; GFX10-NEXT: s_or_b32 s0, s0, s6 +; GFX10-NEXT: s_lshr_b32 s13, s2, 16 +; GFX10-NEXT: s_or_b32 s0, s0, s7 +; GFX10-NEXT: s_and_b32 s7, s12, s5 +; GFX10-NEXT: s_lshl_b32 s8, s8, 24 +; GFX10-NEXT: s_lshr_b32 s9, s1, 8 +; GFX10-NEXT: s_lshr_b32 s14, s2, 24 +; GFX10-NEXT: s_or_b32 s0, s0, s8 +; GFX10-NEXT: s_and_b32 s2, s2, s5 +; GFX10-NEXT: s_lshl_b32 s7, s7, 8 +; GFX10-NEXT: s_and_b32 s8, s13, s5 +; GFX10-NEXT: s_lshr_b32 s10, s1, 16 +; GFX10-NEXT: s_and_b32 s9, s9, s5 +; GFX10-NEXT: s_or_b32 s2, s2, s7 +; GFX10-NEXT: s_lshl_b32 s7, s8, 16 +; GFX10-NEXT: s_lshr_b32 s15, s3, 8 +; GFX10-NEXT: s_lshr_b32 s11, s1, 24 +; GFX10-NEXT: s_and_b32 s10, s10, s5 +; GFX10-NEXT: s_and_b32 s1, s1, s5 +; GFX10-NEXT: s_lshl_b32 s9, s9, 8 +; GFX10-NEXT: s_or_b32 s2, s2, s7 +; GFX10-NEXT: s_and_b32 s7, s15, s5 +; GFX10-NEXT: s_lshr_b32 s16, s3, 16 +; GFX10-NEXT: s_lshl_b32 s10, s10, 16 +; GFX10-NEXT: s_or_b32 s1, s1, s9 +; GFX10-NEXT: s_lshr_b32 s17, s3, 24 +; GFX10-NEXT: s_and_b32 s3, s3, s5 +; GFX10-NEXT: s_and_b32 s5, s16, s5 +; GFX10-NEXT: s_lshl_b32 s7, s7, 8 +; GFX10-NEXT: s_or_b32 s1, s1, s10 +; GFX10-NEXT: s_lshl_b32 s6, s11, 24 +; GFX10-NEXT: s_or_b32 s3, s3, s7 +; GFX10-NEXT: s_lshl_b32 s5, s5, 16 +; GFX10-NEXT: s_or_b32 s1, s1, s6 +; GFX10-NEXT: s_lshl_b32 s6, s14, 24 +; GFX10-NEXT: s_or_b32 s3, s3, s5 +; GFX10-NEXT: s_lshl_b32 s5, s17, 24 +; GFX10-NEXT: s_or_b32 s2, s2, s6 +; GFX10-NEXT: s_lshr_b32 s6, s4, 2 +; GFX10-NEXT: s_or_b32 s3, s3, s5 +; GFX10-NEXT: s_cmp_eq_u32 s6, 1 +; GFX10-NEXT: s_cselect_b32 s0, s1, s0 +; GFX10-NEXT: s_cmp_eq_u32 s6, 2 +; GFX10-NEXT: s_cselect_b32 s0, s2, s0 +; GFX10-NEXT: s_cmp_eq_u32 s6, 3 +; GFX10-NEXT: s_cselect_b32 s0, s3, s0 +; GFX10-NEXT: s_and_b32 s1, s4, 3 +; GFX10-NEXT: s_lshl_b32 s1, s1, 3 +; GFX10-NEXT: s_lshr_b32 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog %vector = load <16 x i8>, <16 x i8> addrspace(4)* %ptr %element = extractelement <16 x i8> %vector, i32 %idx ret i8 %element @@ -1860,6 +2595,55 @@ ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: extractelement_vgpr_v16i8_sgpr_idx: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: v_mov_b32_e32 v5, 8 +; GFX10-NEXT: s_mov_b32 s0, 8 +; GFX10-NEXT: s_movk_i32 s1, 0xff +; GFX10-NEXT: v_mov_b32_e32 v4, 0xff +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 8, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; GFX10-NEXT: v_and_b32_sdwa v13, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_e32 v12, 8, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v19, v5, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX10-NEXT: v_and_b32_sdwa v14, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX10-NEXT: v_and_or_b32 v23, v1, s1, v8 +; GFX10-NEXT: s_lshr_b32 s0, s2, 2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v5, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_sdwa v17, v2, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX10-NEXT: v_and_or_b32 v2, v2, s1, v19 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v3 +; GFX10-NEXT: v_or3_b32 v0, v0, v13, v7 +; GFX10-NEXT: v_or3_b32 v1, v23, v14, v9 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 +; GFX10-NEXT: v_and_or_b32 v5, v3, v4, v5 +; GFX10-NEXT: v_and_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_or3_b32 v2, v2, v17, v11 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v6 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 2 +; GFX10-NEXT: v_or3_b32 v1, v5, v3, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 3 +; GFX10-NEXT: s_and_b32 s0, s2, 3 +; GFX10-NEXT: s_lshl_b32 s0, s0, 3 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: ; return to shader part epilog %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr %element = extractelement <16 x i8> %vector, i32 %idx ret i8 %element @@ -2038,6 +2822,56 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v16i8_vgpr_idx: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off +; GFX10-NEXT: v_mov_b32_e32 v1, 8 +; GFX10-NEXT: s_mov_b32 s4, 8 +; GFX10-NEXT: s_movk_i32 s5, 0xff +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 2, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, 0xff +; GFX10-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 8, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 8, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v12, 8, v5 +; GFX10-NEXT: v_lshlrev_b32_sdwa v8, s4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_sdwa v16, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v17, v4, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_e32 v13, 24, v5 +; GFX10-NEXT: v_and_or_b32 v3, v3, s5, v8 +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX10-NEXT: v_lshrrev_b32_e32 v14, 8, v6 +; GFX10-NEXT: v_lshlrev_b32_sdwa v12, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX10-NEXT: v_and_or_b32 v4, v4, s5, v10 +; GFX10-NEXT: v_and_b32_sdwa v18, v5, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_e32 v15, 24, v6 +; GFX10-NEXT: v_or3_b32 v3, v3, v16, v9 +; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v13 +; GFX10-NEXT: v_or3_b32 v4, v4, v17, v11 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_or_b32 v5, v5, s5, v12 +; GFX10-NEXT: v_and_b32_sdwa v19, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX10-NEXT: v_and_or_b32 v0, v6, v0, v1 +; GFX10-NEXT: v_or3_b32 v1, v5, v18, v8 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v7 +; GFX10-NEXT: v_or3_b32 v0, v0, v19, v9 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v7 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr %element = extractelement <16 x i8> %vector, i32 %idx ret i8 %element @@ -2113,6 +2947,73 @@ ; GCN-NEXT: v_lshrrev_b32_e32 v0, v0, v1 ; GCN-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: extractelement_sgpr_v16i8_vgpr_idx: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 2, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshr_b32 s8, s1, 8 +; GFX10-NEXT: s_lshr_b32 s5, s0, 8 +; GFX10-NEXT: s_lshr_b32 s9, s1, 16 +; GFX10-NEXT: s_and_b32 s8, s8, s4 +; GFX10-NEXT: s_and_b32 s5, s5, s4 +; GFX10-NEXT: s_and_b32 s9, s9, s4 +; GFX10-NEXT: s_lshr_b32 s10, s1, 24 +; GFX10-NEXT: s_and_b32 s1, s1, s4 +; GFX10-NEXT: s_lshl_b32 s8, s8, 8 +; GFX10-NEXT: s_lshr_b32 s6, s0, 16 +; GFX10-NEXT: s_lshr_b32 s7, s0, 24 +; GFX10-NEXT: s_and_b32 s0, s0, s4 +; GFX10-NEXT: s_lshl_b32 s5, s5, 8 +; GFX10-NEXT: s_lshl_b32 s9, s9, 16 +; GFX10-NEXT: s_or_b32 s1, s1, s8 +; GFX10-NEXT: s_and_b32 s6, s6, s4 +; GFX10-NEXT: s_or_b32 s0, s0, s5 +; GFX10-NEXT: s_or_b32 s1, s1, s9 +; GFX10-NEXT: s_lshl_b32 s5, s10, 24 +; GFX10-NEXT: s_lshl_b32 s6, s6, 16 +; GFX10-NEXT: s_lshr_b32 s11, s2, 8 +; GFX10-NEXT: s_or_b32 s1, s1, s5 +; GFX10-NEXT: s_or_b32 s0, s0, s6 +; GFX10-NEXT: s_and_b32 s6, s11, s4 +; GFX10-NEXT: s_lshl_b32 s7, s7, 24 +; GFX10-NEXT: s_lshr_b32 s12, s2, 16 +; GFX10-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-NEXT: s_lshr_b32 s13, s2, 24 +; GFX10-NEXT: s_or_b32 s0, s0, s7 +; GFX10-NEXT: s_and_b32 s2, s2, s4 +; GFX10-NEXT: s_lshl_b32 s6, s6, 8 +; GFX10-NEXT: s_and_b32 s7, s12, s4 +; GFX10-NEXT: s_or_b32 s2, s2, s6 +; GFX10-NEXT: s_lshl_b32 s6, s7, 16 +; GFX10-NEXT: s_lshr_b32 s14, s3, 8 +; GFX10-NEXT: s_lshr_b32 s15, s3, 16 +; GFX10-NEXT: s_or_b32 s2, s2, s6 +; GFX10-NEXT: s_and_b32 s6, s14, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v2, s0, v2, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1 +; GFX10-NEXT: s_lshl_b32 s5, s13, 24 +; GFX10-NEXT: s_and_b32 s1, s15, s4 +; GFX10-NEXT: s_lshr_b32 s16, s3, 24 +; GFX10-NEXT: s_and_b32 s3, s3, s4 +; GFX10-NEXT: s_lshl_b32 s6, s6, 8 +; GFX10-NEXT: s_or_b32 s2, s2, s5 +; GFX10-NEXT: s_or_b32 s3, s3, s6 +; GFX10-NEXT: s_lshl_b32 s1, s1, 16 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1 +; GFX10-NEXT: s_or_b32 s0, s3, s1 +; GFX10-NEXT: s_lshl_b32 s1, s16, 24 +; GFX10-NEXT: s_or_b32 s3, s0, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s3, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v0, v1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: ; return to shader part epilog %vector = load <16 x i8>, <16 x i8> addrspace(4)* %ptr %element = extractelement <16 x i8> %vector, i32 %idx ret i8 %element @@ -2175,6 +3076,23 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v16i8_idx0: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 8 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_and_b32_sdwa v3, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX10-NEXT: v_or3_b32 v0, v0, v3, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr %element = extractelement <16 x i8> %vector, i32 0 ret i8 %element @@ -2240,6 +3158,24 @@ ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v16i8_idx1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 8 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_and_b32_sdwa v3, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX10-NEXT: v_or3_b32 v0, v0, v3, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr %element = extractelement <16 x i8> %vector, i32 1 ret i8 %element @@ -2305,6 +3241,24 @@ ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v16i8_idx2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 8 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_and_b32_sdwa v3, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX10-NEXT: v_or3_b32 v0, v0, v3, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr %element = extractelement <16 x i8> %vector, i32 2 ret i8 %element @@ -2370,6 +3324,24 @@ ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v16i8_idx3: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 8 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_and_b32_sdwa v3, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX10-NEXT: v_or3_b32 v0, v0, v3, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr %element = extractelement <16 x i8> %vector, i32 3 ret i8 %element @@ -2432,6 +3404,23 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v16i8_idx4: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 8 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_and_b32_sdwa v3, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v0, v1, s4, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX10-NEXT: v_or3_b32 v0, v0, v3, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr %element = extractelement <16 x i8> %vector, i32 4 ret i8 %element @@ -2497,6 +3486,24 @@ ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v16i8_idx5: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 8 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_and_b32_sdwa v3, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v0, v1, s4, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX10-NEXT: v_or3_b32 v0, v0, v3, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr %element = extractelement <16 x i8> %vector, i32 5 ret i8 %element @@ -2562,6 +3569,24 @@ ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v16i8_idx6: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 8 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_and_b32_sdwa v3, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v0, v1, s4, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX10-NEXT: v_or3_b32 v0, v0, v3, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr %element = extractelement <16 x i8> %vector, i32 6 ret i8 %element @@ -2627,6 +3652,24 @@ ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v16i8_idx7: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 8 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_and_b32_sdwa v3, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v0, v1, s4, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX10-NEXT: v_or3_b32 v0, v0, v3, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr %element = extractelement <16 x i8> %vector, i32 7 ret i8 %element @@ -2689,6 +3732,23 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v16i8_idx8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 8 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 24, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX10-NEXT: v_and_b32_sdwa v3, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v0, v2, s4, v0 +; GFX10-NEXT: v_or3_b32 v0, v0, v3, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr %element = extractelement <16 x i8> %vector, i32 8 ret i8 %element @@ -2754,6 +3814,24 @@ ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v16i8_idx9: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 8 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 24, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX10-NEXT: v_and_b32_sdwa v3, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v0, v2, s4, v0 +; GFX10-NEXT: v_or3_b32 v0, v0, v3, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr %element = extractelement <16 x i8> %vector, i32 9 ret i8 %element @@ -2819,6 +3897,24 @@ ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v16i8_idx10: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 8 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 24, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX10-NEXT: v_and_b32_sdwa v3, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v0, v2, s4, v0 +; GFX10-NEXT: v_or3_b32 v0, v0, v3, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr %element = extractelement <16 x i8> %vector, i32 10 ret i8 %element @@ -2884,6 +3980,24 @@ ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v16i8_idx11: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 8 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 24, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX10-NEXT: v_and_b32_sdwa v3, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v0, v2, s4, v0 +; GFX10-NEXT: v_or3_b32 v0, v0, v3, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr %element = extractelement <16 x i8> %vector, i32 11 ret i8 %element @@ -2946,6 +4060,23 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v16i8_idx12: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 8 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 24, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX10-NEXT: v_and_b32_sdwa v2, v3, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v0, v3, s4, v0 +; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr %element = extractelement <16 x i8> %vector, i32 12 ret i8 %element @@ -3011,6 +4142,24 @@ ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v16i8_idx13: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 8 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 24, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX10-NEXT: v_and_b32_sdwa v2, v3, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v0, v3, s4, v0 +; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr %element = extractelement <16 x i8> %vector, i32 13 ret i8 %element @@ -3076,6 +4225,24 @@ ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v16i8_idx14: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 8 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 24, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX10-NEXT: v_and_b32_sdwa v2, v3, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v0, v3, s4, v0 +; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr %element = extractelement <16 x i8> %vector, i32 14 ret i8 %element @@ -3141,6 +4308,24 @@ ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: extractelement_vgpr_v16i8_idx15: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 8 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 24, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX10-NEXT: v_and_b32_sdwa v2, v3, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v0, v3, s4, v0 +; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr %element = extractelement <16 x i8> %vector, i32 15 ret i8 %element diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GPRIDX %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s define float @dyn_extract_v8f32_const_s_v(i32 %sel) { ; GCN-LABEL: dyn_extract_v8f32_const_s_v: @@ -26,6 +27,26 @@ ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 7, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v5, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: dyn_extract_v8f32_const_s_v: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x40400000, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 4.0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x40a00000, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x40c00000, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x40e00000, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, 0x41000000, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] entry: %ext = extractelement <8 x float> , i32 %sel ret float %ext @@ -50,6 +71,25 @@ ; GCN-NEXT: s_cselect_b32 s0, 0x41000000, s0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: dyn_extract_v8f32_const_s_s: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_cmp_eq_u32 s2, 1 +; GFX10-NEXT: s_cselect_b32 s0, 2.0, 1.0 +; GFX10-NEXT: s_cmp_eq_u32 s2, 2 +; GFX10-NEXT: s_cselect_b32 s0, 0x40400000, s0 +; GFX10-NEXT: s_cmp_eq_u32 s2, 3 +; GFX10-NEXT: s_cselect_b32 s0, 4.0, s0 +; GFX10-NEXT: s_cmp_eq_u32 s2, 4 +; GFX10-NEXT: s_cselect_b32 s0, 0x40a00000, s0 +; GFX10-NEXT: s_cmp_eq_u32 s2, 5 +; GFX10-NEXT: s_cselect_b32 s0, 0x40c00000, s0 +; GFX10-NEXT: s_cmp_eq_u32 s2, 6 +; GFX10-NEXT: s_cselect_b32 s0, 0x40e00000, s0 +; GFX10-NEXT: s_cmp_eq_u32 s2, 7 +; GFX10-NEXT: s_cselect_b32 s0, 0x41000000, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: ; return to shader part epilog entry: %ext = extractelement <8 x float> , i32 %sel ret float %ext @@ -85,6 +125,33 @@ ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 7, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v8, vcc ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: dyn_extract_v8f32_s_v: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: v_cndmask_b32_e32 v1, s0, v1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s4, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s5, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s6, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s7, vcc_lo +; GFX10-NEXT: ; return to shader part epilog entry: %ext = extractelement <8 x float> %vec, i32 %sel ret float %ext @@ -109,6 +176,26 @@ ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 7, v8 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: dyn_extract_v8f32_v_v: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v8 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v8 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v8 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v8 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v8 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v8 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] entry: %ext = extractelement <8 x float> %vec, i32 %sel ret float %ext @@ -132,6 +219,24 @@ ; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s2, 7 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: dyn_extract_v8f32_v_s: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 2 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 3 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 4 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 5 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 6 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 7 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo +; GFX10-NEXT: ; return to shader part epilog entry: %ext = extractelement <8 x float> %vec, i32 %sel ret float %ext @@ -156,6 +261,25 @@ ; GCN-NEXT: s_cselect_b32 s0, s9, s0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: dyn_extract_v8f32_s_s: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_cmp_eq_u32 s10, 1 +; GFX10-NEXT: s_cselect_b32 s0, s3, s2 +; GFX10-NEXT: s_cmp_eq_u32 s10, 2 +; GFX10-NEXT: s_cselect_b32 s0, s4, s0 +; GFX10-NEXT: s_cmp_eq_u32 s10, 3 +; GFX10-NEXT: s_cselect_b32 s0, s5, s0 +; GFX10-NEXT: s_cmp_eq_u32 s10, 4 +; GFX10-NEXT: s_cselect_b32 s0, s6, s0 +; GFX10-NEXT: s_cmp_eq_u32 s10, 5 +; GFX10-NEXT: s_cselect_b32 s0, s7, s0 +; GFX10-NEXT: s_cmp_eq_u32 s10, 6 +; GFX10-NEXT: s_cselect_b32 s0, s8, s0 +; GFX10-NEXT: s_cmp_eq_u32 s10, 7 +; GFX10-NEXT: s_cselect_b32 s0, s9, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: ; return to shader part epilog entry: %ext = extractelement <8 x float> %vec, i32 %sel ret float %ext @@ -211,6 +335,43 @@ ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v15, vcc ; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: dyn_extract_v8i64_const_s_v: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_mov_b64 s[6:7], 2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-NEXT: s_mov_b64 s[4:5], 1 +; GFX10-NEXT: s_mov_b64 s[8:9], 3 +; GFX10-NEXT: s_mov_b64 s[14:15], 4 +; GFX10-NEXT: v_cndmask_b32_e32 v1, s4, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, s5, v2, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 +; GFX10-NEXT: s_mov_b64 s[12:13], 5 +; GFX10-NEXT: s_mov_b64 s[16:17], 7 +; GFX10-NEXT: s_mov_b64 s[18:19], 8 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s9, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s14, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s15, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 +; GFX10-NEXT: s_mov_b64 s[14:15], 6 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s12, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s13, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s14, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s15, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s16, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s17, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s18, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s19, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] entry: %ext = extractelement <8 x i64> , i32 %sel ret i64 %ext @@ -250,6 +411,23 @@ ; MOVREL-NEXT: v_mov_b32_e32 v1, s1 ; MOVREL-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; MOVREL-NEXT: s_endpgm +; +; GFX10-LABEL: dyn_extract_v8i64_const_s_s: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b64 s[4:5], 1 +; GFX10-NEXT: s_mov_b32 m0, s2 +; GFX10-NEXT: s_mov_b64 s[18:19], 8 +; GFX10-NEXT: s_mov_b64 s[16:17], 7 +; GFX10-NEXT: s_mov_b64 s[14:15], 6 +; GFX10-NEXT: s_mov_b64 s[12:13], 5 +; GFX10-NEXT: s_mov_b64 s[10:11], 4 +; GFX10-NEXT: s_mov_b64 s[8:9], 3 +; GFX10-NEXT: s_mov_b64 s[6:7], 2 +; GFX10-NEXT: s_movrels_b64 s[0:1], s[4:5] +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: global_store_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_endpgm entry: %ext = extractelement <8 x i64> , i32 %sel store i64 %ext, i64 addrspace(1)* undef @@ -364,6 +542,50 @@ ; MOVREL-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc ; MOVREL-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; MOVREL-NEXT: s_endpgm +; +; GFX10-LABEL: dyn_extract_v8i64_s_v: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s19, s5 +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: v_mov_b32_e32 v2, s19 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: v_cndmask_b32_e32 v1, s0, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, s1, v2, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 s46, s12 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s5, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 +; GFX10-NEXT: s_mov_b32 s47, s13 +; GFX10-NEXT: s_mov_b32 s12, s14 +; GFX10-NEXT: s_mov_b32 s13, s15 +; GFX10-NEXT: s_mov_b32 s14, s16 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s7, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 +; GFX10-NEXT: s_mov_b32 s15, s17 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s9, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s46, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s47, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s12, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s13, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s14, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s15, vcc_lo +; GFX10-NEXT: global_store_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_endpgm entry: %ext = extractelement <8 x i64> %vec, i32 %sel store i64 %ext, i64 addrspace(1)* undef @@ -396,6 +618,33 @@ ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: dyn_extract_v8i64_v_v: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16 +; GFX10-NEXT: v_cndmask_b32_e32 v19, v0, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v16 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v19, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v16 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v7, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v16 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v16 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v11, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v16 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v12, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v13, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v16 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] entry: %ext = extractelement <8 x i64> %vec, i32 %sel ret i64 %ext @@ -419,6 +668,14 @@ ; MOVREL-NEXT: v_movrels_b32_e32 v17, v1 ; MOVREL-NEXT: flat_store_dwordx2 v[0:1], v[16:17] ; MOVREL-NEXT: s_endpgm +; +; GFX10-LABEL: dyn_extract_v8i64_v_s: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_lshl_b32 m0, s2, 1 +; GFX10-NEXT: v_movrels_b32_e32 v16, v0 +; GFX10-NEXT: v_movrels_b32_e32 v17, v1 +; GFX10-NEXT: global_store_dwordx2 v[0:1], v[16:17], off +; GFX10-NEXT: s_endpgm entry: %ext = extractelement <8 x i64> %vec, i32 %sel store i64 %ext, i64 addrspace(1)* undef @@ -475,6 +732,31 @@ ; MOVREL-NEXT: v_mov_b32_e32 v1, s1 ; MOVREL-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; MOVREL-NEXT: s_endpgm +; +; GFX10-LABEL: dyn_extract_v8i64_s_s: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 m0, s18 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_mov_b32 s11, s13 +; GFX10-NEXT: s_mov_b32 s12, s14 +; GFX10-NEXT: s_mov_b32 s13, s15 +; GFX10-NEXT: s_mov_b32 s14, s16 +; GFX10-NEXT: s_mov_b32 s15, s17 +; GFX10-NEXT: s_movrels_b64 s[0:1], s[0:1] +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: global_store_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_endpgm entry: %ext = extractelement <8 x i64> %vec, i32 %sel store i64 %ext, i64 addrspace(1)* undef @@ -501,6 +783,26 @@ ; GCN-NEXT: s_cselect_b32 s0, s9, s0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: dyn_extract_v8f32_s_s_offset3: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_add_i32 s10, s10, 3 +; GFX10-NEXT: s_cmp_eq_u32 s10, 1 +; GFX10-NEXT: s_cselect_b32 s0, s3, s2 +; GFX10-NEXT: s_cmp_eq_u32 s10, 2 +; GFX10-NEXT: s_cselect_b32 s0, s4, s0 +; GFX10-NEXT: s_cmp_eq_u32 s10, 3 +; GFX10-NEXT: s_cselect_b32 s0, s5, s0 +; GFX10-NEXT: s_cmp_eq_u32 s10, 4 +; GFX10-NEXT: s_cselect_b32 s0, s6, s0 +; GFX10-NEXT: s_cmp_eq_u32 s10, 5 +; GFX10-NEXT: s_cselect_b32 s0, s7, s0 +; GFX10-NEXT: s_cmp_eq_u32 s10, 6 +; GFX10-NEXT: s_cselect_b32 s0, s8, s0 +; GFX10-NEXT: s_cmp_eq_u32 s10, 7 +; GFX10-NEXT: s_cselect_b32 s0, s9, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: ; return to shader part epilog entry: %add = add i32 %sel, 3 %ext = extractelement <8 x float> %vec, i32 %add @@ -547,6 +849,27 @@ ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 7, v8 ; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; MOVREL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: dyn_extract_v8f32_v_v_offset3: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v8 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v8 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v8 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v8 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v8 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v8 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] entry: %add = add i32 %sel, 3 %ext = extractelement <8 x float> %vec, i32 %add @@ -575,6 +898,28 @@ ; GCN-NEXT: s_mov_b32 s15, s17 ; GCN-NEXT: s_movrels_b64 s[0:1], s[2:3] ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: dyn_extract_v8f64_s_s_offset1: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 m0, s18 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_mov_b32 s11, s13 +; GFX10-NEXT: s_mov_b32 s12, s14 +; GFX10-NEXT: s_mov_b32 s13, s15 +; GFX10-NEXT: s_mov_b32 s14, s16 +; GFX10-NEXT: s_mov_b32 s15, s17 +; GFX10-NEXT: s_movrels_b64 s[0:1], s[2:3] +; GFX10-NEXT: ; return to shader part epilog entry: %add = add i32 %sel, 1 %ext = extractelement <8 x double> %vec, i32 %add @@ -603,6 +948,28 @@ ; GCN-NEXT: s_mov_b32 s15, s17 ; GCN-NEXT: s_movrels_b64 s[0:1], s[4:5] ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: dyn_extract_v8f64_s_s_offset2: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 m0, s18 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_mov_b32 s11, s13 +; GFX10-NEXT: s_mov_b32 s12, s14 +; GFX10-NEXT: s_mov_b32 s13, s15 +; GFX10-NEXT: s_mov_b32 s14, s16 +; GFX10-NEXT: s_mov_b32 s15, s17 +; GFX10-NEXT: s_movrels_b64 s[0:1], s[4:5] +; GFX10-NEXT: ; return to shader part epilog entry: %add = add i32 %sel, 2 %ext = extractelement <8 x double> %vec, i32 %add @@ -631,6 +998,28 @@ ; GCN-NEXT: s_mov_b32 s15, s17 ; GCN-NEXT: s_movrels_b64 s[0:1], s[6:7] ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: dyn_extract_v8f64_s_s_offset3: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 m0, s18 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_mov_b32 s11, s13 +; GFX10-NEXT: s_mov_b32 s12, s14 +; GFX10-NEXT: s_mov_b32 s13, s15 +; GFX10-NEXT: s_mov_b32 s14, s16 +; GFX10-NEXT: s_mov_b32 s15, s17 +; GFX10-NEXT: s_movrels_b64 s[0:1], s[6:7] +; GFX10-NEXT: ; return to shader part epilog entry: %add = add i32 %sel, 3 %ext = extractelement <8 x double> %vec, i32 %add @@ -659,6 +1048,28 @@ ; GCN-NEXT: s_mov_b32 s15, s17 ; GCN-NEXT: s_movrels_b64 s[0:1], s[8:9] ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: dyn_extract_v8f64_s_s_offset4: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 m0, s18 +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_mov_b32 s11, s13 +; GFX10-NEXT: s_mov_b32 s12, s14 +; GFX10-NEXT: s_mov_b32 s13, s15 +; GFX10-NEXT: s_mov_b32 s14, s16 +; GFX10-NEXT: s_mov_b32 s15, s17 +; GFX10-NEXT: s_movrels_b64 s[0:1], s[8:9] +; GFX10-NEXT: ; return to shader part epilog entry: %add = add i32 %sel, 4 %ext = extractelement <8 x double> %vec, i32 %add @@ -687,6 +1098,28 @@ ; GCN-NEXT: s_mov_b32 s15, s17 ; GCN-NEXT: s_movrels_b64 s[0:1], s[10:11] ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: dyn_extract_v8f64_s_s_offset5: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_mov_b32 s11, s13 +; GFX10-NEXT: s_mov_b32 m0, s18 +; GFX10-NEXT: s_mov_b32 s12, s14 +; GFX10-NEXT: s_mov_b32 s13, s15 +; GFX10-NEXT: s_mov_b32 s14, s16 +; GFX10-NEXT: s_mov_b32 s15, s17 +; GFX10-NEXT: s_movrels_b64 s[0:1], s[10:11] +; GFX10-NEXT: ; return to shader part epilog entry: %add = add i32 %sel, 5 %ext = extractelement <8 x double> %vec, i32 %add @@ -715,6 +1148,28 @@ ; GCN-NEXT: s_mov_b32 s15, s17 ; GCN-NEXT: s_movrels_b64 s[0:1], s[12:13] ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: dyn_extract_v8f64_s_s_offset6: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_mov_b32 s11, s13 +; GFX10-NEXT: s_mov_b32 s12, s14 +; GFX10-NEXT: s_mov_b32 s13, s15 +; GFX10-NEXT: s_mov_b32 m0, s18 +; GFX10-NEXT: s_mov_b32 s14, s16 +; GFX10-NEXT: s_mov_b32 s15, s17 +; GFX10-NEXT: s_movrels_b64 s[0:1], s[12:13] +; GFX10-NEXT: ; return to shader part epilog entry: %add = add i32 %sel, 6 %ext = extractelement <8 x double> %vec, i32 %add @@ -766,6 +1221,28 @@ ; MOVREL-NEXT: s_mov_b32 m0, s18 ; MOVREL-NEXT: s_movrels_b64 s[0:1], s[14:15] ; MOVREL-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: dyn_extract_v8f64_s_s_offset7: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_mov_b32 s11, s13 +; GFX10-NEXT: s_mov_b32 s12, s14 +; GFX10-NEXT: s_mov_b32 s13, s15 +; GFX10-NEXT: s_mov_b32 s14, s16 +; GFX10-NEXT: s_mov_b32 s15, s17 +; GFX10-NEXT: s_mov_b32 m0, s18 +; GFX10-NEXT: s_movrels_b64 s[0:1], s[14:15] +; GFX10-NEXT: ; return to shader part epilog entry: %add = add i32 %sel, 7 %ext = extractelement <8 x double> %vec, i32 %add @@ -794,6 +1271,28 @@ ; GCN-NEXT: s_mov_b32 s15, s17 ; GCN-NEXT: s_movrels_b64 s[0:1], s[0:1] ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: dyn_extract_v8f64_s_s_offsetm1: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_add_i32 m0, s18, -1 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_mov_b32 s11, s13 +; GFX10-NEXT: s_mov_b32 s12, s14 +; GFX10-NEXT: s_mov_b32 s13, s15 +; GFX10-NEXT: s_mov_b32 s14, s16 +; GFX10-NEXT: s_mov_b32 s15, s17 +; GFX10-NEXT: s_movrels_b64 s[0:1], s[0:1] +; GFX10-NEXT: ; return to shader part epilog entry: %add = add i32 %sel, -1 %ext = extractelement <8 x double> %vec, i32 %add @@ -854,6 +1353,34 @@ ; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc ; MOVREL-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc ; MOVREL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: dyn_extract_v8f64_v_v_offset3: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16 +; GFX10-NEXT: v_cndmask_b32_e32 v19, v0, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v16 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v19, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v16 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v7, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v16 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v16 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v11, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v16 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v12, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v13, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v16 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] entry: %add = add i32 %sel, 3 %ext = extractelement <8 x double> %vec, i32 %add @@ -879,6 +1406,26 @@ ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 7, v8 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: dyn_extract_v8p3_v_v: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v8 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v8 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v8 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v8 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v8 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v8 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] entry: %ext = extractelement <8 x i8 addrspace(3)*> %vec, i32 %idx ret i8 addrspace(3)* %ext @@ -925,6 +1472,26 @@ ; MOVREL-NEXT: s_mov_b32 m0, -1 ; MOVREL-NEXT: ds_write_b32 v0, v0 ; MOVREL-NEXT: s_endpgm +; +; GFX10-LABEL: dyn_extract_v8p3_s_s: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_cmp_eq_u32 s10, 1 +; GFX10-NEXT: s_cselect_b32 s0, s3, s2 +; GFX10-NEXT: s_cmp_eq_u32 s10, 2 +; GFX10-NEXT: s_cselect_b32 s0, s4, s0 +; GFX10-NEXT: s_cmp_eq_u32 s10, 3 +; GFX10-NEXT: s_cselect_b32 s0, s5, s0 +; GFX10-NEXT: s_cmp_eq_u32 s10, 4 +; GFX10-NEXT: s_cselect_b32 s0, s6, s0 +; GFX10-NEXT: s_cmp_eq_u32 s10, 5 +; GFX10-NEXT: s_cselect_b32 s0, s7, s0 +; GFX10-NEXT: s_cmp_eq_u32 s10, 6 +; GFX10-NEXT: s_cselect_b32 s0, s8, s0 +; GFX10-NEXT: s_cmp_eq_u32 s10, 7 +; GFX10-NEXT: s_cselect_b32 s0, s9, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: ds_write_b32 v0, v0 +; GFX10-NEXT: s_endpgm entry: %ext = extractelement <8 x i8 addrspace(3)*> %vec, i32 %idx store i8 addrspace(3)* %ext, i8 addrspace(3)* addrspace(3)* undef @@ -957,6 +1524,33 @@ ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: dyn_extract_v8p1_v_v: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16 +; GFX10-NEXT: v_cndmask_b32_e32 v19, v0, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v16 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v19, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v16 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v7, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v16 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v16 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v11, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v16 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v12, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v13, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v16 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] entry: %ext = extractelement <8 x i8 addrspace(1)*> %vec, i32 %idx ret i8 addrspace(1)* %ext @@ -1012,6 +1606,31 @@ ; MOVREL-NEXT: v_mov_b32_e32 v1, s1 ; MOVREL-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; MOVREL-NEXT: s_endpgm +; +; GFX10-LABEL: dyn_extract_v8p1_s_s: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 m0, s18 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_mov_b32 s11, s13 +; GFX10-NEXT: s_mov_b32 s12, s14 +; GFX10-NEXT: s_mov_b32 s13, s15 +; GFX10-NEXT: s_mov_b32 s14, s16 +; GFX10-NEXT: s_mov_b32 s15, s17 +; GFX10-NEXT: s_movrels_b64 s[0:1], s[0:1] +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: global_store_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_endpgm entry: %ext = extractelement <8 x i8 addrspace(1)*> %vec, i32 %idx store i8 addrspace(1)* %ext, i8 addrspace(1)* addrspace(1)* undef @@ -1031,6 +1650,12 @@ ; MOVREL-NEXT: s_mov_b32 m0, s2 ; MOVREL-NEXT: v_movrels_b32_e32 v0, v0 ; MOVREL-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: dyn_extract_v16f32_v_s: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 m0, s2 +; GFX10-NEXT: v_movrels_b32_e32 v0, v0 +; GFX10-NEXT: ; return to shader part epilog entry: %ext = extractelement <16 x float> %vec, i32 %sel ret float %ext @@ -1049,6 +1674,12 @@ ; MOVREL-NEXT: s_mov_b32 m0, s2 ; MOVREL-NEXT: v_movrels_b32_e32 v0, v0 ; MOVREL-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: dyn_extract_v32f32_v_s: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 m0, s2 +; GFX10-NEXT: v_movrels_b32_e32 v0, v0 +; GFX10-NEXT: ; return to shader part epilog entry: %ext = extractelement <32 x float> %vec, i32 %sel ret float %ext @@ -1074,6 +1705,15 @@ ; MOVREL-NEXT: v_readfirstlane_b32 s0, v32 ; MOVREL-NEXT: v_readfirstlane_b32 s1, v0 ; MOVREL-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: dyn_extract_v16f64_v_s: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_lshl_b32 m0, s2, 1 +; GFX10-NEXT: v_movrels_b32_e32 v32, v0 +; GFX10-NEXT: v_movrels_b32_e32 v0, v1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v32 +; GFX10-NEXT: v_readfirstlane_b32 s1, v0 +; GFX10-NEXT: ; return to shader part epilog entry: %ext = extractelement <16 x double> %vec, i32 %sel ret double %ext @@ -1102,6 +1742,29 @@ ; GCN-NEXT: s_movrels_b32 s0, s4 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: dyn_extract_v16f32_s_s: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s4, 1.0 +; GFX10-NEXT: s_mov_b32 m0, s2 +; GFX10-NEXT: s_mov_b32 s19, 0x41800000 +; GFX10-NEXT: s_mov_b32 s18, 0x41700000 +; GFX10-NEXT: s_mov_b32 s17, 0x41600000 +; GFX10-NEXT: s_mov_b32 s16, 0x41500000 +; GFX10-NEXT: s_mov_b32 s15, 0x41400000 +; GFX10-NEXT: s_mov_b32 s14, 0x41300000 +; GFX10-NEXT: s_mov_b32 s13, 0x41200000 +; GFX10-NEXT: s_mov_b32 s12, 0x41100000 +; GFX10-NEXT: s_mov_b32 s11, 0x41000000 +; GFX10-NEXT: s_mov_b32 s10, 0x40e00000 +; GFX10-NEXT: s_mov_b32 s9, 0x40c00000 +; GFX10-NEXT: s_mov_b32 s8, 0x40a00000 +; GFX10-NEXT: s_mov_b32 s7, 4.0 +; GFX10-NEXT: s_mov_b32 s6, 0x40400000 +; GFX10-NEXT: s_mov_b32 s5, 2.0 +; GFX10-NEXT: s_movrels_b32 s0, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: ; return to shader part epilog entry: %ext = extractelement <16 x float> , i32 %sel ret float %ext @@ -1146,6 +1809,45 @@ ; GCN-NEXT: s_movrels_b32 s0, s36 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: dyn_extract_v32f32_s_s: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s36, 1.0 +; GFX10-NEXT: s_mov_b32 m0, s2 +; GFX10-NEXT: s_mov_b32 s67, 0x42000000 +; GFX10-NEXT: s_mov_b32 s66, 0x41f80000 +; GFX10-NEXT: s_mov_b32 s65, 0x41f00000 +; GFX10-NEXT: s_mov_b32 s64, 0x41e80000 +; GFX10-NEXT: s_mov_b32 s63, 0x41e00000 +; GFX10-NEXT: s_mov_b32 s62, 0x41d80000 +; GFX10-NEXT: s_mov_b32 s61, 0x41d00000 +; GFX10-NEXT: s_mov_b32 s60, 0x41c80000 +; GFX10-NEXT: s_mov_b32 s59, 0x41c00000 +; GFX10-NEXT: s_mov_b32 s58, 0x41b80000 +; GFX10-NEXT: s_mov_b32 s57, 0x41b00000 +; GFX10-NEXT: s_mov_b32 s56, 0x41a80000 +; GFX10-NEXT: s_mov_b32 s55, 0x41a00000 +; GFX10-NEXT: s_mov_b32 s54, 0x41980000 +; GFX10-NEXT: s_mov_b32 s53, 0x41900000 +; GFX10-NEXT: s_mov_b32 s52, 0x41880000 +; GFX10-NEXT: s_mov_b32 s51, 0x41800000 +; GFX10-NEXT: s_mov_b32 s50, 0x41700000 +; GFX10-NEXT: s_mov_b32 s49, 0x41600000 +; GFX10-NEXT: s_mov_b32 s48, 0x41500000 +; GFX10-NEXT: s_mov_b32 s47, 0x41400000 +; GFX10-NEXT: s_mov_b32 s46, 0x41300000 +; GFX10-NEXT: s_mov_b32 s45, 0x41200000 +; GFX10-NEXT: s_mov_b32 s44, 0x41100000 +; GFX10-NEXT: s_mov_b32 s43, 0x41000000 +; GFX10-NEXT: s_mov_b32 s42, 0x40e00000 +; GFX10-NEXT: s_mov_b32 s41, 0x40c00000 +; GFX10-NEXT: s_mov_b32 s40, 0x40a00000 +; GFX10-NEXT: s_mov_b32 s39, 4.0 +; GFX10-NEXT: s_mov_b32 s38, 0x40400000 +; GFX10-NEXT: s_mov_b32 s37, 2.0 +; GFX10-NEXT: s_movrels_b32 s0, s36 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: ; return to shader part epilog entry: %ext = extractelement <32 x float> , i32 %sel ret float %ext @@ -1186,6 +1888,41 @@ ; GCN-NEXT: s_mov_b64 s[38:39], 2.0 ; GCN-NEXT: s_movrels_b64 s[0:1], s[36:37] ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: dyn_extract_v16f64_s_s: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s66, 0 +; GFX10-NEXT: s_mov_b64 s[36:37], 1.0 +; GFX10-NEXT: s_mov_b32 m0, s2 +; GFX10-NEXT: s_mov_b32 s67, 0x40300000 +; GFX10-NEXT: s_mov_b32 s65, 0x402e0000 +; GFX10-NEXT: s_mov_b32 s64, s66 +; GFX10-NEXT: s_mov_b32 s63, 0x402c0000 +; GFX10-NEXT: s_mov_b32 s62, s66 +; GFX10-NEXT: s_mov_b32 s61, 0x402a0000 +; GFX10-NEXT: s_mov_b32 s60, s66 +; GFX10-NEXT: s_mov_b32 s59, 0x40280000 +; GFX10-NEXT: s_mov_b32 s58, s66 +; GFX10-NEXT: s_mov_b32 s57, 0x40260000 +; GFX10-NEXT: s_mov_b32 s56, s66 +; GFX10-NEXT: s_mov_b32 s55, 0x40240000 +; GFX10-NEXT: s_mov_b32 s54, s66 +; GFX10-NEXT: s_mov_b32 s53, 0x40220000 +; GFX10-NEXT: s_mov_b32 s52, s66 +; GFX10-NEXT: s_mov_b32 s51, 0x40200000 +; GFX10-NEXT: s_mov_b32 s50, s66 +; GFX10-NEXT: s_mov_b32 s49, 0x401c0000 +; GFX10-NEXT: s_mov_b32 s48, s66 +; GFX10-NEXT: s_mov_b32 s47, 0x40180000 +; GFX10-NEXT: s_mov_b32 s46, s66 +; GFX10-NEXT: s_mov_b32 s45, 0x40140000 +; GFX10-NEXT: s_mov_b32 s44, s66 +; GFX10-NEXT: s_mov_b64 s[42:43], 4.0 +; GFX10-NEXT: s_mov_b32 s41, 0x40080000 +; GFX10-NEXT: s_mov_b32 s40, s66 +; GFX10-NEXT: s_mov_b64 s[38:39], 2.0 +; GFX10-NEXT: s_movrels_b64 s[0:1], s[36:37] +; GFX10-NEXT: ; return to shader part epilog entry: %ext = extractelement <16 x double> , i32 %sel ret double %ext @@ -1213,6 +1950,27 @@ ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v6, vcc ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: dyn_extract_v6f32_s_v: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: v_cndmask_b32_e32 v1, s0, v1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s4, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s5, vcc_lo +; GFX10-NEXT: ; return to shader part epilog entry: %ext = extractelement <6 x float> %vec, i32 %sel ret float %ext @@ -1233,6 +1991,22 @@ ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v6 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: dyn_extract_v6f32_v_v: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v6 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v6 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v6 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v6 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] entry: %ext = extractelement <6 x float> %vec, i32 %sel ret float %ext @@ -1252,6 +2026,20 @@ ; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s2, 5 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: dyn_extract_v6f32_v_s: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 2 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 3 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 4 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 5 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo +; GFX10-NEXT: ; return to shader part epilog entry: %ext = extractelement <6 x float> %vec, i32 %sel ret float %ext @@ -1272,6 +2060,21 @@ ; GCN-NEXT: s_cselect_b32 s0, s7, s0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: dyn_extract_v6f32_s_s: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_cmp_eq_u32 s8, 1 +; GFX10-NEXT: s_cselect_b32 s0, s3, s2 +; GFX10-NEXT: s_cmp_eq_u32 s8, 2 +; GFX10-NEXT: s_cselect_b32 s0, s4, s0 +; GFX10-NEXT: s_cmp_eq_u32 s8, 3 +; GFX10-NEXT: s_cselect_b32 s0, s5, s0 +; GFX10-NEXT: s_cmp_eq_u32 s8, 4 +; GFX10-NEXT: s_cselect_b32 s0, s6, s0 +; GFX10-NEXT: s_cmp_eq_u32 s8, 5 +; GFX10-NEXT: s_cselect_b32 s0, s7, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: ; return to shader part epilog entry: %ext = extractelement <6 x float> %vec, i32 %sel ret float %ext @@ -1303,6 +2106,30 @@ ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 6, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v7, vcc ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: dyn_extract_v7f32_s_v: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: v_cndmask_b32_e32 v1, s0, v1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s4, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s5, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s6, vcc_lo +; GFX10-NEXT: ; return to shader part epilog entry: %ext = extractelement <7 x float> %vec, i32 %sel ret float %ext @@ -1325,6 +2152,24 @@ ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 6, v7 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: dyn_extract_v7f32_v_v: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v7 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v7 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v7 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v7 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v7 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] entry: %ext = extractelement <7 x float> %vec, i32 %sel ret float %ext @@ -1346,6 +2191,22 @@ ; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s2, 6 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: dyn_extract_v7f32_v_s: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 2 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 3 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 4 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 5 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 6 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX10-NEXT: ; return to shader part epilog entry: %ext = extractelement <7 x float> %vec, i32 %sel ret float %ext @@ -1368,6 +2229,23 @@ ; GCN-NEXT: s_cselect_b32 s0, s8, s0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: dyn_extract_v7f32_s_s: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_cmp_eq_u32 s9, 1 +; GFX10-NEXT: s_cselect_b32 s0, s3, s2 +; GFX10-NEXT: s_cmp_eq_u32 s9, 2 +; GFX10-NEXT: s_cselect_b32 s0, s4, s0 +; GFX10-NEXT: s_cmp_eq_u32 s9, 3 +; GFX10-NEXT: s_cselect_b32 s0, s5, s0 +; GFX10-NEXT: s_cmp_eq_u32 s9, 4 +; GFX10-NEXT: s_cselect_b32 s0, s6, s0 +; GFX10-NEXT: s_cmp_eq_u32 s9, 5 +; GFX10-NEXT: s_cselect_b32 s0, s7, s0 +; GFX10-NEXT: s_cmp_eq_u32 s9, 6 +; GFX10-NEXT: s_cselect_b32 s0, s8, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: ; return to shader part epilog entry: %ext = extractelement <7 x float> %vec, i32 %sel ret float %ext @@ -1414,6 +2292,41 @@ ; GCN-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-NEXT: v_readfirstlane_b32 s1, v1 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: dyn_extract_v6f64_s_v: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s15, s5 +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: v_mov_b32_e32 v2, s15 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: v_cndmask_b32_e32 v1, s0, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, s1, v2, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 s14, s12 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s5, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 +; GFX10-NEXT: s_mov_b32 s47, s13 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s7, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s9, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s14, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s47, vcc_lo +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: ; return to shader part epilog entry: %ext = extractelement <6 x double> %vec, i32 %sel ret double %ext @@ -1439,6 +2352,27 @@ ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: dyn_extract_v6f64_v_v: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 +; GFX10-NEXT: v_cndmask_b32_e32 v15, v0, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v12 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v15, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v12 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v7, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v12 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v12 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] entry: %ext = extractelement <6 x double> %vec, i32 %sel ret double %ext @@ -1464,6 +2398,15 @@ ; MOVREL-NEXT: v_readfirstlane_b32 s0, v12 ; MOVREL-NEXT: v_readfirstlane_b32 s1, v0 ; MOVREL-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: dyn_extract_v6f64_v_s: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_lshl_b32 m0, s2, 1 +; GFX10-NEXT: v_movrels_b32_e32 v12, v0 +; GFX10-NEXT: v_movrels_b32_e32 v0, v1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v12 +; GFX10-NEXT: v_readfirstlane_b32 s1, v0 +; GFX10-NEXT: ; return to shader part epilog entry: %ext = extractelement <6 x double> %vec, i32 %sel ret double %ext @@ -1487,6 +2430,24 @@ ; GCN-NEXT: s_mov_b32 s11, s13 ; GCN-NEXT: s_movrels_b64 s[0:1], s[0:1] ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: dyn_extract_v6f64_s_s: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 m0, s14 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_mov_b32 s11, s13 +; GFX10-NEXT: s_movrels_b64 s[0:1], s[0:1] +; GFX10-NEXT: ; return to shader part epilog entry: %ext = extractelement <6 x double> %vec, i32 %sel ret double %ext @@ -1540,6 +2501,46 @@ ; GCN-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-NEXT: v_readfirstlane_b32 s1, v1 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: dyn_extract_v7f64_s_v: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s19, s5 +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: v_mov_b32_e32 v2, s19 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: v_cndmask_b32_e32 v1, s0, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, s1, v2, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 s46, s12 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s5, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 +; GFX10-NEXT: s_mov_b32 s47, s13 +; GFX10-NEXT: s_mov_b32 s12, s14 +; GFX10-NEXT: s_mov_b32 s13, s15 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s7, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s9, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s46, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s47, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s12, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s13, vcc_lo +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: ; return to shader part epilog entry: %ext = extractelement <7 x double> %vec, i32 %sel ret double %ext @@ -1568,6 +2569,30 @@ ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: dyn_extract_v7f64_v_v: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v14 +; GFX10-NEXT: v_cndmask_b32_e32 v15, v0, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v14 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v15, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v14 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v7, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v14 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v14 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v11, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v14 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v12, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v13, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] entry: %ext = extractelement <7 x double> %vec, i32 %sel ret double %ext @@ -1593,6 +2618,15 @@ ; MOVREL-NEXT: v_readfirstlane_b32 s0, v14 ; MOVREL-NEXT: v_readfirstlane_b32 s1, v0 ; MOVREL-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: dyn_extract_v7f64_v_s: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_lshl_b32 m0, s2, 1 +; GFX10-NEXT: v_movrels_b32_e32 v14, v0 +; GFX10-NEXT: v_movrels_b32_e32 v0, v1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v14 +; GFX10-NEXT: v_readfirstlane_b32 s1, v0 +; GFX10-NEXT: ; return to shader part epilog entry: %ext = extractelement <7 x double> %vec, i32 %sel ret double %ext @@ -1618,6 +2652,26 @@ ; GCN-NEXT: s_mov_b32 s13, s15 ; GCN-NEXT: s_movrels_b64 s[0:1], s[0:1] ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: dyn_extract_v7f64_s_s: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 m0, s16 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_mov_b32 s11, s13 +; GFX10-NEXT: s_mov_b32 s12, s14 +; GFX10-NEXT: s_mov_b32 s13, s15 +; GFX10-NEXT: s_movrels_b64 s[0:1], s[0:1] +; GFX10-NEXT: ; return to shader part epilog entry: %ext = extractelement <7 x double> %vec, i32 %sel ret double %ext @@ -1806,6 +2860,98 @@ ; MOVREL-NEXT: v_mov_b32_e32 v3, s7 ; MOVREL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; MOVREL-NEXT: s_endpgm +; +; GFX10-LABEL: dyn_extract_v5f64_s_s: +; GFX10: .amd_kernel_code_t +; GFX10-NEXT: amd_code_version_major = 1 +; GFX10-NEXT: amd_code_version_minor = 2 +; GFX10-NEXT: amd_machine_kind = 1 +; GFX10-NEXT: amd_machine_version_major = 10 +; GFX10-NEXT: amd_machine_version_minor = 1 +; GFX10-NEXT: amd_machine_version_stepping = 0 +; GFX10-NEXT: kernel_code_entry_byte_offset = 256 +; GFX10-NEXT: kernel_code_prefetch_byte_size = 0 +; GFX10-NEXT: granulated_workitem_vgpr_count = 0 +; GFX10-NEXT: granulated_wavefront_sgpr_count = 1 +; GFX10-NEXT: priority = 0 +; GFX10-NEXT: float_mode = 240 +; GFX10-NEXT: priv = 0 +; GFX10-NEXT: enable_dx10_clamp = 1 +; GFX10-NEXT: debug_mode = 0 +; GFX10-NEXT: enable_ieee_mode = 1 +; GFX10-NEXT: enable_wgp_mode = 1 +; GFX10-NEXT: enable_mem_ordered = 1 +; GFX10-NEXT: enable_fwd_progress = 0 +; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; GFX10-NEXT: user_sgpr_count = 6 +; GFX10-NEXT: enable_trap_handler = 0 +; GFX10-NEXT: enable_sgpr_workgroup_id_x = 1 +; GFX10-NEXT: enable_sgpr_workgroup_id_y = 0 +; GFX10-NEXT: enable_sgpr_workgroup_id_z = 0 +; GFX10-NEXT: enable_sgpr_workgroup_info = 0 +; GFX10-NEXT: enable_vgpr_workitem_id = 0 +; GFX10-NEXT: enable_exception_msb = 0 +; GFX10-NEXT: granulated_lds_size = 0 +; GFX10-NEXT: enable_exception = 0 +; GFX10-NEXT: enable_sgpr_private_segment_buffer = 1 +; GFX10-NEXT: enable_sgpr_dispatch_ptr = 0 +; GFX10-NEXT: enable_sgpr_queue_ptr = 0 +; GFX10-NEXT: enable_sgpr_kernarg_segment_ptr = 1 +; GFX10-NEXT: enable_sgpr_dispatch_id = 0 +; GFX10-NEXT: enable_sgpr_flat_scratch_init = 0 +; GFX10-NEXT: enable_sgpr_private_segment_size = 0 +; GFX10-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; GFX10-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; GFX10-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; GFX10-NEXT: enable_wavefront_size32 = 1 +; GFX10-NEXT: enable_ordered_append_gds = 0 +; GFX10-NEXT: private_element_size = 1 +; GFX10-NEXT: is_ptr64 = 1 +; GFX10-NEXT: is_dynamic_callstack = 0 +; GFX10-NEXT: is_debug_enabled = 0 +; GFX10-NEXT: is_xnack_enabled = 1 +; GFX10-NEXT: workitem_private_segment_byte_size = 0 +; GFX10-NEXT: workgroup_group_segment_byte_size = 0 +; GFX10-NEXT: gds_segment_byte_size = 0 +; GFX10-NEXT: kernarg_segment_byte_size = 28 +; GFX10-NEXT: workgroup_fbarrier_count = 0 +; GFX10-NEXT: wavefront_sgpr_count = 9 +; GFX10-NEXT: workitem_vgpr_count = 3 +; GFX10-NEXT: reserved_vgpr_first = 0 +; GFX10-NEXT: reserved_vgpr_count = 0 +; GFX10-NEXT: reserved_sgpr_first = 0 +; GFX10-NEXT: reserved_sgpr_count = 0 +; GFX10-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; GFX10-NEXT: debug_private_segment_buffer_sgpr = 0 +; GFX10-NEXT: kernarg_segment_alignment = 4 +; GFX10-NEXT: group_segment_alignment = 4 +; GFX10-NEXT: private_segment_alignment = 4 +; GFX10-NEXT: wavefront_size = 5 +; GFX10-NEXT: call_convention = -1 +; GFX10-NEXT: runtime_loader_kernel_symbol = 0 +; GFX10-NEXT: .end_amd_kernel_code_t +; GFX10-NEXT: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s8, s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: s_mov_b32 s5, 0x40080000 +; GFX10-NEXT: s_mov_b32 s4, s2 +; GFX10-NEXT: s_mov_b32 s3, 0x40140000 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_cmp_eq_u32 s8, 1 +; GFX10-NEXT: s_cselect_b64 s[6:7], 2.0, 1.0 +; GFX10-NEXT: s_cmp_eq_u32 s8, 2 +; GFX10-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] +; GFX10-NEXT: s_cmp_eq_u32 s8, 3 +; GFX10-NEXT: s_cselect_b64 s[4:5], 4.0, s[4:5] +; GFX10-NEXT: s_cmp_eq_u32 s8, 4 +; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: s_endpgm entry: %ext = extractelement <5 x double> , i32 %sel store double %ext, double addrspace(1)* %out @@ -1857,6 +3003,40 @@ ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 14, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v12, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: dyn_extract_v15f32_const_s_v: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x40400000, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 4.0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x40a00000, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x40c00000, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x40e00000, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x41000000, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x41100000, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x41200000, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x41300000, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x41400000, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 12, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x41500000, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 13, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x41600000, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 14, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, 0x41700000, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] entry: %ext = extractelement <15 x float> , i32 %sel ret float %ext @@ -1884,6 +3064,28 @@ ; GCN-NEXT: s_movrels_b32 s0, s4 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: dyn_extract_v15f32_const_s_s: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s4, 1.0 +; GFX10-NEXT: s_mov_b32 m0, s2 +; GFX10-NEXT: s_mov_b32 s18, 0x41700000 +; GFX10-NEXT: s_mov_b32 s17, 0x41600000 +; GFX10-NEXT: s_mov_b32 s16, 0x41500000 +; GFX10-NEXT: s_mov_b32 s15, 0x41400000 +; GFX10-NEXT: s_mov_b32 s14, 0x41300000 +; GFX10-NEXT: s_mov_b32 s13, 0x41200000 +; GFX10-NEXT: s_mov_b32 s12, 0x41100000 +; GFX10-NEXT: s_mov_b32 s11, 0x41000000 +; GFX10-NEXT: s_mov_b32 s10, 0x40e00000 +; GFX10-NEXT: s_mov_b32 s9, 0x40c00000 +; GFX10-NEXT: s_mov_b32 s8, 0x40a00000 +; GFX10-NEXT: s_mov_b32 s7, 4.0 +; GFX10-NEXT: s_mov_b32 s6, 0x40400000 +; GFX10-NEXT: s_mov_b32 s5, 2.0 +; GFX10-NEXT: s_movrels_b32 s0, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: ; return to shader part epilog entry: %ext = extractelement <15 x float> , i32 %sel ret float %ext @@ -1947,6 +3149,54 @@ ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 14, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v15, vcc ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: dyn_extract_v15f32_s_v: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: v_cndmask_b32_e32 v1, s0, v1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 s46, s12 +; GFX10-NEXT: s_mov_b32 s47, s13 +; GFX10-NEXT: s_mov_b32 s12, s14 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 +; GFX10-NEXT: s_mov_b32 s13, s15 +; GFX10-NEXT: s_mov_b32 s14, s16 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s4, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s5, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s6, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s7, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s8, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s9, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s46, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s47, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 12, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s12, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 13, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s13, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 14, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s14, vcc_lo +; GFX10-NEXT: ; return to shader part epilog entry: %ext = extractelement <15 x float> %vec, i32 %sel ret float %ext @@ -1985,6 +3235,40 @@ ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 14, v15 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: dyn_extract_v15f32_v_v: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v7, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v8, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v11, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 12, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v12, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 13, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 14, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] entry: %ext = extractelement <15 x float> %vec, i32 %sel ret float %ext @@ -2003,6 +3287,12 @@ ; MOVREL-NEXT: s_mov_b32 m0, s2 ; MOVREL-NEXT: v_movrels_b32_e32 v0, v0 ; MOVREL-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: dyn_extract_v15f32_v_s: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 m0, s2 +; GFX10-NEXT: v_movrels_b32_e32 v0, v0 +; GFX10-NEXT: ; return to shader part epilog entry: %ext = extractelement <15 x float> %vec, i32 %sel ret float %ext @@ -2030,6 +3320,28 @@ ; GCN-NEXT: s_movrels_b32 s0, s0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: dyn_extract_v15f32_s_s: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 m0, s17 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_mov_b32 s11, s13 +; GFX10-NEXT: s_mov_b32 s12, s14 +; GFX10-NEXT: s_mov_b32 s13, s15 +; GFX10-NEXT: s_mov_b32 s14, s16 +; GFX10-NEXT: s_movrels_b32 s0, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: ; return to shader part epilog entry: %ext = extractelement <15 x float> %vec, i32 %sel ret float %ext @@ -2057,6 +3369,28 @@ ; GCN-NEXT: s_movrels_b32 s0, s3 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: dyn_extract_v15f32_s_s_offset3: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 m0, s17 +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_mov_b32 s11, s13 +; GFX10-NEXT: s_mov_b32 s12, s14 +; GFX10-NEXT: s_mov_b32 s13, s15 +; GFX10-NEXT: s_mov_b32 s14, s16 +; GFX10-NEXT: s_movrels_b32 s0, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: ; return to shader part epilog entry: %add = add i32 %sel, 3 %ext = extractelement <15 x float> %vec, i32 %add @@ -2131,6 +3465,41 @@ ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 14, v15 ; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc ; MOVREL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: dyn_extract_v15f32_v_v_offset3: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v7, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v8, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v11, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 12, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v12, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 13, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 14, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] entry: %add = add i32 %sel, 3 %ext = extractelement <15 x float> %vec, i32 %add @@ -2306,6 +3675,91 @@ ; MOVREL-NEXT: v_mov_b32_e32 v1, s1 ; MOVREL-NEXT: flat_store_dword v[0:1], v2 ; MOVREL-NEXT: s_endpgm +; +; GFX10-LABEL: dyn_extract_v4f32_s_s_s: +; GFX10: .amd_kernel_code_t +; GFX10-NEXT: amd_code_version_major = 1 +; GFX10-NEXT: amd_code_version_minor = 2 +; GFX10-NEXT: amd_machine_kind = 1 +; GFX10-NEXT: amd_machine_version_major = 10 +; GFX10-NEXT: amd_machine_version_minor = 1 +; GFX10-NEXT: amd_machine_version_stepping = 0 +; GFX10-NEXT: kernel_code_entry_byte_offset = 256 +; GFX10-NEXT: kernel_code_prefetch_byte_size = 0 +; GFX10-NEXT: granulated_workitem_vgpr_count = 0 +; GFX10-NEXT: granulated_wavefront_sgpr_count = 0 +; GFX10-NEXT: priority = 0 +; GFX10-NEXT: float_mode = 240 +; GFX10-NEXT: priv = 0 +; GFX10-NEXT: enable_dx10_clamp = 1 +; GFX10-NEXT: debug_mode = 0 +; GFX10-NEXT: enable_ieee_mode = 1 +; GFX10-NEXT: enable_wgp_mode = 1 +; GFX10-NEXT: enable_mem_ordered = 1 +; GFX10-NEXT: enable_fwd_progress = 0 +; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; GFX10-NEXT: user_sgpr_count = 6 +; GFX10-NEXT: enable_trap_handler = 0 +; GFX10-NEXT: enable_sgpr_workgroup_id_x = 1 +; GFX10-NEXT: enable_sgpr_workgroup_id_y = 0 +; GFX10-NEXT: enable_sgpr_workgroup_id_z = 0 +; GFX10-NEXT: enable_sgpr_workgroup_info = 0 +; GFX10-NEXT: enable_vgpr_workitem_id = 0 +; GFX10-NEXT: enable_exception_msb = 0 +; GFX10-NEXT: granulated_lds_size = 0 +; GFX10-NEXT: enable_exception = 0 +; GFX10-NEXT: enable_sgpr_private_segment_buffer = 1 +; GFX10-NEXT: enable_sgpr_dispatch_ptr = 0 +; GFX10-NEXT: enable_sgpr_queue_ptr = 0 +; GFX10-NEXT: enable_sgpr_kernarg_segment_ptr = 1 +; GFX10-NEXT: enable_sgpr_dispatch_id = 0 +; GFX10-NEXT: enable_sgpr_flat_scratch_init = 0 +; GFX10-NEXT: enable_sgpr_private_segment_size = 0 +; GFX10-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; GFX10-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; GFX10-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; GFX10-NEXT: enable_wavefront_size32 = 1 +; GFX10-NEXT: enable_ordered_append_gds = 0 +; GFX10-NEXT: private_element_size = 1 +; GFX10-NEXT: is_ptr64 = 1 +; GFX10-NEXT: is_dynamic_callstack = 0 +; GFX10-NEXT: is_debug_enabled = 0 +; GFX10-NEXT: is_xnack_enabled = 1 +; GFX10-NEXT: workitem_private_segment_byte_size = 0 +; GFX10-NEXT: workgroup_group_segment_byte_size = 0 +; GFX10-NEXT: gds_segment_byte_size = 0 +; GFX10-NEXT: kernarg_segment_byte_size = 28 +; GFX10-NEXT: workgroup_fbarrier_count = 0 +; GFX10-NEXT: wavefront_sgpr_count = 6 +; GFX10-NEXT: workitem_vgpr_count = 2 +; GFX10-NEXT: reserved_vgpr_first = 0 +; GFX10-NEXT: reserved_vgpr_count = 0 +; GFX10-NEXT: reserved_sgpr_first = 0 +; GFX10-NEXT: reserved_sgpr_count = 0 +; GFX10-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; GFX10-NEXT: debug_private_segment_buffer_sgpr = 0 +; GFX10-NEXT: kernarg_segment_alignment = 4 +; GFX10-NEXT: group_segment_alignment = 4 +; GFX10-NEXT: private_segment_alignment = 4 +; GFX10-NEXT: wavefront_size = 5 +; GFX10-NEXT: call_convention = -1 +; GFX10-NEXT: runtime_loader_kernel_symbol = 0 +; GFX10-NEXT: .end_amd_kernel_code_t +; GFX10-NEXT: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_cmp_eq_u32 s2, 1 +; GFX10-NEXT: s_cselect_b32 s3, 2.0, 1.0 +; GFX10-NEXT: s_cmp_eq_u32 s2, 2 +; GFX10-NEXT: s_cselect_b32 s3, 0x40400000, s3 +; GFX10-NEXT: s_cmp_eq_u32 s2, 3 +; GFX10-NEXT: s_cselect_b32 s2, 4.0, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: s_endpgm entry: %ext = extractelement <4 x float> , i32 %sel store float %ext, float addrspace(1)* %out @@ -2487,6 +3941,94 @@ ; MOVREL-NEXT: v_mov_b32_e32 v3, s3 ; MOVREL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; MOVREL-NEXT: s_endpgm +; +; GFX10-LABEL: dyn_extract_v4f64_s_s_s: +; GFX10: .amd_kernel_code_t +; GFX10-NEXT: amd_code_version_major = 1 +; GFX10-NEXT: amd_code_version_minor = 2 +; GFX10-NEXT: amd_machine_kind = 1 +; GFX10-NEXT: amd_machine_version_major = 10 +; GFX10-NEXT: amd_machine_version_minor = 1 +; GFX10-NEXT: amd_machine_version_stepping = 0 +; GFX10-NEXT: kernel_code_entry_byte_offset = 256 +; GFX10-NEXT: kernel_code_prefetch_byte_size = 0 +; GFX10-NEXT: granulated_workitem_vgpr_count = 0 +; GFX10-NEXT: granulated_wavefront_sgpr_count = 0 +; GFX10-NEXT: priority = 0 +; GFX10-NEXT: float_mode = 240 +; GFX10-NEXT: priv = 0 +; GFX10-NEXT: enable_dx10_clamp = 1 +; GFX10-NEXT: debug_mode = 0 +; GFX10-NEXT: enable_ieee_mode = 1 +; GFX10-NEXT: enable_wgp_mode = 1 +; GFX10-NEXT: enable_mem_ordered = 1 +; GFX10-NEXT: enable_fwd_progress = 0 +; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; GFX10-NEXT: user_sgpr_count = 6 +; GFX10-NEXT: enable_trap_handler = 0 +; GFX10-NEXT: enable_sgpr_workgroup_id_x = 1 +; GFX10-NEXT: enable_sgpr_workgroup_id_y = 0 +; GFX10-NEXT: enable_sgpr_workgroup_id_z = 0 +; GFX10-NEXT: enable_sgpr_workgroup_info = 0 +; GFX10-NEXT: enable_vgpr_workitem_id = 0 +; GFX10-NEXT: enable_exception_msb = 0 +; GFX10-NEXT: granulated_lds_size = 0 +; GFX10-NEXT: enable_exception = 0 +; GFX10-NEXT: enable_sgpr_private_segment_buffer = 1 +; GFX10-NEXT: enable_sgpr_dispatch_ptr = 0 +; GFX10-NEXT: enable_sgpr_queue_ptr = 0 +; GFX10-NEXT: enable_sgpr_kernarg_segment_ptr = 1 +; GFX10-NEXT: enable_sgpr_dispatch_id = 0 +; GFX10-NEXT: enable_sgpr_flat_scratch_init = 0 +; GFX10-NEXT: enable_sgpr_private_segment_size = 0 +; GFX10-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; GFX10-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; GFX10-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; GFX10-NEXT: enable_wavefront_size32 = 1 +; GFX10-NEXT: enable_ordered_append_gds = 0 +; GFX10-NEXT: private_element_size = 1 +; GFX10-NEXT: is_ptr64 = 1 +; GFX10-NEXT: is_dynamic_callstack = 0 +; GFX10-NEXT: is_debug_enabled = 0 +; GFX10-NEXT: is_xnack_enabled = 1 +; GFX10-NEXT: workitem_private_segment_byte_size = 0 +; GFX10-NEXT: workgroup_group_segment_byte_size = 0 +; GFX10-NEXT: gds_segment_byte_size = 0 +; GFX10-NEXT: kernarg_segment_byte_size = 28 +; GFX10-NEXT: workgroup_fbarrier_count = 0 +; GFX10-NEXT: wavefront_sgpr_count = 7 +; GFX10-NEXT: workitem_vgpr_count = 3 +; GFX10-NEXT: reserved_vgpr_first = 0 +; GFX10-NEXT: reserved_vgpr_count = 0 +; GFX10-NEXT: reserved_sgpr_first = 0 +; GFX10-NEXT: reserved_sgpr_count = 0 +; GFX10-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; GFX10-NEXT: debug_private_segment_buffer_sgpr = 0 +; GFX10-NEXT: kernarg_segment_alignment = 4 +; GFX10-NEXT: group_segment_alignment = 4 +; GFX10-NEXT: private_segment_alignment = 4 +; GFX10-NEXT: wavefront_size = 5 +; GFX10-NEXT: call_convention = -1 +; GFX10-NEXT: runtime_loader_kernel_symbol = 0 +; GFX10-NEXT: .end_amd_kernel_code_t +; GFX10-NEXT: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: s_mov_b32 s3, 0x40080000 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_cmp_eq_u32 s6, 1 +; GFX10-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0 +; GFX10-NEXT: s_cmp_eq_u32 s6, 2 +; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] +; GFX10-NEXT: s_cmp_eq_u32 s6, 3 +; GFX10-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3] +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: s_endpgm entry: %ext = extractelement <4 x double> , i32 %sel store double %ext, double addrspace(1)* %out @@ -2511,6 +4053,15 @@ ; MOVREL-NEXT: s_waitcnt vmcnt(0) ; MOVREL-NEXT: v_mov_b32_e32 v0, v7 ; MOVREL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_extract_v64i32_7: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v7 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vec = load <64 x i32>, <64 x i32> addrspace(1)* %ptr %elt = extractelement <64 x i32> %vec, i32 7 ret i32 %elt @@ -2536,6 +4087,14 @@ ; MOVREL-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; MOVREL-NEXT: s_waitcnt vmcnt(0) ; MOVREL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_extract_v64i32_32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:128 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] %vec = load <64 x i32>, <64 x i32> addrspace(1)* %ptr %elt = extractelement <64 x i32> %vec, i32 32 ret i32 %elt @@ -2563,6 +4122,15 @@ ; MOVREL-NEXT: s_waitcnt vmcnt(0) ; MOVREL-NEXT: v_mov_b32_e32 v0, v1 ; MOVREL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_extract_v64i32_33: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:128 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vec = load <64 x i32>, <64 x i32> addrspace(1)* %ptr %elt = extractelement <64 x i32> %vec, i32 33 ret i32 %elt @@ -2598,6 +4166,21 @@ ; MOVREL-NEXT: s_waitcnt vmcnt(0) ; MOVREL-NEXT: v_mov_b32_e32 v0, v5 ; MOVREL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_extract_v64i32_37: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_movk_i32 s4, 0x80 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: v_mov_b32_e32 v3, s5 +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] %vec = load <64 x i32>, <64 x i32> addrspace(1)* %ptr %elt = extractelement <64 x i32> %vec, i32 37 ret i32 %elt diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll @@ -9,6 +9,9 @@ ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX9 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX9 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s + define half @v_fdiv_f16(half %a, half %b) { ; GFX6-IEEE-LABEL: v_fdiv_f16: ; GFX6-IEEE: ; %bb.0: @@ -61,6 +64,18 @@ ; GFX89-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX89-NEXT: v_div_fixup_f16 v0, v2, v1, v0 ; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fdiv_f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX10-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX10-NEXT: v_div_fixup_f16 v0, v2, v1, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv half %a, %b ret half %fdiv } @@ -82,6 +97,14 @@ ; GFX89-NEXT: v_rcp_f16_e32 v1, v1 ; GFX89-NEXT: v_mul_f16_e32 v0, v0, v1 ; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fdiv_f16_afn: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_rcp_f16_e32 v1, v1 +; GFX10-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn half %a, %b ret half %fdiv } @@ -138,6 +161,18 @@ ; GFX89-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX89-NEXT: v_div_fixup_f16 v0, v2, v1, v0 ; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fdiv_f16_ulp25: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX10-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX10-NEXT: v_div_fixup_f16 v0, v2, v1, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv half %a, %b, !fpmath !0 ret half %fdiv } @@ -194,6 +229,18 @@ ; GFX89-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX89-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 ; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_rcp_f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX10-NEXT: v_cvt_f32_f16_e32 v2, 1.0 +; GFX10-NEXT: v_rcp_f32_e32 v1, v1 +; GFX10-NEXT: v_mul_f32_e32 v1, v2, v1 +; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX10-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv half 1.0, %x ret half %fdiv } @@ -250,6 +297,18 @@ ; GFX89-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX89-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 ; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_rcp_f16_arcp: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX10-NEXT: v_cvt_f32_f16_e32 v2, 1.0 +; GFX10-NEXT: v_rcp_f32_e32 v1, v1 +; GFX10-NEXT: v_mul_f32_e32 v1, v2, v1 +; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX10-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp half 1.0, %x ret half %fdiv } @@ -270,6 +329,13 @@ ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX89-NEXT: v_rcp_f16_e32 v0, v0 ; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_rcp_f16_arcp_afn: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_rcp_f16_e32 v0, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp afn half 1.0, %x ret half %fdiv } @@ -321,6 +387,13 @@ ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX89-NEXT: v_rcp_f16_e32 v0, v0 ; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_rcp_f16_ulp25: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_rcp_f16_e32 v0, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv half 1.0, %x, !fpmath !0 ret half %fdiv } @@ -342,6 +415,14 @@ ; GFX89-NEXT: v_rcp_f16_e32 v1, v1 ; GFX89-NEXT: v_mul_f16_e32 v0, v0, v1 ; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fdiv_f16_afn_ulp25: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_rcp_f16_e32 v1, v1 +; GFX10-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn half %a, %b, !fpmath !0 ret half %fdiv } @@ -398,6 +479,18 @@ ; GFX89-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX89-NEXT: v_div_fixup_f16 v0, v2, v1, v0 ; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fdiv_f16_arcp_ulp25: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX10-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX10-NEXT: v_div_fixup_f16 v0, v2, v1, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp half %a, %b, !fpmath !0 ret half %fdiv } @@ -529,6 +622,28 @@ ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fdiv_v2f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v0 +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX10-NEXT: v_rcp_f32_e32 v4, v4 +; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX10-NEXT: v_rcp_f32_e32 v3, v3 +; GFX10-NEXT: v_mul_f32_e32 v4, v7, v4 +; GFX10-NEXT: v_mul_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_cvt_f16_f32_e32 v7, v4 +; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-NEXT: v_div_fixup_f16 v0, v7, v1, v0 +; GFX10-NEXT: v_div_fixup_f16 v2, v3, v2, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xffff, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x half> %a, %b ret <2 x half> %fdiv } @@ -577,6 +692,17 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fdiv_v2f16_afn: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_rcp_f16_e32 v2, v1 +; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_mul_f16_e32 v2, v0, v2 +; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v0, v2, 0xffff, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn <2 x half> %a, %b ret <2 x half> %fdiv } @@ -708,6 +834,28 @@ ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fdiv_v2f16_ulp25: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v0 +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX10-NEXT: v_rcp_f32_e32 v4, v4 +; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX10-NEXT: v_rcp_f32_e32 v3, v3 +; GFX10-NEXT: v_mul_f32_e32 v4, v7, v4 +; GFX10-NEXT: v_mul_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_cvt_f16_f32_e32 v7, v4 +; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-NEXT: v_div_fixup_f16 v0, v7, v1, v0 +; GFX10-NEXT: v_div_fixup_f16 v2, v3, v2, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xffff, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x half> %a, %b, !fpmath !0 ret <2 x half> %fdiv } @@ -833,6 +981,26 @@ ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_rcp_v2f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX10-NEXT: v_cvt_f32_f16_e32 v4, 1.0 +; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX10-NEXT: v_rcp_f32_e32 v3, v3 +; GFX10-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX10-NEXT: v_mul_f32_e32 v2, v4, v2 +; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX10-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0 +; GFX10-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xffff, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x half> , %x ret <2 x half> %fdiv } @@ -958,6 +1126,26 @@ ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_rcp_v2f16_arcp: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX10-NEXT: v_cvt_f32_f16_e32 v4, 1.0 +; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX10-NEXT: v_rcp_f32_e32 v3, v3 +; GFX10-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX10-NEXT: v_mul_f32_e32 v2, v4, v2 +; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX10-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0 +; GFX10-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xffff, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp <2 x half> , %x ret <2 x half> %fdiv } @@ -1000,6 +1188,15 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_rcp_v2f16_arcp_afn: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_rcp_f16_e32 v1, v0 +; GFX10-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v0, v1, 0xffff, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp afn <2 x half> , %x ret <2 x half> %fdiv } @@ -1104,6 +1301,15 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_rcp_v2f16_ulp25: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_rcp_f16_e32 v1, v0 +; GFX10-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v0, v1, 0xffff, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x half> , %x, !fpmath !0 ret <2 x half> %fdiv } @@ -1152,6 +1358,17 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fdiv_v2f16_afn_ulp25: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_rcp_f16_e32 v2, v1 +; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_mul_f16_e32 v2, v0, v2 +; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v0, v2, 0xffff, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn <2 x half> %a, %b, !fpmath !0 ret <2 x half> %fdiv } @@ -1283,6 +1500,28 @@ ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fdiv_v2f16_arcp_ulp25: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v0 +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX10-NEXT: v_rcp_f32_e32 v4, v4 +; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX10-NEXT: v_rcp_f32_e32 v3, v3 +; GFX10-NEXT: v_mul_f32_e32 v4, v7, v4 +; GFX10-NEXT: v_mul_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_cvt_f16_f32_e32 v7, v4 +; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-NEXT: v_div_fixup_f16 v0, v7, v1, v0 +; GFX10-NEXT: v_div_fixup_f16 v2, v3, v2, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xffff, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp <2 x half> %a, %b, !fpmath !0 ret <2 x half> %fdiv } @@ -1331,6 +1570,17 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fdiv_v2f16_arcp_afn_ulp25: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_rcp_f16_e32 v2, v1 +; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_mul_f16_e32 v2, v0, v2 +; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v0, v2, 0xffff, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn arcp <2 x half> %a, %b, !fpmath !0 ret <2 x half> %fdiv } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll @@ -8,6 +8,9 @@ ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-IEEE,GFX89-IEEE %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH,GFX89-FLUSH %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10,GFX10-IEEE %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10,GFX10-FLUSH %s + define float @v_fdiv_f32(float %a, float %b) { ; GFX6-IEEE-LABEL: v_fdiv_f32: ; GFX6-IEEE: ; %bb.0: @@ -76,6 +79,42 @@ ; GFX89-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX89-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX89-FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-IEEE-LABEL: v_fdiv_f32: +; GFX10-IEEE: ; %bb.0: +; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX10-IEEE-NEXT: v_fma_f32 v4, -v2, v3, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v3, v4, v3 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v5, v3 +; GFX10-IEEE-NEXT: v_fma_f32 v6, v4, -v2, v5 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v3 +; GFX10-IEEE-NEXT: v_fmac_f32_e64 v5, -v2, v4 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v5, v3, v4 +; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-FLUSH-LABEL: v_fdiv_f32: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX10-FLUSH-NEXT: s_denorm_mode 3 +; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX10-FLUSH-NEXT: v_fma_f32 v6, v5, -v2, v4 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3 +; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v4, -v2, v5 +; GFX10-FLUSH-NEXT: s_denorm_mode 0 +; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v4, v3, v5 +; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv float %a, %b ret float %fdiv } @@ -87,6 +126,14 @@ ; GCN-NEXT: v_rcp_f32_e32 v1, v1 ; GCN-NEXT: v_mul_f32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fdiv_f32_afn: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_rcp_f32_e32 v1, v1 +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn float %a, %b ret float %fdiv } @@ -136,6 +183,35 @@ ; GFX89-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX89-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX89-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-IEEE-LABEL: v_fdiv_f32_ulp25: +; GFX10-IEEE: ; %bb.0: +; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX10-IEEE-NEXT: v_fma_f32 v4, -v2, v3, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v3, v4, v3 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v5, v3 +; GFX10-IEEE-NEXT: v_fma_f32 v6, v4, -v2, v5 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v3 +; GFX10-IEEE-NEXT: v_fmac_f32_e64 v5, -v2, v4 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v5, v3, v4 +; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-FLUSH-LABEL: v_fdiv_f32_ulp25: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-FLUSH-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |v1| +; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x2f800000, s4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v1, v1 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v2, v0 +; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv float %a, %b, !fpmath !0 ret float %fdiv } @@ -208,6 +284,42 @@ ; GFX89-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v3, v4 ; GFX89-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; GFX89-FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-IEEE-LABEL: v_rcp_f32: +; GFX10-IEEE: ; %bb.0: +; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v1, s4, v0, v0, 1.0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v1 +; GFX10-IEEE-NEXT: v_fma_f32 v3, -v1, v2, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v2, v3, v2 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v4, v2 +; GFX10-IEEE-NEXT: v_fma_f32 v5, v3, -v1, v4 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v2 +; GFX10-IEEE-NEXT: v_fmac_f32_e64 v4, -v1, v3 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v1, v4, v2, v3 +; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-FLUSH-LABEL: v_rcp_f32: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v1, s4, v0, v0, 1.0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v3, vcc_lo, 1.0, v0, 1.0 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 +; GFX10-FLUSH-NEXT: s_denorm_mode 3 +; GFX10-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v2, v4, v2 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX10-FLUSH-NEXT: v_fma_f32 v5, v4, -v1, v3 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v4, v5, v2 +; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v3, -v1, v4 +; GFX10-FLUSH-NEXT: s_denorm_mode 0 +; GFX10-FLUSH-NEXT: v_div_fmas_f32 v1, v3, v2, v4 +; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv float 1.0, %x ret float %fdiv } @@ -280,6 +392,42 @@ ; GFX89-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v3, v4 ; GFX89-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; GFX89-FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-IEEE-LABEL: v_rcp_f32_arcp: +; GFX10-IEEE: ; %bb.0: +; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v1, s4, v0, v0, 1.0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v1 +; GFX10-IEEE-NEXT: v_fma_f32 v3, -v1, v2, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v2, v3, v2 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v4, v2 +; GFX10-IEEE-NEXT: v_fma_f32 v5, v3, -v1, v4 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v2 +; GFX10-IEEE-NEXT: v_fmac_f32_e64 v4, -v1, v3 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v1, v4, v2, v3 +; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-FLUSH-LABEL: v_rcp_f32_arcp: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v1, s4, v0, v0, 1.0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v3, vcc_lo, 1.0, v0, 1.0 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 +; GFX10-FLUSH-NEXT: s_denorm_mode 3 +; GFX10-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v2, v4, v2 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX10-FLUSH-NEXT: v_fma_f32 v5, v4, -v1, v3 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v4, v5, v2 +; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v3, -v1, v4 +; GFX10-FLUSH-NEXT: s_denorm_mode 0 +; GFX10-FLUSH-NEXT: v_div_fmas_f32 v1, v3, v2, v4 +; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp float 1.0, %x ret float %fdiv } @@ -290,6 +438,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_rcp_f32_arcp_afn: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_rcp_f32_e32 v0, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp afn float 1.0, %x ret float %fdiv } @@ -313,6 +468,25 @@ ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-FLUSH-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-IEEE-LABEL: v_rcp_f32_ulp25: +; GFX10-IEEE: ; %bb.0: +; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-IEEE-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |v0| +; GFX10-IEEE-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x2f800000, s4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v0, v0 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-FLUSH-LABEL: v_rcp_f32_ulp25: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v0, v0 +; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv float 1.0, %x, !fpmath !0 ret float %fdiv } @@ -324,6 +498,14 @@ ; GCN-NEXT: v_rcp_f32_e32 v1, v1 ; GCN-NEXT: v_mul_f32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fdiv_f32_afn_ulp25: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_rcp_f32_e32 v1, v1 +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn float %a, %b, !fpmath !0 ret float %fdiv } @@ -373,6 +555,35 @@ ; GFX89-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX89-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX89-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-IEEE-LABEL: v_fdiv_f32_arcp_ulp25: +; GFX10-IEEE: ; %bb.0: +; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX10-IEEE-NEXT: v_fma_f32 v4, -v2, v3, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v3, v4, v3 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v5, v3 +; GFX10-IEEE-NEXT: v_fma_f32 v6, v4, -v2, v5 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v3 +; GFX10-IEEE-NEXT: v_fmac_f32_e64 v5, -v2, v4 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v5, v3, v4 +; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-FLUSH-LABEL: v_fdiv_f32_arcp_ulp25: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-FLUSH-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |v1| +; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x2f800000, s4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v1, v1 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v2, v0 +; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp float %a, %b, !fpmath !0 ret float %fdiv } @@ -494,6 +705,67 @@ ; GFX89-FLUSH-NEXT: v_div_fmas_f32 v2, v5, v2, v4 ; GFX89-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v3, v1 ; GFX89-FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-IEEE-LABEL: v_fdiv_v2f32: +; GFX10-IEEE: ; %bb.0: +; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v4, s4, v2, v2, v0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v5, s4, v3, v3, v1 +; GFX10-IEEE-NEXT: v_div_scale_f32 v15, vcc_lo, v0, v2, v0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v6, v4 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v10, v5 +; GFX10-IEEE-NEXT: v_fma_f32 v8, -v4, v6, 1.0 +; GFX10-IEEE-NEXT: v_fma_f32 v9, -v5, v10, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v6, v8, v6 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v10, v9, v10 +; GFX10-IEEE-NEXT: v_div_scale_f32 v8, s4, v1, v3, v1 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v15, v6 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v8, v10 +; GFX10-IEEE-NEXT: v_fma_f32 v12, v9, -v4, v15 +; GFX10-IEEE-NEXT: v_fma_f32 v13, v11, -v5, v8 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v12, v6 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v10 +; GFX10-IEEE-NEXT: v_fmac_f32_e64 v15, -v4, v9 +; GFX10-IEEE-NEXT: v_fmac_f32_e64 v8, -v5, v11 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v7, v15, v6, v9 +; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v6, v8, v10, v11 +; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v7, v2, v0 +; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v6, v3, v1 +; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-FLUSH-LABEL: v_fdiv_v2f32: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v4, s4, v2, v2, v0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v6, vcc_lo, v0, v2, v0 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v4 +; GFX10-FLUSH-NEXT: s_denorm_mode 3 +; GFX10-FLUSH-NEXT: v_fma_f32 v7, -v4, v5, 1.0 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v7, v5 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v5 +; GFX10-FLUSH-NEXT: v_fma_f32 v8, v7, -v4, v6 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v7, v8, v5 +; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v6, -v4, v7 +; GFX10-FLUSH-NEXT: s_denorm_mode 0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v4, s4, v3, v3, v1 +; GFX10-FLUSH-NEXT: v_div_fmas_f32 v5, v6, v5, v7 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v11, vcc_lo, v1, v3, v1 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v6, v4 +; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v5, v2, v0 +; GFX10-FLUSH-NEXT: s_denorm_mode 3 +; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v4, v6, 1.0 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v6, v5, v6 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v11, v6 +; GFX10-FLUSH-NEXT: v_fma_f32 v7, v5, -v4, v11 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v7, v6 +; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v11, -v4, v5 +; GFX10-FLUSH-NEXT: s_denorm_mode 0 +; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v11, v6, v5 +; GFX10-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v3, v1 +; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x float> %a, %b ret <2 x float> %fdiv } @@ -507,6 +779,16 @@ ; GCN-NEXT: v_mul_f32_e32 v0, v0, v2 ; GCN-NEXT: v_mul_f32_e32 v1, v1, v3 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fdiv_v2f32_afn: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-NEXT: v_rcp_f32_e32 v3, v3 +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn <2 x float> %a, %b ret <2 x float> %fdiv } @@ -585,6 +867,55 @@ ; GFX89-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 ; GFX89-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1 ; GFX89-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-IEEE-LABEL: v_fdiv_v2f32_ulp25: +; GFX10-IEEE: ; %bb.0: +; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v4, s4, v2, v2, v0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v5, s4, v3, v3, v1 +; GFX10-IEEE-NEXT: v_div_scale_f32 v15, vcc_lo, v0, v2, v0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v6, v4 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v10, v5 +; GFX10-IEEE-NEXT: v_fma_f32 v8, -v4, v6, 1.0 +; GFX10-IEEE-NEXT: v_fma_f32 v9, -v5, v10, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v6, v8, v6 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v10, v9, v10 +; GFX10-IEEE-NEXT: v_div_scale_f32 v8, s4, v1, v3, v1 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v15, v6 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v8, v10 +; GFX10-IEEE-NEXT: v_fma_f32 v12, v9, -v4, v15 +; GFX10-IEEE-NEXT: v_fma_f32 v13, v11, -v5, v8 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v12, v6 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v10 +; GFX10-IEEE-NEXT: v_fmac_f32_e64 v15, -v4, v9 +; GFX10-IEEE-NEXT: v_fmac_f32_e64 v8, -v5, v11 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v7, v15, v6, v9 +; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v6, v8, v10, v11 +; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v7, v2, v0 +; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v6, v3, v1 +; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-FLUSH-LABEL: v_fdiv_v2f32_ulp25: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-FLUSH-NEXT: s_mov_b32 s4, 0x6f800000 +; GFX10-FLUSH-NEXT: s_mov_b32 s5, 0x2f800000 +; GFX10-FLUSH-NEXT: v_cmp_gt_f32_e64 s6, |v2|, s4 +; GFX10-FLUSH-NEXT: v_cmp_gt_f32_e64 s4, |v3|, s4 +; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v7, 1.0, s5, s6 +; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v6, 1.0, s5, s4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v7 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v7, v0 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v6, v1 +; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x float> %a, %b, !fpmath !0 ret <2 x float> %fdiv } @@ -706,6 +1037,67 @@ ; GFX89-FLUSH-NEXT: v_div_fmas_f32 v2, v3, v2, v5 ; GFX89-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 ; GFX89-FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-IEEE-LABEL: v_rcp_v2f32: +; GFX10-IEEE: ; %bb.0: +; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v2, s4, v0, v0, 1.0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v3, s4, v1, v1, 1.0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v13, vcc_lo, 1.0, v0, 1.0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v8, v3 +; GFX10-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0 +; GFX10-IEEE-NEXT: v_fma_f32 v7, -v3, v8, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v4 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v8, v7, v8 +; GFX10-IEEE-NEXT: v_div_scale_f32 v6, s4, 1.0, v1, 1.0 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v13, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v6, v8 +; GFX10-IEEE-NEXT: v_fma_f32 v10, v7, -v2, v13 +; GFX10-IEEE-NEXT: v_fma_f32 v11, v9, -v3, v6 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v10, v4 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v11, v8 +; GFX10-IEEE-NEXT: v_fmac_f32_e64 v13, -v2, v7 +; GFX10-IEEE-NEXT: v_fmac_f32_e64 v6, -v3, v9 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v13, v4, v7 +; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v3, v6, v8, v9 +; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, 1.0 +; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-FLUSH-LABEL: v_rcp_v2f32: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, s4, v0, v0, 1.0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX10-FLUSH-NEXT: s_denorm_mode 3 +; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX10-FLUSH-NEXT: v_fma_f32 v6, v5, -v2, v4 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3 +; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v4, -v2, v5 +; GFX10-FLUSH-NEXT: s_denorm_mode 0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, s4, v1, v1, 1.0 +; GFX10-FLUSH-NEXT: v_div_fmas_f32 v3, v4, v3, v5 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 +; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, 1.0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v3, vcc_lo, 1.0, v1, 1.0 +; GFX10-FLUSH-NEXT: s_denorm_mode 3 +; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v2, v4, 1.0 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v4, v5, v4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v3, v4 +; GFX10-FLUSH-NEXT: v_fma_f32 v6, v5, -v2, v3 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v4 +; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v3, -v2, v5 +; GFX10-FLUSH-NEXT: s_denorm_mode 0 +; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v3, v4, v5 +; GFX10-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 +; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x float> , %x ret <2 x float> %fdiv } @@ -827,6 +1219,67 @@ ; GFX89-FLUSH-NEXT: v_div_fmas_f32 v2, v3, v2, v5 ; GFX89-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 ; GFX89-FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-IEEE-LABEL: v_rcp_v2f32_arcp: +; GFX10-IEEE: ; %bb.0: +; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v2, s4, v0, v0, 1.0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v3, s4, v1, v1, 1.0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v13, vcc_lo, 1.0, v0, 1.0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v8, v3 +; GFX10-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0 +; GFX10-IEEE-NEXT: v_fma_f32 v7, -v3, v8, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v4 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v8, v7, v8 +; GFX10-IEEE-NEXT: v_div_scale_f32 v6, s4, 1.0, v1, 1.0 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v13, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v6, v8 +; GFX10-IEEE-NEXT: v_fma_f32 v10, v7, -v2, v13 +; GFX10-IEEE-NEXT: v_fma_f32 v11, v9, -v3, v6 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v10, v4 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v11, v8 +; GFX10-IEEE-NEXT: v_fmac_f32_e64 v13, -v2, v7 +; GFX10-IEEE-NEXT: v_fmac_f32_e64 v6, -v3, v9 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v13, v4, v7 +; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v3, v6, v8, v9 +; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, 1.0 +; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-FLUSH-LABEL: v_rcp_v2f32_arcp: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, s4, v0, v0, 1.0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX10-FLUSH-NEXT: s_denorm_mode 3 +; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX10-FLUSH-NEXT: v_fma_f32 v6, v5, -v2, v4 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3 +; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v4, -v2, v5 +; GFX10-FLUSH-NEXT: s_denorm_mode 0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, s4, v1, v1, 1.0 +; GFX10-FLUSH-NEXT: v_div_fmas_f32 v3, v4, v3, v5 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 +; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, 1.0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v3, vcc_lo, 1.0, v1, 1.0 +; GFX10-FLUSH-NEXT: s_denorm_mode 3 +; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v2, v4, 1.0 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v4, v5, v4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v3, v4 +; GFX10-FLUSH-NEXT: v_fma_f32 v6, v5, -v2, v3 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v4 +; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v3, -v2, v5 +; GFX10-FLUSH-NEXT: s_denorm_mode 0 +; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v3, v4, v5 +; GFX10-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 +; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp <2 x float> , %x ret <2 x float> %fdiv } @@ -838,6 +1291,14 @@ ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_rcp_f32_e32 v1, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_rcp_v2f32_arcp_afn: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_rcp_f32_e32 v0, v0 +; GFX10-NEXT: v_rcp_f32_e32 v1, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp afn <2 x float> , %x ret <2 x float> %fdiv } @@ -868,6 +1329,34 @@ ; GCN-FLUSH-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-FLUSH-NEXT: v_rcp_f32_e32 v1, v1 ; GCN-FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-IEEE-LABEL: v_rcp_v2f32_ulp25: +; GFX10-IEEE: ; %bb.0: +; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-IEEE-NEXT: s_mov_b32 s4, 0x6f800000 +; GFX10-IEEE-NEXT: s_mov_b32 s5, 0x2f800000 +; GFX10-IEEE-NEXT: v_cmp_gt_f32_e64 s6, |v0|, s4 +; GFX10-IEEE-NEXT: v_cmp_gt_f32_e64 s4, |v1|, s4 +; GFX10-IEEE-NEXT: v_cndmask_b32_e64 v2, 1.0, s5, s6 +; GFX10-IEEE-NEXT: v_cndmask_b32_e64 v3, 1.0, s5, s4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v0, v0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v1, v1 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v0, v2, v0 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v1, v3, v1 +; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-FLUSH-LABEL: v_rcp_v2f32_ulp25: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v0, v0 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v1, v1 +; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x float> , %x, !fpmath !0 ret <2 x float> %fdiv } @@ -881,6 +1370,16 @@ ; GCN-NEXT: v_mul_f32_e32 v0, v0, v2 ; GCN-NEXT: v_mul_f32_e32 v1, v1, v3 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fdiv_v2f32_afn_ulp25: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-NEXT: v_rcp_f32_e32 v3, v3 +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn <2 x float> %a, %b, !fpmath !0 ret <2 x float> %fdiv } @@ -959,6 +1458,55 @@ ; GFX89-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 ; GFX89-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1 ; GFX89-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-IEEE-LABEL: v_fdiv_v2f32_arcp_ulp25: +; GFX10-IEEE: ; %bb.0: +; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v4, s4, v2, v2, v0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v5, s4, v3, v3, v1 +; GFX10-IEEE-NEXT: v_div_scale_f32 v15, vcc_lo, v0, v2, v0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v6, v4 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v10, v5 +; GFX10-IEEE-NEXT: v_fma_f32 v8, -v4, v6, 1.0 +; GFX10-IEEE-NEXT: v_fma_f32 v9, -v5, v10, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v6, v8, v6 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v10, v9, v10 +; GFX10-IEEE-NEXT: v_div_scale_f32 v8, s4, v1, v3, v1 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v15, v6 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v8, v10 +; GFX10-IEEE-NEXT: v_fma_f32 v12, v9, -v4, v15 +; GFX10-IEEE-NEXT: v_fma_f32 v13, v11, -v5, v8 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v12, v6 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v10 +; GFX10-IEEE-NEXT: v_fmac_f32_e64 v15, -v4, v9 +; GFX10-IEEE-NEXT: v_fmac_f32_e64 v8, -v5, v11 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v7, v15, v6, v9 +; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v6, v8, v10, v11 +; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v7, v2, v0 +; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v6, v3, v1 +; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-FLUSH-LABEL: v_fdiv_v2f32_arcp_ulp25: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-FLUSH-NEXT: s_mov_b32 s4, 0x6f800000 +; GFX10-FLUSH-NEXT: s_mov_b32 s5, 0x2f800000 +; GFX10-FLUSH-NEXT: v_cmp_gt_f32_e64 s6, |v2|, s4 +; GFX10-FLUSH-NEXT: v_cmp_gt_f32_e64 s4, |v3|, s4 +; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v7, 1.0, s5, s6 +; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v6, 1.0, s5, s4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v7 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v7, v0 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v6, v1 +; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0 ret <2 x float> %fdiv } @@ -972,6 +1520,16 @@ ; GCN-NEXT: v_mul_f32_e32 v0, v0, v2 ; GCN-NEXT: v_mul_f32_e32 v1, v1, v3 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fdiv_v2f32_arcp_afn_ulp25: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-NEXT: v_rcp_f32_e32 v3, v3 +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn arcp <2 x float> %a, %b, !fpmath !0 ret <2 x float> %fdiv } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll @@ -8,6 +8,9 @@ ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s + define double @v_fdiv_f64(double %a, double %b) { ; GFX6-LABEL: v_fdiv_f64: ; GFX6: ; %bb.0: @@ -59,6 +62,23 @@ ; GFX9-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] ; GFX9-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1] ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fdiv_f64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_div_scale_f64 v[4:5], s4, v[2:3], v[2:3], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[2:3], v[0:1] +; GFX10-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX10-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7] +; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11] +; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9] +; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv double %a, %b ret double %fdiv } @@ -76,6 +96,20 @@ ; GCN-NEXT: v_fma_f64 v[0:1], -v[2:3], v[6:7], v[0:1] ; GCN-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[6:7] ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fdiv_f64_afn: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GFX10-NEXT: v_fma_f64 v[10:11], v[6:7], v[4:5], v[4:5] +; GFX10-NEXT: v_mul_f64 v[6:7], v[0:1], v[10:11] +; GFX10-NEXT: v_fma_f64 v[0:1], -v[2:3], v[6:7], v[0:1] +; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[10:11], v[6:7] +; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn double %a, %b ret double %fdiv } @@ -131,6 +165,23 @@ ; GFX9-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] ; GFX9-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1] ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fdiv_f64_ulp25: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_div_scale_f64 v[4:5], s4, v[2:3], v[2:3], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[2:3], v[0:1] +; GFX10-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX10-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7] +; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11] +; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9] +; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv double %a, %b, !fpmath !0 ret double %fdiv } @@ -187,6 +238,23 @@ ; GFX9-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] ; GFX9-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_rcp_f64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_div_scale_f64 v[2:3], s4, v[0:1], v[0:1], 1.0 +; GFX10-NEXT: v_div_scale_f64 v[8:9], vcc_lo, 1.0, v[0:1], 1.0 +; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GFX10-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GFX10-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GFX10-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] +; GFX10-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] +; GFX10-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] +; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv double 1.0, %x ret double %fdiv } @@ -243,6 +311,23 @@ ; GFX9-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] ; GFX9-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_rcp_f64_arcp: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_div_scale_f64 v[2:3], s4, v[0:1], v[0:1], 1.0 +; GFX10-NEXT: v_div_scale_f64 v[8:9], vcc_lo, 1.0, v[0:1], 1.0 +; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GFX10-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GFX10-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GFX10-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] +; GFX10-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] +; GFX10-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] +; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp double 1.0, %x ret double %fdiv } @@ -260,6 +345,20 @@ ; GCN-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0 ; GCN-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_rcp_f64_arcp_afn: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; GFX10-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; GFX10-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; GFX10-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; GFX10-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; GFX10-NEXT: v_mul_f64 v[6:7], 1.0, v[2:3] +; GFX10-NEXT: v_fma_f64 v[0:1], -v[0:1], v[6:7], 1.0 +; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[6:7] +; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp afn double 1.0, %x ret double %fdiv } @@ -316,6 +415,23 @@ ; GFX9-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] ; GFX9-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_rcp_f64_ulp25: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_div_scale_f64 v[2:3], s4, v[0:1], v[0:1], 1.0 +; GFX10-NEXT: v_div_scale_f64 v[8:9], vcc_lo, 1.0, v[0:1], 1.0 +; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GFX10-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GFX10-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GFX10-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] +; GFX10-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] +; GFX10-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] +; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv double 1.0, %x, !fpmath !0 ret double %fdiv } @@ -333,6 +449,20 @@ ; GCN-NEXT: v_fma_f64 v[0:1], -v[2:3], v[6:7], v[0:1] ; GCN-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[6:7] ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fdiv_f64_afn_ulp25: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GFX10-NEXT: v_fma_f64 v[10:11], v[6:7], v[4:5], v[4:5] +; GFX10-NEXT: v_mul_f64 v[6:7], v[0:1], v[10:11] +; GFX10-NEXT: v_fma_f64 v[0:1], -v[2:3], v[6:7], v[0:1] +; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[10:11], v[6:7] +; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn double %a, %b, !fpmath !0 ret double %fdiv } @@ -388,6 +518,23 @@ ; GFX9-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] ; GFX9-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1] ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fdiv_f64_arcp_ulp25: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_div_scale_f64 v[4:5], s4, v[2:3], v[2:3], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[2:3], v[0:1] +; GFX10-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX10-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7] +; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11] +; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9] +; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp double %a, %b, !fpmath !0 ret double %fdiv } @@ -482,6 +629,39 @@ ; GFX9-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21] ; GFX9-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fdiv_v2f64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v30, v4 +; GFX10-NEXT: v_mov_b32_e32 v31, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v6 +; GFX10-NEXT: v_mov_b32_e32 v5, v7 +; GFX10-NEXT: v_div_scale_f64 v[26:27], s4, v[30:31], v[30:31], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[30:31], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[24:25], s4, v[4:5], v[4:5], v[2:3] +; GFX10-NEXT: v_div_scale_f64 v[16:17], s4, v[2:3], v[4:5], v[2:3] +; GFX10-NEXT: v_rcp_f64_e32 v[12:13], v[26:27] +; GFX10-NEXT: v_rcp_f64_e32 v[14:15], v[24:25] +; GFX10-NEXT: v_fma_f64 v[10:11], -v[26:27], v[12:13], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], -v[24:25], v[14:15], 1.0 +; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[10:11], v[12:13] +; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[8:9], v[14:15] +; GFX10-NEXT: v_fma_f64 v[6:7], -v[26:27], v[12:13], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], -v[24:25], v[14:15], 1.0 +; GFX10-NEXT: v_fma_f64 v[6:7], v[12:13], v[6:7], v[12:13] +; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[8:9], v[14:15] +; GFX10-NEXT: v_mul_f64 v[18:19], v[20:21], v[6:7] +; GFX10-NEXT: v_mul_f64 v[22:23], v[16:17], v[14:15] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[26:27], v[18:19], v[20:21] +; GFX10-NEXT: v_fma_f64 v[16:17], -v[24:25], v[22:23], v[16:17] +; GFX10-NEXT: v_div_fmas_f64 v[18:19], v[8:9], v[6:7], v[18:19] +; GFX10-NEXT: s_mov_b32 vcc_lo, s4 +; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[16:17], v[14:15], v[22:23] +; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[18:19], v[30:31], v[0:1] +; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[8:9], v[4:5], v[2:3] +; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x double> %a, %b ret <2 x double> %fdiv } @@ -507,6 +687,36 @@ ; GCN-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[12:13] ; GCN-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[14:15] ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fdiv_v2f64_afn: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v18, v4 +; GFX10-NEXT: v_mov_b32_e32 v19, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v6 +; GFX10-NEXT: v_mov_b32_e32 v5, v7 +; GFX10-NEXT: v_mov_b32_e32 v22, v0 +; GFX10-NEXT: v_mov_b32_e32 v23, v1 +; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[18:19] +; GFX10-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[4:5] +; GFX10-NEXT: v_mov_b32_e32 v1, v3 +; GFX10-NEXT: v_fma_f64 v[6:7], -v[18:19], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[6:7], v[8:9], v[8:9] +; GFX10-NEXT: v_fma_f64 v[10:11], v[12:13], v[10:11], v[10:11] +; GFX10-NEXT: v_fma_f64 v[6:7], -v[18:19], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[6:7], v[8:9], v[8:9] +; GFX10-NEXT: v_fma_f64 v[10:11], v[12:13], v[10:11], v[10:11] +; GFX10-NEXT: v_mul_f64 v[12:13], v[22:23], v[8:9] +; GFX10-NEXT: v_mul_f64 v[14:15], v[0:1], v[10:11] +; GFX10-NEXT: v_fma_f64 v[18:19], -v[18:19], v[12:13], v[22:23] +; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[0:1] +; GFX10-NEXT: v_fma_f64 v[0:1], v[18:19], v[8:9], v[12:13] +; GFX10-NEXT: v_fma_f64 v[2:3], v[4:5], v[10:11], v[14:15] +; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn <2 x double> %a, %b ret <2 x double> %fdiv } @@ -601,6 +811,39 @@ ; GFX9-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21] ; GFX9-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fdiv_v2f64_ulp25: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v30, v4 +; GFX10-NEXT: v_mov_b32_e32 v31, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v6 +; GFX10-NEXT: v_mov_b32_e32 v5, v7 +; GFX10-NEXT: v_div_scale_f64 v[26:27], s4, v[30:31], v[30:31], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[30:31], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[24:25], s4, v[4:5], v[4:5], v[2:3] +; GFX10-NEXT: v_div_scale_f64 v[16:17], s4, v[2:3], v[4:5], v[2:3] +; GFX10-NEXT: v_rcp_f64_e32 v[12:13], v[26:27] +; GFX10-NEXT: v_rcp_f64_e32 v[14:15], v[24:25] +; GFX10-NEXT: v_fma_f64 v[10:11], -v[26:27], v[12:13], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], -v[24:25], v[14:15], 1.0 +; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[10:11], v[12:13] +; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[8:9], v[14:15] +; GFX10-NEXT: v_fma_f64 v[6:7], -v[26:27], v[12:13], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], -v[24:25], v[14:15], 1.0 +; GFX10-NEXT: v_fma_f64 v[6:7], v[12:13], v[6:7], v[12:13] +; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[8:9], v[14:15] +; GFX10-NEXT: v_mul_f64 v[18:19], v[20:21], v[6:7] +; GFX10-NEXT: v_mul_f64 v[22:23], v[16:17], v[14:15] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[26:27], v[18:19], v[20:21] +; GFX10-NEXT: v_fma_f64 v[16:17], -v[24:25], v[22:23], v[16:17] +; GFX10-NEXT: v_div_fmas_f64 v[18:19], v[8:9], v[6:7], v[18:19] +; GFX10-NEXT: s_mov_b32 vcc_lo, s4 +; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[16:17], v[14:15], v[22:23] +; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[18:19], v[30:31], v[0:1] +; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[8:9], v[4:5], v[2:3] +; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x double> %a, %b, !fpmath !0 ret <2 x double> %fdiv } @@ -695,6 +938,35 @@ ; GFX9-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 ; GFX9-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_rcp_v2f64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_div_scale_f64 v[22:23], s4, v[0:1], v[0:1], 1.0 +; GFX10-NEXT: v_div_scale_f64 v[20:21], s4, v[2:3], v[2:3], 1.0 +; GFX10-NEXT: v_div_scale_f64 v[16:17], vcc_lo, 1.0, v[0:1], 1.0 +; GFX10-NEXT: v_div_scale_f64 v[12:13], s4, 1.0, v[2:3], 1.0 +; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[22:23] +; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[20:21] +; GFX10-NEXT: v_fma_f64 v[6:7], -v[22:23], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[4:5], -v[20:21], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[6:7], v[8:9] +; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[4:5], v[10:11] +; GFX10-NEXT: v_fma_f64 v[6:7], -v[22:23], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[4:5], -v[20:21], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[8:9] +; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[4:5], v[10:11] +; GFX10-NEXT: v_mul_f64 v[14:15], v[16:17], v[6:7] +; GFX10-NEXT: v_mul_f64 v[18:19], v[12:13], v[10:11] +; GFX10-NEXT: v_fma_f64 v[4:5], -v[22:23], v[14:15], v[16:17] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[20:21], v[18:19], v[12:13] +; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[4:5], v[6:7], v[14:15] +; GFX10-NEXT: s_mov_b32 vcc_lo, s4 +; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[12:13], v[10:11], v[18:19] +; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0 +; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[8:9], v[2:3], 1.0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x double> , %x ret <2 x double> %fdiv } @@ -789,6 +1061,35 @@ ; GFX9-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 ; GFX9-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_rcp_v2f64_arcp: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_div_scale_f64 v[22:23], s4, v[0:1], v[0:1], 1.0 +; GFX10-NEXT: v_div_scale_f64 v[20:21], s4, v[2:3], v[2:3], 1.0 +; GFX10-NEXT: v_div_scale_f64 v[16:17], vcc_lo, 1.0, v[0:1], 1.0 +; GFX10-NEXT: v_div_scale_f64 v[12:13], s4, 1.0, v[2:3], 1.0 +; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[22:23] +; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[20:21] +; GFX10-NEXT: v_fma_f64 v[6:7], -v[22:23], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[4:5], -v[20:21], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[6:7], v[8:9] +; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[4:5], v[10:11] +; GFX10-NEXT: v_fma_f64 v[6:7], -v[22:23], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[4:5], -v[20:21], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[8:9] +; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[4:5], v[10:11] +; GFX10-NEXT: v_mul_f64 v[14:15], v[16:17], v[6:7] +; GFX10-NEXT: v_mul_f64 v[18:19], v[12:13], v[10:11] +; GFX10-NEXT: v_fma_f64 v[4:5], -v[22:23], v[14:15], v[16:17] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[20:21], v[18:19], v[12:13] +; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[4:5], v[6:7], v[14:15] +; GFX10-NEXT: s_mov_b32 vcc_lo, s4 +; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[12:13], v[10:11], v[18:19] +; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0 +; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[8:9], v[2:3], 1.0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp <2 x double> , %x ret <2 x double> %fdiv } @@ -814,6 +1115,32 @@ ; GCN-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9] ; GCN-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11] ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_rcp_v2f64_arcp_afn: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v14, v0 +; GFX10-NEXT: v_mov_b32_e32 v15, v1 +; GFX10-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-NEXT: v_mov_b32_e32 v1, v3 +; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[14:15] +; GFX10-NEXT: v_rcp_f64_e32 v[6:7], v[0:1] +; GFX10-NEXT: v_fma_f64 v[2:3], -v[14:15], v[4:5], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], -v[0:1], v[6:7], 1.0 +; GFX10-NEXT: v_fma_f64 v[4:5], v[2:3], v[4:5], v[4:5] +; GFX10-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7] +; GFX10-NEXT: v_fma_f64 v[2:3], -v[14:15], v[4:5], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], -v[0:1], v[6:7], 1.0 +; GFX10-NEXT: v_fma_f64 v[4:5], v[2:3], v[4:5], v[4:5] +; GFX10-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7] +; GFX10-NEXT: v_mul_f64 v[8:9], 1.0, v[4:5] +; GFX10-NEXT: v_mul_f64 v[10:11], 1.0, v[6:7] +; GFX10-NEXT: v_fma_f64 v[14:15], -v[14:15], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[12:13], -v[0:1], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[0:1], v[14:15], v[4:5], v[8:9] +; GFX10-NEXT: v_fma_f64 v[2:3], v[12:13], v[6:7], v[10:11] +; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp afn <2 x double> , %x ret <2 x double> %fdiv } @@ -908,6 +1235,35 @@ ; GFX9-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 ; GFX9-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_rcp_v2f64_ulp25: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_div_scale_f64 v[22:23], s4, v[0:1], v[0:1], 1.0 +; GFX10-NEXT: v_div_scale_f64 v[20:21], s4, v[2:3], v[2:3], 1.0 +; GFX10-NEXT: v_div_scale_f64 v[16:17], vcc_lo, 1.0, v[0:1], 1.0 +; GFX10-NEXT: v_div_scale_f64 v[12:13], s4, 1.0, v[2:3], 1.0 +; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[22:23] +; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[20:21] +; GFX10-NEXT: v_fma_f64 v[6:7], -v[22:23], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[4:5], -v[20:21], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[6:7], v[8:9] +; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[4:5], v[10:11] +; GFX10-NEXT: v_fma_f64 v[6:7], -v[22:23], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[4:5], -v[20:21], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[8:9] +; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[4:5], v[10:11] +; GFX10-NEXT: v_mul_f64 v[14:15], v[16:17], v[6:7] +; GFX10-NEXT: v_mul_f64 v[18:19], v[12:13], v[10:11] +; GFX10-NEXT: v_fma_f64 v[4:5], -v[22:23], v[14:15], v[16:17] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[20:21], v[18:19], v[12:13] +; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[4:5], v[6:7], v[14:15] +; GFX10-NEXT: s_mov_b32 vcc_lo, s4 +; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[12:13], v[10:11], v[18:19] +; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0 +; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[8:9], v[2:3], 1.0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x double> , %x, !fpmath !0 ret <2 x double> %fdiv } @@ -933,6 +1289,36 @@ ; GCN-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[12:13] ; GCN-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[14:15] ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fdiv_v2f64_afn_ulp25: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v18, v4 +; GFX10-NEXT: v_mov_b32_e32 v19, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v6 +; GFX10-NEXT: v_mov_b32_e32 v5, v7 +; GFX10-NEXT: v_mov_b32_e32 v22, v0 +; GFX10-NEXT: v_mov_b32_e32 v23, v1 +; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[18:19] +; GFX10-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[4:5] +; GFX10-NEXT: v_mov_b32_e32 v1, v3 +; GFX10-NEXT: v_fma_f64 v[6:7], -v[18:19], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[6:7], v[8:9], v[8:9] +; GFX10-NEXT: v_fma_f64 v[10:11], v[12:13], v[10:11], v[10:11] +; GFX10-NEXT: v_fma_f64 v[6:7], -v[18:19], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[6:7], v[8:9], v[8:9] +; GFX10-NEXT: v_fma_f64 v[10:11], v[12:13], v[10:11], v[10:11] +; GFX10-NEXT: v_mul_f64 v[12:13], v[22:23], v[8:9] +; GFX10-NEXT: v_mul_f64 v[14:15], v[0:1], v[10:11] +; GFX10-NEXT: v_fma_f64 v[18:19], -v[18:19], v[12:13], v[22:23] +; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[0:1] +; GFX10-NEXT: v_fma_f64 v[0:1], v[18:19], v[8:9], v[12:13] +; GFX10-NEXT: v_fma_f64 v[2:3], v[4:5], v[10:11], v[14:15] +; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn <2 x double> %a, %b, !fpmath !0 ret <2 x double> %fdiv } @@ -1027,6 +1413,39 @@ ; GFX9-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21] ; GFX9-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fdiv_v2f64_arcp_ulp25: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v30, v4 +; GFX10-NEXT: v_mov_b32_e32 v31, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v6 +; GFX10-NEXT: v_mov_b32_e32 v5, v7 +; GFX10-NEXT: v_div_scale_f64 v[26:27], s4, v[30:31], v[30:31], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[30:31], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[24:25], s4, v[4:5], v[4:5], v[2:3] +; GFX10-NEXT: v_div_scale_f64 v[16:17], s4, v[2:3], v[4:5], v[2:3] +; GFX10-NEXT: v_rcp_f64_e32 v[12:13], v[26:27] +; GFX10-NEXT: v_rcp_f64_e32 v[14:15], v[24:25] +; GFX10-NEXT: v_fma_f64 v[10:11], -v[26:27], v[12:13], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], -v[24:25], v[14:15], 1.0 +; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[10:11], v[12:13] +; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[8:9], v[14:15] +; GFX10-NEXT: v_fma_f64 v[6:7], -v[26:27], v[12:13], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], -v[24:25], v[14:15], 1.0 +; GFX10-NEXT: v_fma_f64 v[6:7], v[12:13], v[6:7], v[12:13] +; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[8:9], v[14:15] +; GFX10-NEXT: v_mul_f64 v[18:19], v[20:21], v[6:7] +; GFX10-NEXT: v_mul_f64 v[22:23], v[16:17], v[14:15] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[26:27], v[18:19], v[20:21] +; GFX10-NEXT: v_fma_f64 v[16:17], -v[24:25], v[22:23], v[16:17] +; GFX10-NEXT: v_div_fmas_f64 v[18:19], v[8:9], v[6:7], v[18:19] +; GFX10-NEXT: s_mov_b32 vcc_lo, s4 +; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[16:17], v[14:15], v[22:23] +; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[18:19], v[30:31], v[0:1] +; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[8:9], v[4:5], v[2:3] +; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp <2 x double> %a, %b, !fpmath !0 ret <2 x double> %fdiv } @@ -1052,6 +1471,36 @@ ; GCN-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[12:13] ; GCN-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[14:15] ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fdiv_v2f64_arcp_afn_ulp25: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v18, v4 +; GFX10-NEXT: v_mov_b32_e32 v19, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v6 +; GFX10-NEXT: v_mov_b32_e32 v5, v7 +; GFX10-NEXT: v_mov_b32_e32 v22, v0 +; GFX10-NEXT: v_mov_b32_e32 v23, v1 +; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[18:19] +; GFX10-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[4:5] +; GFX10-NEXT: v_mov_b32_e32 v1, v3 +; GFX10-NEXT: v_fma_f64 v[6:7], -v[18:19], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[6:7], v[8:9], v[8:9] +; GFX10-NEXT: v_fma_f64 v[10:11], v[12:13], v[10:11], v[10:11] +; GFX10-NEXT: v_fma_f64 v[6:7], -v[18:19], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[6:7], v[8:9], v[8:9] +; GFX10-NEXT: v_fma_f64 v[10:11], v[12:13], v[10:11], v[10:11] +; GFX10-NEXT: v_mul_f64 v[12:13], v[22:23], v[8:9] +; GFX10-NEXT: v_mul_f64 v[14:15], v[0:1], v[10:11] +; GFX10-NEXT: v_fma_f64 v[18:19], -v[18:19], v[12:13], v[22:23] +; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[0:1] +; GFX10-NEXT: v_fma_f64 v[0:1], v[18:19], v[8:9], v[12:13] +; GFX10-NEXT: v_fma_f64 v[2:3], v[4:5], v[10:11], v[14:15] +; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn arcp <2 x double> %a, %b, !fpmath !0 ret <2 x double> %fdiv } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll @@ -2,6 +2,7 @@ ; RUN: llc -global-isel -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s define float @v_fma_f32(float %x, float %y, float %z) { ; GFX6-LABEL: v_fma_f32: @@ -21,6 +22,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_fma_f32 v0, v0, v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fma_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fma = call float @llvm.fma.f32(float %x, float %y, float %z) ret float %fma } @@ -46,6 +54,14 @@ ; GFX9-NEXT: v_fma_f32 v0, v0, v2, v4 ; GFX9-NEXT: v_fma_f32 v1, v1, v3, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fma_v2f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_fma_f32 v0, v0, v2, v4 +; GFX10-NEXT: v_fma_f32 v1, v1, v3, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fma = call <2 x float> @llvm.fma.v2f32(<2 x float> %x, <2 x float> %y, <2 x float> %z) ret <2 x float> %fma } @@ -72,6 +88,14 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_fma_f16 v0, v0, v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fma_f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_fmac_f16_e32 v2, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fma = call half @llvm.fma.f16(half %x, half %y, half %z) ret half %fma } @@ -117,6 +141,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_fma_f16 v0, v0, v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fma_v2f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_fma_f16 v0, v0, v1, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fma = call <2 x half> @llvm.fma.v2f16(<2 x half> %x, <2 x half> %y, <2 x half> %z) ret <2 x half> %fma } @@ -164,6 +195,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fma_v2f16_fneg_lhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX10-NEXT: s_setpc_b64 s[30:31] %x.fneg = fneg <2 x half> %x %fma = call <2 x half> @llvm.fma.v2f16(<2 x half> %x.fneg, <2 x half> %y, <2 x half> %z) ret <2 x half> %fma @@ -212,6 +250,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fma_v2f16_fneg_rhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX10-NEXT: s_setpc_b64 s[30:31] %y.fneg = fneg <2 x half> %y %fma = call <2 x half> @llvm.fma.v2f16(<2 x half> %x, <2 x half> %y.fneg, <2 x half> %z) ret <2 x half> %fma @@ -264,6 +309,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[1,1,0] neg_hi:[1,1,0] ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fma_v2f16_fneg_lhs_rhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[1,1,0] neg_hi:[1,1,0] +; GFX10-NEXT: s_setpc_b64 s[30:31] %x.fneg = fneg <2 x half> %x %y.fneg = fneg <2 x half> %y %fma = call <2 x half> @llvm.fma.v2f16(<2 x half> %x.fneg, <2 x half> %y.fneg, <2 x half> %z) @@ -328,6 +380,14 @@ ; GFX9-NEXT: v_pk_fma_f16 v0, v0, v2, v4 ; GFX9-NEXT: v_pk_fma_f16 v1, v1, v3, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fma_v4f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_fma_f16 v0, v0, v2, v4 +; GFX10-NEXT: v_pk_fma_f16 v1, v1, v3, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fma = call <4 x half> @llvm.fma.v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z) ret <4 x half> %fma } @@ -350,6 +410,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fma_f64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; GFX10-NEXT: s_setpc_b64 s[30:31] %fma = call double @llvm.fma.f64(double %x, double %y, double %z) ret double %fma } @@ -372,6 +439,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_fma_f64 v[0:1], -v[0:1], -v[2:3], -v[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fma_f64_fneg_all: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_fma_f64 v[0:1], -v[0:1], -v[2:3], -v[4:5] +; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.x = fneg double %x %neg.y = fneg double %y %neg.z = fneg double %z @@ -400,6 +474,18 @@ ; GFX9-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9] ; GFX9-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11] ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fma_v2f64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v14, v0 +; GFX10-NEXT: v_mov_b32_e32 v15, v1 +; GFX10-NEXT: v_mov_b32_e32 v12, v2 +; GFX10-NEXT: v_mov_b32_e32 v13, v3 +; GFX10-NEXT: v_fma_f64 v[0:1], v[14:15], v[4:5], v[8:9] +; GFX10-NEXT: v_fma_f64 v[2:3], v[12:13], v[6:7], v[10:11] +; GFX10-NEXT: s_setpc_b64 s[30:31] %fma = call <2 x double> @llvm.fma.v2f64(<2 x double> %x, <2 x double> %y, <2 x double> %z) ret <2 x double> %fma } @@ -422,6 +508,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_fma_f32 v0, |v0|, v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fma_f32_fabs_lhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_fma_f32 v0, v1, |v0|, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fabs.x = call float @llvm.fabs.f32(float %x) %fma = call float @llvm.fma.f32(float %fabs.x, float %y, float %z) ret float %fma @@ -445,6 +538,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_fma_f32 v0, v0, |v1|, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fma_f32_fabs_rhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_fma_f32 v0, |v1|, v0, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fabs.y = call float @llvm.fabs.f32(float %y) %fma = call float @llvm.fma.f32(float %x, float %fabs.y, float %z) ret float %fma @@ -468,6 +568,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_fma_f32 v0, |v0|, |v1|, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fma_f32_fabs_lhs_rhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_fma_f32 v0, |v1|, |v0|, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fabs.x = call float @llvm.fabs.f32(float %x) %fabs.y = call float @llvm.fabs.f32(float %y) %fma = call float @llvm.fma.f32(float %fabs.x, float %fabs.y, float %z) @@ -489,6 +596,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: v_fma_f32 v0, s0, v0, v1 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_fma_f32_sgpr_vgpr_vgpr: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_fma_f32 v0, s0, v0, v1 +; GFX10-NEXT: ; return to shader part epilog %fma = call float @llvm.fma.f32(float %x, float %y, float %z) ret float %fma } @@ -508,6 +620,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: v_fma_f32 v0, v0, s0, v1 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_fma_f32_vgpr_sgpr_vgpr: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_fma_f32 v0, s0, v0, v1 +; GFX10-NEXT: ; return to shader part epilog %fma = call float @llvm.fma.f32(float %x, float %y, float %z) ret float %fma } @@ -533,6 +650,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: v_fma_f32 v0, s0, v0, v1 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_fma_f32_sgpr_sgpr_sgpr: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_fma_f32 v0, s1, s0, v0 +; GFX10-NEXT: ; return to shader part epilog %fma = call float @llvm.fma.f32(float %x, float %y, float %z) ret float %fma } @@ -555,6 +678,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_fma_f32 v0, -v0, v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fma_f32_fneg_lhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_fma_f32 v0, v1, -v0, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.x = fneg float %x %fma = call float @llvm.fma.f32(float %neg.x, float %y, float %z) ret float %fma @@ -578,6 +708,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_fma_f32 v0, v0, -v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fma_f32_fneg_rhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_fma_f32 v0, -v1, v0, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.y = fneg float %y %fma = call float @llvm.fma.f32(float %x, float %neg.y, float %z) ret float %fma @@ -601,6 +738,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_fma_f32 v0, v0, v1, -v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fma_f32_fneg_z: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_fma_f32 v0, v0, v1, -v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.z = fneg float %z %fma = call float @llvm.fma.f32(float %x, float %y, float %neg.z) ret float %fma diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll @@ -2,6 +2,7 @@ ; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI %s ; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=VI %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { ; SI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: @@ -74,6 +75,22 @@ ; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v2, v0, s[4:5] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v3, v0, s[6:7] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_sub_f32_e32 v1, 0x80000000, v1 +; GFX10-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid @@ -189,6 +206,31 @@ ; GFX9-NEXT: v_max_f32_e32 v1, v2, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_test_no_global_nnans_med3_f32_pat0_srcmod0: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v2, v0, s[4:5] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v3, v0, s[6:7] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_sub_f32_e32 v1, 0x80000000, v1 +; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX10-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX10-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX10-NEXT: v_max_f32_e32 v4, v1, v2 +; GFX10-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX10-NEXT: v_max_f32_e32 v4, v4, v4 +; GFX10-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX10-NEXT: v_min_f32_e32 v2, v4, v3 +; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX10-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid @@ -283,6 +325,25 @@ ; GFX9-NEXT: v_med3_f32 v1, v1, |v2|, v3 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v2, v0, s[4:5] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v3, v0, s[6:7] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 s2, 0x80000000 +; GFX10-NEXT: v_sub_f32_e32 v1, s2, v1 +; GFX10-NEXT: v_sub_f32_e64 v3, s2, |v3| +; GFX10-NEXT: v_med3_f32 v1, v1, |v2|, v3 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid @@ -386,6 +447,26 @@ ; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v2, v0, s[4:5] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v3, v0, s[6:7] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 s2, 0x80000000 +; GFX10-NEXT: v_sub_f32_e64 v1, s2, |v1| +; GFX10-NEXT: v_sub_f32_e64 v2, s2, |v2| +; GFX10-NEXT: v_sub_f32_e64 v3, s2, |v3| +; GFX10-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid @@ -488,6 +569,24 @@ ; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_nnan_inputs_med3_f32_pat0: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v2, v0, s[4:5] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v3, v0, s[6:7] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX10-NEXT: v_add_f32_e32 v2, 2.0, v2 +; GFX10-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX10-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid @@ -617,6 +716,32 @@ ; GFX9-NEXT: v_max_f32_e32 v1, v2, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_test_safe_med3_f32_pat0_multi_use0: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v2, v0, s[4:5] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v3, v0, s[6:7] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX10-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX10-NEXT: v_max_f32_e32 v4, v1, v2 +; GFX10-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX10-NEXT: v_max_f32_e32 v4, v4, v4 +; GFX10-NEXT: v_min_f32_e32 v2, v4, v3 +; GFX10-NEXT: v_max_f32_e32 v3, v1, v1 +; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX10-NEXT: v_max_f32_e32 v2, v3, v2 +; GFX10-NEXT: global_store_dword v[0:1], v1, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_store_dword v0, v2, s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s define <2 x half> @v_fmul_v2f16(<2 x half> %a, <2 x half> %b) { ; GFX9-LABEL: v_fmul_v2f16: @@ -18,6 +19,13 @@ ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fmul_v2f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_mul_f16 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %mul = fmul <2 x half> %a, %b ret <2 x half> %mul } @@ -39,6 +47,13 @@ ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fmul_v2f16_fneg_lhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_mul_f16 v0, v0, v1 neg_lo:[1,0] neg_hi:[1,0] +; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg <2 x half> %a %mul = fmul <2 x half> %neg.a, %b ret <2 x half> %mul @@ -61,6 +76,13 @@ ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fmul_v2f16_fneg_rhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_mul_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.b = fneg <2 x half> %b %mul = fmul <2 x half> %a, %neg.b ret <2 x half> %mul @@ -85,6 +107,13 @@ ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fmul_v2f16_fneg_lhs_fneg_rhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_mul_f16 v0, v0, v1 neg_lo:[1,1] neg_hi:[1,1] +; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg <2 x half> %a %neg.b = fneg <2 x half> %b %mul = fmul <2 x half> %neg.a, %neg.b @@ -137,6 +166,14 @@ ; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fmul_v4f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 +; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] %mul = fmul <4 x half> %a, %b ret <4 x half> %mul } @@ -165,6 +202,14 @@ ; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fmul_v4f16_fneg_lhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[1,0] neg_hi:[1,0] +; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[1,0] neg_hi:[1,0] +; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg <4 x half> %a %mul = fmul <4 x half> %neg.a, %b ret <4 x half> %mul @@ -194,6 +239,14 @@ ; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fmul_v4f16_fneg_rhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1] +; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.b = fneg <4 x half> %b %mul = fmul <4 x half> %a, %neg.b ret <4 x half> %mul @@ -225,6 +278,14 @@ ; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fmul_v4f16_fneg_lhs_fneg_rhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[1,1] neg_hi:[1,1] +; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[1,1] neg_hi:[1,1] +; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg <4 x half> %a %neg.b = fneg <4 x half> %b %mul = fmul <4 x half> %neg.a, %neg.b @@ -258,6 +319,15 @@ ; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fmul_v6f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_mul_f16 v0, v0, v3 +; GFX10-NEXT: v_pk_mul_f16 v1, v1, v4 +; GFX10-NEXT: v_pk_mul_f16 v2, v2, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] %mul = fmul <6 x half> %a, %b ret <6 x half> %mul } @@ -293,6 +363,15 @@ ; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fmul_v6f16_fneg_lhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_mul_f16 v0, v0, v3 neg_lo:[1,0] neg_hi:[1,0] +; GFX10-NEXT: v_pk_mul_f16 v1, v1, v4 neg_lo:[1,0] neg_hi:[1,0] +; GFX10-NEXT: v_pk_mul_f16 v2, v2, v5 neg_lo:[1,0] neg_hi:[1,0] +; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg <6 x half> %a %mul = fmul <6 x half> %neg.a, %b ret <6 x half> %mul @@ -329,6 +408,15 @@ ; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fmul_v6f16_fneg_rhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_mul_f16 v0, v0, v3 neg_lo:[0,1] neg_hi:[0,1] +; GFX10-NEXT: v_pk_mul_f16 v1, v1, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX10-NEXT: v_pk_mul_f16 v2, v2, v5 neg_lo:[0,1] neg_hi:[0,1] +; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.b = fneg <6 x half> %b %mul = fmul <6 x half> %a, %neg.b ret <6 x half> %mul @@ -368,6 +456,15 @@ ; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fmul_v6f16_fneg_lhs_fneg_rhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_mul_f16 v0, v0, v3 neg_lo:[1,1] neg_hi:[1,1] +; GFX10-NEXT: v_pk_mul_f16 v1, v1, v4 neg_lo:[1,1] neg_hi:[1,1] +; GFX10-NEXT: v_pk_mul_f16 v2, v2, v5 neg_lo:[1,1] neg_hi:[1,1] +; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg <6 x half> %a %neg.b = fneg <6 x half> %b %mul = fmul <6 x half> %neg.a, %neg.b @@ -406,6 +503,16 @@ ; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fmul_v8f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_mul_f16 v0, v0, v4 +; GFX10-NEXT: v_pk_mul_f16 v1, v1, v5 +; GFX10-NEXT: v_pk_mul_f16 v2, v2, v6 +; GFX10-NEXT: v_pk_mul_f16 v3, v3, v7 +; GFX10-NEXT: s_setpc_b64 s[30:31] %mul = fmul <8 x half> %a, %b ret <8 x half> %mul } @@ -447,6 +554,16 @@ ; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fmul_v8f16_fneg_lhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_mul_f16 v0, v0, v4 neg_lo:[1,0] neg_hi:[1,0] +; GFX10-NEXT: v_pk_mul_f16 v1, v1, v5 neg_lo:[1,0] neg_hi:[1,0] +; GFX10-NEXT: v_pk_mul_f16 v2, v2, v6 neg_lo:[1,0] neg_hi:[1,0] +; GFX10-NEXT: v_pk_mul_f16 v3, v3, v7 neg_lo:[1,0] neg_hi:[1,0] +; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg <8 x half> %a %mul = fmul <8 x half> %neg.a, %b ret <8 x half> %mul @@ -489,6 +606,16 @@ ; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fmul_v8f16_fneg_rhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_mul_f16 v0, v0, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX10-NEXT: v_pk_mul_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1] +; GFX10-NEXT: v_pk_mul_f16 v2, v2, v6 neg_lo:[0,1] neg_hi:[0,1] +; GFX10-NEXT: v_pk_mul_f16 v3, v3, v7 neg_lo:[0,1] neg_hi:[0,1] +; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.b = fneg <8 x half> %b %mul = fmul <8 x half> %a, %neg.b ret <8 x half> %mul @@ -535,6 +662,16 @@ ; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fmul_v8f16_fneg_lhs_fneg_rhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_mul_f16 v0, v0, v4 neg_lo:[1,1] neg_hi:[1,1] +; GFX10-NEXT: v_pk_mul_f16 v1, v1, v5 neg_lo:[1,1] neg_hi:[1,1] +; GFX10-NEXT: v_pk_mul_f16 v2, v2, v6 neg_lo:[1,1] neg_hi:[1,1] +; GFX10-NEXT: v_pk_mul_f16 v3, v3, v7 neg_lo:[1,1] neg_hi:[1,1] +; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg <8 x half> %a %neg.b = fneg <8 x half> %b %mul = fmul <8 x half> %neg.a, %neg.b diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll @@ -2,6 +2,7 @@ ; RUN: llc -global-isel -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s define float @v_pow_f32(float %x, float %y) { ; GFX6-LABEL: v_pow_f32: @@ -27,6 +28,15 @@ ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_pow_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_log_f32_e32 v0, v0 +; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_exp_f32_e32 v0, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %pow = call float @llvm.pow.f32(float %x, float %y) ret float %pow } @@ -64,6 +74,18 @@ ; GFX9-NEXT: v_exp_f32_e32 v0, v0 ; GFX9-NEXT: v_exp_f32_e32 v1, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_pow_v2f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_log_f32_e32 v0, v0 +; GFX10-NEXT: v_log_f32_e32 v1, v1 +; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_mul_legacy_f32_e32 v1, v1, v3 +; GFX10-NEXT: v_exp_f32_e32 v0, v0 +; GFX10-NEXT: v_exp_f32_e32 v1, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %pow = call <2 x float> @llvm.pow.v2f32(<2 x float> %x, <2 x float> %y) ret <2 x float> %pow } @@ -101,6 +123,18 @@ ; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX9-NEXT: v_exp_f16_e32 v0, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_pow_f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_log_f16_e32 v0, v0 +; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX10-NEXT: v_exp_f16_e32 v0, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %pow = call half @llvm.pow.f16(half %x, half %y) ret half %pow } @@ -167,6 +201,25 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_pow_v2f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_log_f16_e32 v2, v0 +; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX10-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX10-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 +; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX10-NEXT: v_exp_f16_e32 v1, v1 +; GFX10-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v0, v1, 0xffff, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y) ret <2 x half> %pow } @@ -236,6 +289,26 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_pow_v2f16_fneg_lhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX10-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_log_f16_e32 v2, v0 +; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX10-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 +; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX10-NEXT: v_exp_f16_e32 v1, v1 +; GFX10-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v0, v1, 0xffff, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %x.fneg = fneg <2 x half> %x %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x.fneg, <2 x half> %y) ret <2 x half> %pow @@ -306,6 +379,26 @@ ; GFX9-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_pow_v2f16_fneg_rhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_log_f16_e32 v2, v0 +; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX10-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX10-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 +; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX10-NEXT: v_exp_f16_e32 v1, v1 +; GFX10-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v0, v1, 0xffff, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %y.fneg = fneg <2 x half> %y %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y.fneg) ret <2 x half> %pow @@ -382,6 +475,28 @@ ; GFX9-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_pow_v2f16_fneg_lhs_rhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_mov_b32 s4, 0x80008000 +; GFX10-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX10-NEXT: v_xor_b32_e32 v1, s4, v1 +; GFX10-NEXT: v_log_f16_e32 v2, v0 +; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX10-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX10-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 +; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX10-NEXT: v_exp_f16_e32 v1, v1 +; GFX10-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v0, v1, 0xffff, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %x.fneg = fneg <2 x half> %x %y.fneg = fneg <2 x half> %y %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x.fneg, <2 x half> %y.fneg) @@ -418,6 +533,15 @@ ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_pow_f32_fabs_lhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_log_f32_e64 v0, |v0| +; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_exp_f32_e32 v0, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fabs.x = call float @llvm.fabs.f32(float %x) %pow = call float @llvm.pow.f32(float %fabs.x, float %y) ret float %pow @@ -447,6 +571,15 @@ ; GFX9-NEXT: v_mul_legacy_f32_e64 v0, v0, |v1| ; GFX9-NEXT: v_exp_f32_e32 v0, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_pow_f32_fabs_rhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_log_f32_e32 v0, v0 +; GFX10-NEXT: v_mul_legacy_f32_e64 v0, v0, |v1| +; GFX10-NEXT: v_exp_f32_e32 v0, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fabs.y = call float @llvm.fabs.f32(float %y) %pow = call float @llvm.pow.f32(float %x, float %fabs.y) ret float %pow @@ -476,6 +609,15 @@ ; GFX9-NEXT: v_mul_legacy_f32_e64 v0, v0, |v1| ; GFX9-NEXT: v_exp_f32_e32 v0, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_pow_f32_fabs_lhs_rhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_log_f32_e64 v0, |v0| +; GFX10-NEXT: v_mul_legacy_f32_e64 v0, v0, |v1| +; GFX10-NEXT: v_exp_f32_e32 v0, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fabs.x = call float @llvm.fabs.f32(float %x) %fabs.y = call float @llvm.fabs.f32(float %y) %pow = call float @llvm.pow.f32(float %fabs.x, float %fabs.y) @@ -503,6 +645,13 @@ ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_pow_f32_sgpr_vgpr: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_log_f32_e32 v1, s0 +; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 +; GFX10-NEXT: v_exp_f32_e32 v0, v0 +; GFX10-NEXT: ; return to shader part epilog %pow = call float @llvm.pow.f32(float %x, float %y) ret float %pow } @@ -528,6 +677,13 @@ ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, s0, v0 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_pow_f32_vgpr_sgpr: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_log_f32_e32 v0, v0 +; GFX10-NEXT: v_mul_legacy_f32_e32 v0, s0, v0 +; GFX10-NEXT: v_exp_f32_e32 v0, v0 +; GFX10-NEXT: ; return to shader part epilog %pow = call float @llvm.pow.f32(float %x, float %y) ret float %pow } @@ -553,6 +709,13 @@ ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, s1, v0 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_pow_f32_sgpr_sgpr: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_log_f32_e32 v0, s0 +; GFX10-NEXT: v_mul_legacy_f32_e32 v0, s1, v0 +; GFX10-NEXT: v_exp_f32_e32 v0, v0 +; GFX10-NEXT: ; return to shader part epilog %pow = call float @llvm.pow.f32(float %x, float %y) ret float %pow } @@ -581,6 +744,15 @@ ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_pow_f32_fneg_lhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_log_f32_e64 v0, -v0 +; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_exp_f32_e32 v0, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.x = fneg float %x %pow = call float @llvm.pow.f32(float %neg.x, float %y) ret float %pow @@ -610,6 +782,15 @@ ; GFX9-NEXT: v_mul_legacy_f32_e64 v0, v0, -v1 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_pow_f32_fneg_rhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_log_f32_e32 v0, v0 +; GFX10-NEXT: v_mul_legacy_f32_e64 v0, v0, -v1 +; GFX10-NEXT: v_exp_f32_e32 v0, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.y = fneg float %y %pow = call float @llvm.pow.f32(float %x, float %neg.y) ret float %pow diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll @@ -2,6 +2,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefix=GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s define amdgpu_ps void @insertelement_s_v2i16_s_s(<2 x i16> addrspace(4)* inreg %ptr, i16 inreg %val, i32 inreg %idx) { ; GFX9-LABEL: insertelement_s_v2i16_s_s: @@ -57,6 +58,24 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_s_v2i16_s_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_and_b32 s1, s5, 1 +; GFX10-NEXT: s_mov_b32 s2, 0xffff +; GFX10-NEXT: s_lshl_b32 s1, s1, 4 +; GFX10-NEXT: s_and_b32 s3, s4, s2 +; GFX10-NEXT: s_lshl_b32 s2, s2, s1 +; GFX10-NEXT: s_lshl_b32 s1, s3, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_andn2_b32 s0, s0, s2 +; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm %vec = load <2 x i16>, <2 x i16> addrspace(4)* %ptr %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx store <2 x i16> %insert, <2 x i16> addrspace(1)* null @@ -117,6 +136,23 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_v_v2i16_s_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_and_b32 s0, s3, 1 +; GFX10-NEXT: s_mov_b32 s1, 0xffff +; GFX10-NEXT: s_lshl_b32 s0, s0, 4 +; GFX10-NEXT: s_and_b32 s2, s2, s1 +; GFX10-NEXT: s_lshl_b32 s1, s1, s0 +; GFX10-NEXT: s_lshl_b32 s0, s2, s0 +; GFX10-NEXT: s_not_b32 s1, s1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_and_or_b32 v2, v0, s1, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm %vec = load <2 x i16>, <2 x i16> addrspace(1 )* %ptr %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx store <2 x i16> %insert, <2 x i16> addrspace(1)* null @@ -174,6 +210,22 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_s_v2i16_v_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_and_b32 s1, s4, 1 +; GFX10-NEXT: s_mov_b32 s2, 0xffff +; GFX10-NEXT: s_lshl_b32 s1, s1, 4 +; GFX10-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX10-NEXT: s_lshl_b32 s2, s2, s1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_andn2_b32 s0, s0, s2 +; GFX10-NEXT: v_lshl_or_b32 v2, v0, s1, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm %vec = load <2 x i16>, <2 x i16> addrspace(4)* %ptr %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx store <2 x i16> %insert, <2 x i16> addrspace(1)* null @@ -233,6 +285,23 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_s_v2i16_s_v: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_mov_b32 s1, 0xffff +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, s1 +; GFX10-NEXT: s_and_b32 s1, s4, s1 +; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, s1 +; GFX10-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_and_or_b32 v2, s0, v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm %vec = load <2 x i16>, <2 x i16> addrspace(4)* %ptr %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx store <2 x i16> %insert, <2 x i16> addrspace(1)* null @@ -290,6 +359,22 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_s_v2i16_v_v: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_mov_b32 s1, 0xffff +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, s1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_xor_b32_e32 v1, -1, v2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_and_or_b32 v2, s0, v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm %vec = load <2 x i16>, <2 x i16> addrspace(4)* %ptr %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx store <2 x i16> %insert, <2 x i16> addrspace(1)* null @@ -349,6 +434,23 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_v_v2i16_s_v: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: v_and_b32_e32 v1, 1, v2 +; GFX10-NEXT: s_mov_b32 s0, 0xffff +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, s0 +; GFX10-NEXT: s_and_b32 s0, s2, s0 +; GFX10-NEXT: v_lshlrev_b32_e64 v1, v1, s0 +; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_and_or_b32 v2, v0, v2, v1 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm %vec = load <2 x i16>, <2 x i16> addrspace(1)* %ptr %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx store <2 x i16> %insert, <2 x i16> addrspace(1)* null @@ -407,6 +509,22 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_v_v2i16_v_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_and_b32 s0, s2, 1 +; GFX10-NEXT: s_mov_b32 s1, 0xffff +; GFX10-NEXT: s_lshl_b32 s0, s0, 4 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: s_lshl_b32 s0, s1, s0 +; GFX10-NEXT: s_not_b32 s0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_and_or_b32 v2, v0, s0, v1 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm %vec = load <2 x i16>, <2 x i16> addrspace(1)* %ptr %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx store <2 x i16> %insert, <2 x i16> addrspace(1)* null @@ -464,6 +582,22 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_v_v2i16_v_v: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: v_and_b32_e32 v1, 1, v3 +; GFX10-NEXT: s_mov_b32 s0, 0xffff +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, s0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_xor_b32_e32 v2, -1, v3 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_and_or_b32 v2, v0, v2, v1 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm %vec = load <2 x i16>, <2 x i16> addrspace(1)* %ptr %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx store <2 x i16> %insert, <2 x i16> addrspace(1)* null @@ -599,6 +733,29 @@ ; GFX7-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_v_v4i16_s_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_lshr_b32 s1, s3, 1 +; GFX10-NEXT: s_and_b32 s3, s3, 1 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s1, 1 +; GFX10-NEXT: s_mov_b32 s0, 0xffff +; GFX10-NEXT: s_lshl_b32 s3, s3, 4 +; GFX10-NEXT: s_and_b32 s2, s2, s0 +; GFX10-NEXT: s_lshl_b32 s0, s0, s3 +; GFX10-NEXT: s_lshl_b32 s2, s2, s3 +; GFX10-NEXT: s_not_b32 s0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc_lo +; GFX10-NEXT: v_and_or_b32 v2, v2, s0, s2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s1, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v2, s0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: s_endpgm %vec = load <4 x i16>, <4 x i16> addrspace(1 )* %ptr %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx store <4 x i16> %insert, <4 x i16> addrspace(1)* null @@ -683,6 +840,31 @@ ; GFX7-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_s_v4i16_v_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_lshr_b32 s2, s4, 1 +; GFX10-NEXT: s_mov_b32 s5, 0xffff +; GFX10-NEXT: s_cmp_eq_u32 s2, 1 +; GFX10-NEXT: v_and_b32_e32 v2, s5, v0 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_cselect_b32 s3, s1, s0 +; GFX10-NEXT: s_and_b32 s4, s4, 1 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: s_lshl_b32 s4, s4, 4 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_lshl_b32 s5, s5, s4 +; GFX10-NEXT: s_andn2_b32 s3, s3, s5 +; GFX10-NEXT: v_lshl_or_b32 v2, v2, s4, s3 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: s_endpgm %vec = load <4 x i16>, <4 x i16> addrspace(4)* %ptr %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx store <4 x i16> %insert, <4 x i16> addrspace(1)* null @@ -772,6 +954,32 @@ ; GFX7-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_s_v4i16_s_v: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: v_and_b32_e32 v1, 1, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 1, v0 +; GFX10-NEXT: s_mov_b32 s2, 0xffff +; GFX10-NEXT: s_and_b32 s3, s4, s2 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, s2 +; GFX10-NEXT: v_lshlrev_b32_e64 v4, v1, s3 +; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v0, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v3 +; GFX10-NEXT: v_and_or_b32 v2, v5, v2, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: s_endpgm %vec = load <4 x i16>, <4 x i16> addrspace(4)* %ptr %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx store <4 x i16> %insert, <4 x i16> addrspace(1)* null @@ -859,6 +1067,31 @@ ; GFX7-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_s_v4i16_v_v: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: v_and_b32_e32 v2, 1, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 1, v1 +; GFX10-NEXT: s_mov_b32 s2, 0xffff +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v2, s2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v1, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v4 +; GFX10-NEXT: v_and_or_b32 v2, v5, v3, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: s_endpgm %vec = load <4 x i16>, <4 x i16> addrspace(4)* %ptr %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx store <4 x i16> %insert, <4 x i16> addrspace(1)* null @@ -936,6 +1169,29 @@ ; GFX7-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_v_v4i16_s_v: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: v_and_b32_e32 v3, 1, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX10-NEXT: s_mov_b32 s0, 0xffff +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX10-NEXT: s_and_b32 s0, s2, s0 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v3, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v2 +; GFX10-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc_lo +; GFX10-NEXT: v_and_or_b32 v3, v5, v4, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v3, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: s_endpgm %vec = load <4 x i16>, <4 x i16> addrspace(1)* %ptr %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx store <4 x i16> %insert, <4 x i16> addrspace(1)* null @@ -1012,6 +1268,28 @@ ; GFX7-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_v_v4i16_v_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_and_b32 s0, s2, 1 +; GFX10-NEXT: s_lshr_b32 s2, s2, 1 +; GFX10-NEXT: s_lshl_b32 s0, s0, 4 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 +; GFX10-NEXT: s_mov_b32 s1, 0xffff +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: s_lshl_b32 s0, s1, s0 +; GFX10-NEXT: s_not_b32 s0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc_lo +; GFX10-NEXT: v_and_or_b32 v2, v3, s0, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s2, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v2, s0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: s_endpgm %vec = load <4 x i16>, <4 x i16> addrspace(1)* %ptr %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx store <4 x i16> %insert, <4 x i16> addrspace(1)* null @@ -1087,6 +1365,28 @@ ; GFX7-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_v_v4i16_v_v: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: v_and_b32_e32 v4, 1, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 1, v3 +; GFX10-NEXT: s_mov_b32 s0, 0xffff +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 4, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v3 +; GFX10-NEXT: v_xor_b32_e32 v4, -1, v5 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc_lo +; GFX10-NEXT: v_and_or_b32 v2, v5, v4, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: s_endpgm %vec = load <4 x i16>, <4 x i16> addrspace(1)* %ptr %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx store <4 x i16> %insert, <4 x i16> addrspace(1)* null @@ -1202,6 +1502,42 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_s_v8i16_s_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX10-NEXT: s_lshr_b32 s6, s5, 1 +; GFX10-NEXT: s_mov_b32 s8, 0xffff +; GFX10-NEXT: s_cmp_eq_u32 s6, 1 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_cselect_b32 s7, s1, s0 +; GFX10-NEXT: s_cmp_eq_u32 s6, 2 +; GFX10-NEXT: s_cselect_b32 s7, s2, s7 +; GFX10-NEXT: s_cmp_eq_u32 s6, 3 +; GFX10-NEXT: s_cselect_b32 s7, s3, s7 +; GFX10-NEXT: s_and_b32 s5, s5, 1 +; GFX10-NEXT: s_and_b32 s4, s4, s8 +; GFX10-NEXT: s_lshl_b32 s5, s5, 4 +; GFX10-NEXT: s_lshl_b32 s8, s8, s5 +; GFX10-NEXT: s_lshl_b32 s4, s4, s5 +; GFX10-NEXT: s_andn2_b32 s5, s7, s8 +; GFX10-NEXT: s_or_b32 s4, s5, s4 +; GFX10-NEXT: s_cmp_eq_u32 s6, 0 +; GFX10-NEXT: s_cselect_b32 s0, s4, s0 +; GFX10-NEXT: s_cmp_eq_u32 s6, 1 +; GFX10-NEXT: s_cselect_b32 s1, s4, s1 +; GFX10-NEXT: s_cmp_eq_u32 s6, 2 +; GFX10-NEXT: s_cselect_b32 s2, s4, s2 +; GFX10-NEXT: s_cmp_eq_u32 s6, 3 +; GFX10-NEXT: s_cselect_b32 s3, s4, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX10-NEXT: s_endpgm %vec = load <8 x i16>, <8 x i16> addrspace(4)* %ptr %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx store <8 x i16> %insert, <8 x i16> addrspace(1)* null @@ -1300,6 +1636,35 @@ ; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[2:3] ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_v_v8i16_s_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: s_lshr_b32 s4, s3, 1 +; GFX10-NEXT: s_and_b32 s1, s3, 1 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s4, 2 +; GFX10-NEXT: s_lshl_b32 s3, s1, 4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s4, 3 +; GFX10-NEXT: s_mov_b32 s5, 0xffff +; GFX10-NEXT: s_and_b32 s2, s2, s5 +; GFX10-NEXT: s_lshl_b32 s5, s5, s3 +; GFX10-NEXT: s_lshl_b32 s2, s2, s3 +; GFX10-NEXT: s_not_b32 s3, s5 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v3, s1 +; GFX10-NEXT: v_and_or_b32 v4, v4, s3, s2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s4, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v4, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v4, s1 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX10-NEXT: s_endpgm %vec = load <8 x i16>, <8 x i16> addrspace(1 )* %ptr %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx store <8 x i16> %insert, <8 x i16> addrspace(1)* null @@ -1415,6 +1780,41 @@ ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_s_v8i16_v_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX10-NEXT: s_lshr_b32 s5, s4, 1 +; GFX10-NEXT: s_mov_b32 s7, 0xffff +; GFX10-NEXT: s_cmp_eq_u32 s5, 1 +; GFX10-NEXT: v_and_b32_e32 v4, s7, v0 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_cselect_b32 s6, s1, s0 +; GFX10-NEXT: s_cmp_eq_u32 s5, 2 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: s_cselect_b32 s6, s2, s6 +; GFX10-NEXT: s_cmp_eq_u32 s5, 3 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_cselect_b32 s6, s3, s6 +; GFX10-NEXT: s_and_b32 s4, s4, 1 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: s_lshl_b32 s4, s4, 4 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: s_lshl_b32 s7, s7, s4 +; GFX10-NEXT: s_andn2_b32 s6, s6, s7 +; GFX10-NEXT: v_lshl_or_b32 v4, v4, s4, s6 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 2 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 3 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX10-NEXT: s_endpgm %vec = load <8 x i16>, <8 x i16> addrspace(4)* %ptr %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx store <8 x i16> %insert, <8 x i16> addrspace(1)* null @@ -1535,6 +1935,40 @@ ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_s_v8i16_s_v: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 +; GFX10-NEXT: v_and_b32_e32 v1, 1, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 1, v0 +; GFX10-NEXT: s_mov_b32 s0, 0xffff +; GFX10-NEXT: s_and_b32 s1, s4, s0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v4 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v4 +; GFX10-NEXT: v_lshlrev_b32_e64 v5, v1, s1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v4 +; GFX10-NEXT: v_xor_b32_e32 v6, -1, v2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s9 +; GFX10-NEXT: v_cndmask_b32_e32 v0, s8, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s10, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v0, s11, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-NEXT: v_mov_b32_e32 v2, s10 +; GFX10-NEXT: v_mov_b32_e32 v3, s11 +; GFX10-NEXT: v_and_or_b32 v5, v7, v6, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s2 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v5, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v5, s1 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX10-NEXT: s_endpgm %vec = load <8 x i16>, <8 x i16> addrspace(4)* %ptr %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx store <8 x i16> %insert, <8 x i16> addrspace(1)* null @@ -1653,6 +2087,39 @@ ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_s_v8i16_v_v: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 1, v1 +; GFX10-NEXT: v_and_b32_e32 v2, 1, v1 +; GFX10-NEXT: s_mov_b32 s0, 0xffff +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v4 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v2, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v4 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_xor_b32_e32 v6, -1, v3 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: v_cndmask_b32_e32 v1, s4, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s6, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v1, s7, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: v_mov_b32_e32 v3, s7 +; GFX10-NEXT: v_and_or_b32 v5, v7, v6, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s2 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v5, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v5, s1 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX10-NEXT: s_endpgm %vec = load <8 x i16>, <8 x i16> addrspace(4)* %ptr %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx store <8 x i16> %insert, <8 x i16> addrspace(1)* null @@ -1750,6 +2217,35 @@ ; GFX7-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[2:3] ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_v_v8i16_s_v: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off +; GFX10-NEXT: v_and_b32_e32 v0, 1, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v2 +; GFX10-NEXT: s_mov_b32 s0, 0xffff +; GFX10-NEXT: s_and_b32 s1, s2, s0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v1 +; GFX10-NEXT: v_lshlrev_b32_e64 v7, v0, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v1 +; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, s1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v1 +; GFX10-NEXT: v_xor_b32_e32 v7, -1, v7 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v11, v2, v5, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v11, v6, s1 +; GFX10-NEXT: v_and_or_b32 v7, v2, v7, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v7, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v7, s1 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX10-NEXT: s_endpgm %vec = load <8 x i16>, <8 x i16> addrspace(1)* %ptr %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx store <8 x i16> %insert, <8 x i16> addrspace(1)* null @@ -1846,6 +2342,34 @@ ; GFX7-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[2:3] ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_v_v8i16_v_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off +; GFX10-NEXT: s_lshr_b32 s3, s2, 1 +; GFX10-NEXT: s_and_b32 s1, s2, 1 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s3, 1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, 2 +; GFX10-NEXT: s_lshl_b32 s2, s1, 4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s3, 3 +; GFX10-NEXT: s_mov_b32 s4, 0xffff +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: s_lshl_b32 s2, s4, s2 +; GFX10-NEXT: s_not_b32 s2, s2 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s1 +; GFX10-NEXT: v_and_or_b32 v7, v0, s2, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s3, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v7, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v7, s1 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX10-NEXT: s_endpgm %vec = load <8 x i16>, <8 x i16> addrspace(1)* %ptr %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx store <8 x i16> %insert, <8 x i16> addrspace(1)* null @@ -1941,6 +2465,34 @@ ; GFX7-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[2:3] ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_v_v8i16_v_v: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v3 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v3 +; GFX10-NEXT: s_mov_b32 s0, 0xffff +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v1 +; GFX10-NEXT: v_lshlrev_b32_e64 v8, v0, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_xor_b32_e32 v2, -1, v8 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v10, v3, v6, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v10, v7, s1 +; GFX10-NEXT: v_and_or_b32 v3, v3, v2, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, v3, s2 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v3, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v3, s1 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX10-NEXT: s_endpgm %vec = load <8 x i16>, <8 x i16> addrspace(1)* %ptr %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx store <8 x i16> %insert, <8 x i16> addrspace(1)* null @@ -2124,6 +2676,65 @@ ; GFX7-NEXT: v_mov_b32_e32 v3, s7 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_s_v16i16_s_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 +; GFX10-NEXT: s_lshr_b32 s7, s5, 1 +; GFX10-NEXT: s_mov_b32 s2, 0xffff +; GFX10-NEXT: s_cmp_eq_u32 s7, 1 +; GFX10-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-NEXT: v_mov_b32_e32 v9, 0 +; GFX10-NEXT: v_mov_b32_e32 v10, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_cselect_b32 s0, s9, s8 +; GFX10-NEXT: s_cmp_eq_u32 s7, 2 +; GFX10-NEXT: s_cselect_b32 s0, s10, s0 +; GFX10-NEXT: s_cmp_eq_u32 s7, 3 +; GFX10-NEXT: s_cselect_b32 s0, s11, s0 +; GFX10-NEXT: s_cmp_eq_u32 s7, 4 +; GFX10-NEXT: s_cselect_b32 s0, s12, s0 +; GFX10-NEXT: s_cmp_eq_u32 s7, 5 +; GFX10-NEXT: s_cselect_b32 s0, s13, s0 +; GFX10-NEXT: s_cmp_eq_u32 s7, 6 +; GFX10-NEXT: s_cselect_b32 s0, s14, s0 +; GFX10-NEXT: s_cmp_eq_u32 s7, 7 +; GFX10-NEXT: s_cselect_b32 s0, s15, s0 +; GFX10-NEXT: s_and_b32 s1, s5, 1 +; GFX10-NEXT: s_and_b32 s3, s4, s2 +; GFX10-NEXT: s_lshl_b32 s1, s1, 4 +; GFX10-NEXT: s_lshl_b32 s2, s2, s1 +; GFX10-NEXT: s_lshl_b32 s1, s3, s1 +; GFX10-NEXT: s_andn2_b32 s0, s0, s2 +; GFX10-NEXT: s_or_b32 s16, s0, s1 +; GFX10-NEXT: s_cmp_eq_u32 s7, 0 +; GFX10-NEXT: s_cselect_b32 s0, s16, s8 +; GFX10-NEXT: s_cmp_eq_u32 s7, 1 +; GFX10-NEXT: s_cselect_b32 s1, s16, s9 +; GFX10-NEXT: s_cmp_eq_u32 s7, 2 +; GFX10-NEXT: s_cselect_b32 s2, s16, s10 +; GFX10-NEXT: s_cmp_eq_u32 s7, 3 +; GFX10-NEXT: s_cselect_b32 s3, s16, s11 +; GFX10-NEXT: s_cmp_eq_u32 s7, 4 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: s_cselect_b32 s4, s16, s12 +; GFX10-NEXT: s_cmp_eq_u32 s7, 5 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_cselect_b32 s5, s16, s13 +; GFX10-NEXT: s_cmp_eq_u32 s7, 6 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: s_cselect_b32 s6, s16, s14 +; GFX10-NEXT: s_cmp_eq_u32 s7, 7 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: s_cselect_b32 s7, s16, s15 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 +; GFX10-NEXT: v_mov_b32_e32 v6, s6 +; GFX10-NEXT: v_mov_b32_e32 v7, s7 +; GFX10-NEXT: s_mov_b64 s[0:1], 16 +; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off +; GFX10-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] +; GFX10-NEXT: s_endpgm %vec = load <16 x i16>, <16 x i16> addrspace(4)* %ptr %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx store <16 x i16> %insert, <16 x i16> addrspace(1)* null @@ -2274,6 +2885,53 @@ ; GFX7-NEXT: s_mov_b64 s[16:17], 16 ; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_v_v16i16_s_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx4 v[2:5], v[0:1], off +; GFX10-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 +; GFX10-NEXT: s_lshr_b32 s7, s3, 1 +; GFX10-NEXT: s_mov_b32 s8, 0xffff +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s7, 1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s7, 2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s7, 3 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, s7, 4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, s7, 5 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, s7, 6 +; GFX10-NEXT: s_and_b32 s9, s2, s8 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s7, 7 +; GFX10-NEXT: s_and_b32 s3, s3, 1 +; GFX10-NEXT: v_mov_b32_e32 v13, 0 +; GFX10-NEXT: s_lshl_b32 s3, s3, 4 +; GFX10-NEXT: s_lshl_b32 s8, s8, s3 +; GFX10-NEXT: s_lshl_b32 s3, s9, s3 +; GFX10-NEXT: s_not_b32 s8, s8 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_cndmask_b32_e32 v11, v2, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v11, v4, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v11, v0, v7, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v11, v8, s6 +; GFX10-NEXT: v_mov_b32_e32 v10, 0 +; GFX10-NEXT: v_mov_b32_e32 v11, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v9, s2 +; GFX10-NEXT: v_and_or_b32 v12, v0, s8, s3 +; GFX10-NEXT: v_cmp_eq_u32_e64 s3, s7, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v12, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, v12, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, v12, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v12, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v12, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v7, v12, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v8, v12, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v9, v12, s2 +; GFX10-NEXT: s_mov_b64 s[0:1], 16 +; GFX10-NEXT: global_store_dwordx4 v[10:11], v[0:3], off +; GFX10-NEXT: global_store_dwordx4 v13, v[4:7], s[0:1] +; GFX10-NEXT: s_endpgm %vec = load <16 x i16>, <16 x i16> addrspace(1 )* %ptr %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx store <16 x i16> %insert, <16 x i16> addrspace(1)* null @@ -2458,6 +3116,64 @@ ; GFX7-NEXT: s_mov_b64 s[0:1], 16 ; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_s_v16i16_v_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 +; GFX10-NEXT: s_lshr_b32 s0, s4, 1 +; GFX10-NEXT: s_mov_b32 s3, 0xffff +; GFX10-NEXT: s_cmp_eq_u32 s0, 1 +; GFX10-NEXT: v_and_b32_e32 v8, s3, v0 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 0 +; GFX10-NEXT: v_mov_b32_e32 v11, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_cselect_b32 s1, s9, s8 +; GFX10-NEXT: s_cmp_eq_u32 s0, 2 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: s_cselect_b32 s1, s10, s1 +; GFX10-NEXT: s_cmp_eq_u32 s0, 3 +; GFX10-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-NEXT: s_cselect_b32 s1, s11, s1 +; GFX10-NEXT: s_cmp_eq_u32 s0, 4 +; GFX10-NEXT: v_mov_b32_e32 v2, s10 +; GFX10-NEXT: s_cselect_b32 s1, s12, s1 +; GFX10-NEXT: s_cmp_eq_u32 s0, 5 +; GFX10-NEXT: v_mov_b32_e32 v3, s11 +; GFX10-NEXT: s_cselect_b32 s1, s13, s1 +; GFX10-NEXT: s_cmp_eq_u32 s0, 6 +; GFX10-NEXT: v_mov_b32_e32 v4, s12 +; GFX10-NEXT: s_cselect_b32 s1, s14, s1 +; GFX10-NEXT: s_cmp_eq_u32 s0, 7 +; GFX10-NEXT: v_mov_b32_e32 v5, s13 +; GFX10-NEXT: s_cselect_b32 s1, s15, s1 +; GFX10-NEXT: s_and_b32 s2, s4, 1 +; GFX10-NEXT: v_mov_b32_e32 v6, s14 +; GFX10-NEXT: s_lshl_b32 s2, s2, 4 +; GFX10-NEXT: v_mov_b32_e32 v7, s15 +; GFX10-NEXT: s_lshl_b32 s3, s3, s2 +; GFX10-NEXT: s_andn2_b32 s1, s1, s3 +; GFX10-NEXT: v_lshl_or_b32 v10, v8, s2, s1 +; GFX10-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-NEXT: v_mov_b32_e32 v9, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 2 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 3 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 4 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 5 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 6 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 7 +; GFX10-NEXT: s_mov_b64 s[0:1], 16 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc_lo +; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off +; GFX10-NEXT: global_store_dwordx4 v11, v[4:7], s[0:1] +; GFX10-NEXT: s_endpgm %vec = load <16 x i16>, <16 x i16> addrspace(4)* %ptr %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx store <16 x i16> %insert, <16 x i16> addrspace(1)* null @@ -2647,6 +3363,59 @@ ; GFX7-NEXT: s_mov_b64 s[0:1], 16 ; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_s_v16i16_s_v: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 1, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: s_mov_b32 s5, 0xffff +; GFX10-NEXT: v_mov_b32_e32 v12, 0 +; GFX10-NEXT: s_and_b32 s6, s4, s5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v10 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v10 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v10 +; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v10 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v10 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, s5 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v10 +; GFX10-NEXT: v_lshlrev_b32_e64 v8, v0, s6 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v10 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-NEXT: v_xor_b32_e32 v9, -1, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v1, s8, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s10, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s11, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s12, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s13, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s14, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v11, v1, s15, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-NEXT: v_mov_b32_e32 v2, s10 +; GFX10-NEXT: v_mov_b32_e32 v3, s11 +; GFX10-NEXT: v_and_or_b32 v11, v11, v9, v8 +; GFX10-NEXT: v_mov_b32_e32 v4, s12 +; GFX10-NEXT: v_mov_b32_e32 v5, s13 +; GFX10-NEXT: v_mov_b32_e32 v6, s14 +; GFX10-NEXT: v_mov_b32_e32 v7, s15 +; GFX10-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-NEXT: v_mov_b32_e32 v9, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v11, s6 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v11, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v11, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v11, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v11, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v11, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v11, s5 +; GFX10-NEXT: s_mov_b64 s[0:1], 16 +; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off +; GFX10-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] +; GFX10-NEXT: s_endpgm %vec = load <16 x i16>, <16 x i16> addrspace(4)* %ptr %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx store <16 x i16> %insert, <16 x i16> addrspace(1)* null @@ -2834,6 +3603,58 @@ ; GFX7-NEXT: s_mov_b64 s[0:1], 16 ; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_s_v16i16_v_v: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 1, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX10-NEXT: s_mov_b32 s4, 0xffff +; GFX10-NEXT: v_mov_b32_e32 v12, 0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v10 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v10 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v10 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v10 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v10 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v10 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, s4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v10 +; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-NEXT: v_xor_b32_e32 v9, -1, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v2, s8, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s10, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s11, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s12, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s13, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s14, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v11, v2, s15, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-NEXT: v_mov_b32_e32 v2, s10 +; GFX10-NEXT: v_mov_b32_e32 v3, s11 +; GFX10-NEXT: v_and_or_b32 v11, v11, v9, v8 +; GFX10-NEXT: v_mov_b32_e32 v4, s12 +; GFX10-NEXT: v_mov_b32_e32 v5, s13 +; GFX10-NEXT: v_mov_b32_e32 v6, s14 +; GFX10-NEXT: v_mov_b32_e32 v7, s15 +; GFX10-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-NEXT: v_mov_b32_e32 v9, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v11, s6 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v11, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v11, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v11, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v11, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v11, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v11, s5 +; GFX10-NEXT: s_mov_b64 s[0:1], 16 +; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off +; GFX10-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] +; GFX10-NEXT: s_endpgm %vec = load <16 x i16>, <16 x i16> addrspace(4)* %ptr %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx store <16 x i16> %insert, <16 x i16> addrspace(1)* null @@ -2983,6 +3804,53 @@ ; GFX7-NEXT: s_mov_b64 s[16:17], 16 ; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_v_v16i16_s_v: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off +; GFX10-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX10-NEXT: s_mov_b32 s5, 0xffff +; GFX10-NEXT: v_mov_b32_e32 v14, 0 +; GFX10-NEXT: s_and_b32 s6, s2, s5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 4, v0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 5, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 6, v0 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_cndmask_b32_e32 v11, v3, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v11, v5, s0 +; GFX10-NEXT: v_lshlrev_b32_e64 v11, v2, s5 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v0 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v2, s6 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s1 +; GFX10-NEXT: v_xor_b32_e32 v11, -1, v11 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v7, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v15, v1, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v15, v9, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v10, s5 +; GFX10-NEXT: v_and_or_b32 v13, v1, v11, v2 +; GFX10-NEXT: v_mov_b32_e32 v11, 0 +; GFX10-NEXT: v_mov_b32_e32 v12, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v13, s6 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v13, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v13, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v13, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v7, v13, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v8, v13, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v9, v13, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v10, v13, s5 +; GFX10-NEXT: s_mov_b64 s[0:1], 16 +; GFX10-NEXT: global_store_dwordx4 v[11:12], v[0:3], off +; GFX10-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] +; GFX10-NEXT: s_endpgm %vec = load <16 x i16>, <16 x i16> addrspace(1)* %ptr %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx store <16 x i16> %insert, <16 x i16> addrspace(1)* null @@ -3131,6 +3999,52 @@ ; GFX7-NEXT: s_mov_b64 s[16:17], 16 ; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_v_v16i16_v_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off +; GFX10-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 +; GFX10-NEXT: s_lshr_b32 s6, s2, 1 +; GFX10-NEXT: s_and_b32 s5, s2, 1 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s6, 1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s6, 2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s6, 3 +; GFX10-NEXT: v_cmp_eq_u32_e64 s3, s6, 4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, s6, 5 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s6, 6 +; GFX10-NEXT: s_lshl_b32 s7, s5, 4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, s6, 7 +; GFX10-NEXT: s_mov_b32 s8, 0xffff +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: s_lshl_b32 s7, s8, s7 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, s6, 0 +; GFX10-NEXT: s_not_b32 s7, s7 +; GFX10-NEXT: v_mov_b32_e32 v14, 0 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cndmask_b32_e64 v11, v0, v7, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v11, v8, s4 +; GFX10-NEXT: v_mov_b32_e32 v11, 0 +; GFX10-NEXT: v_mov_b32_e32 v12, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v9, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v10, s5 +; GFX10-NEXT: v_and_or_b32 v13, v0, s7, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v13, s6 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v13, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v13, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v13, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v7, v13, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v8, v13, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v9, v13, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v10, v13, s5 +; GFX10-NEXT: s_mov_b64 s[0:1], 16 +; GFX10-NEXT: global_store_dwordx4 v[11:12], v[0:3], off +; GFX10-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] +; GFX10-NEXT: s_endpgm %vec = load <16 x i16>, <16 x i16> addrspace(1)* %ptr %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx store <16 x i16> %insert, <16 x i16> addrspace(1)* null @@ -3278,6 +4192,52 @@ ; GFX7-NEXT: s_mov_b64 s[16:17], 16 ; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_v_v16i16_v_v: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off +; GFX10-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:16 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX10-NEXT: s_mov_b32 s4, 0xffff +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v0 +; GFX10-NEXT: v_lshlrev_b32_e64 v12, v3, s4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_xor_b32_e32 v3, -1, v12 +; GFX10-NEXT: v_mov_b32_e32 v12, 0 +; GFX10-NEXT: v_mov_b32_e32 v13, 0 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v7, s1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cndmask_b32_e64 v15, v1, v8, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v15, v9, s3 +; GFX10-NEXT: v_mov_b32_e32 v15, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v10, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v11, s5 +; GFX10-NEXT: v_and_or_b32 v14, v1, v3, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, v14, s6 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v14, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v14, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v14, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v14, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v9, v14, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v10, v14, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v11, v14, s5 +; GFX10-NEXT: s_mov_b64 s[0:1], 16 +; GFX10-NEXT: global_store_dwordx4 v[12:13], v[0:3], off +; GFX10-NEXT: global_store_dwordx4 v15, v[4:7], s[0:1] +; GFX10-NEXT: s_endpgm %vec = load <16 x i16>, <16 x i16> addrspace(1)* %ptr %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx store <16 x i16> %insert, <16 x i16> addrspace(1)* null diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll @@ -2,6 +2,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefix=GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s define amdgpu_ps void @insertelement_s_v2i8_s_s(<2 x i8> addrspace(4)* inreg %ptr, i8 inreg %val, i32 inreg %idx) { ; GFX9-LABEL: insertelement_s_v2i8_s_s: @@ -65,6 +66,25 @@ ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_s_v2i8_s_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 1 +; GFX10-NEXT: s_movk_i32 s0, 0xff +; GFX10-NEXT: global_load_ushort v1, v1, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX10-NEXT: v_and_b32_sdwa v1, v2, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: global_store_short v[0:1], v2, off +; GFX10-NEXT: s_endpgm %vec = load <2 x i8>, <2 x i8> addrspace(4)* %ptr %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx store <2 x i8> %insert, <2 x i8> addrspace(1)* null @@ -129,6 +149,24 @@ ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_v_v2i8_s_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_ushort v0, v[0:1], off +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s3, 1 +; GFX10-NEXT: s_movk_i32 s0, 0xff +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s3, 0 +; GFX10-NEXT: v_and_b32_sdwa v1, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX10-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: global_store_short v[0:1], v2, off +; GFX10-NEXT: s_endpgm %vec = load <2 x i8>, <2 x i8> addrspace(1 )* %ptr %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx store <2 x i8> %insert, <2 x i8> addrspace(1)* null @@ -194,6 +232,24 @@ ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_s_v2i8_v_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 1 +; GFX10-NEXT: s_movk_i32 s0, 0xff +; GFX10-NEXT: global_load_ushort v1, v1, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX10-NEXT: v_and_b32_sdwa v1, v2, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: global_store_short v[0:1], v2, off +; GFX10-NEXT: s_endpgm %vec = load <2 x i8>, <2 x i8> addrspace(4)* %ptr %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx store <2 x i8> %insert, <2 x i8> addrspace(1)* null @@ -262,6 +318,25 @@ ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_s_v2i8_s_v: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX10-NEXT: s_movk_i32 s0, 0xff +; GFX10-NEXT: global_load_ushort v2, v2, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX10-NEXT: v_and_b32_sdwa v1, v3, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: global_store_short v[0:1], v2, off +; GFX10-NEXT: s_endpgm %vec = load <2 x i8>, <2 x i8> addrspace(4)* %ptr %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx store <2 x i8> %insert, <2 x i8> addrspace(1)* null @@ -327,6 +402,24 @@ ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_s_v2i8_v_v: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX10-NEXT: s_movk_i32 s0, 0xff +; GFX10-NEXT: global_load_ushort v2, v2, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX10-NEXT: v_and_b32_sdwa v1, v3, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX10-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: global_store_short v[0:1], v2, off +; GFX10-NEXT: s_endpgm %vec = load <2 x i8>, <2 x i8> addrspace(4)* %ptr %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx store <2 x i8> %insert, <2 x i8> addrspace(1)* null @@ -391,6 +484,24 @@ ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_v_v2i8_s_v: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_ushort v0, v[0:1], off +; GFX10-NEXT: v_mov_b32_e32 v3, s2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX10-NEXT: s_movk_i32 s0, 0xff +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX10-NEXT: v_and_b32_sdwa v1, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX10-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: global_store_short v[0:1], v2, off +; GFX10-NEXT: s_endpgm %vec = load <2 x i8>, <2 x i8> addrspace(1)* %ptr %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx store <2 x i8> %insert, <2 x i8> addrspace(1)* null @@ -452,6 +563,23 @@ ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_v_v2i8_v_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_ushort v0, v[0:1], off +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 +; GFX10-NEXT: s_movk_i32 s0, 0xff +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0 +; GFX10-NEXT: v_and_b32_sdwa v1, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX10-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: global_store_short v[0:1], v2, off +; GFX10-NEXT: s_endpgm %vec = load <2 x i8>, <2 x i8> addrspace(1)* %ptr %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx store <2 x i8> %insert, <2 x i8> addrspace(1)* null @@ -513,6 +641,23 @@ ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_v_v2i8_v_v: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_ushort v0, v[0:1], off +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX10-NEXT: s_movk_i32 s0, 0xff +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 +; GFX10-NEXT: v_and_b32_sdwa v1, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX10-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: global_store_short v[0:1], v2, off +; GFX10-NEXT: s_endpgm %vec = load <2 x i8>, <2 x i8> addrspace(1)* %ptr %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx store <2 x i8> %insert, <2 x i8> addrspace(1)* null @@ -691,6 +836,39 @@ ; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_v_v4i8_s_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_mov_b32 s0, 8 +; GFX10-NEXT: s_and_b32 s1, s3, 3 +; GFX10-NEXT: s_lshl_b32 s1, s1, 3 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_movk_i32 s0, 0xff +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX10-NEXT: v_and_b32_sdwa v3, v0, s0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: s_and_b32 s2, s2, s0 +; GFX10-NEXT: v_and_or_b32 v0, v0, s0, v1 +; GFX10-NEXT: s_lshl_b32 s3, s0, s1 +; GFX10-NEXT: s_lshl_b32 s1, s2, s1 +; GFX10-NEXT: s_not_b32 s2, s3 +; GFX10-NEXT: v_mov_b32_e32 v1, 8 +; GFX10-NEXT: v_or3_b32 v0, v0, v3, v2 +; GFX10-NEXT: v_and_or_b32 v0, v0, s2, s1 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_sdwa v2, v0, s0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX10-NEXT: v_and_or_b32 v0, v0, s0, v1 +; GFX10-NEXT: v_or3_b32 v2, v0, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm %vec = load <4 x i8>, <4 x i8> addrspace(1 )* %ptr %insert = insertelement <4 x i8> %vec, i8 %val, i32 %idx store <4 x i8> %insert, <4 x i8> addrspace(1)* null @@ -815,6 +993,42 @@ ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_s_v4i8_v_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_movk_i32 s2, 0xff +; GFX10-NEXT: s_and_b32 s1, s4, 3 +; GFX10-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX10-NEXT: s_lshl_b32 s1, s1, 3 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshr_b32 s3, s0, 8 +; GFX10-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-NEXT: s_and_b32 s3, s3, s2 +; GFX10-NEXT: s_and_b32 s4, s4, s2 +; GFX10-NEXT: s_lshr_b32 s5, s0, 24 +; GFX10-NEXT: s_and_b32 s0, s0, s2 +; GFX10-NEXT: s_lshl_b32 s3, s3, 8 +; GFX10-NEXT: s_lshl_b32 s4, s4, 16 +; GFX10-NEXT: s_or_b32 s0, s0, s3 +; GFX10-NEXT: s_lshl_b32 s3, s5, 24 +; GFX10-NEXT: s_or_b32 s0, s0, s4 +; GFX10-NEXT: s_lshl_b32 s4, s2, s1 +; GFX10-NEXT: s_or_b32 s0, s0, s3 +; GFX10-NEXT: s_andn2_b32 s0, s0, s4 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, s1, s0 +; GFX10-NEXT: s_mov_b32 s0, 8 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX10-NEXT: v_and_b32_sdwa v3, v0, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX10-NEXT: v_and_or_b32 v0, v0, s2, v1 +; GFX10-NEXT: v_or3_b32 v2, v0, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm %vec = load <4 x i8>, <4 x i8> addrspace(4)* %ptr %insert = insertelement <4 x i8> %vec, i8 %val, i32 %idx store <4 x i8> %insert, <4 x i8> addrspace(1)* null @@ -941,6 +1155,43 @@ ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_s_v4i8_s_v: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX10-NEXT: s_movk_i32 s1, 0xff +; GFX10-NEXT: s_and_b32 s2, s4, s1 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, s1 +; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, s2 +; GFX10-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshr_b32 s2, s0, 8 +; GFX10-NEXT: s_lshr_b32 s3, s0, 16 +; GFX10-NEXT: s_and_b32 s2, s2, s1 +; GFX10-NEXT: s_and_b32 s3, s3, s1 +; GFX10-NEXT: s_lshr_b32 s4, s0, 24 +; GFX10-NEXT: s_and_b32 s0, s0, s1 +; GFX10-NEXT: s_lshl_b32 s2, s2, 8 +; GFX10-NEXT: s_lshl_b32 s3, s3, 16 +; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: s_lshl_b32 s2, s4, 24 +; GFX10-NEXT: s_or_b32 s0, s0, s3 +; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: v_and_or_b32 v0, s0, v1, v0 +; GFX10-NEXT: s_mov_b32 s0, 8 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX10-NEXT: v_and_b32_sdwa v3, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v1 +; GFX10-NEXT: v_or3_b32 v2, v0, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm %vec = load <4 x i8>, <4 x i8> addrspace(4)* %ptr %insert = insertelement <4 x i8> %vec, i8 %val, i32 %idx store <4 x i8> %insert, <4 x i8> addrspace(1)* null @@ -1065,6 +1316,42 @@ ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_s_v4i8_v_v: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX10-NEXT: s_movk_i32 s1, 0xff +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, s1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_xor_b32_e32 v1, -1, v2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshr_b32 s2, s0, 8 +; GFX10-NEXT: s_lshr_b32 s3, s0, 16 +; GFX10-NEXT: s_and_b32 s2, s2, s1 +; GFX10-NEXT: s_and_b32 s3, s3, s1 +; GFX10-NEXT: s_lshr_b32 s4, s0, 24 +; GFX10-NEXT: s_and_b32 s0, s0, s1 +; GFX10-NEXT: s_lshl_b32 s2, s2, 8 +; GFX10-NEXT: s_lshl_b32 s3, s3, 16 +; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: s_lshl_b32 s2, s4, 24 +; GFX10-NEXT: s_or_b32 s0, s0, s3 +; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: v_and_or_b32 v0, s0, v1, v0 +; GFX10-NEXT: s_mov_b32 s0, 8 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX10-NEXT: v_and_b32_sdwa v3, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v1 +; GFX10-NEXT: v_or3_b32 v2, v0, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm %vec = load <4 x i8>, <4 x i8> addrspace(4)* %ptr %insert = insertelement <4 x i8> %vec, i8 %val, i32 %idx store <4 x i8> %insert, <4 x i8> addrspace(1)* null @@ -1185,6 +1472,39 @@ ; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_v_v4i8_s_v: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: v_and_b32_e32 v1, 3, v2 +; GFX10-NEXT: s_mov_b32 s0, 8 +; GFX10-NEXT: s_movk_i32 s1, 0xff +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX10-NEXT: v_lshlrev_b32_e64 v4, v1, s1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX10-NEXT: v_and_b32_sdwa v5, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX10-NEXT: s_and_b32 s0, s2, s1 +; GFX10-NEXT: v_lshlrev_b32_e64 v1, v1, s0 +; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v2 +; GFX10-NEXT: v_xor_b32_e32 v2, -1, v4 +; GFX10-NEXT: v_or3_b32 v0, v0, v5, v3 +; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, 8 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_sdwa v2, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v1 +; GFX10-NEXT: v_or3_b32 v2, v0, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm %vec = load <4 x i8>, <4 x i8> addrspace(1)* %ptr %insert = insertelement <4 x i8> %vec, i8 %val, i32 %idx store <4 x i8> %insert, <4 x i8> addrspace(1)* null @@ -1304,6 +1624,38 @@ ; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_v_v4i8_v_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_mov_b32 s0, 8 +; GFX10-NEXT: s_and_b32 s1, s2, 3 +; GFX10-NEXT: s_lshl_b32 s1, s1, 3 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_movk_i32 s0, 0xff +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX10-NEXT: v_and_b32_sdwa v4, v0, s0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v6, v0, s0, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_lshl_b32 s1, s0, s1 +; GFX10-NEXT: s_not_b32 s1, s1 +; GFX10-NEXT: v_or3_b32 v0, v6, v4, v3 +; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, 8 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_sdwa v2, v0, s0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX10-NEXT: v_and_or_b32 v0, v0, s0, v1 +; GFX10-NEXT: v_or3_b32 v2, v0, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm %vec = load <4 x i8>, <4 x i8> addrspace(1)* %ptr %insert = insertelement <4 x i8> %vec, i8 %val, i32 %idx store <4 x i8> %insert, <4 x i8> addrspace(1)* null @@ -1425,6 +1777,39 @@ ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_v_v4i8_v_v: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: v_and_b32_e32 v1, 3, v3 +; GFX10-NEXT: s_mov_b32 s0, 8 +; GFX10-NEXT: s_movk_i32 s1, 0xff +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX10-NEXT: v_lshlrev_b32_e64 v5, v1, s1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_xor_b32_e32 v2, -1, v5 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX10-NEXT: v_and_b32_sdwa v6, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX10-NEXT: v_and_or_b32 v3, v0, s1, v3 +; GFX10-NEXT: v_or3_b32 v0, v3, v6, v4 +; GFX10-NEXT: v_mov_b32_e32 v3, 0xff +; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, 8 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_sdwa v2, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX10-NEXT: v_and_or_b32 v3, v0, v3, v1 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_or3_b32 v2, v3, v2, v4 +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm %vec = load <4 x i8>, <4 x i8> addrspace(1)* %ptr %insert = insertelement <4 x i8> %vec, i8 %val, i32 %idx store <4 x i8> %insert, <4 x i8> addrspace(1)* null @@ -1654,6 +2039,80 @@ ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_s_v8i8_s_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_movk_i32 s2, 0xff +; GFX10-NEXT: s_lshr_b32 s3, s5, 2 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshr_b32 s6, s0, 8 +; GFX10-NEXT: s_lshr_b32 s9, s1, 8 +; GFX10-NEXT: s_lshr_b32 s7, s0, 16 +; GFX10-NEXT: s_lshr_b32 s10, s1, 16 +; GFX10-NEXT: s_and_b32 s6, s6, s2 +; GFX10-NEXT: s_and_b32 s9, s9, s2 +; GFX10-NEXT: s_lshr_b32 s8, s0, 24 +; GFX10-NEXT: s_lshr_b32 s11, s1, 24 +; GFX10-NEXT: s_and_b32 s7, s7, s2 +; GFX10-NEXT: s_and_b32 s10, s10, s2 +; GFX10-NEXT: s_and_b32 s0, s0, s2 +; GFX10-NEXT: s_lshl_b32 s6, s6, 8 +; GFX10-NEXT: s_and_b32 s1, s1, s2 +; GFX10-NEXT: s_lshl_b32 s9, s9, 8 +; GFX10-NEXT: s_lshl_b32 s7, s7, 16 +; GFX10-NEXT: s_or_b32 s0, s0, s6 +; GFX10-NEXT: s_lshl_b32 s10, s10, 16 +; GFX10-NEXT: s_or_b32 s1, s1, s9 +; GFX10-NEXT: s_lshl_b32 s8, s8, 24 +; GFX10-NEXT: s_or_b32 s0, s0, s7 +; GFX10-NEXT: s_lshl_b32 s11, s11, 24 +; GFX10-NEXT: s_or_b32 s1, s1, s10 +; GFX10-NEXT: s_or_b32 s0, s0, s8 +; GFX10-NEXT: s_or_b32 s1, s1, s11 +; GFX10-NEXT: s_cmp_eq_u32 s3, 1 +; GFX10-NEXT: s_cselect_b32 s6, s1, s0 +; GFX10-NEXT: s_and_b32 s5, s5, 3 +; GFX10-NEXT: s_and_b32 s4, s4, s2 +; GFX10-NEXT: s_lshl_b32 s5, s5, 3 +; GFX10-NEXT: s_lshl_b32 s7, s2, s5 +; GFX10-NEXT: s_lshl_b32 s4, s4, s5 +; GFX10-NEXT: s_andn2_b32 s5, s6, s7 +; GFX10-NEXT: s_or_b32 s4, s5, s4 +; GFX10-NEXT: s_cmp_eq_u32 s3, 0 +; GFX10-NEXT: s_cselect_b32 s0, s4, s0 +; GFX10-NEXT: s_cmp_eq_u32 s3, 1 +; GFX10-NEXT: s_cselect_b32 s1, s4, s1 +; GFX10-NEXT: s_lshr_b32 s3, s0, 8 +; GFX10-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-NEXT: s_and_b32 s3, s3, s2 +; GFX10-NEXT: s_and_b32 s4, s4, s2 +; GFX10-NEXT: s_lshr_b32 s5, s0, 24 +; GFX10-NEXT: s_and_b32 s0, s0, s2 +; GFX10-NEXT: s_lshl_b32 s3, s3, 8 +; GFX10-NEXT: s_lshr_b32 s6, s1, 8 +; GFX10-NEXT: s_or_b32 s0, s0, s3 +; GFX10-NEXT: s_lshl_b32 s4, s4, 16 +; GFX10-NEXT: s_lshr_b32 s7, s1, 16 +; GFX10-NEXT: s_or_b32 s0, s0, s4 +; GFX10-NEXT: s_and_b32 s4, s6, s2 +; GFX10-NEXT: s_lshr_b32 s3, s1, 24 +; GFX10-NEXT: s_and_b32 s1, s1, s2 +; GFX10-NEXT: s_and_b32 s2, s7, s2 +; GFX10-NEXT: s_lshl_b32 s4, s4, 8 +; GFX10-NEXT: s_lshl_b32 s2, s2, 16 +; GFX10-NEXT: s_or_b32 s1, s1, s4 +; GFX10-NEXT: s_lshl_b32 s4, s5, 24 +; GFX10-NEXT: s_or_b32 s1, s1, s2 +; GFX10-NEXT: s_lshl_b32 s2, s3, 24 +; GFX10-NEXT: s_or_b32 s0, s0, s4 +; GFX10-NEXT: s_or_b32 s1, s1, s2 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(4)* %ptr %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx store <8 x i8> %insert, <8 x i8> addrspace(1)* null @@ -1847,6 +2306,59 @@ ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_v_v8i8_s_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_mov_b32 s0, 8 +; GFX10-NEXT: s_movk_i32 s1, 0xff +; GFX10-NEXT: s_and_b32 s2, s2, s1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX10-NEXT: v_and_b32_sdwa v6, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_sdwa v7, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX10-NEXT: v_and_or_b32 v11, v0, s1, v2 +; GFX10-NEXT: v_and_or_b32 v2, v1, s1, v3 +; GFX10-NEXT: s_lshr_b32 s0, s3, 2 +; GFX10-NEXT: s_and_b32 s3, s3, 3 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 +; GFX10-NEXT: v_or3_b32 v0, v11, v6, v4 +; GFX10-NEXT: v_or3_b32 v1, v2, v7, v5 +; GFX10-NEXT: s_lshl_b32 s3, s3, 3 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s0, 0 +; GFX10-NEXT: s_lshl_b32 s4, s1, s3 +; GFX10-NEXT: s_lshl_b32 s2, s2, s3 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc_lo +; GFX10-NEXT: s_not_b32 s3, s4 +; GFX10-NEXT: v_and_or_b32 v2, v2, s3, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v2, 8 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX10-NEXT: v_and_b32_sdwa v7, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_sdwa v4, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v3 +; GFX10-NEXT: v_and_or_b32 v1, v1, s1, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_or3_b32 v0, v0, v7, v5 +; GFX10-NEXT: v_or3_b32 v1, v1, v4, v6 +; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(1 )* %ptr %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx store <8 x i8> %insert, <8 x i8> addrspace(1)* null @@ -2061,6 +2573,70 @@ ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_s_v8i8_v_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_movk_i32 s2, 0xff +; GFX10-NEXT: s_lshr_b32 s3, s4, 2 +; GFX10-NEXT: v_and_b32_e32 v2, s2, v0 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s3, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshr_b32 s5, s0, 8 +; GFX10-NEXT: s_lshr_b32 s8, s1, 8 +; GFX10-NEXT: s_lshr_b32 s6, s0, 16 +; GFX10-NEXT: s_lshr_b32 s9, s1, 16 +; GFX10-NEXT: s_and_b32 s5, s5, s2 +; GFX10-NEXT: s_and_b32 s8, s8, s2 +; GFX10-NEXT: s_lshr_b32 s7, s0, 24 +; GFX10-NEXT: s_lshr_b32 s10, s1, 24 +; GFX10-NEXT: s_and_b32 s6, s6, s2 +; GFX10-NEXT: s_and_b32 s9, s9, s2 +; GFX10-NEXT: s_and_b32 s0, s0, s2 +; GFX10-NEXT: s_lshl_b32 s5, s5, 8 +; GFX10-NEXT: s_and_b32 s1, s1, s2 +; GFX10-NEXT: s_lshl_b32 s8, s8, 8 +; GFX10-NEXT: s_lshl_b32 s6, s6, 16 +; GFX10-NEXT: s_or_b32 s0, s0, s5 +; GFX10-NEXT: s_lshl_b32 s9, s9, 16 +; GFX10-NEXT: s_or_b32 s1, s1, s8 +; GFX10-NEXT: s_lshl_b32 s7, s7, 24 +; GFX10-NEXT: s_or_b32 s0, s0, s6 +; GFX10-NEXT: s_lshl_b32 s10, s10, 24 +; GFX10-NEXT: s_or_b32 s1, s1, s9 +; GFX10-NEXT: s_or_b32 s0, s0, s7 +; GFX10-NEXT: s_or_b32 s1, s1, s10 +; GFX10-NEXT: s_cmp_eq_u32 s3, 1 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: s_cselect_b32 s5, s1, s0 +; GFX10-NEXT: s_and_b32 s4, s4, 3 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_lshl_b32 s4, s4, 3 +; GFX10-NEXT: s_mov_b32 s0, 8 +; GFX10-NEXT: s_lshl_b32 s6, s2, s4 +; GFX10-NEXT: s_andn2_b32 s5, s5, s6 +; GFX10-NEXT: v_lshl_or_b32 v2, v2, s4, s5 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s3, 1 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX10-NEXT: v_and_b32_sdwa v6, v0, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_sdwa v7, v1, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX10-NEXT: v_and_or_b32 v11, v0, s2, v2 +; GFX10-NEXT: v_and_or_b32 v2, v1, s2, v3 +; GFX10-NEXT: v_or3_b32 v0, v11, v6, v4 +; GFX10-NEXT: v_or3_b32 v1, v2, v7, v5 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(4)* %ptr %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx store <8 x i8> %insert, <8 x i8> addrspace(1)* null @@ -2280,6 +2856,71 @@ ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_s_v8i8_s_v: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: v_and_b32_e32 v1, 3, v0 +; GFX10-NEXT: s_movk_i32 s2, 0xff +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 2, v0 +; GFX10-NEXT: s_and_b32 s3, s4, s2 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, s3 +; GFX10-NEXT: v_lshlrev_b32_e64 v0, v1, s2 +; GFX10-NEXT: v_xor_b32_e32 v4, -1, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshr_b32 s6, s1, 8 +; GFX10-NEXT: s_lshr_b32 s7, s1, 16 +; GFX10-NEXT: s_and_b32 s6, s6, s2 +; GFX10-NEXT: s_lshr_b32 s8, s1, 24 +; GFX10-NEXT: s_and_b32 s7, s7, s2 +; GFX10-NEXT: s_and_b32 s1, s1, s2 +; GFX10-NEXT: s_lshl_b32 s6, s6, 8 +; GFX10-NEXT: s_lshr_b32 s3, s0, 8 +; GFX10-NEXT: s_lshl_b32 s7, s7, 16 +; GFX10-NEXT: s_or_b32 s1, s1, s6 +; GFX10-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-NEXT: s_and_b32 s3, s3, s2 +; GFX10-NEXT: s_lshl_b32 s8, s8, 24 +; GFX10-NEXT: s_or_b32 s1, s1, s7 +; GFX10-NEXT: s_lshr_b32 s5, s0, 24 +; GFX10-NEXT: s_and_b32 s4, s4, s2 +; GFX10-NEXT: s_and_b32 s0, s0, s2 +; GFX10-NEXT: s_lshl_b32 s3, s3, 8 +; GFX10-NEXT: s_or_b32 s1, s1, s8 +; GFX10-NEXT: s_lshl_b32 s4, s4, 16 +; GFX10-NEXT: s_or_b32 s0, s0, s3 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_lshl_b32 s5, s5, 24 +; GFX10-NEXT: s_or_b32 s0, s0, s4 +; GFX10-NEXT: s_or_b32 s0, s0, s5 +; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v1, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v2 +; GFX10-NEXT: v_and_or_b32 v3, v5, v4, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v3, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo +; GFX10-NEXT: s_mov_b32 s0, 8 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX10-NEXT: v_and_b32_sdwa v7, v1, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX10-NEXT: v_and_b32_sdwa v6, v0, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX10-NEXT: v_and_or_b32 v11, v0, s2, v2 +; GFX10-NEXT: v_and_or_b32 v2, v1, s2, v3 +; GFX10-NEXT: v_or3_b32 v0, v11, v6, v4 +; GFX10-NEXT: v_or3_b32 v1, v2, v7, v5 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(4)* %ptr %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx store <8 x i8> %insert, <8 x i8> addrspace(1)* null @@ -2497,6 +3138,70 @@ ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_s_v8i8_v_v: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_movk_i32 s2, 0xff +; GFX10-NEXT: v_and_b32_e32 v2, 3, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 2, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_e64 v0, v2, s2 +; GFX10-NEXT: v_xor_b32_e32 v2, -1, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshr_b32 s6, s1, 8 +; GFX10-NEXT: s_lshr_b32 s7, s1, 16 +; GFX10-NEXT: s_and_b32 s6, s6, s2 +; GFX10-NEXT: s_lshr_b32 s8, s1, 24 +; GFX10-NEXT: s_and_b32 s7, s7, s2 +; GFX10-NEXT: s_and_b32 s1, s1, s2 +; GFX10-NEXT: s_lshl_b32 s6, s6, 8 +; GFX10-NEXT: s_lshr_b32 s3, s0, 8 +; GFX10-NEXT: s_lshl_b32 s7, s7, 16 +; GFX10-NEXT: s_or_b32 s1, s1, s6 +; GFX10-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-NEXT: s_and_b32 s3, s3, s2 +; GFX10-NEXT: s_lshl_b32 s8, s8, 24 +; GFX10-NEXT: s_or_b32 s1, s1, s7 +; GFX10-NEXT: s_lshr_b32 s5, s0, 24 +; GFX10-NEXT: s_and_b32 s4, s4, s2 +; GFX10-NEXT: s_and_b32 s0, s0, s2 +; GFX10-NEXT: s_lshl_b32 s3, s3, 8 +; GFX10-NEXT: s_or_b32 s1, s1, s8 +; GFX10-NEXT: s_lshl_b32 s4, s4, 16 +; GFX10-NEXT: s_or_b32 s0, s0, s3 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_lshl_b32 s5, s5, 24 +; GFX10-NEXT: s_or_b32 s0, s0, s4 +; GFX10-NEXT: s_or_b32 s0, s0, s5 +; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v1, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v3 +; GFX10-NEXT: v_and_or_b32 v2, v5, v2, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX10-NEXT: s_mov_b32 s0, 8 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX10-NEXT: v_and_b32_sdwa v7, v1, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX10-NEXT: v_and_b32_sdwa v6, v0, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX10-NEXT: v_and_or_b32 v11, v0, s2, v2 +; GFX10-NEXT: v_and_or_b32 v2, v1, s2, v3 +; GFX10-NEXT: v_or3_b32 v0, v11, v6, v4 +; GFX10-NEXT: v_or3_b32 v1, v2, v7, v5 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(4)* %ptr %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx store <8 x i8> %insert, <8 x i8> addrspace(1)* null @@ -2689,6 +3394,59 @@ ; GFX7-NEXT: v_or_b32_e32 v1, v1, v7 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_v_v8i8_s_v: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dwordx2 v[11:12], v[0:1], off +; GFX10-NEXT: s_mov_b32 s0, 8 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v2 +; GFX10-NEXT: s_movk_i32 s1, 0xff +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v11 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v12 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v11 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v12 +; GFX10-NEXT: v_and_b32_sdwa v8, v11, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, s0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_sdwa v9, v12, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX10-NEXT: v_and_or_b32 v11, v11, s1, v4 +; GFX10-NEXT: v_and_or_b32 v10, v12, s1, v5 +; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, s1 +; GFX10-NEXT: s_and_b32 s0, s2, s1 +; GFX10-NEXT: v_or3_b32 v0, v11, v8, v6 +; GFX10-NEXT: v_or3_b32 v1, v10, v9, v7 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v3, s0 +; GFX10-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, 8 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc_lo +; GFX10-NEXT: v_and_or_b32 v3, v5, v4, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v3, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX10-NEXT: v_and_b32_sdwa v7, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_sdwa v4, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v3 +; GFX10-NEXT: v_and_or_b32 v1, v1, s1, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_or3_b32 v0, v0, v7, v5 +; GFX10-NEXT: v_or3_b32 v1, v1, v4, v6 +; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(1)* %ptr %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx store <8 x i8> %insert, <8 x i8> addrspace(1)* null @@ -2880,6 +3638,58 @@ ; GFX7-NEXT: v_or_b32_e32 v1, v1, v7 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_v_v8i8_v_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_mov_b32 s0, 8 +; GFX10-NEXT: s_movk_i32 s1, 0xff +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX10-NEXT: v_and_b32_sdwa v7, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_sdwa v8, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v3 +; GFX10-NEXT: v_and_or_b32 v1, v1, s1, v4 +; GFX10-NEXT: s_and_b32 s0, s2, 3 +; GFX10-NEXT: s_lshr_b32 s2, s2, 2 +; GFX10-NEXT: s_lshl_b32 s0, s0, 3 +; GFX10-NEXT: v_or3_b32 v0, v0, v7, v5 +; GFX10-NEXT: v_or3_b32 v1, v1, v8, v6 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_lshl_b32 s0, s1, s0 +; GFX10-NEXT: s_not_b32 s0, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc_lo +; GFX10-NEXT: v_and_or_b32 v2, v3, s0, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s2, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v2, s0 +; GFX10-NEXT: v_mov_b32_e32 v2, 8 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX10-NEXT: v_and_b32_sdwa v7, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_sdwa v4, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v3 +; GFX10-NEXT: v_and_or_b32 v1, v1, s1, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_or3_b32 v0, v0, v7, v5 +; GFX10-NEXT: v_or3_b32 v1, v1, v4, v6 +; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(1)* %ptr %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx store <8 x i8> %insert, <8 x i8> addrspace(1)* null @@ -3073,6 +3883,59 @@ ; GFX7-NEXT: v_or_b32_e32 v1, v1, v7 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_v_v8i8_v_v: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_mov_b32 s0, 8 +; GFX10-NEXT: v_and_b32_e32 v4, 3, v3 +; GFX10-NEXT: s_movk_i32 s1, 0xff +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 2, v3 +; GFX10-NEXT: v_mov_b32_e32 v5, 0xff +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v1 +; GFX10-NEXT: v_and_b32_sdwa v10, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_sdwa v11, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX10-NEXT: v_and_or_b32 v15, v0, s1, v6 +; GFX10-NEXT: v_and_or_b32 v14, v1, s1, v7 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, v4, v5 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v3 +; GFX10-NEXT: v_or3_b32 v0, v15, v10, v8 +; GFX10-NEXT: v_or3_b32 v1, v14, v11, v9 +; GFX10-NEXT: v_xor_b32_e32 v4, -1, v6 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc_lo +; GFX10-NEXT: v_and_or_b32 v2, v7, v4, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v11, v1, v2, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v2, 8 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v11 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v11 +; GFX10-NEXT: v_and_b32_sdwa v8, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX10-NEXT: v_and_b32_sdwa v4, v11, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX10-NEXT: v_and_or_b32 v3, v0, v5, v3 +; GFX10-NEXT: v_and_or_b32 v1, v11, v5, v2 +; GFX10-NEXT: v_or3_b32 v0, v3, v8, v6 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_or3_b32 v1, v1, v4, v7 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(1)* %ptr %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx store <8 x i8> %insert, <8 x i8> addrspace(1)* null @@ -3476,6 +4339,138 @@ ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_s_v16i8_s_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX10-NEXT: s_movk_i32 s6, 0xff +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshr_b32 s7, s0, 8 +; GFX10-NEXT: s_lshr_b32 s8, s0, 16 +; GFX10-NEXT: s_and_b32 s7, s7, s6 +; GFX10-NEXT: s_lshr_b32 s9, s0, 24 +; GFX10-NEXT: s_and_b32 s8, s8, s6 +; GFX10-NEXT: s_and_b32 s0, s0, s6 +; GFX10-NEXT: s_lshl_b32 s7, s7, 8 +; GFX10-NEXT: s_lshr_b32 s13, s2, 8 +; GFX10-NEXT: s_lshl_b32 s8, s8, 16 +; GFX10-NEXT: s_or_b32 s0, s0, s7 +; GFX10-NEXT: s_lshr_b32 s10, s1, 8 +; GFX10-NEXT: s_or_b32 s0, s0, s8 +; GFX10-NEXT: s_and_b32 s8, s13, s6 +; GFX10-NEXT: s_lshl_b32 s9, s9, 24 +; GFX10-NEXT: s_lshr_b32 s14, s2, 16 +; GFX10-NEXT: s_lshr_b32 s11, s1, 16 +; GFX10-NEXT: s_and_b32 s10, s10, s6 +; GFX10-NEXT: s_lshr_b32 s15, s2, 24 +; GFX10-NEXT: s_or_b32 s0, s0, s9 +; GFX10-NEXT: s_and_b32 s2, s2, s6 +; GFX10-NEXT: s_lshl_b32 s8, s8, 8 +; GFX10-NEXT: s_and_b32 s9, s14, s6 +; GFX10-NEXT: s_lshr_b32 s12, s1, 24 +; GFX10-NEXT: s_and_b32 s11, s11, s6 +; GFX10-NEXT: s_or_b32 s2, s2, s8 +; GFX10-NEXT: s_lshl_b32 s8, s9, 16 +; GFX10-NEXT: s_lshr_b32 s16, s3, 8 +; GFX10-NEXT: s_and_b32 s1, s1, s6 +; GFX10-NEXT: s_lshl_b32 s10, s10, 8 +; GFX10-NEXT: s_lshr_b32 s17, s3, 16 +; GFX10-NEXT: s_or_b32 s2, s2, s8 +; GFX10-NEXT: s_and_b32 s8, s16, s6 +; GFX10-NEXT: s_lshl_b32 s7, s11, 16 +; GFX10-NEXT: s_or_b32 s1, s1, s10 +; GFX10-NEXT: s_lshr_b32 s18, s3, 24 +; GFX10-NEXT: s_or_b32 s1, s1, s7 +; GFX10-NEXT: s_lshl_b32 s7, s12, 24 +; GFX10-NEXT: s_and_b32 s3, s3, s6 +; GFX10-NEXT: s_lshl_b32 s8, s8, 8 +; GFX10-NEXT: s_and_b32 s9, s17, s6 +; GFX10-NEXT: s_or_b32 s1, s1, s7 +; GFX10-NEXT: s_lshl_b32 s7, s15, 24 +; GFX10-NEXT: s_or_b32 s3, s3, s8 +; GFX10-NEXT: s_lshl_b32 s8, s9, 16 +; GFX10-NEXT: s_or_b32 s2, s2, s7 +; GFX10-NEXT: s_or_b32 s3, s3, s8 +; GFX10-NEXT: s_lshl_b32 s7, s18, 24 +; GFX10-NEXT: s_lshr_b32 s8, s5, 2 +; GFX10-NEXT: s_or_b32 s3, s3, s7 +; GFX10-NEXT: s_cmp_eq_u32 s8, 1 +; GFX10-NEXT: s_cselect_b32 s7, s1, s0 +; GFX10-NEXT: s_cmp_eq_u32 s8, 2 +; GFX10-NEXT: s_cselect_b32 s7, s2, s7 +; GFX10-NEXT: s_cmp_eq_u32 s8, 3 +; GFX10-NEXT: s_cselect_b32 s7, s3, s7 +; GFX10-NEXT: s_and_b32 s5, s5, 3 +; GFX10-NEXT: s_and_b32 s4, s4, s6 +; GFX10-NEXT: s_lshl_b32 s5, s5, 3 +; GFX10-NEXT: s_lshl_b32 s9, s6, s5 +; GFX10-NEXT: s_lshl_b32 s4, s4, s5 +; GFX10-NEXT: s_andn2_b32 s5, s7, s9 +; GFX10-NEXT: s_or_b32 s4, s5, s4 +; GFX10-NEXT: s_cmp_eq_u32 s8, 0 +; GFX10-NEXT: s_cselect_b32 s0, s4, s0 +; GFX10-NEXT: s_cmp_eq_u32 s8, 1 +; GFX10-NEXT: s_cselect_b32 s1, s4, s1 +; GFX10-NEXT: s_cmp_eq_u32 s8, 2 +; GFX10-NEXT: s_cselect_b32 s2, s4, s2 +; GFX10-NEXT: s_cmp_eq_u32 s8, 3 +; GFX10-NEXT: s_cselect_b32 s3, s4, s3 +; GFX10-NEXT: s_lshr_b32 s4, s0, 8 +; GFX10-NEXT: s_lshr_b32 s5, s0, 16 +; GFX10-NEXT: s_and_b32 s4, s4, s6 +; GFX10-NEXT: s_lshr_b32 s7, s0, 24 +; GFX10-NEXT: s_and_b32 s0, s0, s6 +; GFX10-NEXT: s_lshl_b32 s4, s4, 8 +; GFX10-NEXT: s_and_b32 s5, s5, s6 +; GFX10-NEXT: s_or_b32 s0, s0, s4 +; GFX10-NEXT: s_lshl_b32 s4, s5, 16 +; GFX10-NEXT: s_lshr_b32 s8, s1, 8 +; GFX10-NEXT: s_or_b32 s0, s0, s4 +; GFX10-NEXT: s_lshl_b32 s4, s7, 24 +; GFX10-NEXT: s_and_b32 s7, s8, s6 +; GFX10-NEXT: s_lshr_b32 s9, s1, 16 +; GFX10-NEXT: s_lshr_b32 s10, s1, 24 +; GFX10-NEXT: s_and_b32 s1, s1, s6 +; GFX10-NEXT: s_lshl_b32 s7, s7, 8 +; GFX10-NEXT: s_and_b32 s8, s9, s6 +; GFX10-NEXT: s_or_b32 s1, s1, s7 +; GFX10-NEXT: s_lshl_b32 s7, s8, 16 +; GFX10-NEXT: s_or_b32 s0, s0, s4 +; GFX10-NEXT: s_lshr_b32 s11, s2, 8 +; GFX10-NEXT: s_or_b32 s1, s1, s7 +; GFX10-NEXT: s_lshl_b32 s4, s10, 24 +; GFX10-NEXT: s_lshr_b32 s14, s3, 8 +; GFX10-NEXT: s_and_b32 s7, s11, s6 +; GFX10-NEXT: s_lshr_b32 s12, s2, 16 +; GFX10-NEXT: s_or_b32 s1, s1, s4 +; GFX10-NEXT: s_and_b32 s4, s14, s6 +; GFX10-NEXT: s_lshr_b32 s15, s3, 16 +; GFX10-NEXT: s_lshr_b32 s13, s2, 24 +; GFX10-NEXT: s_lshr_b32 s5, s3, 24 +; GFX10-NEXT: s_and_b32 s2, s2, s6 +; GFX10-NEXT: s_lshl_b32 s7, s7, 8 +; GFX10-NEXT: s_and_b32 s8, s12, s6 +; GFX10-NEXT: s_and_b32 s3, s3, s6 +; GFX10-NEXT: s_lshl_b32 s4, s4, 8 +; GFX10-NEXT: s_and_b32 s6, s15, s6 +; GFX10-NEXT: s_or_b32 s2, s2, s7 +; GFX10-NEXT: s_lshl_b32 s7, s8, 16 +; GFX10-NEXT: s_or_b32 s3, s3, s4 +; GFX10-NEXT: s_lshl_b32 s4, s6, 16 +; GFX10-NEXT: s_or_b32 s2, s2, s7 +; GFX10-NEXT: s_lshl_b32 s6, s13, 24 +; GFX10-NEXT: s_or_b32 s3, s3, s4 +; GFX10-NEXT: s_lshl_b32 s4, s5, 24 +; GFX10-NEXT: s_or_b32 s2, s2, s6 +; GFX10-NEXT: s_or_b32 s3, s3, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX10-NEXT: s_endpgm %vec = load <16 x i8>, <16 x i8> addrspace(4)* %ptr %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx store <16 x i8> %insert, <16 x i8> addrspace(1)* null @@ -3795,6 +4790,93 @@ ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_v_v16i8_s_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: v_mov_b32_e32 v4, 8 +; GFX10-NEXT: s_mov_b32 s0, 8 +; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: s_lshr_b32 s5, s3, 2 +; GFX10-NEXT: s_and_b32 s1, s3, 3 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 1 +; GFX10-NEXT: s_lshl_b32 s3, s1, 3 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s5, 3 +; GFX10-NEXT: s_and_b32 s2, s2, s4 +; GFX10-NEXT: s_lshl_b32 s6, s4, s3 +; GFX10-NEXT: s_lshl_b32 s2, s2, s3 +; GFX10-NEXT: s_not_b32 s3, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, s0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_sdwa v13, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v14, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v2 +; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 8, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX10-NEXT: v_and_or_b32 v1, v1, s4, v7 +; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_sdwa v15, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v10 +; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v3 +; GFX10-NEXT: v_or3_b32 v0, v0, v13, v6 +; GFX10-NEXT: v_and_or_b32 v2, v2, s4, v9 +; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_or3_b32 v7, v1, v14, v8 +; GFX10-NEXT: v_and_b32_sdwa v16, v3, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v12 +; GFX10-NEXT: v_or3_b32 v2, v2, v15, v5 +; GFX10-NEXT: v_and_or_b32 v3, v3, s4, v10 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v0, v7, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, 2 +; GFX10-NEXT: v_or3_b32 v3, v3, v16, v6 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v3, s1 +; GFX10-NEXT: v_and_or_b32 v5, v5, s3, s2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s5, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v5, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s2 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v5, s1 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 8, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v3 +; GFX10-NEXT: v_and_b32_sdwa v13, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v5 +; GFX10-NEXT: v_and_b32_sdwa v15, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX10-NEXT: v_and_or_b32 v5, v2, s4, v9 +; GFX10-NEXT: v_and_b32_sdwa v14, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX10-NEXT: v_and_b32_sdwa v16, v3, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX10-NEXT: v_and_or_b32 v1, v1, s4, v7 +; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v12 +; GFX10-NEXT: v_and_or_b32 v18, v3, s4, v4 +; GFX10-NEXT: v_or3_b32 v2, v5, v15, v10 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_or3_b32 v0, v0, v13, v6 +; GFX10-NEXT: v_or3_b32 v1, v1, v14, v8 +; GFX10-NEXT: v_or3_b32 v3, v18, v16, v11 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX10-NEXT: s_endpgm %vec = load <16 x i8>, <16 x i8> addrspace(1 )* %ptr %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx store <16 x i8> %insert, <16 x i8> addrspace(1)* null @@ -4167,6 +5249,119 @@ ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_s_v16i8_v_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX10-NEXT: s_movk_i32 s5, 0xff +; GFX10-NEXT: v_mov_b32_e32 v9, 8 +; GFX10-NEXT: v_and_b32_e32 v0, s5, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshr_b32 s6, s0, 8 +; GFX10-NEXT: s_lshr_b32 s7, s0, 16 +; GFX10-NEXT: s_and_b32 s6, s6, s5 +; GFX10-NEXT: s_lshr_b32 s8, s0, 24 +; GFX10-NEXT: s_and_b32 s7, s7, s5 +; GFX10-NEXT: s_and_b32 s0, s0, s5 +; GFX10-NEXT: s_lshl_b32 s6, s6, 8 +; GFX10-NEXT: s_lshr_b32 s12, s2, 8 +; GFX10-NEXT: s_lshl_b32 s7, s7, 16 +; GFX10-NEXT: s_or_b32 s0, s0, s6 +; GFX10-NEXT: s_lshr_b32 s9, s1, 8 +; GFX10-NEXT: s_or_b32 s0, s0, s7 +; GFX10-NEXT: s_and_b32 s7, s12, s5 +; GFX10-NEXT: s_lshl_b32 s8, s8, 24 +; GFX10-NEXT: s_lshr_b32 s13, s2, 16 +; GFX10-NEXT: s_lshr_b32 s10, s1, 16 +; GFX10-NEXT: s_and_b32 s9, s9, s5 +; GFX10-NEXT: s_lshr_b32 s14, s2, 24 +; GFX10-NEXT: s_or_b32 s0, s0, s8 +; GFX10-NEXT: s_and_b32 s2, s2, s5 +; GFX10-NEXT: s_lshl_b32 s7, s7, 8 +; GFX10-NEXT: s_and_b32 s8, s13, s5 +; GFX10-NEXT: s_lshr_b32 s11, s1, 24 +; GFX10-NEXT: s_and_b32 s10, s10, s5 +; GFX10-NEXT: s_or_b32 s2, s2, s7 +; GFX10-NEXT: s_lshl_b32 s7, s8, 16 +; GFX10-NEXT: s_lshr_b32 s15, s3, 8 +; GFX10-NEXT: s_and_b32 s1, s1, s5 +; GFX10-NEXT: s_lshl_b32 s9, s9, 8 +; GFX10-NEXT: s_lshr_b32 s16, s3, 16 +; GFX10-NEXT: s_or_b32 s2, s2, s7 +; GFX10-NEXT: s_and_b32 s7, s15, s5 +; GFX10-NEXT: s_lshl_b32 s6, s10, 16 +; GFX10-NEXT: s_or_b32 s1, s1, s9 +; GFX10-NEXT: s_lshr_b32 s17, s3, 24 +; GFX10-NEXT: s_or_b32 s1, s1, s6 +; GFX10-NEXT: s_lshl_b32 s6, s11, 24 +; GFX10-NEXT: s_and_b32 s3, s3, s5 +; GFX10-NEXT: s_lshl_b32 s7, s7, 8 +; GFX10-NEXT: s_and_b32 s8, s16, s5 +; GFX10-NEXT: s_or_b32 s1, s1, s6 +; GFX10-NEXT: s_lshl_b32 s6, s14, 24 +; GFX10-NEXT: s_or_b32 s3, s3, s7 +; GFX10-NEXT: s_lshl_b32 s7, s8, 16 +; GFX10-NEXT: s_or_b32 s2, s2, s6 +; GFX10-NEXT: s_or_b32 s3, s3, s7 +; GFX10-NEXT: s_lshl_b32 s6, s17, 24 +; GFX10-NEXT: s_lshr_b32 s7, s4, 2 +; GFX10-NEXT: s_or_b32 s3, s3, s6 +; GFX10-NEXT: s_cmp_eq_u32 s7, 1 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s7, 0 +; GFX10-NEXT: s_cselect_b32 s6, s1, s0 +; GFX10-NEXT: s_cmp_eq_u32 s7, 2 +; GFX10-NEXT: s_cselect_b32 s6, s2, s6 +; GFX10-NEXT: s_cmp_eq_u32 s7, 3 +; GFX10-NEXT: s_cselect_b32 s6, s3, s6 +; GFX10-NEXT: s_and_b32 s4, s4, 3 +; GFX10-NEXT: s_lshl_b32 s4, s4, 3 +; GFX10-NEXT: s_lshl_b32 s8, s5, s4 +; GFX10-NEXT: s_andn2_b32 s6, s6, s8 +; GFX10-NEXT: v_lshl_or_b32 v4, v0, s4, s6 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: s_mov_b32 s0, 8 +; GFX10-NEXT: v_cndmask_b32_e32 v15, v0, v4, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s7, 1 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s7, 2 +; GFX10-NEXT: v_and_b32_sdwa v0, v15, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s7, 3 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v6, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 8, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v15 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v2 +; GFX10-NEXT: v_and_or_b32 v6, v1, s5, v6 +; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 8, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v3 +; GFX10-NEXT: v_and_b32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_or_b32 v19, v15, s5, v4 +; GFX10-NEXT: v_and_or_b32 v8, v2, s5, v8 +; GFX10-NEXT: v_and_b32_sdwa v15, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX10-NEXT: v_and_or_b32 v9, v3, s5, v9 +; GFX10-NEXT: v_and_b32_sdwa v14, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v12 +; GFX10-NEXT: v_or3_b32 v0, v19, v0, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_or3_b32 v1, v6, v1, v7 +; GFX10-NEXT: v_or3_b32 v2, v8, v15, v10 +; GFX10-NEXT: v_or3_b32 v3, v9, v14, v11 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX10-NEXT: s_endpgm %vec = load <16 x i8>, <16 x i8> addrspace(4)* %ptr %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx store <16 x i8> %insert, <16 x i8> addrspace(1)* null @@ -4544,6 +5739,118 @@ ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_s_v16i8_s_v: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX10-NEXT: s_movk_i32 s5, 0xff +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 2, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX10-NEXT: v_mov_b32_e32 v9, 8 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, s5 +; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshr_b32 s6, s0, 8 +; GFX10-NEXT: s_lshr_b32 s7, s0, 16 +; GFX10-NEXT: s_lshr_b32 s9, s1, 8 +; GFX10-NEXT: s_and_b32 s6, s6, s5 +; GFX10-NEXT: s_lshr_b32 s8, s0, 24 +; GFX10-NEXT: s_lshr_b32 s10, s1, 16 +; GFX10-NEXT: s_and_b32 s7, s7, s5 +; GFX10-NEXT: s_and_b32 s9, s9, s5 +; GFX10-NEXT: s_and_b32 s0, s0, s5 +; GFX10-NEXT: s_lshl_b32 s6, s6, 8 +; GFX10-NEXT: s_lshr_b32 s11, s1, 24 +; GFX10-NEXT: s_and_b32 s1, s1, s5 +; GFX10-NEXT: s_lshl_b32 s9, s9, 8 +; GFX10-NEXT: s_and_b32 s10, s10, s5 +; GFX10-NEXT: s_or_b32 s0, s0, s6 +; GFX10-NEXT: s_lshl_b32 s7, s7, 16 +; GFX10-NEXT: s_lshr_b32 s12, s2, 8 +; GFX10-NEXT: s_lshl_b32 s8, s8, 24 +; GFX10-NEXT: s_or_b32 s0, s0, s7 +; GFX10-NEXT: s_lshl_b32 s6, s10, 16 +; GFX10-NEXT: s_or_b32 s1, s1, s9 +; GFX10-NEXT: s_lshr_b32 s13, s2, 16 +; GFX10-NEXT: s_or_b32 s8, s0, s8 +; GFX10-NEXT: s_or_b32 s0, s1, s6 +; GFX10-NEXT: s_and_b32 s6, s12, s5 +; GFX10-NEXT: s_lshr_b32 s14, s2, 24 +; GFX10-NEXT: s_and_b32 s2, s2, s5 +; GFX10-NEXT: s_lshl_b32 s6, s6, 8 +; GFX10-NEXT: s_and_b32 s7, s13, s5 +; GFX10-NEXT: s_lshl_b32 s1, s11, 24 +; GFX10-NEXT: s_or_b32 s2, s2, s6 +; GFX10-NEXT: s_lshr_b32 s15, s3, 8 +; GFX10-NEXT: s_lshl_b32 s6, s7, 16 +; GFX10-NEXT: s_or_b32 s9, s0, s1 +; GFX10-NEXT: s_or_b32 s0, s2, s6 +; GFX10-NEXT: s_and_b32 s2, s15, s5 +; GFX10-NEXT: s_lshl_b32 s1, s14, 24 +; GFX10-NEXT: s_lshr_b32 s16, s3, 16 +; GFX10-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-NEXT: s_lshr_b32 s17, s3, 24 +; GFX10-NEXT: s_or_b32 s10, s0, s1 +; GFX10-NEXT: s_and_b32 s1, s16, s5 +; GFX10-NEXT: s_and_b32 s3, s3, s5 +; GFX10-NEXT: s_lshl_b32 s2, s2, 8 +; GFX10-NEXT: s_lshl_b32 s1, s1, 16 +; GFX10-NEXT: s_or_b32 s0, s3, s2 +; GFX10-NEXT: v_cndmask_b32_e32 v1, s8, v1, vcc_lo +; GFX10-NEXT: s_or_b32 s1, s0, s1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v4 +; GFX10-NEXT: s_lshl_b32 s2, s17, 24 +; GFX10-NEXT: s_or_b32 s11, s1, s2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s10, s0 +; GFX10-NEXT: s_and_b32 s2, s4, s5 +; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, s2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s11, s1 +; GFX10-NEXT: v_and_or_b32 v5, v1, v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-NEXT: v_mov_b32_e32 v2, s10 +; GFX10-NEXT: v_mov_b32_e32 v3, s11 +; GFX10-NEXT: v_cndmask_b32_e64 v15, v0, v5, s2 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v5, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v5, s1 +; GFX10-NEXT: s_mov_b32 s2, 8 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v15 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 8, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 8, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v15 +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, s2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v6, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v3 +; GFX10-NEXT: v_and_or_b32 v19, v15, s5, v4 +; GFX10-NEXT: v_and_b32_sdwa v0, v15, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX10-NEXT: v_and_or_b32 v6, v1, s5, v6 +; GFX10-NEXT: v_and_b32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX10-NEXT: v_and_or_b32 v8, v2, s5, v8 +; GFX10-NEXT: v_and_b32_sdwa v15, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX10-NEXT: v_and_or_b32 v9, v3, s5, v9 +; GFX10-NEXT: v_and_b32_sdwa v14, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v12 +; GFX10-NEXT: v_or3_b32 v0, v19, v0, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_or3_b32 v1, v6, v1, v7 +; GFX10-NEXT: v_or3_b32 v2, v8, v15, v10 +; GFX10-NEXT: v_or3_b32 v3, v9, v14, v11 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX10-NEXT: s_endpgm %vec = load <16 x i8>, <16 x i8> addrspace(4)* %ptr %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx store <16 x i8> %insert, <16 x i8> addrspace(1)* null @@ -4919,6 +6226,117 @@ ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_s_v16i8_v_v: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX10-NEXT: s_movk_i32 s8, 0xff +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 2, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX10-NEXT: v_mov_b32_e32 v9, 8 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, s8 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_xor_b32_e32 v1, -1, v3 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshr_b32 s4, s0, 8 +; GFX10-NEXT: s_lshr_b32 s5, s0, 16 +; GFX10-NEXT: s_lshr_b32 s7, s1, 8 +; GFX10-NEXT: s_and_b32 s4, s4, s8 +; GFX10-NEXT: s_lshr_b32 s6, s0, 24 +; GFX10-NEXT: s_lshr_b32 s9, s1, 16 +; GFX10-NEXT: s_and_b32 s5, s5, s8 +; GFX10-NEXT: s_and_b32 s7, s7, s8 +; GFX10-NEXT: s_and_b32 s0, s0, s8 +; GFX10-NEXT: s_lshl_b32 s4, s4, 8 +; GFX10-NEXT: s_and_b32 s9, s9, s8 +; GFX10-NEXT: s_lshr_b32 s10, s1, 24 +; GFX10-NEXT: s_and_b32 s1, s1, s8 +; GFX10-NEXT: s_lshl_b32 s7, s7, 8 +; GFX10-NEXT: s_lshl_b32 s5, s5, 16 +; GFX10-NEXT: s_or_b32 s0, s0, s4 +; GFX10-NEXT: s_lshr_b32 s11, s2, 8 +; GFX10-NEXT: s_lshl_b32 s6, s6, 24 +; GFX10-NEXT: s_or_b32 s0, s0, s5 +; GFX10-NEXT: s_lshl_b32 s9, s9, 16 +; GFX10-NEXT: s_or_b32 s1, s1, s7 +; GFX10-NEXT: s_lshr_b32 s12, s2, 16 +; GFX10-NEXT: s_or_b32 s4, s0, s6 +; GFX10-NEXT: s_or_b32 s0, s1, s9 +; GFX10-NEXT: s_and_b32 s1, s11, s8 +; GFX10-NEXT: s_lshr_b32 s13, s2, 24 +; GFX10-NEXT: s_and_b32 s2, s2, s8 +; GFX10-NEXT: s_lshl_b32 s1, s1, 8 +; GFX10-NEXT: s_and_b32 s5, s12, s8 +; GFX10-NEXT: s_or_b32 s1, s2, s1 +; GFX10-NEXT: s_lshl_b32 s2, s5, 16 +; GFX10-NEXT: s_lshl_b32 s5, s10, 24 +; GFX10-NEXT: s_lshr_b32 s14, s3, 8 +; GFX10-NEXT: s_or_b32 s5, s0, s5 +; GFX10-NEXT: s_and_b32 s0, s14, s8 +; GFX10-NEXT: s_or_b32 s1, s1, s2 +; GFX10-NEXT: s_lshl_b32 s2, s13, 24 +; GFX10-NEXT: s_lshr_b32 s15, s3, 16 +; GFX10-NEXT: s_or_b32 s6, s1, s2 +; GFX10-NEXT: s_and_b32 s1, s3, s8 +; GFX10-NEXT: s_lshl_b32 s0, s0, 8 +; GFX10-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-NEXT: s_or_b32 s0, s1, s0 +; GFX10-NEXT: s_and_b32 s1, s15, s8 +; GFX10-NEXT: s_lshr_b32 s16, s3, 24 +; GFX10-NEXT: s_lshl_b32 s1, s1, 16 +; GFX10-NEXT: v_cndmask_b32_e32 v2, s4, v2, vcc_lo +; GFX10-NEXT: s_or_b32 s1, s0, s1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v4 +; GFX10-NEXT: s_lshl_b32 s2, s16, 24 +; GFX10-NEXT: s_or_b32 s7, s1, s2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s6, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s7, s1 +; GFX10-NEXT: v_and_or_b32 v5, v2, v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: v_mov_b32_e32 v3, s7 +; GFX10-NEXT: v_cndmask_b32_e64 v15, v0, v5, s2 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v5, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v5, s1 +; GFX10-NEXT: s_mov_b32 s2, 8 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v15 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 8, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 8, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v15 +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, s2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v6, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v3 +; GFX10-NEXT: v_and_or_b32 v19, v15, s8, v4 +; GFX10-NEXT: v_and_b32_sdwa v0, v15, s8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX10-NEXT: v_and_or_b32 v6, v1, s8, v6 +; GFX10-NEXT: v_and_b32_sdwa v1, v1, s8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX10-NEXT: v_and_or_b32 v8, v2, s8, v8 +; GFX10-NEXT: v_and_b32_sdwa v15, v2, s8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX10-NEXT: v_and_or_b32 v9, v3, s8, v9 +; GFX10-NEXT: v_and_b32_sdwa v14, v3, s8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v12 +; GFX10-NEXT: v_or3_b32 v0, v19, v0, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_or3_b32 v1, v6, v1, v7 +; GFX10-NEXT: v_or3_b32 v2, v8, v15, v10 +; GFX10-NEXT: v_or3_b32 v3, v9, v14, v11 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX10-NEXT: s_endpgm %vec = load <16 x i8>, <16 x i8> addrspace(4)* %ptr %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx store <16 x i8> %insert, <16 x i8> addrspace(1)* null @@ -5237,6 +6655,93 @@ ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_v_v16i8_s_v: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off +; GFX10-NEXT: v_mov_b32_e32 v22, 8 +; GFX10-NEXT: s_mov_b32 s0, 8 +; GFX10-NEXT: s_movk_i32 s3, 0xff +; GFX10-NEXT: v_and_b32_e32 v0, 3, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; GFX10-NEXT: s_and_b32 s1, s2, s3 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v2 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 8, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 8, v5 +; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v22, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v26, s0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v13, 8, v6 +; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v22, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_or_b32 v19, v4, s3, v9 +; GFX10-NEXT: v_and_b32_sdwa v15, v3, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v16, v4, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX10-NEXT: v_and_or_b32 v26, v3, s3, v26 +; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX10-NEXT: v_lshrrev_b32_e32 v14, 24, v6 +; GFX10-NEXT: v_and_b32_sdwa v17, v5, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v12 +; GFX10-NEXT: v_and_or_b32 v30, v5, s3, v11 +; GFX10-NEXT: v_or3_b32 v3, v26, v15, v8 +; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v22, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_or3_b32 v26, v19, v16, v10 +; GFX10-NEXT: v_and_b32_sdwa v18, v6, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_or3_b32 v5, v30, v17, v7 +; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v14 +; GFX10-NEXT: v_and_or_b32 v11, v6, s3, v9 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v3, v26, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v2 +; GFX10-NEXT: v_lshlrev_b32_e64 v9, v0, s3 +; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, s1 +; GFX10-NEXT: v_or3_b32 v6, v11, v18, v8 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v5, s0 +; GFX10-NEXT: v_xor_b32_e32 v10, -1, v9 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v6, s1 +; GFX10-NEXT: v_and_or_b32 v0, v7, v10, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v18, v26, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, v0, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, v0, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v0, s1 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v18 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 8, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v18 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v4 +; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v22, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v22, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v22, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v22, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX10-NEXT: v_and_b32_sdwa v15, v4, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v19, v2, s3, v5 +; GFX10-NEXT: v_and_b32_sdwa v14, v18, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v16, v0, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v12 +; GFX10-NEXT: v_and_or_b32 v3, v18, s3, v7 +; GFX10-NEXT: v_and_or_b32 v5, v0, s3, v1 +; GFX10-NEXT: v_and_or_b32 v4, v4, s3, v9 +; GFX10-NEXT: v_and_b32_sdwa v13, v2, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX10-NEXT: v_or3_b32 v1, v3, v14, v8 +; GFX10-NEXT: v_or3_b32 v3, v5, v16, v11 +; GFX10-NEXT: v_or3_b32 v2, v4, v15, v10 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_or3_b32 v0, v19, v13, v6 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX10-NEXT: s_endpgm %vec = load <16 x i8>, <16 x i8> addrspace(1)* %ptr %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx store <16 x i8> %insert, <16 x i8> addrspace(1)* null @@ -5554,6 +7059,92 @@ ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_v_v16i8_v_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off +; GFX10-NEXT: v_mov_b32_e32 v18, 8 +; GFX10-NEXT: s_mov_b32 s0, 8 +; GFX10-NEXT: s_movk_i32 s3, 0xff +; GFX10-NEXT: s_lshr_b32 s4, s2, 2 +; GFX10-NEXT: s_and_b32 s1, s2, 3 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 1 +; GFX10-NEXT: s_lshl_b32 s2, s1, 3 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s4, 3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_lshl_b32 s2, s3, s2 +; GFX10-NEXT: s_not_b32 s2, s2 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 8, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v27, 8, v5 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v19, v18, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshrrev_b32_e32 v12, 8, v6 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v5 +; GFX10-NEXT: v_and_b32_sdwa v14, v3, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v15, v4, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v22, v4, s3, v19 +; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX10-NEXT: v_and_or_b32 v1, v3, s3, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v18, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_sdwa v23, v5, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_e32 v13, 24, v6 +; GFX10-NEXT: v_or3_b32 v1, v1, v14, v7 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v11 +; GFX10-NEXT: v_and_or_b32 v5, v5, s3, v10 +; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v18, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_or3_b32 v3, v22, v15, v9 +; GFX10-NEXT: v_and_b32_sdwa v17, v6, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v13 +; GFX10-NEXT: v_or3_b32 v4, v5, v23, v4 +; GFX10-NEXT: v_and_or_b32 v6, v6, s3, v8 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v1, v3, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s4, 2 +; GFX10-NEXT: v_or3_b32 v7, v6, v17, v7 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v4, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v7, s1 +; GFX10-NEXT: v_and_or_b32 v2, v5, s2, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s4, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v19, v1, v2, s2 +; GFX10-NEXT: v_cndmask_b32_e32 v22, v3, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v2, s1 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 8, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v22 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 8, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v22 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v18, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v18, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v18, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v18, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v19 +; GFX10-NEXT: v_and_b32_sdwa v13, v19, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v15, v4, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v19, v19, s3, v5 +; GFX10-NEXT: v_and_b32_sdwa v14, v22, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v16, v2, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX10-NEXT: v_and_or_b32 v3, v22, s3, v7 +; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v12 +; GFX10-NEXT: v_and_or_b32 v4, v4, s3, v9 +; GFX10-NEXT: v_and_or_b32 v5, v2, s3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX10-NEXT: v_or3_b32 v1, v3, v14, v8 +; GFX10-NEXT: v_or3_b32 v2, v4, v15, v10 +; GFX10-NEXT: v_or3_b32 v3, v5, v16, v11 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_or3_b32 v0, v19, v13, v6 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX10-NEXT: s_endpgm %vec = load <16 x i8>, <16 x i8> addrspace(1)* %ptr %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx store <16 x i8> %insert, <16 x i8> addrspace(1)* null @@ -5873,6 +7464,93 @@ ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: insertelement_v_v16i8_v_v: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off +; GFX10-NEXT: v_mov_b32_e32 v8, 8 +; GFX10-NEXT: s_mov_b32 s0, 8 +; GFX10-NEXT: s_movk_i32 s1, 0xff +; GFX10-NEXT: v_and_b32_e32 v0, 3, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 2, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, 0xff +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v3 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 8, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 8, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v13, 8, v6 +; GFX10-NEXT: v_lshlrev_b32_sdwa v9, s0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v8, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_sdwa v17, v4, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v18, v5, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_e32 v14, 24, v6 +; GFX10-NEXT: v_and_or_b32 v4, v4, s1, v9 +; GFX10-NEXT: v_and_or_b32 v5, v5, s1, v11 +; GFX10-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; GFX10-NEXT: v_lshrrev_b32_e32 v15, 8, v7 +; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX10-NEXT: v_lshlrev_b32_sdwa v13, v8, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_sdwa v19, v6, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v14 +; GFX10-NEXT: v_lshlrev_b32_sdwa v14, v8, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshrrev_b32_e32 v16, 24, v7 +; GFX10-NEXT: v_and_or_b32 v6, v6, s1, v13 +; GFX10-NEXT: v_or3_b32 v15, v4, v17, v10 +; GFX10-NEXT: v_or3_b32 v5, v5, v18, v12 +; GFX10-NEXT: v_and_b32_sdwa v20, v7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v16 +; GFX10-NEXT: v_and_or_b32 v7, v7, v1, v14 +; GFX10-NEXT: v_or3_b32 v6, v6, v19, v9 +; GFX10-NEXT: v_cndmask_b32_e32 v9, v15, v5, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v11, v0, v1 +; GFX10-NEXT: v_or3_b32 v7, v7, v20, v10 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v6, s0 +; GFX10-NEXT: v_xor_b32_e32 v2, -1, v11 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v7, s1 +; GFX10-NEXT: v_and_or_b32 v0, v9, v2, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v0, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v27, v7, v0, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v15, v0, s2 +; GFX10-NEXT: v_cndmask_b32_e32 v18, v5, v0, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 8, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v19, 8, v27 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v18 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v18 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v4 +; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v8, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshrrev_b32_e32 v13, 24, v27 +; GFX10-NEXT: v_lshlrev_b32_sdwa v23, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v8, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_sdwa v21, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v15, v18, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v19, v2, v1, v23 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX10-NEXT: v_and_b32_sdwa v16, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v10, v4, v1, v10 +; GFX10-NEXT: v_and_b32_sdwa v17, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX10-NEXT: v_lshlrev_b32_e32 v12, 24, v13 +; GFX10-NEXT: v_and_or_b32 v3, v27, v1, v8 +; GFX10-NEXT: v_and_or_b32 v2, v18, v1, v7 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_or3_b32 v0, v19, v21, v6 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: v_or3_b32 v3, v3, v17, v12 +; GFX10-NEXT: v_or3_b32 v1, v2, v15, v9 +; GFX10-NEXT: v_or3_b32 v2, v10, v16, v11 +; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX10-NEXT: s_endpgm %vec = load <16 x i8>, <16 x i8> addrspace(1)* %ptr %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx store <16 x i8> %insert, <16 x i8> addrspace(1)* null diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in, <64 x i32> addrspace(1)* %ptr.out) #0 { ; GCN-LABEL: v_insert_v64i32_37: @@ -61,6 +62,68 @@ ; GCN-NEXT: global_store_dwordx4 v68, v[36:39], s[2:3] offset:96 ; GCN-NEXT: global_store_dwordx4 v68, v[40:43], s[2:3] offset:112 ; GCN-NEXT: s_endpgm +; +; GFX10-LABEL: v_insert_v64i32_37: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: v_lshlrev_b32_e32 v70, 8, v0 +; GFX10-NEXT: s_movk_i32 s4, 0x80 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-NEXT: s_movk_i32 s4, 0xc0 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: v_mov_b32_e32 v4, s5 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v6, s1 +; GFX10-NEXT: v_mov_b32_e32 v5, s0 +; GFX10-NEXT: s_clause 0x4 +; GFX10-NEXT: global_load_dwordx4 v[32:35], v70, s[0:1] +; GFX10-NEXT: global_load_dwordx4 v[36:39], v70, s[0:1] offset:16 +; GFX10-NEXT: global_load_dwordx4 v[40:43], v70, s[0:1] offset:32 +; GFX10-NEXT: global_load_dwordx4 v[44:47], v70, s[0:1] offset:48 +; GFX10-NEXT: global_load_dwordx4 v[48:51], v70, s[0:1] offset:64 +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v5, v70 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v6, vcc_lo +; GFX10-NEXT: v_add_co_u32_e64 v64, vcc_lo, v0, 64 +; GFX10-NEXT: v_add_co_ci_u32_e32 v65, vcc_lo, 0, v5, vcc_lo +; GFX10-NEXT: v_add_co_u32_e64 v66, vcc_lo, v0, v1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v67, vcc_lo, v5, v2, vcc_lo +; GFX10-NEXT: v_add_co_u32_e64 v68, vcc_lo, v0, v3 +; GFX10-NEXT: v_add_co_ci_u32_e32 v69, vcc_lo, v5, v4, vcc_lo +; GFX10-NEXT: s_clause 0xa +; GFX10-NEXT: global_load_dwordx4 v[52:55], v[64:65], off offset:16 +; GFX10-NEXT: global_load_dwordx4 v[56:59], v[64:65], off offset:32 +; GFX10-NEXT: global_load_dwordx4 v[60:63], v[64:65], off offset:48 +; GFX10-NEXT: global_load_dwordx4 v[4:7], v[66:67], off offset:16 +; GFX10-NEXT: global_load_dwordx4 v[8:11], v[66:67], off offset:32 +; GFX10-NEXT: global_load_dwordx4 v[12:15], v[66:67], off offset:48 +; GFX10-NEXT: global_load_dwordx4 v[20:23], v[68:69], off offset:16 +; GFX10-NEXT: global_load_dwordx4 v[24:27], v[68:69], off offset:32 +; GFX10-NEXT: global_load_dwordx4 v[28:31], v[68:69], off offset:48 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v70, s[0:1] offset:128 +; GFX10-NEXT: global_load_dwordx4 v[16:19], v70, s[0:1] offset:192 +; GFX10-NEXT: s_waitcnt vmcnt(7) +; GFX10-NEXT: v_mov_b32_e32 v5, 0x3e7 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: global_store_dwordx4 v70, v[0:3], s[2:3] offset:128 +; GFX10-NEXT: global_store_dwordx4 v70, v[4:7], s[2:3] offset:144 +; GFX10-NEXT: global_store_dwordx4 v70, v[8:11], s[2:3] offset:160 +; GFX10-NEXT: global_store_dwordx4 v70, v[12:15], s[2:3] offset:176 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dwordx4 v70, v[16:19], s[2:3] offset:192 +; GFX10-NEXT: global_store_dwordx4 v70, v[20:23], s[2:3] offset:208 +; GFX10-NEXT: global_store_dwordx4 v70, v[24:27], s[2:3] offset:224 +; GFX10-NEXT: global_store_dwordx4 v70, v[32:35], s[2:3] +; GFX10-NEXT: global_store_dwordx4 v70, v[36:39], s[2:3] offset:16 +; GFX10-NEXT: global_store_dwordx4 v70, v[40:43], s[2:3] offset:32 +; GFX10-NEXT: global_store_dwordx4 v70, v[44:47], s[2:3] offset:48 +; GFX10-NEXT: global_store_dwordx4 v70, v[48:51], s[2:3] offset:64 +; GFX10-NEXT: global_store_dwordx4 v70, v[52:55], s[2:3] offset:80 +; GFX10-NEXT: global_store_dwordx4 v70, v[56:59], s[2:3] offset:96 +; GFX10-NEXT: global_store_dwordx4 v70, v[60:63], s[2:3] offset:112 +; GFX10-NEXT: global_store_dwordx4 v70, v[28:31], s[2:3] offset:240 +; GFX10-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.in = getelementptr <64 x i32>, <64 x i32> addrspace(1)* %ptr.in, i32 %id %vec = load <64 x i32>, <64 x i32> addrspace(1)* %gep.in diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-abs.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-abs.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-abs.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-abs.mir @@ -1,6 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX6 %s # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX9 %s --- name: smax_neg_abs_pattern_s32_ss diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.mir @@ -2,6 +2,7 @@ # RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX6 %s # RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX6 %s # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s --- name: add_s32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-extract-vector-elt.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-extract-vector-elt.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-extract-vector-elt.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-extract-vector-elt.mir @@ -3,6 +3,7 @@ # RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=MOVREL %s # RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-vgpr-index-mode -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GPRIDX %s # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GPRIDX %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=MOVREL %s --- name: extract_vector_elt_s_s32_v2s32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fabs.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fabs.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fabs.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fabs.mir @@ -2,6 +2,7 @@ # RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s # RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s --- name: fabs_s32_ss diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcanonicalize.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcanonicalize.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcanonicalize.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcanonicalize.mir @@ -1,6 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -disable-gisel-legality-check -o - %s | FileCheck -check-prefix=GFX8 %s # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum-ieee.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum-ieee.s16.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum-ieee.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum-ieee.s16.mir @@ -1,6 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s --- name: fmaxnum_ieee_f16_vv diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum-ieee.v2s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum-ieee.v2s16.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum-ieee.v2s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum-ieee.v2s16.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s --- name: fmaxnum_ieee_v2f16_vv diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum.s16.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum.s16.mir @@ -1,6 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s --- name: fmaxnum_f16_vv diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum.v2s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum.v2s16.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum.v2s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum.v2s16.mir @@ -1,5 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s + # FIXME: Ideally this would fail to select with ieee mode enabled. --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum-ieee.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum-ieee.s16.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum-ieee.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum-ieee.s16.mir @@ -1,6 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s --- name: fminnum_ieee_f16_vv diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum-ieee.v2s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum-ieee.v2s16.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum-ieee.v2s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum-ieee.v2s16.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s --- name: fminnum_ieee_v2f16_vv diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum.s16.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum.s16.mir @@ -1,6 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s --- name: fminnum_f16_vv diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum.v2s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum.v2s16.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum.v2s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum.v2s16.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s --- name: fminnum_v2f16_vv diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmul.v2s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmul.v2s16.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmul.v2s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmul.v2s16.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s --- name: fmul_v2f16_vv diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fneg.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fneg.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fneg.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fneg.mir @@ -2,6 +2,7 @@ # RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s # RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s --- name: fneg_s32_ss diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-insert-vector-elt.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-insert-vector-elt.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-insert-vector-elt.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-insert-vector-elt.mir @@ -1,6 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=MOVREL %s # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GPRIDX %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=MOVREL %s --- name: insert_vector_elt_s_s32_v2s32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir @@ -1,6 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX7 %s # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX10 %s --- @@ -23,6 +24,11 @@ ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 4) ; GFX9: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] + ; GFX10-LABEL: name: load_atomic_flat_s32_seq_cst + ; GFX10: liveins: $vgpr0_vgpr1 + ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX10: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 4) + ; GFX10: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = G_LOAD %0 :: (load seq_cst 4, align 4, addrspace 0) $vgpr0 = COPY %1 @@ -50,6 +56,11 @@ ; GFX9: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 ; GFX9: [[LOAD:%[0-9]+]]:vgpr_32(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load seq_cst 4) ; GFX9: $vgpr0 = COPY [[LOAD]](<2 x s16>) + ; GFX10-LABEL: name: load_atomic_flat_v2s16_seq_cst + ; GFX10: liveins: $vgpr0_vgpr1 + ; GFX10: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 + ; GFX10: [[LOAD:%[0-9]+]]:vgpr_32(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load seq_cst 4) + ; GFX10: $vgpr0 = COPY [[LOAD]](<2 x s16>) %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(<2 x s16>) = G_LOAD %0 :: (load seq_cst 4, align 4, addrspace 0) $vgpr0 = COPY %1 @@ -77,6 +88,11 @@ ; GFX9: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 ; GFX9: [[LOAD:%[0-9]+]]:vgpr_32(p3) = G_LOAD [[COPY]](p0) :: (load seq_cst 4) ; GFX9: $vgpr0 = COPY [[LOAD]](p3) + ; GFX10-LABEL: name: load_atomic_flat_p3_seq_cst + ; GFX10: liveins: $vgpr0_vgpr1 + ; GFX10: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 + ; GFX10: [[LOAD:%[0-9]+]]:vgpr_32(p3) = G_LOAD [[COPY]](p0) :: (load seq_cst 4) + ; GFX10: $vgpr0 = COPY [[LOAD]](p3) %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(p3) = G_LOAD %0 :: (load seq_cst 4, align 4, addrspace 0) $vgpr0 = COPY %1 @@ -104,6 +120,11 @@ ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 8) ; GFX9: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] + ; GFX10-LABEL: name: load_atomic_flat_s64_seq_cst + ; GFX10: liveins: $vgpr0_vgpr1 + ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX10: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 8) + ; GFX10: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_LOAD %0 :: (load seq_cst 8, align 8, addrspace 0) $vgpr0_vgpr1 = COPY %1 @@ -131,6 +152,11 @@ ; GFX9: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 ; GFX9: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load seq_cst 8) ; GFX9: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; GFX10-LABEL: name: load_atomic_flat_v2s32_seq_cst + ; GFX10: liveins: $vgpr0_vgpr1 + ; GFX10: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 + ; GFX10: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load seq_cst 8) + ; GFX10: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(<2 x s32>) = G_LOAD %0 :: (load seq_cst 8, align 8, addrspace 0) $vgpr0_vgpr1 = COPY %1 @@ -158,6 +184,11 @@ ; GFX9: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 ; GFX9: [[LOAD:%[0-9]+]]:vreg_64(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load seq_cst 8) ; GFX9: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) + ; GFX10-LABEL: name: load_atomic_flat_v4s16_seq_cst + ; GFX10: liveins: $vgpr0_vgpr1 + ; GFX10: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 + ; GFX10: [[LOAD:%[0-9]+]]:vreg_64(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load seq_cst 8) + ; GFX10: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(<4 x s16>) = G_LOAD %0 :: (load seq_cst 8, align 8, addrspace 0) $vgpr0_vgpr1 = COPY %1 @@ -185,6 +216,11 @@ ; GFX9: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 ; GFX9: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p0) :: (load seq_cst 8) ; GFX9: $vgpr0_vgpr1 = COPY [[LOAD]](p1) + ; GFX10-LABEL: name: load_atomic_flat_p1_seq_cst + ; GFX10: liveins: $vgpr0_vgpr1 + ; GFX10: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 + ; GFX10: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p0) :: (load seq_cst 8) + ; GFX10: $vgpr0_vgpr1 = COPY [[LOAD]](p1) %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(p1) = G_LOAD %0 :: (load seq_cst 8, align 8, addrspace 0) $vgpr0_vgpr1 = COPY %1 @@ -212,6 +248,11 @@ ; GFX9: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 ; GFX9: [[LOAD:%[0-9]+]]:vreg_64(p0) = G_LOAD [[COPY]](p0) :: (load seq_cst 8) ; GFX9: $vgpr0_vgpr1 = COPY [[LOAD]](p0) + ; GFX10-LABEL: name: load_atomic_flat_p0_seq_cst + ; GFX10: liveins: $vgpr0_vgpr1 + ; GFX10: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 + ; GFX10: [[LOAD:%[0-9]+]]:vreg_64(p0) = G_LOAD [[COPY]](p0) :: (load seq_cst 8) + ; GFX10: $vgpr0_vgpr1 = COPY [[LOAD]](p0) %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(p0) = G_LOAD %0 :: (load seq_cst 8, align 8, addrspace 0) $vgpr0_vgpr1 = COPY %1 @@ -259,6 +300,21 @@ ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 ; GFX9: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 4) ; GFX9: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] + ; GFX10-LABEL: name: load_atomic_flat_s32_seq_cst_gep_m2048 + ; GFX10: liveins: $vgpr0_vgpr1 + ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec + ; GFX10: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec + ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX10: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX10: %9:vgpr_32, dead %11:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 + ; GFX10: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 4) + ; GFX10: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 -2048 %2:vgpr(p0) = G_PTR_ADD %0, %1 @@ -298,6 +354,21 @@ ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 4095, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 4) ; GFX9: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] + ; GFX10-LABEL: name: load_atomic_flat_s32_seq_cst_gep_4095 + ; GFX10: liveins: $vgpr0_vgpr1 + ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec + ; GFX10: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX10: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX10: %9:vgpr_32, dead %11:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 + ; GFX10: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 4) + ; GFX10: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 4095 %2:vgpr(p0) = G_PTR_ADD %0, %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir @@ -3,6 +3,7 @@ # RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX7 %s # RUN: llc -march=amdgcn -mcpu=hawaii -mattr=+flat-for-global -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX7-FLAT %s # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX10 %s --- @@ -45,6 +46,11 @@ ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load seq_cst 4, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + ; GFX10-LABEL: name: load_atomic_global_s32_seq_cst + ; GFX10: liveins: $vgpr0_vgpr1 + ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load seq_cst 4, addrspace 1) + ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = G_LOAD %0 :: (load seq_cst 4, align 4, addrspace 1) $vgpr0 = COPY %1 @@ -82,6 +88,11 @@ ; GFX9: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX9: [[LOAD:%[0-9]+]]:vgpr_32(<2 x s16>) = G_LOAD [[COPY]](p1) :: (load seq_cst 4, addrspace 1) ; GFX9: $vgpr0 = COPY [[LOAD]](<2 x s16>) + ; GFX10-LABEL: name: load_atomic_global_v2s16_seq_cst + ; GFX10: liveins: $vgpr0_vgpr1 + ; GFX10: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 + ; GFX10: [[LOAD:%[0-9]+]]:vgpr_32(<2 x s16>) = G_LOAD [[COPY]](p1) :: (load seq_cst 4, addrspace 1) + ; GFX10: $vgpr0 = COPY [[LOAD]](<2 x s16>) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<2 x s16>) = G_LOAD %0 :: (load seq_cst 4, align 4, addrspace 1) $vgpr0 = COPY %1 @@ -119,6 +130,11 @@ ; GFX9: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX9: [[LOAD:%[0-9]+]]:vgpr_32(p3) = G_LOAD [[COPY]](p1) :: (load seq_cst 4, addrspace 1) ; GFX9: $vgpr0 = COPY [[LOAD]](p3) + ; GFX10-LABEL: name: load_atomic_global_p3_seq_cst + ; GFX10: liveins: $vgpr0_vgpr1 + ; GFX10: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 + ; GFX10: [[LOAD:%[0-9]+]]:vgpr_32(p3) = G_LOAD [[COPY]](p1) :: (load seq_cst 4, addrspace 1) + ; GFX10: $vgpr0 = COPY [[LOAD]](p3) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(p3) = G_LOAD %0 :: (load seq_cst 4, align 4, addrspace 1) $vgpr0 = COPY %1 @@ -166,6 +182,11 @@ ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load seq_cst 8, addrspace 1) ; GFX9: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] + ; GFX10-LABEL: name: load_atomic_global_s64_seq_cst + ; GFX10: liveins: $vgpr0_vgpr1 + ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX10: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load seq_cst 8, addrspace 1) + ; GFX10: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_LOAD %0 :: (load seq_cst 8, align 8, addrspace 1) $vgpr0_vgpr1 = COPY %1 @@ -203,6 +224,11 @@ ; GFX9: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX9: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load seq_cst 8, addrspace 1) ; GFX9: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; GFX10-LABEL: name: load_atomic_global_v2s32_seq_cst + ; GFX10: liveins: $vgpr0_vgpr1 + ; GFX10: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 + ; GFX10: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load seq_cst 8, addrspace 1) + ; GFX10: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<2 x s32>) = G_LOAD %0 :: (load seq_cst 8, align 8, addrspace 1) $vgpr0_vgpr1 = COPY %1 @@ -240,6 +266,11 @@ ; GFX9: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX9: [[LOAD:%[0-9]+]]:vreg_64(<4 x s16>) = G_LOAD [[COPY]](p1) :: (load seq_cst 8, addrspace 1) ; GFX9: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) + ; GFX10-LABEL: name: load_atomic_global_v4s16_seq_cst + ; GFX10: liveins: $vgpr0_vgpr1 + ; GFX10: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 + ; GFX10: [[LOAD:%[0-9]+]]:vreg_64(<4 x s16>) = G_LOAD [[COPY]](p1) :: (load seq_cst 8, addrspace 1) + ; GFX10: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<4 x s16>) = G_LOAD %0 :: (load seq_cst 8, align 8, addrspace 1) $vgpr0_vgpr1 = COPY %1 @@ -277,6 +308,11 @@ ; GFX9: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX9: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p1) :: (load seq_cst 8, addrspace 1) ; GFX9: $vgpr0_vgpr1 = COPY [[LOAD]](p1) + ; GFX10-LABEL: name: load_atomic_global_p1_seq_cst + ; GFX10: liveins: $vgpr0_vgpr1 + ; GFX10: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 + ; GFX10: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p1) :: (load seq_cst 8, addrspace 1) + ; GFX10: $vgpr0_vgpr1 = COPY [[LOAD]](p1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(p1) = G_LOAD %0 :: (load seq_cst 8, align 8, addrspace 1) $vgpr0_vgpr1 = COPY %1 @@ -314,6 +350,11 @@ ; GFX9: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX9: [[LOAD:%[0-9]+]]:vreg_64(p0) = G_LOAD [[COPY]](p1) :: (load seq_cst 8, addrspace 1) ; GFX9: $vgpr0_vgpr1 = COPY [[LOAD]](p0) + ; GFX10-LABEL: name: load_atomic_global_p0_seq_cst + ; GFX10: liveins: $vgpr0_vgpr1 + ; GFX10: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 + ; GFX10: [[LOAD:%[0-9]+]]:vreg_64(p0) = G_LOAD [[COPY]](p1) :: (load seq_cst 8, addrspace 1) + ; GFX10: $vgpr0_vgpr1 = COPY [[LOAD]](p0) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(p0) = G_LOAD %0 :: (load seq_cst 8, align 8, addrspace 1) $vgpr0_vgpr1 = COPY %1 @@ -391,6 +432,11 @@ ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], -2048, 0, implicit $exec :: (load seq_cst 4, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + ; GFX10-LABEL: name: load_atomic_global_s32_seq_cst_gep_m2048 + ; GFX10: liveins: $vgpr0_vgpr1 + ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], -2048, 0, implicit $exec :: (load seq_cst 4, addrspace 1) + ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 -2048 %2:vgpr(p1) = G_PTR_ADD %0, %1 @@ -450,6 +496,21 @@ ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 4095, 0, implicit $exec :: (load seq_cst 4, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + ; GFX10-LABEL: name: load_atomic_global_s32_seq_cst_gep_4095 + ; GFX10: liveins: $vgpr0_vgpr1 + ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec + ; GFX10: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX10: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX10: %9:vgpr_32, dead %11:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 + ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load seq_cst 4, addrspace 1) + ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 4095 %2:vgpr(p1) = G_PTR_ADD %0, %1 @@ -529,6 +590,11 @@ ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], -2048, 0, implicit $exec :: (load seq_cst 8, addrspace 1) ; GFX9: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] + ; GFX10-LABEL: name: load_atomic_global_s64_seq_cst_gep_m2048 + ; GFX10: liveins: $vgpr0_vgpr1 + ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX10: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], -2048, 0, implicit $exec :: (load seq_cst 8, addrspace 1) + ; GFX10: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 -2048 %2:vgpr(p1) = G_PTR_ADD %0, %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-local.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-local.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-local.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-local.mir @@ -2,6 +2,7 @@ # RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX6 %s # RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX7 %s # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local-128.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local-128.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local-128.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local-128.mir @@ -1,6 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX7 %s # RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir @@ -1,6 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX6 %s # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-add3.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-add3.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-add3.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-add3.mir @@ -1,7 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX8 %s # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s -# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s --- @@ -29,14 +29,6 @@ ; GFX9: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY]], [[COPY1]], implicit-def $scc ; GFX9: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_ADD_I32_]], [[COPY2]], implicit-def $scc ; GFX9: S_ENDPGM 0, implicit [[S_ADD_I32_1]] - ; GFX10-LABEL: name: add_s32_sgpr_sgpr_sgpr - ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2 - ; GFX10: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX10: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX10: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX10: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY]], [[COPY1]], implicit-def $scc - ; GFX10: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_ADD_I32_]], [[COPY2]], implicit-def $scc - ; GFX10: S_ENDPGM 0, implicit [[S_ADD_I32_1]] %0:sgpr(s32) = COPY $sgpr0 %1:sgpr(s32) = COPY $sgpr1 %2:sgpr(s32) = COPY $sgpr2 @@ -70,13 +62,6 @@ ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 [[COPY]], [[COPY1]], [[COPY2]], implicit $exec ; GFX9: S_ENDPGM 0, implicit [[V_ADD3_U32_e64_]] - ; GFX10-LABEL: name: add_s32_vgpr_vgpr_vgpr - ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 [[COPY]], [[COPY1]], [[COPY2]], implicit $exec - ; GFX10: S_ENDPGM 0, implicit [[V_ADD3_U32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = COPY $vgpr2 @@ -111,14 +96,6 @@ ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec ; GFX9: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], [[COPY2]], 0, implicit $exec ; GFX9: S_ENDPGM 0, implicit [[V_ADD_U32_e64_1]], implicit [[V_ADD_U32_e64_]] - ; GFX10-LABEL: name: add_s32_vgpr_vgpr_vgpr_multi_use - ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec - ; GFX10: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], [[COPY2]], 0, implicit $exec - ; GFX10: S_ENDPGM 0, implicit [[V_ADD_U32_e64_1]], implicit [[V_ADD_U32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = COPY $vgpr2 @@ -154,14 +131,6 @@ ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec ; GFX9: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], [[COPY2]], 0, implicit $exec ; GFX9: S_ENDPGM 0, implicit [[V_ADD_U32_e64_1]] - ; GFX10-LABEL: name: add_p3_vgpr_vgpr_vgpr - ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec - ; GFX10: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], [[COPY2]], 0, implicit $exec - ; GFX10: S_ENDPGM 0, implicit [[V_ADD_U32_e64_1]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = COPY $vgpr2 @@ -197,14 +166,6 @@ ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec ; GFX9: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], [[COPY2]], 0, implicit $exec ; GFX9: S_ENDPGM 0, implicit [[V_ADD_U32_e64_1]] - ; GFX10-LABEL: name: add_p5_vgpr_vgpr_vgpr - ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec - ; GFX10: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], [[COPY2]], 0, implicit $exec - ; GFX10: S_ENDPGM 0, implicit [[V_ADD_U32_e64_1]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = COPY $vgpr2 @@ -240,14 +201,6 @@ ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec ; GFX9: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY2]], [[V_ADD_U32_e64_]], 0, implicit $exec ; GFX9: S_ENDPGM 0, implicit [[V_ADD_U32_e64_1]] - ; GFX10-LABEL: name: add_p3_s32_vgpr_vgpr_vgpr - ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec - ; GFX10: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY2]], [[V_ADD_U32_e64_]], 0, implicit $exec - ; GFX10: S_ENDPGM 0, implicit [[V_ADD_U32_e64_1]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(p3) = COPY $vgpr2 @@ -283,14 +236,6 @@ ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec ; GFX9: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY2]], [[V_ADD_U32_e64_]], 0, implicit $exec ; GFX9: S_ENDPGM 0, implicit [[V_ADD_U32_e64_1]] - ; GFX10-LABEL: name: add_p5_s32_vgpr_vgpr_vgpr - ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec - ; GFX10: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY2]], [[V_ADD_U32_e64_]], 0, implicit $exec - ; GFX10: S_ENDPGM 0, implicit [[V_ADD_U32_e64_1]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(p5) = COPY $vgpr2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-and-or.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-and-or.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-and-or.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-and-or.mir @@ -1,7 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX8 %s # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s -# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s --- @@ -29,14 +29,6 @@ ; GFX9: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], [[COPY1]], implicit-def $scc ; GFX9: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_AND_B32_]], [[COPY2]], implicit-def $scc ; GFX9: S_ENDPGM 0, implicit [[S_OR_B32_]] - ; GFX10-LABEL: name: and_or_s32_sgpr_sgpr_sgpr - ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2 - ; GFX10: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX10: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX10: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX10: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], [[COPY1]], implicit-def $scc - ; GFX10: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_AND_B32_]], [[COPY2]], implicit-def $scc - ; GFX10: S_ENDPGM 0, implicit [[S_OR_B32_]] %0:sgpr(s32) = COPY $sgpr0 %1:sgpr(s32) = COPY $sgpr1 %2:sgpr(s32) = COPY $sgpr2 @@ -70,13 +62,6 @@ ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9: [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[COPY]], [[COPY1]], [[COPY2]], implicit $exec ; GFX9: S_ENDPGM 0, implicit [[V_AND_OR_B32_e64_]] - ; GFX10-LABEL: name: and_or_s32_vgpr_vgpr_vgpr - ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10: [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[COPY]], [[COPY1]], [[COPY2]], implicit $exec - ; GFX10: S_ENDPGM 0, implicit [[V_AND_OR_B32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = COPY $vgpr2 @@ -110,13 +95,6 @@ ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9: [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[COPY]], [[COPY1]], [[COPY2]], implicit $exec ; GFX9: S_ENDPGM 0, implicit [[V_AND_OR_B32_e64_]] - ; GFX10-LABEL: name: and_or_s32_vgpr_vgpr_vgpr_commute - ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10: [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[COPY]], [[COPY1]], [[COPY2]], implicit $exec - ; GFX10: S_ENDPGM 0, implicit [[V_AND_OR_B32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = COPY $vgpr2 @@ -153,15 +131,6 @@ ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_AND_B32_]] ; GFX9: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[COPY3]], [[COPY2]], implicit $exec ; GFX9: S_ENDPGM 0, implicit [[V_OR_B32_e64_]] - ; GFX10-LABEL: name: and_or_s32_sgpr_sgpr_vgpr - ; GFX10: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX10: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX10: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX10: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], [[COPY1]], implicit-def $scc - ; GFX10: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_AND_B32_]] - ; GFX10: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[COPY3]], [[COPY2]], implicit $exec - ; GFX10: S_ENDPGM 0, implicit [[V_OR_B32_e64_]] %0:sgpr(s32) = COPY $sgpr0 %1:sgpr(s32) = COPY $sgpr1 %2:vgpr(s32) = COPY $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-or3.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-or3.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-or3.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-or3.mir @@ -1,7 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX8 %s # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s -# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s --- @@ -29,14 +29,6 @@ ; GFX9: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[COPY]], [[COPY1]], implicit-def $scc ; GFX9: [[S_OR_B32_1:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_OR_B32_]], [[COPY2]], implicit-def $scc ; GFX9: S_ENDPGM 0, implicit [[S_OR_B32_1]] - ; GFX10-LABEL: name: or_s32_sgpr_sgpr_sgpr - ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2 - ; GFX10: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX10: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX10: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX10: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[COPY]], [[COPY1]], implicit-def $scc - ; GFX10: [[S_OR_B32_1:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_OR_B32_]], [[COPY2]], implicit-def $scc - ; GFX10: S_ENDPGM 0, implicit [[S_OR_B32_1]] %0:sgpr(s32) = COPY $sgpr0 %1:sgpr(s32) = COPY $sgpr1 %2:sgpr(s32) = COPY $sgpr2 @@ -70,13 +62,6 @@ ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9: [[V_OR3_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR3_B32_e64 [[COPY]], [[COPY1]], [[COPY2]], implicit $exec ; GFX9: S_ENDPGM 0, implicit [[V_OR3_B32_e64_]] - ; GFX10-LABEL: name: or_s32_vgpr_vgpr_vgpr - ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10: [[V_OR3_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR3_B32_e64 [[COPY]], [[COPY1]], [[COPY2]], implicit $exec - ; GFX10: S_ENDPGM 0, implicit [[V_OR3_B32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = COPY $vgpr2 @@ -111,14 +96,6 @@ ; GFX9: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[COPY]], [[COPY1]], implicit $exec ; GFX9: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_]], [[COPY2]], implicit $exec ; GFX9: S_ENDPGM 0, implicit [[V_OR_B32_e64_1]], implicit [[V_OR_B32_e64_]] - ; GFX10-LABEL: name: or_s32_vgpr_vgpr_vgpr_multi_use - ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[COPY]], [[COPY1]], implicit $exec - ; GFX10: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_]], [[COPY2]], implicit $exec - ; GFX10: S_ENDPGM 0, implicit [[V_OR_B32_e64_1]], implicit [[V_OR_B32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = COPY $vgpr2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-smed3.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-smed3.s16.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-smed3.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-smed3.s16.mir @@ -1,6 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX8 %s # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX9 %s --- name: smed3_s16_vvv diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-umed3.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-umed3.s16.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-umed3.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-umed3.s16.mir @@ -1,6 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX8 %s # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX9 %s --- name: umed3_s16_vvv diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shuffle-vector.v2s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shuffle-vector.v2s16.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shuffle-vector.v2s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shuffle-vector.v2s16.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=instruction-select -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=instruction-select -o - %s | FileCheck -check-prefix=GFX9 %s --- name: v_shufflevector_v2s16_v2s16_u_u diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-smax.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-smax.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-smax.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-smax.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s --- name: smax_s32_ss diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-smin.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-smin.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-smin.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-smin.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s --- name: smin_s32_ss diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-smulh.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-smulh.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-smulh.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-smulh.mir @@ -2,6 +2,7 @@ # RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -global-isel-abort=2 -pass-remarks-missed='gisel*' -verify-machineinstrs %s -o - 2>%t | FileCheck -check-prefix=SI %s # RUN: FileCheck -check-prefix=ERR %s < %t # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX9 %s # ERR-NOT: remark: # ERR: remark: :0:0: cannot select: %2:sgpr(s32) = G_SMULH %0:sgpr, %1:sgpr (in function: smulh_s32_ss) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-flat.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-flat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-flat.mir @@ -1,6 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX7 %s # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-local.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-local.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-local.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-local.mir @@ -2,6 +2,7 @@ # RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX6 %s # RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX7 %s # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir @@ -1,6 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX6 %s # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sub.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sub.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sub.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sub.mir @@ -2,7 +2,7 @@ # RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX6 %s # RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX6 %s # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s -# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s --- name: sub_s32 @@ -35,16 +35,6 @@ ; GFX9: [[V_SUB_U32_e64_1:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[S_SUB_I32_]], [[V_SUB_U32_e64_]], 0, implicit $exec ; GFX9: [[V_SUB_U32_e64_2:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_SUB_U32_e64_1]], [[COPY2]], 0, implicit $exec ; GFX9: S_ENDPGM 0, implicit [[V_SUB_U32_e64_2]] - ; GFX10-LABEL: name: sub_s32 - ; GFX10: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr3_vgpr4 - ; GFX10: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX10: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX10: [[S_SUB_I32_:%[0-9]+]]:sreg_32 = S_SUB_I32 [[COPY]], [[COPY1]], implicit-def $scc - ; GFX10: [[V_SUB_U32_e64_:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[COPY2]], [[S_SUB_I32_]], 0, implicit $exec - ; GFX10: [[V_SUB_U32_e64_1:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[S_SUB_I32_]], [[V_SUB_U32_e64_]], 0, implicit $exec - ; GFX10: [[V_SUB_U32_e64_2:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_SUB_U32_e64_1]], [[COPY2]], 0, implicit $exec - ; GFX10: S_ENDPGM 0, implicit [[V_SUB_U32_e64_2]] %0:sgpr(s32) = COPY $sgpr0 %1:sgpr(s32) = COPY $sgpr1 %2:vgpr(s32) = COPY $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-trunc.v2s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-trunc.v2s16.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-trunc.v2s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-trunc.v2s16.mir @@ -2,6 +2,7 @@ # RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX6 %s # RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX8 %s # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX8 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX8 %s --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-umax.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-umax.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-umax.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-umax.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s --- name: umax_s32_ss diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-umin.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-umin.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-umin.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-umin.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs %s -o - 2>%t | FileCheck -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs %s -o - 2>%t | FileCheck -check-prefix=GCN %s --- name: umin_s32_ss diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-umulh.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-umulh.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-umulh.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-umulh.mir @@ -2,6 +2,7 @@ # RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -global-isel-abort=2 -pass-remarks-missed='gisel*' -verify-machineinstrs %s -o - 2>%t | FileCheck -check-prefix=SI %s # RUN: FileCheck -check-prefix=ERR %s < %t # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX9 %s # ERR-NOT: remark: # ERR: remark: :0:0: cannot select: %2:sgpr(s32) = G_UMULH %0:sgpr, %1:sgpr (in function: umulh_s32_ss) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-add.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-add.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-add.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-add.mir @@ -2,6 +2,7 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX6 %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX8 %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX9 %s --- name: test_add_s32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir @@ -2,6 +2,7 @@ # RUN: llc -march=amdgcn -mcpu=fiji -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=VI %s # RUN: llc -march=amdgcn -mcpu=gfx900 -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX9 %s # RUN: llc -march=amdgcn -mcpu=tahiti -O0 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=SI %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX9 %s --- name: test_addrspacecast_p0_to_p1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ashr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ashr.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ashr.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ashr.mir @@ -2,6 +2,7 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=SI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=VI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -O0 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s --- name: test_ashr_s32_s32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-build-vector-trunc.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-build-vector-trunc.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-build-vector-trunc.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-build-vector-trunc.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -O0 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -O0 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s --- name: legal_s32_to_v2s16 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-build-vector.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-build-vector.s16.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-build-vector.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-build-vector.s16.mir @@ -2,6 +2,7 @@ # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -O0 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX78 %s # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -O0 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX78 %s # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -O0 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -O0 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s --- name: build_vector_v2s16 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fabs.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fabs.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fabs.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fabs.mir @@ -2,6 +2,7 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=SI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=VI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fadd.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fadd.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fadd.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fadd.mir @@ -2,6 +2,7 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=SI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=VI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s --- name: test_fadd_s32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcanonicalize.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcanonicalize.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcanonicalize.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcanonicalize.mir @@ -2,6 +2,7 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=SI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=VI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fceil.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fceil.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fceil.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fceil.mir @@ -3,6 +3,7 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -run-pass=legalizer -o - %s | FileCheck -check-prefix=CI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -o - %s | FileCheck -check-prefix=VI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX9 %s --- name: test_fceil_s16 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcmp.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcmp.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcmp.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcmp.mir @@ -2,6 +2,7 @@ # RUN: llc -O0 -march=amdgcn -mcpu=hawaii -run-pass=legalizer -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX7 %s # RUN: llc -O0 -march=amdgcn -mcpu=fiji -run-pass=legalizer -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX8 %s # RUN: llc -O0 -march=amdgcn -mcpu=gfx900 -run-pass=legalizer -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -O0 -march=amdgcn -mcpu=gfx1010 -run-pass=legalizer -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s --- name: test_fcmp_s32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcopysign.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcopysign.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcopysign.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcopysign.mir @@ -2,6 +2,7 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefix=SI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck -check-prefix=VI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s --- name: test_copysign_s16_s16 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcos.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcos.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcos.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcos.mir @@ -2,6 +2,7 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=SI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=VI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s --- name: test_fcos_s32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fexp.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fexp.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fexp.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fexp.mir @@ -2,6 +2,7 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX6 %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX8 %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s --- name: test_fexp_s32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fexp2.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fexp2.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fexp2.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fexp2.mir @@ -2,6 +2,7 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX6 %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX8 %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s --- name: test_fexp2_s32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ffloor.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ffloor.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ffloor.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ffloor.mir @@ -2,6 +2,7 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=SI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=VI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s --- name: test_ffloor_s32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fma.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fma.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fma.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fma.mir @@ -2,6 +2,7 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=SI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=VI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s --- name: test_fma_s32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaxnum.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaxnum.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaxnum.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaxnum.mir @@ -2,6 +2,7 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=SI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=VI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s --- name: test_fmaxnum_s32_ieee_mode_on diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fminnum.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fminnum.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fminnum.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fminnum.mir @@ -2,6 +2,7 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=SI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=VI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s --- name: test_fminnum_s32_ieee_mode_on diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmul.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmul.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmul.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmul.mir @@ -2,6 +2,7 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=SI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=VI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s --- name: test_fmul_s32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fneg.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fneg.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fneg.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fneg.mir @@ -2,6 +2,7 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=SI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=VI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s --- name: test_fneg_s32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fpow.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fpow.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fpow.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fpow.mir @@ -1,6 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX6 %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s --- name: test_fpow_s32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fpowi.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fpowi.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fpowi.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fpowi.mir @@ -1,6 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX6 %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX9 %s --- name: test_fpowi_s16_s32_flags diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshr.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshr.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshr.mir @@ -2,6 +2,7 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=SI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=VI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s --- name: test_fshr_s32_s32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsin.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsin.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsin.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsin.mir @@ -2,6 +2,7 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=SI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=VI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s --- name: test_fsin_s32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsqrt.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsqrt.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsqrt.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsqrt.mir @@ -2,6 +2,7 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=SI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=VI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s --- name: test_fsqrt_s32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsub.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsub.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsub.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsub.mir @@ -2,6 +2,7 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=SI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=VI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s --- name: test_fsub_s32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-icmp.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-icmp.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-icmp.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-icmp.mir @@ -2,6 +2,7 @@ # RUN: llc -O0 -march=amdgcn -mcpu=hawaii -run-pass=legalizer -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX7 %s # RUN: llc -O0 -march=amdgcn -mcpu=fiji -run-pass=legalizer -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX8 %s # RUN: llc -O0 -march=amdgcn -mcpu=gfx900 -run-pass=legalizer -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -O0 -march=amdgcn -mcpu=gfx1010 -run-pass=legalizer -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s --- name: test_icmp_s32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-intrinsic-round.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-intrinsic-round.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-intrinsic-round.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-intrinsic-round.mir @@ -2,6 +2,7 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX6 %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX8 %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s --- name: test_intrinsic_round_s32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-intrinsic-trunc.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-intrinsic-trunc.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-intrinsic-trunc.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-intrinsic-trunc.mir @@ -3,6 +3,7 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -run-pass=legalizer -o - %s | FileCheck -check-prefix=CI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -o - %s | FileCheck -check-prefix=VI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX9 %s --- name: test_intrinsic_trunc_s16 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-constant.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-constant.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-constant.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-constant.mir @@ -2,6 +2,7 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=bonaire -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=CI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=VI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s --- name: test_load_constant_s1_align1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir @@ -2,6 +2,7 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=bonaire -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=CI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=VI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s --- name: test_load_flat_s1_align1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-global.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-global.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-global.mir @@ -8,6 +8,8 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=VI %s # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefixes=GFX9-HSA %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefixes=GFX9-MESA %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefixes=GFX9-HSA %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefixes=GFX9-MESA %s --- name: test_load_global_s1_align1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir @@ -5,6 +5,8 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=VI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer -mattr=-unaligned-access-mode -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer -mattr=+unaligned-access-mode -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9-UNALIGNED %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -O0 -run-pass=legalizer -mattr=-unaligned-access-mode -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX10 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -O0 -run-pass=legalizer -mattr=+unaligned-access-mode -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX10-UNALIGNED %s --- name: test_load_local_s1_align1 @@ -54,6 +56,20 @@ ; GFX9-UNALIGNED: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) ; GFX9-UNALIGNED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]] ; GFX9-UNALIGNED: $vgpr0 = COPY [[AND]](s32) + ; GFX10-LABEL: name: test_load_local_s1_align1 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]] + ; GFX10: $vgpr0 = COPY [[AND]](s32) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_s1_align1 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-UNALIGNED: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10-UNALIGNED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]] + ; GFX10-UNALIGNED: $vgpr0 = COPY [[AND]](s32) %0:_(p3) = COPY $vgpr0 %1:_(s1) = G_LOAD %0 :: (load 1, align 1, addrspace 3) %2:_(s32) = G_ZEXT %1 @@ -108,6 +124,20 @@ ; GFX9-UNALIGNED: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) ; GFX9-UNALIGNED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]] ; GFX9-UNALIGNED: $vgpr0 = COPY [[AND]](s32) + ; GFX10-LABEL: name: test_load_local_s2_align1 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]] + ; GFX10: $vgpr0 = COPY [[AND]](s32) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_s2_align1 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; GFX10-UNALIGNED: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10-UNALIGNED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]] + ; GFX10-UNALIGNED: $vgpr0 = COPY [[AND]](s32) %0:_(p3) = COPY $vgpr0 %1:_(s2) = G_LOAD %0 :: (load 1, align 1, addrspace 3) %2:_(s32) = G_ZEXT %1 @@ -150,6 +180,16 @@ ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, align 4, addrspace 3) ; GFX9-UNALIGNED: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) ; GFX9-UNALIGNED: $vgpr0 = COPY [[COPY1]](s32) + ; GFX10-LABEL: name: test_load_local_s8_align4 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, align 4, addrspace 3) + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10: $vgpr0 = COPY [[COPY1]](s32) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_s8_align4 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, align 4, addrspace 3) + ; GFX10-UNALIGNED: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10-UNALIGNED: $vgpr0 = COPY [[COPY1]](s32) %0:_(p3) = COPY $vgpr0 %1:_(s8) = G_LOAD %0 :: (load 1, align 4, addrspace 3) %2:_(s32) = G_ANYEXT %1 @@ -192,6 +232,16 @@ ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) ; GFX9-UNALIGNED: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) ; GFX9-UNALIGNED: $vgpr0 = COPY [[COPY1]](s32) + ; GFX10-LABEL: name: test_load_local_s8_align1 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10: $vgpr0 = COPY [[COPY1]](s32) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_s8_align1 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) + ; GFX10-UNALIGNED: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10-UNALIGNED: $vgpr0 = COPY [[COPY1]](s32) %0:_(p3) = COPY $vgpr0 %1:_(s8) = G_LOAD %0 :: (load 1, align 1, addrspace 3) %2:_(s32) = G_ANYEXT %1 @@ -234,6 +284,16 @@ ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, align 4, addrspace 3) ; GFX9-UNALIGNED: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) ; GFX9-UNALIGNED: $vgpr0 = COPY [[COPY1]](s32) + ; GFX10-LABEL: name: test_load_local_s16_align4 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, align 4, addrspace 3) + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10: $vgpr0 = COPY [[COPY1]](s32) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_s16_align4 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, align 4, addrspace 3) + ; GFX10-UNALIGNED: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10-UNALIGNED: $vgpr0 = COPY [[COPY1]](s32) %0:_(p3) = COPY $vgpr0 %1:_(s16) = G_LOAD %0 :: (load 2, align 4, addrspace 3) %2:_(s32) = G_ANYEXT %1 @@ -276,6 +336,16 @@ ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) ; GFX9-UNALIGNED: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) ; GFX9-UNALIGNED: $vgpr0 = COPY [[COPY1]](s32) + ; GFX10-LABEL: name: test_load_local_s16_align2 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10: $vgpr0 = COPY [[COPY1]](s32) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_s16_align2 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) + ; GFX10-UNALIGNED: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10-UNALIGNED: $vgpr0 = COPY [[COPY1]](s32) %0:_(p3) = COPY $vgpr0 %1:_(s16) = G_LOAD %0 :: (load 2, align 2, addrspace 3) %2:_(s32) = G_ANYEXT %1 @@ -379,6 +449,38 @@ ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, align 1, addrspace 3) ; GFX9-UNALIGNED: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) ; GFX9-UNALIGNED: $vgpr0 = COPY [[COPY1]](s32) + ; GFX10-LABEL: name: test_load_local_s16_align1 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 1 from unknown-address + 1, addrspace 3) + ; GFX10: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX10: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; GFX10: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C1]] + ; GFX10: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX10: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C1]] + ; GFX10: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; GFX10: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C2]](s16) + ; GFX10: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL]] + ; GFX10: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16) + ; GFX10: $vgpr0 = COPY [[ANYEXT]](s32) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_s16_align1 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-UNALIGNED: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 1 from unknown-address + 1, addrspace 3) + ; GFX10-UNALIGNED: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX10-UNALIGNED: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; GFX10-UNALIGNED: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C1]] + ; GFX10-UNALIGNED: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX10-UNALIGNED: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C1]] + ; GFX10-UNALIGNED: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; GFX10-UNALIGNED: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C2]](s16) + ; GFX10-UNALIGNED: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL]] + ; GFX10-UNALIGNED: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16) + ; GFX10-UNALIGNED: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(p3) = COPY $vgpr0 %1:_(s16) = G_LOAD %0 :: (load 2, align 1, addrspace 3) %2:_(s32) = G_ANYEXT %1 @@ -415,6 +517,14 @@ ; GFX9-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 4, addrspace 3) ; GFX9-UNALIGNED: $vgpr0 = COPY [[LOAD]](s32) + ; GFX10-LABEL: name: test_load_local_s32_align4 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 4, addrspace 3) + ; GFX10: $vgpr0 = COPY [[LOAD]](s32) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_s32_align4 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 4, addrspace 3) + ; GFX10-UNALIGNED: $vgpr0 = COPY [[LOAD]](s32) %0:_(p3) = COPY $vgpr0 %1:_(s32) = G_LOAD %0 :: (load 4, align 4, addrspace 3) $vgpr0 = COPY %1 @@ -505,6 +615,36 @@ ; GFX9-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 4, align 2, addrspace 3) ; GFX9-UNALIGNED: $vgpr0 = COPY [[LOAD]](s32) + ; GFX10-LABEL: name: test_load_local_s32_align2 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 2 from unknown-address + 2, addrspace 3) + ; GFX10: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX10: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX10: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX10: $vgpr0 = COPY [[OR]](s32) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_s32_align2 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-UNALIGNED: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 2 from unknown-address + 2, addrspace 3) + ; GFX10-UNALIGNED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX10-UNALIGNED: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10-UNALIGNED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; GFX10-UNALIGNED: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10-UNALIGNED: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX10-UNALIGNED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-UNALIGNED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX10-UNALIGNED: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX10-UNALIGNED: $vgpr0 = COPY [[OR]](s32) %0:_(p3) = COPY $vgpr0 %1:_(s32) = G_LOAD %0 :: (load 4, align 2, addrspace 3) $vgpr0 = COPY %1 @@ -675,6 +815,68 @@ ; GFX9-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 4, align 1, addrspace 3) ; GFX9-UNALIGNED: $vgpr0 = COPY [[LOAD]](s32) + ; GFX10-LABEL: name: test_load_local_s32_align1 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 1 from unknown-address + 1, addrspace 3) + ; GFX10: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 1 from unknown-address + 2, addrspace 3) + ; GFX10: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; GFX10: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 1 from unknown-address + 3, addrspace 3) + ; GFX10: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX10: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX10: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX10: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX10: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C5]](s32) + ; GFX10: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX10: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX10: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; GFX10: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; GFX10: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; GFX10: $vgpr0 = COPY [[OR2]](s32) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_s32_align1 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-UNALIGNED: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 1 from unknown-address + 1, addrspace 3) + ; GFX10-UNALIGNED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-UNALIGNED: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 1 from unknown-address + 2, addrspace 3) + ; GFX10-UNALIGNED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; GFX10-UNALIGNED: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-UNALIGNED: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 1 from unknown-address + 3, addrspace 3) + ; GFX10-UNALIGNED: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; GFX10-UNALIGNED: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10-UNALIGNED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX10-UNALIGNED: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10-UNALIGNED: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX10-UNALIGNED: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-UNALIGNED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX10-UNALIGNED: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX10-UNALIGNED: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX10-UNALIGNED: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX10-UNALIGNED: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-UNALIGNED: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C5]](s32) + ; GFX10-UNALIGNED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GFX10-UNALIGNED: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX10-UNALIGNED: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX10-UNALIGNED: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; GFX10-UNALIGNED: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; GFX10-UNALIGNED: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; GFX10-UNALIGNED: $vgpr0 = COPY [[OR2]](s32) %0:_(p3) = COPY $vgpr0 %1:_(s32) = G_LOAD %0 :: (load 4, align 1, addrspace 3) $vgpr0 = COPY %1 @@ -716,6 +918,16 @@ ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 4, align 8, addrspace 3) ; GFX9-UNALIGNED: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) ; GFX9-UNALIGNED: $vgpr0 = COPY [[COPY1]](s32) + ; GFX10-LABEL: name: test_load_local_s24_align8 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 4, align 8, addrspace 3) + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10: $vgpr0 = COPY [[COPY1]](s32) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_s24_align8 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 4, align 8, addrspace 3) + ; GFX10-UNALIGNED: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10-UNALIGNED: $vgpr0 = COPY [[COPY1]](s32) %0:_(p3) = COPY $vgpr0 %1:_(s24) = G_LOAD %0 :: (load 3, align 8, addrspace 3) %2:_(s32) = G_ANYEXT %1 @@ -758,6 +970,16 @@ ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 4, addrspace 3) ; GFX9-UNALIGNED: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) ; GFX9-UNALIGNED: $vgpr0 = COPY [[COPY1]](s32) + ; GFX10-LABEL: name: test_load_local_s24_align4 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 4, addrspace 3) + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10: $vgpr0 = COPY [[COPY1]](s32) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_s24_align4 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 4, addrspace 3) + ; GFX10-UNALIGNED: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10-UNALIGNED: $vgpr0 = COPY [[COPY1]](s32) %0:_(p3) = COPY $vgpr0 %1:_(s24) = G_LOAD %0 :: (load 3, align 4, addrspace 3) %2:_(s32) = G_ANYEXT %1 @@ -836,6 +1058,28 @@ ; GFX9-UNALIGNED: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] ; GFX9-UNALIGNED: [[COPY1:%[0-9]+]]:_(s32) = COPY [[OR]](s32) ; GFX9-UNALIGNED: $vgpr0 = COPY [[COPY1]](s32) + ; GFX10-LABEL: name: test_load_local_s24_align2 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s16) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[PTR_ADD]](p3) :: (load 1 from unknown-address + 2, align 2, addrspace 3) + ; GFX10: [[DEF:%[0-9]+]]:_(s24) = G_IMPLICIT_DEF + ; GFX10: [[INSERT:%[0-9]+]]:_(s24) = G_INSERT [[DEF]], [[LOAD]](s16), 0 + ; GFX10: [[INSERT1:%[0-9]+]]:_(s24) = G_INSERT [[INSERT]], [[LOAD1]](s8), 16 + ; GFX10: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INSERT1]](s24) + ; GFX10: $vgpr0 = COPY [[ANYEXT]](s32) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_s24_align2 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s16) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-UNALIGNED: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[PTR_ADD]](p3) :: (load 1 from unknown-address + 2, align 2, addrspace 3) + ; GFX10-UNALIGNED: [[DEF:%[0-9]+]]:_(s24) = G_IMPLICIT_DEF + ; GFX10-UNALIGNED: [[INSERT:%[0-9]+]]:_(s24) = G_INSERT [[DEF]], [[LOAD]](s16), 0 + ; GFX10-UNALIGNED: [[INSERT1:%[0-9]+]]:_(s24) = G_INSERT [[INSERT]], [[LOAD1]](s8), 16 + ; GFX10-UNALIGNED: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INSERT1]](s24) + ; GFX10-UNALIGNED: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(p3) = COPY $vgpr0 %1:_(s24) = G_LOAD %0 :: (load 3, align 2, addrspace 3) %2:_(s32) = G_ANYEXT %1 @@ -917,6 +1161,28 @@ ; GFX9-UNALIGNED: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[AND]] ; GFX9-UNALIGNED: [[COPY2:%[0-9]+]]:_(s32) = COPY [[OR]](s32) ; GFX9-UNALIGNED: $vgpr0 = COPY [[COPY2]](s32) + ; GFX10-LABEL: name: test_load_local_s24_align1 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s16) = G_LOAD [[COPY]](p3) :: (load 2, align 1, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[PTR_ADD]](p3) :: (load 1 from unknown-address + 2, addrspace 3) + ; GFX10: [[DEF:%[0-9]+]]:_(s24) = G_IMPLICIT_DEF + ; GFX10: [[INSERT:%[0-9]+]]:_(s24) = G_INSERT [[DEF]], [[LOAD]](s16), 0 + ; GFX10: [[INSERT1:%[0-9]+]]:_(s24) = G_INSERT [[INSERT]], [[LOAD1]](s8), 16 + ; GFX10: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INSERT1]](s24) + ; GFX10: $vgpr0 = COPY [[ANYEXT]](s32) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_s24_align1 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s16) = G_LOAD [[COPY]](p3) :: (load 2, align 1, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-UNALIGNED: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[PTR_ADD]](p3) :: (load 1 from unknown-address + 2, addrspace 3) + ; GFX10-UNALIGNED: [[DEF:%[0-9]+]]:_(s24) = G_IMPLICIT_DEF + ; GFX10-UNALIGNED: [[INSERT:%[0-9]+]]:_(s24) = G_INSERT [[DEF]], [[LOAD]](s16), 0 + ; GFX10-UNALIGNED: [[INSERT1:%[0-9]+]]:_(s24) = G_INSERT [[INSERT]], [[LOAD1]](s8), 16 + ; GFX10-UNALIGNED: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INSERT1]](s24) + ; GFX10-UNALIGNED: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(p3) = COPY $vgpr0 %1:_(s24) = G_LOAD %0 :: (load 3, align 1, addrspace 3) %2:_(s32) = G_ANYEXT %1 @@ -959,6 +1225,16 @@ ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3) ; GFX9-UNALIGNED: [[COPY1:%[0-9]+]]:_(s64) = COPY [[LOAD]](s64) ; GFX9-UNALIGNED: $vgpr0_vgpr1 = COPY [[COPY1]](s64) + ; GFX10-LABEL: name: test_load_local_s48_align8 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3) + ; GFX10: [[COPY1:%[0-9]+]]:_(s64) = COPY [[LOAD]](s64) + ; GFX10: $vgpr0_vgpr1 = COPY [[COPY1]](s64) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_s48_align8 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3) + ; GFX10-UNALIGNED: [[COPY1:%[0-9]+]]:_(s64) = COPY [[LOAD]](s64) + ; GFX10-UNALIGNED: $vgpr0_vgpr1 = COPY [[COPY1]](s64) %0:_(p3) = COPY $vgpr0 %1:_(s48) = G_LOAD %0 :: (load 6, align 8, addrspace 3) %2:_(s64) = G_ANYEXT %1 @@ -995,6 +1271,14 @@ ; GFX9-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3) ; GFX9-UNALIGNED: $vgpr0_vgpr1 = COPY [[LOAD]](s64) + ; GFX10-LABEL: name: test_load_local_s64_align8 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3) + ; GFX10: $vgpr0_vgpr1 = COPY [[LOAD]](s64) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_s64_align8 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3) + ; GFX10-UNALIGNED: $vgpr0_vgpr1 = COPY [[LOAD]](s64) %0:_(p3) = COPY $vgpr0 %1:_(s64) = G_LOAD %0 :: (load 8, align 8, addrspace 3) $vgpr0_vgpr1 = COPY %1 @@ -1030,6 +1314,14 @@ ; GFX9-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) ; GFX9-UNALIGNED: $vgpr0_vgpr1 = COPY [[LOAD]](s64) + ; GFX10-LABEL: name: test_load_local_s64_align4 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) + ; GFX10: $vgpr0_vgpr1 = COPY [[LOAD]](s64) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_s64_align4 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) + ; GFX10-UNALIGNED: $vgpr0_vgpr1 = COPY [[LOAD]](s64) %0:_(p3) = COPY $vgpr0 %1:_(s64) = G_LOAD %0 :: (load 8, align 4, addrspace 3) $vgpr0_vgpr1 = COPY %1 @@ -1185,6 +1477,62 @@ ; GFX9-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load 8, align 2, addrspace 3) ; GFX9-UNALIGNED: $vgpr0_vgpr1 = COPY [[LOAD]](s64) + ; GFX10-LABEL: name: test_load_local_s64_align2 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 2 from unknown-address + 2, addrspace 3) + ; GFX10: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 2 from unknown-address + 4, addrspace 3) + ; GFX10: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 + ; GFX10: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 2 from unknown-address + 6, addrspace 3) + ; GFX10: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX10: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX10: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX10: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX10: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX10: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; GFX10: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX10: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; GFX10: $vgpr0_vgpr1 = COPY [[MV]](s64) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_s64_align2 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-UNALIGNED: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 2 from unknown-address + 2, addrspace 3) + ; GFX10-UNALIGNED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10-UNALIGNED: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 2 from unknown-address + 4, addrspace 3) + ; GFX10-UNALIGNED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 + ; GFX10-UNALIGNED: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-UNALIGNED: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 2 from unknown-address + 6, addrspace 3) + ; GFX10-UNALIGNED: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX10-UNALIGNED: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10-UNALIGNED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX10-UNALIGNED: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10-UNALIGNED: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX10-UNALIGNED: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-UNALIGNED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX10-UNALIGNED: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX10-UNALIGNED: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX10-UNALIGNED: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX10-UNALIGNED: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX10-UNALIGNED: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX10-UNALIGNED: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; GFX10-UNALIGNED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX10-UNALIGNED: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; GFX10-UNALIGNED: $vgpr0_vgpr1 = COPY [[MV]](s64) %0:_(p3) = COPY $vgpr0 %1:_(s64) = G_LOAD %0 :: (load 8, align 2, addrspace 3) $vgpr0_vgpr1 = COPY %1 @@ -1529,6 +1877,128 @@ ; GFX9-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load 8, align 1, addrspace 3) ; GFX9-UNALIGNED: $vgpr0_vgpr1 = COPY [[LOAD]](s64) + ; GFX10-LABEL: name: test_load_local_s64_align1 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 1 from unknown-address + 1, addrspace 3) + ; GFX10: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 1 from unknown-address + 2, addrspace 3) + ; GFX10: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; GFX10: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 1 from unknown-address + 3, addrspace 3) + ; GFX10: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10: [[PTR_ADD3:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; GFX10: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p3) :: (load 1 from unknown-address + 4, addrspace 3) + ; GFX10: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 + ; GFX10: [[PTR_ADD4:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; GFX10: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p3) :: (load 1 from unknown-address + 5, addrspace 3) + ; GFX10: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 + ; GFX10: [[PTR_ADD5:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; GFX10: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p3) :: (load 1 from unknown-address + 6, addrspace 3) + ; GFX10: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 7 + ; GFX10: [[PTR_ADD6:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C6]](s32) + ; GFX10: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p3) :: (load 1 from unknown-address + 7, addrspace 3) + ; GFX10: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX10: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; GFX10: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C7]] + ; GFX10: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX10: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C7]] + ; GFX10: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; GFX10: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C8]](s16) + ; GFX10: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL]] + ; GFX10: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) + ; GFX10: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C7]] + ; GFX10: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) + ; GFX10: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C7]] + ; GFX10: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C8]](s16) + ; GFX10: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] + ; GFX10: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) + ; GFX10: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C7]] + ; GFX10: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) + ; GFX10: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C7]] + ; GFX10: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C8]](s16) + ; GFX10: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; GFX10: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) + ; GFX10: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C7]] + ; GFX10: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) + ; GFX10: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] + ; GFX10: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) + ; GFX10: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] + ; GFX10: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX10: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX10: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; GFX10: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; GFX10: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX10: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX10: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; GFX10: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX10: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; GFX10: $vgpr0_vgpr1 = COPY [[MV]](s64) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_s64_align1 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-UNALIGNED: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 1 from unknown-address + 1, addrspace 3) + ; GFX10-UNALIGNED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-UNALIGNED: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 1 from unknown-address + 2, addrspace 3) + ; GFX10-UNALIGNED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; GFX10-UNALIGNED: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-UNALIGNED: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 1 from unknown-address + 3, addrspace 3) + ; GFX10-UNALIGNED: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10-UNALIGNED: [[PTR_ADD3:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; GFX10-UNALIGNED: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p3) :: (load 1 from unknown-address + 4, addrspace 3) + ; GFX10-UNALIGNED: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 + ; GFX10-UNALIGNED: [[PTR_ADD4:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; GFX10-UNALIGNED: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p3) :: (load 1 from unknown-address + 5, addrspace 3) + ; GFX10-UNALIGNED: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 + ; GFX10-UNALIGNED: [[PTR_ADD5:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; GFX10-UNALIGNED: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p3) :: (load 1 from unknown-address + 6, addrspace 3) + ; GFX10-UNALIGNED: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 7 + ; GFX10-UNALIGNED: [[PTR_ADD6:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C6]](s32) + ; GFX10-UNALIGNED: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p3) :: (load 1 from unknown-address + 7, addrspace 3) + ; GFX10-UNALIGNED: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX10-UNALIGNED: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; GFX10-UNALIGNED: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C7]] + ; GFX10-UNALIGNED: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX10-UNALIGNED: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C7]] + ; GFX10-UNALIGNED: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; GFX10-UNALIGNED: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C8]](s16) + ; GFX10-UNALIGNED: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL]] + ; GFX10-UNALIGNED: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) + ; GFX10-UNALIGNED: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C7]] + ; GFX10-UNALIGNED: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) + ; GFX10-UNALIGNED: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C7]] + ; GFX10-UNALIGNED: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C8]](s16) + ; GFX10-UNALIGNED: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] + ; GFX10-UNALIGNED: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) + ; GFX10-UNALIGNED: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C7]] + ; GFX10-UNALIGNED: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) + ; GFX10-UNALIGNED: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C7]] + ; GFX10-UNALIGNED: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C8]](s16) + ; GFX10-UNALIGNED: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; GFX10-UNALIGNED: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) + ; GFX10-UNALIGNED: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C7]] + ; GFX10-UNALIGNED: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) + ; GFX10-UNALIGNED: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] + ; GFX10-UNALIGNED: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) + ; GFX10-UNALIGNED: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] + ; GFX10-UNALIGNED: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX10-UNALIGNED: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX10-UNALIGNED: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-UNALIGNED: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; GFX10-UNALIGNED: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; GFX10-UNALIGNED: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX10-UNALIGNED: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX10-UNALIGNED: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; GFX10-UNALIGNED: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX10-UNALIGNED: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; GFX10-UNALIGNED: $vgpr0_vgpr1 = COPY [[MV]](s64) %0:_(p3) = COPY $vgpr0 %1:_(s64) = G_LOAD %0 :: (load 8, align 1, addrspace 3) $vgpr0_vgpr1 = COPY %1 @@ -1950,6 +2420,168 @@ ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p3) :: (load 12, align 1, addrspace 3) ; GFX9-UNALIGNED: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) ; GFX9-UNALIGNED: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) + ; GFX10-LABEL: name: test_load_local_s96_align16 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 1 from unknown-address + 1, addrspace 3) + ; GFX10: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 1 from unknown-address + 2, addrspace 3) + ; GFX10: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; GFX10: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 1 from unknown-address + 3, addrspace 3) + ; GFX10: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX10: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX10: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX10: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX10: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C5]](s32) + ; GFX10: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX10: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX10: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; GFX10: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; GFX10: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; GFX10: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10: [[PTR_ADD3:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C7]](s32) + ; GFX10: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p3) :: (load 1 from unknown-address + 4, addrspace 3) + ; GFX10: [[PTR_ADD4:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; GFX10: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p3) :: (load 1 from unknown-address + 5, addrspace 3) + ; GFX10: [[PTR_ADD5:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s32) + ; GFX10: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p3) :: (load 1 from unknown-address + 6, addrspace 3) + ; GFX10: [[PTR_ADD6:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; GFX10: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p3) :: (load 1 from unknown-address + 7, addrspace 3) + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX10: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX10: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; GFX10: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; GFX10: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL3]] + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; GFX10: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; GFX10: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C5]](s32) + ; GFX10: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]] + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; GFX10: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; GFX10: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C6]](s32) + ; GFX10: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]] + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) + ; GFX10: [[PTR_ADD7:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; GFX10: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p3) :: (load 1 from unknown-address + 8, addrspace 3) + ; GFX10: [[PTR_ADD8:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; GFX10: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p3) :: (load 1 from unknown-address + 9, addrspace 3) + ; GFX10: [[PTR_ADD9:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C1]](s32) + ; GFX10: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p3) :: (load 1 from unknown-address + 10, addrspace 3) + ; GFX10: [[PTR_ADD10:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; GFX10: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p3) :: (load 1 from unknown-address + 11, addrspace 3) + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; GFX10: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]] + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; GFX10: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]] + ; GFX10: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C4]](s32) + ; GFX10: [[OR6:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL6]] + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; GFX10: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C3]] + ; GFX10: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND10]], [[C5]](s32) + ; GFX10: [[OR7:%[0-9]+]]:_(s32) = G_OR [[OR6]], [[SHL7]] + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; GFX10: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C3]] + ; GFX10: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C6]](s32) + ; GFX10: [[OR8:%[0-9]+]]:_(s32) = G_OR [[OR7]], [[SHL8]] + ; GFX10: [[DEF:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF + ; GFX10: [[INSERT:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[DEF]], [[BUILD_VECTOR]](<2 x s32>), 0 + ; GFX10: [[INSERT1:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT]], [[OR8]](s32), 64 + ; GFX10: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[INSERT1]](<3 x s32>) + ; GFX10: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_s96_align16 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-UNALIGNED: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 1 from unknown-address + 1, addrspace 3) + ; GFX10-UNALIGNED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-UNALIGNED: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 1 from unknown-address + 2, addrspace 3) + ; GFX10-UNALIGNED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; GFX10-UNALIGNED: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-UNALIGNED: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 1 from unknown-address + 3, addrspace 3) + ; GFX10-UNALIGNED: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; GFX10-UNALIGNED: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10-UNALIGNED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX10-UNALIGNED: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10-UNALIGNED: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX10-UNALIGNED: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-UNALIGNED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX10-UNALIGNED: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX10-UNALIGNED: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX10-UNALIGNED: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX10-UNALIGNED: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-UNALIGNED: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C5]](s32) + ; GFX10-UNALIGNED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GFX10-UNALIGNED: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX10-UNALIGNED: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX10-UNALIGNED: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; GFX10-UNALIGNED: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; GFX10-UNALIGNED: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; GFX10-UNALIGNED: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10-UNALIGNED: [[PTR_ADD3:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C7]](s32) + ; GFX10-UNALIGNED: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p3) :: (load 1 from unknown-address + 4, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD4:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p3) :: (load 1 from unknown-address + 5, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD5:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p3) :: (load 1 from unknown-address + 6, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD6:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; GFX10-UNALIGNED: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p3) :: (load 1 from unknown-address + 7, addrspace 3) + ; GFX10-UNALIGNED: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX10-UNALIGNED: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; GFX10-UNALIGNED: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX10-UNALIGNED: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; GFX10-UNALIGNED: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; GFX10-UNALIGNED: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL3]] + ; GFX10-UNALIGNED: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; GFX10-UNALIGNED: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; GFX10-UNALIGNED: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C5]](s32) + ; GFX10-UNALIGNED: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]] + ; GFX10-UNALIGNED: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; GFX10-UNALIGNED: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; GFX10-UNALIGNED: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C6]](s32) + ; GFX10-UNALIGNED: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]] + ; GFX10-UNALIGNED: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) + ; GFX10-UNALIGNED: [[PTR_ADD7:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; GFX10-UNALIGNED: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p3) :: (load 1 from unknown-address + 8, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD8:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p3) :: (load 1 from unknown-address + 9, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD9:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p3) :: (load 1 from unknown-address + 10, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD10:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; GFX10-UNALIGNED: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p3) :: (load 1 from unknown-address + 11, addrspace 3) + ; GFX10-UNALIGNED: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; GFX10-UNALIGNED: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]] + ; GFX10-UNALIGNED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; GFX10-UNALIGNED: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]] + ; GFX10-UNALIGNED: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C4]](s32) + ; GFX10-UNALIGNED: [[OR6:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL6]] + ; GFX10-UNALIGNED: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; GFX10-UNALIGNED: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C3]] + ; GFX10-UNALIGNED: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND10]], [[C5]](s32) + ; GFX10-UNALIGNED: [[OR7:%[0-9]+]]:_(s32) = G_OR [[OR6]], [[SHL7]] + ; GFX10-UNALIGNED: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; GFX10-UNALIGNED: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C3]] + ; GFX10-UNALIGNED: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C6]](s32) + ; GFX10-UNALIGNED: [[OR8:%[0-9]+]]:_(s32) = G_OR [[OR7]], [[SHL8]] + ; GFX10-UNALIGNED: [[DEF:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF + ; GFX10-UNALIGNED: [[INSERT:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[DEF]], [[BUILD_VECTOR]](<2 x s32>), 0 + ; GFX10-UNALIGNED: [[INSERT1:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT]], [[OR8]](s32), 64 + ; GFX10-UNALIGNED: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[INSERT1]](<3 x s32>) + ; GFX10-UNALIGNED: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) %0:_(p3) = COPY $vgpr0 %1:_(s96) = G_LOAD %0 :: (load 12, align 1, addrspace 3) $vgpr0_vgpr1_vgpr2 = COPY %1 @@ -2021,6 +2653,28 @@ ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p3) :: (load 12, align 8, addrspace 3) ; GFX9-UNALIGNED: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) ; GFX9-UNALIGNED: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) + ; GFX10-LABEL: name: test_load_local_s96_align8 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 from unknown-address + 8, align 8, addrspace 3) + ; GFX10: [[DEF:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF + ; GFX10: [[INSERT:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[DEF]], [[LOAD]](<2 x s32>), 0 + ; GFX10: [[INSERT1:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT]], [[LOAD1]](s32), 64 + ; GFX10: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[INSERT1]](<3 x s32>) + ; GFX10: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_s96_align8 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-UNALIGNED: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 from unknown-address + 8, align 8, addrspace 3) + ; GFX10-UNALIGNED: [[DEF:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF + ; GFX10-UNALIGNED: [[INSERT:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[DEF]], [[LOAD]](<2 x s32>), 0 + ; GFX10-UNALIGNED: [[INSERT1:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT]], [[LOAD1]](s32), 64 + ; GFX10-UNALIGNED: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[INSERT1]](<3 x s32>) + ; GFX10-UNALIGNED: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) %0:_(p3) = COPY $vgpr0 %1:_(s96) = G_LOAD %0 :: (load 12, align 8, addrspace 3) $vgpr0_vgpr1_vgpr2 = COPY %1 @@ -2092,6 +2746,28 @@ ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p3) :: (load 12, align 4, addrspace 3) ; GFX9-UNALIGNED: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) ; GFX9-UNALIGNED: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) + ; GFX10-LABEL: name: test_load_local_s96_align4 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 from unknown-address + 8, addrspace 3) + ; GFX10: [[DEF:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF + ; GFX10: [[INSERT:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[DEF]], [[LOAD]](<2 x s32>), 0 + ; GFX10: [[INSERT1:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT]], [[LOAD1]](s32), 64 + ; GFX10: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[INSERT1]](<3 x s32>) + ; GFX10: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_s96_align4 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-UNALIGNED: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 from unknown-address + 8, addrspace 3) + ; GFX10-UNALIGNED: [[DEF:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF + ; GFX10-UNALIGNED: [[INSERT:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[DEF]], [[LOAD]](<2 x s32>), 0 + ; GFX10-UNALIGNED: [[INSERT1:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT]], [[LOAD1]](s32), 64 + ; GFX10-UNALIGNED: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[INSERT1]](<3 x s32>) + ; GFX10-UNALIGNED: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) %0:_(p3) = COPY $vgpr0 %1:_(s96) = G_LOAD %0 :: (load 12, align 4, addrspace 3) $vgpr0_vgpr1_vgpr2 = COPY %1 @@ -2318,6 +2994,90 @@ ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p3) :: (load 12, align 2, addrspace 3) ; GFX9-UNALIGNED: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) ; GFX9-UNALIGNED: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) + ; GFX10-LABEL: name: test_load_local_s96_align2 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 2 from unknown-address + 2, addrspace 3) + ; GFX10: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX10: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX10: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX10: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; GFX10: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 2 from unknown-address + 4, addrspace 3) + ; GFX10: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; GFX10: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 2 from unknown-address + 6, addrspace 3) + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX10: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX10: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; GFX10: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; GFX10: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) + ; GFX10: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10: [[PTR_ADD3:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; GFX10: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p3) :: (load 2 from unknown-address + 8, addrspace 3) + ; GFX10: [[PTR_ADD4:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; GFX10: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p3) :: (load 2 from unknown-address + 10, addrspace 3) + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX10: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]] + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX10: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]] + ; GFX10: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32) + ; GFX10: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; GFX10: [[DEF:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF + ; GFX10: [[INSERT:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[DEF]], [[BUILD_VECTOR]](<2 x s32>), 0 + ; GFX10: [[INSERT1:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT]], [[OR2]](s32), 64 + ; GFX10: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[INSERT1]](<3 x s32>) + ; GFX10: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_s96_align2 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-UNALIGNED: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 2 from unknown-address + 2, addrspace 3) + ; GFX10-UNALIGNED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX10-UNALIGNED: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10-UNALIGNED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; GFX10-UNALIGNED: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10-UNALIGNED: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX10-UNALIGNED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-UNALIGNED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX10-UNALIGNED: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX10-UNALIGNED: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10-UNALIGNED: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; GFX10-UNALIGNED: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 2 from unknown-address + 4, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 2 from unknown-address + 6, addrspace 3) + ; GFX10-UNALIGNED: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX10-UNALIGNED: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; GFX10-UNALIGNED: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX10-UNALIGNED: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; GFX10-UNALIGNED: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; GFX10-UNALIGNED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX10-UNALIGNED: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) + ; GFX10-UNALIGNED: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-UNALIGNED: [[PTR_ADD3:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; GFX10-UNALIGNED: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p3) :: (load 2 from unknown-address + 8, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD4:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p3) :: (load 2 from unknown-address + 10, addrspace 3) + ; GFX10-UNALIGNED: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX10-UNALIGNED: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]] + ; GFX10-UNALIGNED: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX10-UNALIGNED: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]] + ; GFX10-UNALIGNED: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32) + ; GFX10-UNALIGNED: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; GFX10-UNALIGNED: [[DEF:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF + ; GFX10-UNALIGNED: [[INSERT:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[DEF]], [[BUILD_VECTOR]](<2 x s32>), 0 + ; GFX10-UNALIGNED: [[INSERT1:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT]], [[OR2]](s32), 64 + ; GFX10-UNALIGNED: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[INSERT1]](<3 x s32>) + ; GFX10-UNALIGNED: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) %0:_(p3) = COPY $vgpr0 %1:_(s96) = G_LOAD %0 :: (load 12, align 2, addrspace 3) $vgpr0_vgpr1_vgpr2 = COPY %1 @@ -2739,6 +3499,168 @@ ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p3) :: (load 12, align 1, addrspace 3) ; GFX9-UNALIGNED: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) ; GFX9-UNALIGNED: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) + ; GFX10-LABEL: name: test_load_local_s96_align1 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 1 from unknown-address + 1, addrspace 3) + ; GFX10: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 1 from unknown-address + 2, addrspace 3) + ; GFX10: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; GFX10: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 1 from unknown-address + 3, addrspace 3) + ; GFX10: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX10: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX10: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX10: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX10: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C5]](s32) + ; GFX10: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX10: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX10: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; GFX10: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; GFX10: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; GFX10: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10: [[PTR_ADD3:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C7]](s32) + ; GFX10: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p3) :: (load 1 from unknown-address + 4, addrspace 3) + ; GFX10: [[PTR_ADD4:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; GFX10: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p3) :: (load 1 from unknown-address + 5, addrspace 3) + ; GFX10: [[PTR_ADD5:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s32) + ; GFX10: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p3) :: (load 1 from unknown-address + 6, addrspace 3) + ; GFX10: [[PTR_ADD6:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; GFX10: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p3) :: (load 1 from unknown-address + 7, addrspace 3) + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX10: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX10: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; GFX10: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; GFX10: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL3]] + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; GFX10: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; GFX10: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C5]](s32) + ; GFX10: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]] + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; GFX10: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; GFX10: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C6]](s32) + ; GFX10: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]] + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) + ; GFX10: [[PTR_ADD7:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; GFX10: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p3) :: (load 1 from unknown-address + 8, addrspace 3) + ; GFX10: [[PTR_ADD8:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; GFX10: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p3) :: (load 1 from unknown-address + 9, addrspace 3) + ; GFX10: [[PTR_ADD9:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C1]](s32) + ; GFX10: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p3) :: (load 1 from unknown-address + 10, addrspace 3) + ; GFX10: [[PTR_ADD10:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; GFX10: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p3) :: (load 1 from unknown-address + 11, addrspace 3) + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; GFX10: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]] + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; GFX10: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]] + ; GFX10: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C4]](s32) + ; GFX10: [[OR6:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL6]] + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; GFX10: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C3]] + ; GFX10: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND10]], [[C5]](s32) + ; GFX10: [[OR7:%[0-9]+]]:_(s32) = G_OR [[OR6]], [[SHL7]] + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; GFX10: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C3]] + ; GFX10: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C6]](s32) + ; GFX10: [[OR8:%[0-9]+]]:_(s32) = G_OR [[OR7]], [[SHL8]] + ; GFX10: [[DEF:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF + ; GFX10: [[INSERT:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[DEF]], [[BUILD_VECTOR]](<2 x s32>), 0 + ; GFX10: [[INSERT1:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT]], [[OR8]](s32), 64 + ; GFX10: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[INSERT1]](<3 x s32>) + ; GFX10: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_s96_align1 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-UNALIGNED: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 1 from unknown-address + 1, addrspace 3) + ; GFX10-UNALIGNED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-UNALIGNED: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 1 from unknown-address + 2, addrspace 3) + ; GFX10-UNALIGNED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; GFX10-UNALIGNED: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-UNALIGNED: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 1 from unknown-address + 3, addrspace 3) + ; GFX10-UNALIGNED: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; GFX10-UNALIGNED: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10-UNALIGNED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX10-UNALIGNED: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10-UNALIGNED: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX10-UNALIGNED: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-UNALIGNED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX10-UNALIGNED: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX10-UNALIGNED: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX10-UNALIGNED: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX10-UNALIGNED: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-UNALIGNED: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C5]](s32) + ; GFX10-UNALIGNED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GFX10-UNALIGNED: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX10-UNALIGNED: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX10-UNALIGNED: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; GFX10-UNALIGNED: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; GFX10-UNALIGNED: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; GFX10-UNALIGNED: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10-UNALIGNED: [[PTR_ADD3:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C7]](s32) + ; GFX10-UNALIGNED: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p3) :: (load 1 from unknown-address + 4, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD4:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p3) :: (load 1 from unknown-address + 5, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD5:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p3) :: (load 1 from unknown-address + 6, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD6:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; GFX10-UNALIGNED: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p3) :: (load 1 from unknown-address + 7, addrspace 3) + ; GFX10-UNALIGNED: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX10-UNALIGNED: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; GFX10-UNALIGNED: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX10-UNALIGNED: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; GFX10-UNALIGNED: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; GFX10-UNALIGNED: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL3]] + ; GFX10-UNALIGNED: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; GFX10-UNALIGNED: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; GFX10-UNALIGNED: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C5]](s32) + ; GFX10-UNALIGNED: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]] + ; GFX10-UNALIGNED: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; GFX10-UNALIGNED: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; GFX10-UNALIGNED: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C6]](s32) + ; GFX10-UNALIGNED: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]] + ; GFX10-UNALIGNED: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) + ; GFX10-UNALIGNED: [[PTR_ADD7:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; GFX10-UNALIGNED: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p3) :: (load 1 from unknown-address + 8, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD8:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p3) :: (load 1 from unknown-address + 9, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD9:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p3) :: (load 1 from unknown-address + 10, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD10:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; GFX10-UNALIGNED: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p3) :: (load 1 from unknown-address + 11, addrspace 3) + ; GFX10-UNALIGNED: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; GFX10-UNALIGNED: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]] + ; GFX10-UNALIGNED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; GFX10-UNALIGNED: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]] + ; GFX10-UNALIGNED: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C4]](s32) + ; GFX10-UNALIGNED: [[OR6:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL6]] + ; GFX10-UNALIGNED: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; GFX10-UNALIGNED: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C3]] + ; GFX10-UNALIGNED: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND10]], [[C5]](s32) + ; GFX10-UNALIGNED: [[OR7:%[0-9]+]]:_(s32) = G_OR [[OR6]], [[SHL7]] + ; GFX10-UNALIGNED: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; GFX10-UNALIGNED: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C3]] + ; GFX10-UNALIGNED: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C6]](s32) + ; GFX10-UNALIGNED: [[OR8:%[0-9]+]]:_(s32) = G_OR [[OR7]], [[SHL8]] + ; GFX10-UNALIGNED: [[DEF:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF + ; GFX10-UNALIGNED: [[INSERT:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[DEF]], [[BUILD_VECTOR]](<2 x s32>), 0 + ; GFX10-UNALIGNED: [[INSERT1:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT]], [[OR8]](s32), 64 + ; GFX10-UNALIGNED: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[INSERT1]](<3 x s32>) + ; GFX10-UNALIGNED: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) %0:_(p3) = COPY $vgpr0 %1:_(s96) = G_LOAD %0 :: (load 12, align 1, addrspace 3) $vgpr0_vgpr1_vgpr2 = COPY %1 @@ -3262,6 +4184,208 @@ ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load 16, align 1, addrspace 3) ; GFX9-UNALIGNED: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>) ; GFX9-UNALIGNED: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) + ; GFX10-LABEL: name: test_load_local_s128_align16 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 1 from unknown-address + 1, addrspace 3) + ; GFX10: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 1 from unknown-address + 2, addrspace 3) + ; GFX10: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; GFX10: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 1 from unknown-address + 3, addrspace 3) + ; GFX10: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX10: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX10: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX10: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX10: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C5]](s32) + ; GFX10: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX10: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX10: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; GFX10: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; GFX10: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; GFX10: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10: [[PTR_ADD3:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C7]](s32) + ; GFX10: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p3) :: (load 1 from unknown-address + 4, addrspace 3) + ; GFX10: [[PTR_ADD4:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; GFX10: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p3) :: (load 1 from unknown-address + 5, addrspace 3) + ; GFX10: [[PTR_ADD5:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s32) + ; GFX10: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p3) :: (load 1 from unknown-address + 6, addrspace 3) + ; GFX10: [[PTR_ADD6:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; GFX10: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p3) :: (load 1 from unknown-address + 7, addrspace 3) + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX10: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX10: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; GFX10: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; GFX10: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL3]] + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; GFX10: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; GFX10: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C5]](s32) + ; GFX10: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]] + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; GFX10: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; GFX10: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C6]](s32) + ; GFX10: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]] + ; GFX10: [[PTR_ADD7:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; GFX10: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p3) :: (load 1 from unknown-address + 8, addrspace 3) + ; GFX10: [[PTR_ADD8:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; GFX10: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p3) :: (load 1 from unknown-address + 9, addrspace 3) + ; GFX10: [[PTR_ADD9:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C1]](s32) + ; GFX10: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p3) :: (load 1 from unknown-address + 10, addrspace 3) + ; GFX10: [[PTR_ADD10:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; GFX10: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p3) :: (load 1 from unknown-address + 11, addrspace 3) + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; GFX10: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]] + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; GFX10: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]] + ; GFX10: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C4]](s32) + ; GFX10: [[OR6:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL6]] + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; GFX10: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C3]] + ; GFX10: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND10]], [[C5]](s32) + ; GFX10: [[OR7:%[0-9]+]]:_(s32) = G_OR [[OR6]], [[SHL7]] + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; GFX10: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C3]] + ; GFX10: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C6]](s32) + ; GFX10: [[OR8:%[0-9]+]]:_(s32) = G_OR [[OR7]], [[SHL8]] + ; GFX10: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10: [[PTR_ADD11:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C8]](s32) + ; GFX10: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD11]](p3) :: (load 1 from unknown-address + 12, addrspace 3) + ; GFX10: [[PTR_ADD12:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; GFX10: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD12]](p3) :: (load 1 from unknown-address + 13, addrspace 3) + ; GFX10: [[PTR_ADD13:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s32) + ; GFX10: [[LOAD14:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD13]](p3) :: (load 1 from unknown-address + 14, addrspace 3) + ; GFX10: [[PTR_ADD14:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; GFX10: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p3) :: (load 1 from unknown-address + 15, addrspace 3) + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD12]](s32) + ; GFX10: [[AND12:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C3]] + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) + ; GFX10: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C3]] + ; GFX10: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[C4]](s32) + ; GFX10: [[OR9:%[0-9]+]]:_(s32) = G_OR [[AND12]], [[SHL9]] + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD14]](s32) + ; GFX10: [[AND14:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C3]] + ; GFX10: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND14]], [[C5]](s32) + ; GFX10: [[OR10:%[0-9]+]]:_(s32) = G_OR [[OR9]], [[SHL10]] + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) + ; GFX10: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C3]] + ; GFX10: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[C6]](s32) + ; GFX10: [[OR11:%[0-9]+]]:_(s32) = G_OR [[OR10]], [[SHL11]] + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX10: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; GFX10: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_s128_align16 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-UNALIGNED: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 1 from unknown-address + 1, addrspace 3) + ; GFX10-UNALIGNED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-UNALIGNED: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 1 from unknown-address + 2, addrspace 3) + ; GFX10-UNALIGNED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; GFX10-UNALIGNED: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-UNALIGNED: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 1 from unknown-address + 3, addrspace 3) + ; GFX10-UNALIGNED: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; GFX10-UNALIGNED: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10-UNALIGNED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX10-UNALIGNED: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10-UNALIGNED: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX10-UNALIGNED: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-UNALIGNED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX10-UNALIGNED: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX10-UNALIGNED: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX10-UNALIGNED: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX10-UNALIGNED: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-UNALIGNED: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C5]](s32) + ; GFX10-UNALIGNED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GFX10-UNALIGNED: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX10-UNALIGNED: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX10-UNALIGNED: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; GFX10-UNALIGNED: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; GFX10-UNALIGNED: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; GFX10-UNALIGNED: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10-UNALIGNED: [[PTR_ADD3:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C7]](s32) + ; GFX10-UNALIGNED: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p3) :: (load 1 from unknown-address + 4, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD4:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p3) :: (load 1 from unknown-address + 5, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD5:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p3) :: (load 1 from unknown-address + 6, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD6:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; GFX10-UNALIGNED: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p3) :: (load 1 from unknown-address + 7, addrspace 3) + ; GFX10-UNALIGNED: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX10-UNALIGNED: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; GFX10-UNALIGNED: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX10-UNALIGNED: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; GFX10-UNALIGNED: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; GFX10-UNALIGNED: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL3]] + ; GFX10-UNALIGNED: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; GFX10-UNALIGNED: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; GFX10-UNALIGNED: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C5]](s32) + ; GFX10-UNALIGNED: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]] + ; GFX10-UNALIGNED: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; GFX10-UNALIGNED: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; GFX10-UNALIGNED: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C6]](s32) + ; GFX10-UNALIGNED: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]] + ; GFX10-UNALIGNED: [[PTR_ADD7:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; GFX10-UNALIGNED: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p3) :: (load 1 from unknown-address + 8, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD8:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p3) :: (load 1 from unknown-address + 9, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD9:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p3) :: (load 1 from unknown-address + 10, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD10:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; GFX10-UNALIGNED: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p3) :: (load 1 from unknown-address + 11, addrspace 3) + ; GFX10-UNALIGNED: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; GFX10-UNALIGNED: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]] + ; GFX10-UNALIGNED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; GFX10-UNALIGNED: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]] + ; GFX10-UNALIGNED: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C4]](s32) + ; GFX10-UNALIGNED: [[OR6:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL6]] + ; GFX10-UNALIGNED: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; GFX10-UNALIGNED: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C3]] + ; GFX10-UNALIGNED: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND10]], [[C5]](s32) + ; GFX10-UNALIGNED: [[OR7:%[0-9]+]]:_(s32) = G_OR [[OR6]], [[SHL7]] + ; GFX10-UNALIGNED: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; GFX10-UNALIGNED: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C3]] + ; GFX10-UNALIGNED: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C6]](s32) + ; GFX10-UNALIGNED: [[OR8:%[0-9]+]]:_(s32) = G_OR [[OR7]], [[SHL8]] + ; GFX10-UNALIGNED: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10-UNALIGNED: [[PTR_ADD11:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C8]](s32) + ; GFX10-UNALIGNED: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD11]](p3) :: (load 1 from unknown-address + 12, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD12:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD12]](p3) :: (load 1 from unknown-address + 13, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD13:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LOAD14:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD13]](p3) :: (load 1 from unknown-address + 14, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD14:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; GFX10-UNALIGNED: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p3) :: (load 1 from unknown-address + 15, addrspace 3) + ; GFX10-UNALIGNED: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD12]](s32) + ; GFX10-UNALIGNED: [[AND12:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C3]] + ; GFX10-UNALIGNED: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) + ; GFX10-UNALIGNED: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C3]] + ; GFX10-UNALIGNED: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[C4]](s32) + ; GFX10-UNALIGNED: [[OR9:%[0-9]+]]:_(s32) = G_OR [[AND12]], [[SHL9]] + ; GFX10-UNALIGNED: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD14]](s32) + ; GFX10-UNALIGNED: [[AND14:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C3]] + ; GFX10-UNALIGNED: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND14]], [[C5]](s32) + ; GFX10-UNALIGNED: [[OR10:%[0-9]+]]:_(s32) = G_OR [[OR9]], [[SHL10]] + ; GFX10-UNALIGNED: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) + ; GFX10-UNALIGNED: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C3]] + ; GFX10-UNALIGNED: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[C6]](s32) + ; GFX10-UNALIGNED: [[OR11:%[0-9]+]]:_(s32) = G_OR [[OR10]], [[SHL11]] + ; GFX10-UNALIGNED: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX10-UNALIGNED: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; GFX10-UNALIGNED: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) %0:_(p3) = COPY $vgpr0 %1:_(s128) = G_LOAD %0 :: (load 16, align 1, addrspace 3) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -3311,6 +4435,16 @@ ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load 16, align 8, addrspace 3) ; GFX9-UNALIGNED: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>) ; GFX9-UNALIGNED: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) + ; GFX10-LABEL: name: test_load_local_s128_align8 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load 16, align 8, addrspace 3) + ; GFX10: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>) + ; GFX10: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_s128_align8 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load 16, align 8, addrspace 3) + ; GFX10-UNALIGNED: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>) + ; GFX10-UNALIGNED: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) %0:_(p3) = COPY $vgpr0 %1:_(s128) = G_LOAD %0 :: (load 16, align 8, addrspace 3) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -3390,6 +4524,36 @@ ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load 16, align 4, addrspace 3) ; GFX9-UNALIGNED: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>) ; GFX9-UNALIGNED: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) + ; GFX10-LABEL: name: test_load_local_s128_align4 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 4, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 from unknown-address + 4, addrspace 3) + ; GFX10: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 4 from unknown-address + 8, addrspace 3) + ; GFX10: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 4 from unknown-address + 12, addrspace 3) + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) + ; GFX10: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; GFX10: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_s128_align4 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 4, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10-UNALIGNED: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 from unknown-address + 4, addrspace 3) + ; GFX10-UNALIGNED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-UNALIGNED: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 4 from unknown-address + 8, addrspace 3) + ; GFX10-UNALIGNED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10-UNALIGNED: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-UNALIGNED: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 4 from unknown-address + 12, addrspace 3) + ; GFX10-UNALIGNED: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) + ; GFX10-UNALIGNED: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; GFX10-UNALIGNED: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) %0:_(p3) = COPY $vgpr0 %1:_(s128) = G_LOAD %0 :: (load 16, align 4, addrspace 3) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -3661,6 +4825,108 @@ ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load 16, align 2, addrspace 3) ; GFX9-UNALIGNED: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>) ; GFX9-UNALIGNED: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) + ; GFX10-LABEL: name: test_load_local_s128_align2 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 2 from unknown-address + 2, addrspace 3) + ; GFX10: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX10: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX10: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX10: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; GFX10: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 2 from unknown-address + 4, addrspace 3) + ; GFX10: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; GFX10: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 2 from unknown-address + 6, addrspace 3) + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX10: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX10: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; GFX10: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; GFX10: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) + ; GFX10: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10: [[PTR_ADD3:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; GFX10: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p3) :: (load 2 from unknown-address + 8, addrspace 3) + ; GFX10: [[PTR_ADD4:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; GFX10: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p3) :: (load 2 from unknown-address + 10, addrspace 3) + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX10: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]] + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX10: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]] + ; GFX10: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32) + ; GFX10: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; GFX10: [[PTR_ADD5:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C3]](s32) + ; GFX10: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p3) :: (load 2 from unknown-address + 12, addrspace 3) + ; GFX10: [[PTR_ADD6:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; GFX10: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p3) :: (load 2 from unknown-address + 14, addrspace 3) + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; GFX10: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]] + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; GFX10: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C1]] + ; GFX10: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C2]](s32) + ; GFX10: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR3]](s32) + ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s32>), [[BUILD_VECTOR1]](<2 x s32>) + ; GFX10: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[CONCAT_VECTORS]](<4 x s32>) + ; GFX10: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_s128_align2 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-UNALIGNED: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 2 from unknown-address + 2, addrspace 3) + ; GFX10-UNALIGNED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX10-UNALIGNED: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10-UNALIGNED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; GFX10-UNALIGNED: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10-UNALIGNED: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX10-UNALIGNED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-UNALIGNED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX10-UNALIGNED: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX10-UNALIGNED: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10-UNALIGNED: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; GFX10-UNALIGNED: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 2 from unknown-address + 4, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 2 from unknown-address + 6, addrspace 3) + ; GFX10-UNALIGNED: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX10-UNALIGNED: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; GFX10-UNALIGNED: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX10-UNALIGNED: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; GFX10-UNALIGNED: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; GFX10-UNALIGNED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX10-UNALIGNED: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) + ; GFX10-UNALIGNED: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-UNALIGNED: [[PTR_ADD3:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; GFX10-UNALIGNED: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p3) :: (load 2 from unknown-address + 8, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD4:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p3) :: (load 2 from unknown-address + 10, addrspace 3) + ; GFX10-UNALIGNED: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX10-UNALIGNED: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]] + ; GFX10-UNALIGNED: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX10-UNALIGNED: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]] + ; GFX10-UNALIGNED: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32) + ; GFX10-UNALIGNED: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; GFX10-UNALIGNED: [[PTR_ADD5:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C3]](s32) + ; GFX10-UNALIGNED: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p3) :: (load 2 from unknown-address + 12, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD6:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p3) :: (load 2 from unknown-address + 14, addrspace 3) + ; GFX10-UNALIGNED: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; GFX10-UNALIGNED: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]] + ; GFX10-UNALIGNED: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; GFX10-UNALIGNED: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C1]] + ; GFX10-UNALIGNED: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C2]](s32) + ; GFX10-UNALIGNED: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; GFX10-UNALIGNED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR3]](s32) + ; GFX10-UNALIGNED: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s32>), [[BUILD_VECTOR1]](<2 x s32>) + ; GFX10-UNALIGNED: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[CONCAT_VECTORS]](<4 x s32>) + ; GFX10-UNALIGNED: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) %0:_(p3) = COPY $vgpr0 %1:_(s128) = G_LOAD %0 :: (load 16, align 2, addrspace 3) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -4184,6 +5450,208 @@ ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load 16, align 1, addrspace 3) ; GFX9-UNALIGNED: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>) ; GFX9-UNALIGNED: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) + ; GFX10-LABEL: name: test_load_local_s128_align1 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 1 from unknown-address + 1, addrspace 3) + ; GFX10: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 1 from unknown-address + 2, addrspace 3) + ; GFX10: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; GFX10: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 1 from unknown-address + 3, addrspace 3) + ; GFX10: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX10: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX10: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX10: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX10: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C5]](s32) + ; GFX10: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX10: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX10: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; GFX10: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; GFX10: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; GFX10: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10: [[PTR_ADD3:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C7]](s32) + ; GFX10: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p3) :: (load 1 from unknown-address + 4, addrspace 3) + ; GFX10: [[PTR_ADD4:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; GFX10: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p3) :: (load 1 from unknown-address + 5, addrspace 3) + ; GFX10: [[PTR_ADD5:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s32) + ; GFX10: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p3) :: (load 1 from unknown-address + 6, addrspace 3) + ; GFX10: [[PTR_ADD6:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; GFX10: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p3) :: (load 1 from unknown-address + 7, addrspace 3) + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX10: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX10: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; GFX10: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; GFX10: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL3]] + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; GFX10: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; GFX10: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C5]](s32) + ; GFX10: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]] + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; GFX10: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; GFX10: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C6]](s32) + ; GFX10: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]] + ; GFX10: [[PTR_ADD7:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; GFX10: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p3) :: (load 1 from unknown-address + 8, addrspace 3) + ; GFX10: [[PTR_ADD8:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; GFX10: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p3) :: (load 1 from unknown-address + 9, addrspace 3) + ; GFX10: [[PTR_ADD9:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C1]](s32) + ; GFX10: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p3) :: (load 1 from unknown-address + 10, addrspace 3) + ; GFX10: [[PTR_ADD10:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; GFX10: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p3) :: (load 1 from unknown-address + 11, addrspace 3) + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; GFX10: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]] + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; GFX10: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]] + ; GFX10: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C4]](s32) + ; GFX10: [[OR6:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL6]] + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; GFX10: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C3]] + ; GFX10: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND10]], [[C5]](s32) + ; GFX10: [[OR7:%[0-9]+]]:_(s32) = G_OR [[OR6]], [[SHL7]] + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; GFX10: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C3]] + ; GFX10: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C6]](s32) + ; GFX10: [[OR8:%[0-9]+]]:_(s32) = G_OR [[OR7]], [[SHL8]] + ; GFX10: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10: [[PTR_ADD11:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C8]](s32) + ; GFX10: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD11]](p3) :: (load 1 from unknown-address + 12, addrspace 3) + ; GFX10: [[PTR_ADD12:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; GFX10: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD12]](p3) :: (load 1 from unknown-address + 13, addrspace 3) + ; GFX10: [[PTR_ADD13:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s32) + ; GFX10: [[LOAD14:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD13]](p3) :: (load 1 from unknown-address + 14, addrspace 3) + ; GFX10: [[PTR_ADD14:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; GFX10: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p3) :: (load 1 from unknown-address + 15, addrspace 3) + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD12]](s32) + ; GFX10: [[AND12:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C3]] + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) + ; GFX10: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C3]] + ; GFX10: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[C4]](s32) + ; GFX10: [[OR9:%[0-9]+]]:_(s32) = G_OR [[AND12]], [[SHL9]] + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD14]](s32) + ; GFX10: [[AND14:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C3]] + ; GFX10: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND14]], [[C5]](s32) + ; GFX10: [[OR10:%[0-9]+]]:_(s32) = G_OR [[OR9]], [[SHL10]] + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) + ; GFX10: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C3]] + ; GFX10: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[C6]](s32) + ; GFX10: [[OR11:%[0-9]+]]:_(s32) = G_OR [[OR10]], [[SHL11]] + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX10: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; GFX10: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_s128_align1 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-UNALIGNED: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 1 from unknown-address + 1, addrspace 3) + ; GFX10-UNALIGNED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-UNALIGNED: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 1 from unknown-address + 2, addrspace 3) + ; GFX10-UNALIGNED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; GFX10-UNALIGNED: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-UNALIGNED: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 1 from unknown-address + 3, addrspace 3) + ; GFX10-UNALIGNED: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; GFX10-UNALIGNED: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10-UNALIGNED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX10-UNALIGNED: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10-UNALIGNED: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX10-UNALIGNED: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-UNALIGNED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX10-UNALIGNED: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX10-UNALIGNED: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX10-UNALIGNED: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX10-UNALIGNED: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-UNALIGNED: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C5]](s32) + ; GFX10-UNALIGNED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GFX10-UNALIGNED: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX10-UNALIGNED: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX10-UNALIGNED: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; GFX10-UNALIGNED: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; GFX10-UNALIGNED: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; GFX10-UNALIGNED: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10-UNALIGNED: [[PTR_ADD3:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C7]](s32) + ; GFX10-UNALIGNED: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p3) :: (load 1 from unknown-address + 4, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD4:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p3) :: (load 1 from unknown-address + 5, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD5:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p3) :: (load 1 from unknown-address + 6, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD6:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; GFX10-UNALIGNED: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p3) :: (load 1 from unknown-address + 7, addrspace 3) + ; GFX10-UNALIGNED: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX10-UNALIGNED: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; GFX10-UNALIGNED: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX10-UNALIGNED: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; GFX10-UNALIGNED: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; GFX10-UNALIGNED: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL3]] + ; GFX10-UNALIGNED: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; GFX10-UNALIGNED: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; GFX10-UNALIGNED: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C5]](s32) + ; GFX10-UNALIGNED: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]] + ; GFX10-UNALIGNED: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; GFX10-UNALIGNED: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; GFX10-UNALIGNED: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C6]](s32) + ; GFX10-UNALIGNED: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]] + ; GFX10-UNALIGNED: [[PTR_ADD7:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; GFX10-UNALIGNED: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p3) :: (load 1 from unknown-address + 8, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD8:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p3) :: (load 1 from unknown-address + 9, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD9:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p3) :: (load 1 from unknown-address + 10, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD10:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; GFX10-UNALIGNED: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p3) :: (load 1 from unknown-address + 11, addrspace 3) + ; GFX10-UNALIGNED: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; GFX10-UNALIGNED: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]] + ; GFX10-UNALIGNED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; GFX10-UNALIGNED: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]] + ; GFX10-UNALIGNED: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C4]](s32) + ; GFX10-UNALIGNED: [[OR6:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL6]] + ; GFX10-UNALIGNED: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; GFX10-UNALIGNED: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C3]] + ; GFX10-UNALIGNED: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND10]], [[C5]](s32) + ; GFX10-UNALIGNED: [[OR7:%[0-9]+]]:_(s32) = G_OR [[OR6]], [[SHL7]] + ; GFX10-UNALIGNED: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; GFX10-UNALIGNED: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C3]] + ; GFX10-UNALIGNED: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C6]](s32) + ; GFX10-UNALIGNED: [[OR8:%[0-9]+]]:_(s32) = G_OR [[OR7]], [[SHL8]] + ; GFX10-UNALIGNED: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10-UNALIGNED: [[PTR_ADD11:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C8]](s32) + ; GFX10-UNALIGNED: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD11]](p3) :: (load 1 from unknown-address + 12, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD12:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD12]](p3) :: (load 1 from unknown-address + 13, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD13:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LOAD14:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD13]](p3) :: (load 1 from unknown-address + 14, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD14:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; GFX10-UNALIGNED: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p3) :: (load 1 from unknown-address + 15, addrspace 3) + ; GFX10-UNALIGNED: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD12]](s32) + ; GFX10-UNALIGNED: [[AND12:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C3]] + ; GFX10-UNALIGNED: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) + ; GFX10-UNALIGNED: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C3]] + ; GFX10-UNALIGNED: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[C4]](s32) + ; GFX10-UNALIGNED: [[OR9:%[0-9]+]]:_(s32) = G_OR [[AND12]], [[SHL9]] + ; GFX10-UNALIGNED: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD14]](s32) + ; GFX10-UNALIGNED: [[AND14:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C3]] + ; GFX10-UNALIGNED: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND14]], [[C5]](s32) + ; GFX10-UNALIGNED: [[OR10:%[0-9]+]]:_(s32) = G_OR [[OR9]], [[SHL10]] + ; GFX10-UNALIGNED: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) + ; GFX10-UNALIGNED: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C3]] + ; GFX10-UNALIGNED: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[C6]](s32) + ; GFX10-UNALIGNED: [[OR11:%[0-9]+]]:_(s32) = G_OR [[OR10]], [[SHL11]] + ; GFX10-UNALIGNED: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX10-UNALIGNED: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; GFX10-UNALIGNED: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) %0:_(p3) = COPY $vgpr0 %1:_(s128) = G_LOAD %0 :: (load 16, align 1, addrspace 3) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -4219,6 +5687,14 @@ ; GFX9-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3) ; GFX9-UNALIGNED: $vgpr0_vgpr1 = COPY [[LOAD]](p1) + ; GFX10-LABEL: name: test_load_local_p1_align8 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3) + ; GFX10: $vgpr0_vgpr1 = COPY [[LOAD]](p1) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_p1_align8 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3) + ; GFX10-UNALIGNED: $vgpr0_vgpr1 = COPY [[LOAD]](p1) %0:_(p3) = COPY $vgpr0 %1:_(p1) = G_LOAD %0 :: (load 8, align 8, addrspace 3) $vgpr0_vgpr1 = COPY %1 @@ -4254,6 +5730,14 @@ ; GFX9-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) ; GFX9-UNALIGNED: $vgpr0_vgpr1 = COPY [[LOAD]](p1) + ; GFX10-LABEL: name: test_load_local_p1_align4 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) + ; GFX10: $vgpr0_vgpr1 = COPY [[LOAD]](p1) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_p1_align4 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) + ; GFX10-UNALIGNED: $vgpr0_vgpr1 = COPY [[LOAD]](p1) %0:_(p3) = COPY $vgpr0 %1:_(p1) = G_LOAD %0 :: (load 8, align 4, addrspace 3) $vgpr0_vgpr1 = COPY %1 @@ -4409,6 +5893,62 @@ ; GFX9-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p3) :: (load 8, align 2, addrspace 3) ; GFX9-UNALIGNED: $vgpr0_vgpr1 = COPY [[LOAD]](p1) + ; GFX10-LABEL: name: test_load_local_p1_align2 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 2 from unknown-address + 2, addrspace 3) + ; GFX10: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 2 from unknown-address + 4, addrspace 3) + ; GFX10: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 + ; GFX10: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 2 from unknown-address + 6, addrspace 3) + ; GFX10: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX10: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX10: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX10: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX10: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX10: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; GFX10: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX10: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; GFX10: $vgpr0_vgpr1 = COPY [[MV]](p1) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_p1_align2 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-UNALIGNED: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 2 from unknown-address + 2, addrspace 3) + ; GFX10-UNALIGNED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10-UNALIGNED: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 2 from unknown-address + 4, addrspace 3) + ; GFX10-UNALIGNED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 + ; GFX10-UNALIGNED: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-UNALIGNED: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 2 from unknown-address + 6, addrspace 3) + ; GFX10-UNALIGNED: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX10-UNALIGNED: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10-UNALIGNED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX10-UNALIGNED: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10-UNALIGNED: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX10-UNALIGNED: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-UNALIGNED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX10-UNALIGNED: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX10-UNALIGNED: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX10-UNALIGNED: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX10-UNALIGNED: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX10-UNALIGNED: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX10-UNALIGNED: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; GFX10-UNALIGNED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX10-UNALIGNED: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; GFX10-UNALIGNED: $vgpr0_vgpr1 = COPY [[MV]](p1) %0:_(p3) = COPY $vgpr0 %1:_(p1) = G_LOAD %0 :: (load 8, align 2, addrspace 3) $vgpr0_vgpr1 = COPY %1 @@ -4753,6 +6293,128 @@ ; GFX9-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p3) :: (load 8, align 1, addrspace 3) ; GFX9-UNALIGNED: $vgpr0_vgpr1 = COPY [[LOAD]](p1) + ; GFX10-LABEL: name: test_load_local_p1_align1 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 1 from unknown-address + 1, addrspace 3) + ; GFX10: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 1 from unknown-address + 2, addrspace 3) + ; GFX10: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; GFX10: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 1 from unknown-address + 3, addrspace 3) + ; GFX10: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10: [[PTR_ADD3:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; GFX10: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p3) :: (load 1 from unknown-address + 4, addrspace 3) + ; GFX10: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 + ; GFX10: [[PTR_ADD4:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; GFX10: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p3) :: (load 1 from unknown-address + 5, addrspace 3) + ; GFX10: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 + ; GFX10: [[PTR_ADD5:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; GFX10: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p3) :: (load 1 from unknown-address + 6, addrspace 3) + ; GFX10: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 7 + ; GFX10: [[PTR_ADD6:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C6]](s32) + ; GFX10: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p3) :: (load 1 from unknown-address + 7, addrspace 3) + ; GFX10: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX10: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; GFX10: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C7]] + ; GFX10: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX10: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C7]] + ; GFX10: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; GFX10: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C8]](s16) + ; GFX10: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL]] + ; GFX10: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) + ; GFX10: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C7]] + ; GFX10: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) + ; GFX10: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C7]] + ; GFX10: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C8]](s16) + ; GFX10: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] + ; GFX10: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) + ; GFX10: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C7]] + ; GFX10: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) + ; GFX10: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C7]] + ; GFX10: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C8]](s16) + ; GFX10: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; GFX10: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) + ; GFX10: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C7]] + ; GFX10: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) + ; GFX10: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] + ; GFX10: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) + ; GFX10: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] + ; GFX10: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX10: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX10: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; GFX10: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; GFX10: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX10: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX10: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; GFX10: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX10: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; GFX10: $vgpr0_vgpr1 = COPY [[MV]](p1) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_p1_align1 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-UNALIGNED: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 1 from unknown-address + 1, addrspace 3) + ; GFX10-UNALIGNED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-UNALIGNED: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 1 from unknown-address + 2, addrspace 3) + ; GFX10-UNALIGNED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; GFX10-UNALIGNED: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-UNALIGNED: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 1 from unknown-address + 3, addrspace 3) + ; GFX10-UNALIGNED: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10-UNALIGNED: [[PTR_ADD3:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; GFX10-UNALIGNED: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p3) :: (load 1 from unknown-address + 4, addrspace 3) + ; GFX10-UNALIGNED: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 + ; GFX10-UNALIGNED: [[PTR_ADD4:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; GFX10-UNALIGNED: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p3) :: (load 1 from unknown-address + 5, addrspace 3) + ; GFX10-UNALIGNED: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 + ; GFX10-UNALIGNED: [[PTR_ADD5:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; GFX10-UNALIGNED: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p3) :: (load 1 from unknown-address + 6, addrspace 3) + ; GFX10-UNALIGNED: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 7 + ; GFX10-UNALIGNED: [[PTR_ADD6:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C6]](s32) + ; GFX10-UNALIGNED: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p3) :: (load 1 from unknown-address + 7, addrspace 3) + ; GFX10-UNALIGNED: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX10-UNALIGNED: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; GFX10-UNALIGNED: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C7]] + ; GFX10-UNALIGNED: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX10-UNALIGNED: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C7]] + ; GFX10-UNALIGNED: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; GFX10-UNALIGNED: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C8]](s16) + ; GFX10-UNALIGNED: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL]] + ; GFX10-UNALIGNED: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) + ; GFX10-UNALIGNED: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C7]] + ; GFX10-UNALIGNED: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) + ; GFX10-UNALIGNED: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C7]] + ; GFX10-UNALIGNED: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C8]](s16) + ; GFX10-UNALIGNED: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] + ; GFX10-UNALIGNED: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) + ; GFX10-UNALIGNED: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C7]] + ; GFX10-UNALIGNED: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) + ; GFX10-UNALIGNED: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C7]] + ; GFX10-UNALIGNED: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C8]](s16) + ; GFX10-UNALIGNED: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; GFX10-UNALIGNED: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) + ; GFX10-UNALIGNED: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C7]] + ; GFX10-UNALIGNED: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) + ; GFX10-UNALIGNED: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] + ; GFX10-UNALIGNED: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) + ; GFX10-UNALIGNED: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] + ; GFX10-UNALIGNED: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX10-UNALIGNED: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX10-UNALIGNED: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-UNALIGNED: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; GFX10-UNALIGNED: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; GFX10-UNALIGNED: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX10-UNALIGNED: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX10-UNALIGNED: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; GFX10-UNALIGNED: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX10-UNALIGNED: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; GFX10-UNALIGNED: $vgpr0_vgpr1 = COPY [[MV]](p1) %0:_(p3) = COPY $vgpr0 %1:_(p1) = G_LOAD %0 :: (load 8, align 1, addrspace 3) $vgpr0_vgpr1 = COPY %1 @@ -4788,6 +6450,14 @@ ; GFX9-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p3) :: (load 4, addrspace 3) ; GFX9-UNALIGNED: $vgpr0 = COPY [[LOAD]](p3) + ; GFX10-LABEL: name: test_load_local_p3_align4 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p3) :: (load 4, addrspace 3) + ; GFX10: $vgpr0 = COPY [[LOAD]](p3) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_p3_align4 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p3) :: (load 4, addrspace 3) + ; GFX10-UNALIGNED: $vgpr0 = COPY [[LOAD]](p3) %0:_(p3) = COPY $vgpr0 %1:_(p3) = G_LOAD %0 :: (load 4, align 4, addrspace 3) $vgpr0 = COPY %1 @@ -4883,6 +6553,38 @@ ; GFX9-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p3) :: (load 4, align 2, addrspace 3) ; GFX9-UNALIGNED: $vgpr0 = COPY [[LOAD]](p3) + ; GFX10-LABEL: name: test_load_local_p3_align2 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 2 from unknown-address + 2, addrspace 3) + ; GFX10: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX10: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX10: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX10: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32) + ; GFX10: $vgpr0 = COPY [[INTTOPTR]](p3) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_p3_align2 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-UNALIGNED: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 2 from unknown-address + 2, addrspace 3) + ; GFX10-UNALIGNED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX10-UNALIGNED: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10-UNALIGNED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; GFX10-UNALIGNED: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10-UNALIGNED: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX10-UNALIGNED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-UNALIGNED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX10-UNALIGNED: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX10-UNALIGNED: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32) + ; GFX10-UNALIGNED: $vgpr0 = COPY [[INTTOPTR]](p3) %0:_(p3) = COPY $vgpr0 %1:_(p3) = G_LOAD %0 :: (load 4, align 2, addrspace 3) $vgpr0 = COPY %1 @@ -5058,6 +6760,70 @@ ; GFX9-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p3) :: (load 4, align 1, addrspace 3) ; GFX9-UNALIGNED: $vgpr0 = COPY [[LOAD]](p3) + ; GFX10-LABEL: name: test_load_local_p3_align1 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 1 from unknown-address + 1, addrspace 3) + ; GFX10: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 1 from unknown-address + 2, addrspace 3) + ; GFX10: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; GFX10: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 1 from unknown-address + 3, addrspace 3) + ; GFX10: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX10: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX10: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX10: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX10: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C5]](s32) + ; GFX10: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX10: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX10: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; GFX10: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; GFX10: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; GFX10: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) + ; GFX10: $vgpr0 = COPY [[INTTOPTR]](p3) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_p3_align1 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-UNALIGNED: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 1 from unknown-address + 1, addrspace 3) + ; GFX10-UNALIGNED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-UNALIGNED: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 1 from unknown-address + 2, addrspace 3) + ; GFX10-UNALIGNED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; GFX10-UNALIGNED: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-UNALIGNED: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 1 from unknown-address + 3, addrspace 3) + ; GFX10-UNALIGNED: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; GFX10-UNALIGNED: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10-UNALIGNED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX10-UNALIGNED: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10-UNALIGNED: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX10-UNALIGNED: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-UNALIGNED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX10-UNALIGNED: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX10-UNALIGNED: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX10-UNALIGNED: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX10-UNALIGNED: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-UNALIGNED: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C5]](s32) + ; GFX10-UNALIGNED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GFX10-UNALIGNED: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX10-UNALIGNED: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX10-UNALIGNED: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; GFX10-UNALIGNED: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; GFX10-UNALIGNED: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; GFX10-UNALIGNED: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) + ; GFX10-UNALIGNED: $vgpr0 = COPY [[INTTOPTR]](p3) %0:_(p3) = COPY $vgpr0 %1:_(p3) = G_LOAD %0 :: (load 4, align 1, addrspace 3) $vgpr0 = COPY %1 @@ -5093,6 +6859,14 @@ ; GFX9-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p3) :: (load 4, addrspace 3) ; GFX9-UNALIGNED: $vgpr0 = COPY [[LOAD]](p5) + ; GFX10-LABEL: name: test_load_local_p5_align4 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p3) :: (load 4, addrspace 3) + ; GFX10: $vgpr0 = COPY [[LOAD]](p5) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_p5_align4 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p3) :: (load 4, addrspace 3) + ; GFX10-UNALIGNED: $vgpr0 = COPY [[LOAD]](p5) %0:_(p3) = COPY $vgpr0 %1:_(p5) = G_LOAD %0 :: (load 4, align 4, addrspace 3) $vgpr0 = COPY %1 @@ -5188,6 +6962,38 @@ ; GFX9-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p3) :: (load 4, align 2, addrspace 3) ; GFX9-UNALIGNED: $vgpr0 = COPY [[LOAD]](p5) + ; GFX10-LABEL: name: test_load_local_p5_align2 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 2 from unknown-address + 2, addrspace 3) + ; GFX10: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX10: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX10: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX10: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) + ; GFX10: $vgpr0 = COPY [[INTTOPTR]](p5) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_p5_align2 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-UNALIGNED: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 2 from unknown-address + 2, addrspace 3) + ; GFX10-UNALIGNED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX10-UNALIGNED: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10-UNALIGNED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; GFX10-UNALIGNED: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10-UNALIGNED: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX10-UNALIGNED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-UNALIGNED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX10-UNALIGNED: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX10-UNALIGNED: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) + ; GFX10-UNALIGNED: $vgpr0 = COPY [[INTTOPTR]](p5) %0:_(p3) = COPY $vgpr0 %1:_(p5) = G_LOAD %0 :: (load 4, align 2, addrspace 3) $vgpr0 = COPY %1 @@ -5363,6 +7169,70 @@ ; GFX9-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p3) :: (load 4, align 1, addrspace 3) ; GFX9-UNALIGNED: $vgpr0 = COPY [[LOAD]](p5) + ; GFX10-LABEL: name: test_load_local_p5_align1 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 1 from unknown-address + 1, addrspace 3) + ; GFX10: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 1 from unknown-address + 2, addrspace 3) + ; GFX10: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; GFX10: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 1 from unknown-address + 3, addrspace 3) + ; GFX10: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX10: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX10: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX10: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX10: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C5]](s32) + ; GFX10: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX10: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX10: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; GFX10: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; GFX10: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; GFX10: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) + ; GFX10: $vgpr0 = COPY [[INTTOPTR]](p5) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_p5_align1 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-UNALIGNED: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 1 from unknown-address + 1, addrspace 3) + ; GFX10-UNALIGNED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-UNALIGNED: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 1 from unknown-address + 2, addrspace 3) + ; GFX10-UNALIGNED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; GFX10-UNALIGNED: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-UNALIGNED: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 1 from unknown-address + 3, addrspace 3) + ; GFX10-UNALIGNED: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; GFX10-UNALIGNED: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10-UNALIGNED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX10-UNALIGNED: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10-UNALIGNED: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX10-UNALIGNED: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-UNALIGNED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX10-UNALIGNED: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX10-UNALIGNED: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX10-UNALIGNED: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX10-UNALIGNED: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-UNALIGNED: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C5]](s32) + ; GFX10-UNALIGNED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GFX10-UNALIGNED: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX10-UNALIGNED: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX10-UNALIGNED: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; GFX10-UNALIGNED: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; GFX10-UNALIGNED: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; GFX10-UNALIGNED: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) + ; GFX10-UNALIGNED: $vgpr0 = COPY [[INTTOPTR]](p5) %0:_(p3) = COPY $vgpr0 %1:_(p5) = G_LOAD %0 :: (load 4, align 1, addrspace 3) $vgpr0 = COPY %1 @@ -5494,6 +7364,44 @@ ; GFX9-UNALIGNED: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL]] ; GFX9-UNALIGNED: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16) ; GFX9-UNALIGNED: $vgpr0 = COPY [[ANYEXT]](s32) + ; GFX10-LABEL: name: test_load_local_v2s8_align2 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32) + ; GFX10: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32) + ; GFX10: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; GFX10: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C2]](s32) + ; GFX10: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX10: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; GFX10: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]] + ; GFX10: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX10: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]] + ; GFX10: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; GFX10: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16) + ; GFX10: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL]] + ; GFX10: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16) + ; GFX10: $vgpr0 = COPY [[ANYEXT]](s32) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_v2s8_align2 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-UNALIGNED: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32) + ; GFX10-UNALIGNED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-UNALIGNED: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32) + ; GFX10-UNALIGNED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; GFX10-UNALIGNED: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C2]](s32) + ; GFX10-UNALIGNED: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX10-UNALIGNED: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; GFX10-UNALIGNED: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]] + ; GFX10-UNALIGNED: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX10-UNALIGNED: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]] + ; GFX10-UNALIGNED: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; GFX10-UNALIGNED: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16) + ; GFX10-UNALIGNED: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL]] + ; GFX10-UNALIGNED: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16) + ; GFX10-UNALIGNED: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(p3) = COPY $vgpr0 %1:_(<2 x s8>) = G_LOAD %0 :: (load 2, align 2, addrspace 3) %2:_(s16) = G_BITCAST %1 @@ -5578,6 +7486,26 @@ ; GFX9-UNALIGNED: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) ; GFX9-UNALIGNED: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY1]](s32), [[COPY2]](s32) ; GFX9-UNALIGNED: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) + ; GFX10-LABEL: name: test_load_local_v2s8_align1 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 1 from unknown-address + 1, addrspace 3) + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY1]](s32), [[COPY2]](s32) + ; GFX10: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_v2s8_align1 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-UNALIGNED: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 1 from unknown-address + 1, addrspace 3) + ; GFX10-UNALIGNED: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10-UNALIGNED: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10-UNALIGNED: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY1]](s32), [[COPY2]](s32) + ; GFX10-UNALIGNED: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(p3) = COPY $vgpr0 %1:_(<2 x s8>) = G_LOAD %0 :: (load 2, align 1, addrspace 3) %2:_(<2 x s32>) = G_ANYEXT %1 @@ -5626,6 +7554,18 @@ ; GFX9-UNALIGNED: [[DEF:%[0-9]+]]:_(<4 x s8>) = G_IMPLICIT_DEF ; GFX9-UNALIGNED: [[INSERT:%[0-9]+]]:_(<4 x s8>) = G_INSERT [[DEF]], [[LOAD]](<3 x s8>), 0 ; GFX9-UNALIGNED: $vgpr0 = COPY [[INSERT]](<4 x s8>) + ; GFX10-LABEL: name: test_load_local_v3s8_align4 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(<3 x s8>) = G_LOAD [[COPY]](p3) :: (load 3, align 4, addrspace 1) + ; GFX10: [[DEF:%[0-9]+]]:_(<4 x s8>) = G_IMPLICIT_DEF + ; GFX10: [[INSERT:%[0-9]+]]:_(<4 x s8>) = G_INSERT [[DEF]], [[LOAD]](<3 x s8>), 0 + ; GFX10: $vgpr0 = COPY [[INSERT]](<4 x s8>) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_v3s8_align4 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(<3 x s8>) = G_LOAD [[COPY]](p3) :: (load 3, align 4, addrspace 1) + ; GFX10-UNALIGNED: [[DEF:%[0-9]+]]:_(<4 x s8>) = G_IMPLICIT_DEF + ; GFX10-UNALIGNED: [[INSERT:%[0-9]+]]:_(<4 x s8>) = G_INSERT [[DEF]], [[LOAD]](<3 x s8>), 0 + ; GFX10-UNALIGNED: $vgpr0 = COPY [[INSERT]](<4 x s8>) %0:_(p3) = COPY $vgpr0 %1:_(<3 x s8>) = G_LOAD %0 :: (load 3, addrspace 1, align 4) %2:_(<4 x s8>) = G_IMPLICIT_DEF @@ -5675,6 +7615,18 @@ ; GFX9-UNALIGNED: [[DEF:%[0-9]+]]:_(<4 x s8>) = G_IMPLICIT_DEF ; GFX9-UNALIGNED: [[INSERT:%[0-9]+]]:_(<4 x s8>) = G_INSERT [[DEF]], [[LOAD]](<3 x s8>), 0 ; GFX9-UNALIGNED: $vgpr0 = COPY [[INSERT]](<4 x s8>) + ; GFX10-LABEL: name: test_load_local_v3s8_align1 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(<3 x s8>) = G_LOAD [[COPY]](p3) :: (load 3, align 1, addrspace 3) + ; GFX10: [[DEF:%[0-9]+]]:_(<4 x s8>) = G_IMPLICIT_DEF + ; GFX10: [[INSERT:%[0-9]+]]:_(<4 x s8>) = G_INSERT [[DEF]], [[LOAD]](<3 x s8>), 0 + ; GFX10: $vgpr0 = COPY [[INSERT]](<4 x s8>) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_v3s8_align1 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(<3 x s8>) = G_LOAD [[COPY]](p3) :: (load 3, align 1, addrspace 3) + ; GFX10-UNALIGNED: [[DEF:%[0-9]+]]:_(<4 x s8>) = G_IMPLICIT_DEF + ; GFX10-UNALIGNED: [[INSERT:%[0-9]+]]:_(<4 x s8>) = G_INSERT [[DEF]], [[LOAD]](<3 x s8>), 0 + ; GFX10-UNALIGNED: $vgpr0 = COPY [[INSERT]](<4 x s8>) %0:_(p3) = COPY $vgpr0 %1:_(<3 x s8>) = G_LOAD %0 :: (load 3, align 1, addrspace 3) %2:_(<4 x s8>) = G_IMPLICIT_DEF @@ -5788,6 +7740,42 @@ ; GFX9-UNALIGNED: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) ; GFX9-UNALIGNED: [[TRUNC:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS]](<4 x s16>) ; GFX9-UNALIGNED: $vgpr0 = COPY [[TRUNC]](<4 x s8>) + ; GFX10-LABEL: name: test_load_local_v4s8_align4 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 4, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32) + ; GFX10: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32) + ; GFX10: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; GFX10: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C2]](s32) + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY1]](s32), [[COPY2]](s32) + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY3]](s32), [[COPY4]](s32) + ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10: [[TRUNC:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS]](<4 x s16>) + ; GFX10: $vgpr0 = COPY [[TRUNC]](<4 x s8>) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_v4s8_align4 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 4, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-UNALIGNED: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32) + ; GFX10-UNALIGNED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-UNALIGNED: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32) + ; GFX10-UNALIGNED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; GFX10-UNALIGNED: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C2]](s32) + ; GFX10-UNALIGNED: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10-UNALIGNED: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10-UNALIGNED: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY1]](s32), [[COPY2]](s32) + ; GFX10-UNALIGNED: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX10-UNALIGNED: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX10-UNALIGNED: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY3]](s32), [[COPY4]](s32) + ; GFX10-UNALIGNED: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10-UNALIGNED: [[TRUNC:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS]](<4 x s16>) + ; GFX10-UNALIGNED: $vgpr0 = COPY [[TRUNC]](<4 x s8>) %0:_(p3) = COPY $vgpr0 %1:_(<4 x s8>) = G_LOAD %0 :: (load 4, align 4, addrspace 3) $vgpr0 = COPY %1 @@ -5969,6 +7957,68 @@ ; GFX9-UNALIGNED: [[TRUNC1:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS1]](<4 x s16>) ; GFX9-UNALIGNED: [[CONCAT_VECTORS2:%[0-9]+]]:_(<8 x s8>) = G_CONCAT_VECTORS [[TRUNC]](<4 x s8>), [[TRUNC1]](<4 x s8>) ; GFX9-UNALIGNED: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS2]](<8 x s8>) + ; GFX10-LABEL: name: test_load_local_v8s8_align8 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3) + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32) + ; GFX10: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C1]](s32) + ; GFX10: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; GFX10: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C2]](s32) + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY1]](s32), [[COPY2]](s32) + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY3]](s32), [[COPY4]](s32) + ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10: [[TRUNC:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS]](<4 x s16>) + ; GFX10: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32) + ; GFX10: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C1]](s32) + ; GFX10: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C2]](s32) + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY5]](s32), [[COPY6]](s32) + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LSHR5]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY7]](s32), [[COPY8]](s32) + ; GFX10: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>) + ; GFX10: [[TRUNC1:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS1]](<4 x s16>) + ; GFX10: [[CONCAT_VECTORS2:%[0-9]+]]:_(<8 x s8>) = G_CONCAT_VECTORS [[TRUNC]](<4 x s8>), [[TRUNC1]](<4 x s8>) + ; GFX10: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS2]](<8 x s8>) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_v8s8_align8 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3) + ; GFX10-UNALIGNED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-UNALIGNED: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32) + ; GFX10-UNALIGNED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-UNALIGNED: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C1]](s32) + ; GFX10-UNALIGNED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; GFX10-UNALIGNED: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C2]](s32) + ; GFX10-UNALIGNED: [[COPY1:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10-UNALIGNED: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10-UNALIGNED: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY1]](s32), [[COPY2]](s32) + ; GFX10-UNALIGNED: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX10-UNALIGNED: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX10-UNALIGNED: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY3]](s32), [[COPY4]](s32) + ; GFX10-UNALIGNED: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10-UNALIGNED: [[TRUNC:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS]](<4 x s16>) + ; GFX10-UNALIGNED: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32) + ; GFX10-UNALIGNED: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C2]](s32) + ; GFX10-UNALIGNED: [[COPY5:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10-UNALIGNED: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) + ; GFX10-UNALIGNED: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY5]](s32), [[COPY6]](s32) + ; GFX10-UNALIGNED: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; GFX10-UNALIGNED: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LSHR5]](s32) + ; GFX10-UNALIGNED: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY7]](s32), [[COPY8]](s32) + ; GFX10-UNALIGNED: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>) + ; GFX10-UNALIGNED: [[TRUNC1:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS1]](<4 x s16>) + ; GFX10-UNALIGNED: [[CONCAT_VECTORS2:%[0-9]+]]:_(<8 x s8>) = G_CONCAT_VECTORS [[TRUNC]](<4 x s8>), [[TRUNC1]](<4 x s8>) + ; GFX10-UNALIGNED: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS2]](<8 x s8>) %0:_(p3) = COPY $vgpr0 %1:_(<8 x s8>) = G_LOAD %0 :: (load 8, align 8, addrspace 3) $vgpr0_vgpr1 = COPY %1 @@ -6364,6 +8414,152 @@ ; GFX9-UNALIGNED: [[TRUNC3:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS3]](<4 x s16>) ; GFX9-UNALIGNED: [[CONCAT_VECTORS4:%[0-9]+]]:_(<16 x s8>) = G_CONCAT_VECTORS [[TRUNC]](<4 x s8>), [[TRUNC1]](<4 x s8>), [[TRUNC2]](<4 x s8>), [[TRUNC3]](<4 x s8>) ; GFX9-UNALIGNED: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[CONCAT_VECTORS4]](<16 x s8>) + ; GFX10-LABEL: name: test_load_local_v16s8_align16 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 1 from unknown-address + 1, addrspace 3) + ; GFX10: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 1 from unknown-address + 2, addrspace 3) + ; GFX10: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; GFX10: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 1 from unknown-address + 3, addrspace 3) + ; GFX10: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10: [[PTR_ADD3:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; GFX10: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p3) :: (load 1 from unknown-address + 4, addrspace 3) + ; GFX10: [[PTR_ADD4:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; GFX10: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p3) :: (load 1 from unknown-address + 5, addrspace 3) + ; GFX10: [[PTR_ADD5:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s32) + ; GFX10: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p3) :: (load 1 from unknown-address + 6, addrspace 3) + ; GFX10: [[PTR_ADD6:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; GFX10: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p3) :: (load 1 from unknown-address + 7, addrspace 3) + ; GFX10: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10: [[PTR_ADD7:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; GFX10: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p3) :: (load 1 from unknown-address + 8, addrspace 3) + ; GFX10: [[PTR_ADD8:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; GFX10: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p3) :: (load 1 from unknown-address + 9, addrspace 3) + ; GFX10: [[PTR_ADD9:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C1]](s32) + ; GFX10: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p3) :: (load 1 from unknown-address + 10, addrspace 3) + ; GFX10: [[PTR_ADD10:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; GFX10: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p3) :: (load 1 from unknown-address + 11, addrspace 3) + ; GFX10: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10: [[PTR_ADD11:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; GFX10: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD11]](p3) :: (load 1 from unknown-address + 12, addrspace 3) + ; GFX10: [[PTR_ADD12:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; GFX10: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD12]](p3) :: (load 1 from unknown-address + 13, addrspace 3) + ; GFX10: [[PTR_ADD13:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s32) + ; GFX10: [[LOAD14:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD13]](p3) :: (load 1 from unknown-address + 14, addrspace 3) + ; GFX10: [[PTR_ADD14:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; GFX10: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p3) :: (load 1 from unknown-address + 15, addrspace 3) + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY1]](s32), [[COPY2]](s32) + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY3]](s32), [[COPY4]](s32) + ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10: [[TRUNC:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS]](<4 x s16>) + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY5]](s32), [[COPY6]](s32) + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY7]](s32), [[COPY8]](s32) + ; GFX10: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>) + ; GFX10: [[TRUNC1:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS1]](<4 x s16>) + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32) + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY11]](s32), [[COPY12]](s32) + ; GFX10: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC4]](<2 x s16>), [[BUILD_VECTOR_TRUNC5]](<2 x s16>) + ; GFX10: [[TRUNC2:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS2]](<4 x s16>) + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD12]](s32) + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD14]](s32) + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC7:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) + ; GFX10: [[CONCAT_VECTORS3:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC6]](<2 x s16>), [[BUILD_VECTOR_TRUNC7]](<2 x s16>) + ; GFX10: [[TRUNC3:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS3]](<4 x s16>) + ; GFX10: [[CONCAT_VECTORS4:%[0-9]+]]:_(<16 x s8>) = G_CONCAT_VECTORS [[TRUNC]](<4 x s8>), [[TRUNC1]](<4 x s8>), [[TRUNC2]](<4 x s8>), [[TRUNC3]](<4 x s8>) + ; GFX10: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[CONCAT_VECTORS4]](<16 x s8>) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_v16s8_align16 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-UNALIGNED: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 1 from unknown-address + 1, addrspace 3) + ; GFX10-UNALIGNED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-UNALIGNED: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 1 from unknown-address + 2, addrspace 3) + ; GFX10-UNALIGNED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; GFX10-UNALIGNED: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-UNALIGNED: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 1 from unknown-address + 3, addrspace 3) + ; GFX10-UNALIGNED: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10-UNALIGNED: [[PTR_ADD3:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; GFX10-UNALIGNED: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p3) :: (load 1 from unknown-address + 4, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD4:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p3) :: (load 1 from unknown-address + 5, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD5:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p3) :: (load 1 from unknown-address + 6, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD6:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; GFX10-UNALIGNED: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p3) :: (load 1 from unknown-address + 7, addrspace 3) + ; GFX10-UNALIGNED: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-UNALIGNED: [[PTR_ADD7:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; GFX10-UNALIGNED: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p3) :: (load 1 from unknown-address + 8, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD8:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p3) :: (load 1 from unknown-address + 9, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD9:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p3) :: (load 1 from unknown-address + 10, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD10:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; GFX10-UNALIGNED: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p3) :: (load 1 from unknown-address + 11, addrspace 3) + ; GFX10-UNALIGNED: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10-UNALIGNED: [[PTR_ADD11:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; GFX10-UNALIGNED: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD11]](p3) :: (load 1 from unknown-address + 12, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD12:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD12]](p3) :: (load 1 from unknown-address + 13, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD13:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LOAD14:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD13]](p3) :: (load 1 from unknown-address + 14, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD14:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; GFX10-UNALIGNED: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p3) :: (load 1 from unknown-address + 15, addrspace 3) + ; GFX10-UNALIGNED: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10-UNALIGNED: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10-UNALIGNED: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY1]](s32), [[COPY2]](s32) + ; GFX10-UNALIGNED: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX10-UNALIGNED: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX10-UNALIGNED: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY3]](s32), [[COPY4]](s32) + ; GFX10-UNALIGNED: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10-UNALIGNED: [[TRUNC:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS]](<4 x s16>) + ; GFX10-UNALIGNED: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX10-UNALIGNED: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX10-UNALIGNED: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY5]](s32), [[COPY6]](s32) + ; GFX10-UNALIGNED: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; GFX10-UNALIGNED: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; GFX10-UNALIGNED: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY7]](s32), [[COPY8]](s32) + ; GFX10-UNALIGNED: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>) + ; GFX10-UNALIGNED: [[TRUNC1:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS1]](<4 x s16>) + ; GFX10-UNALIGNED: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; GFX10-UNALIGNED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; GFX10-UNALIGNED: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32) + ; GFX10-UNALIGNED: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; GFX10-UNALIGNED: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; GFX10-UNALIGNED: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY11]](s32), [[COPY12]](s32) + ; GFX10-UNALIGNED: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC4]](<2 x s16>), [[BUILD_VECTOR_TRUNC5]](<2 x s16>) + ; GFX10-UNALIGNED: [[TRUNC2:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS2]](<4 x s16>) + ; GFX10-UNALIGNED: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD12]](s32) + ; GFX10-UNALIGNED: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) + ; GFX10-UNALIGNED: [[BUILD_VECTOR_TRUNC6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) + ; GFX10-UNALIGNED: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD14]](s32) + ; GFX10-UNALIGNED: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) + ; GFX10-UNALIGNED: [[BUILD_VECTOR_TRUNC7:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) + ; GFX10-UNALIGNED: [[CONCAT_VECTORS3:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC6]](<2 x s16>), [[BUILD_VECTOR_TRUNC7]](<2 x s16>) + ; GFX10-UNALIGNED: [[TRUNC3:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS3]](<4 x s16>) + ; GFX10-UNALIGNED: [[CONCAT_VECTORS4:%[0-9]+]]:_(<16 x s8>) = G_CONCAT_VECTORS [[TRUNC]](<4 x s8>), [[TRUNC1]](<4 x s8>), [[TRUNC2]](<4 x s8>), [[TRUNC3]](<4 x s8>) + ; GFX10-UNALIGNED: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[CONCAT_VECTORS4]](<16 x s8>) %0:_(p3) = COPY $vgpr0 %1:_(<16 x s8>) = G_LOAD %0 :: (load 16, align 1, addrspace 3) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -6399,6 +8595,14 @@ ; GFX9-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p3) :: (load 4, addrspace 3) ; GFX9-UNALIGNED: $vgpr0 = COPY [[LOAD]](<2 x s16>) + ; GFX10-LABEL: name: test_load_local_v2s16_align4 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p3) :: (load 4, addrspace 3) + ; GFX10: $vgpr0 = COPY [[LOAD]](<2 x s16>) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_v2s16_align4 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p3) :: (load 4, addrspace 3) + ; GFX10-UNALIGNED: $vgpr0 = COPY [[LOAD]](<2 x s16>) %0:_(p3) = COPY $vgpr0 %1:_(<2 x s16>) = G_LOAD %0 :: (load 4, align 4, addrspace 3) $vgpr0 = COPY %1 @@ -6488,6 +8692,26 @@ ; GFX9-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p3) :: (load 4, align 2, addrspace 3) ; GFX9-UNALIGNED: $vgpr0 = COPY [[LOAD]](<2 x s16>) + ; GFX10-LABEL: name: test_load_local_v2s16_align2 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 2 from unknown-address + 2, addrspace 3) + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY1]](s32), [[COPY2]](s32) + ; GFX10: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_v2s16_align2 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-UNALIGNED: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 2 from unknown-address + 2, addrspace 3) + ; GFX10-UNALIGNED: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10-UNALIGNED: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10-UNALIGNED: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY1]](s32), [[COPY2]](s32) + ; GFX10-UNALIGNED: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) %0:_(p3) = COPY $vgpr0 %1:_(<2 x s16>) = G_LOAD %0 :: (load 4, align 2, addrspace 3) $vgpr0 = COPY %1 @@ -6672,6 +8896,64 @@ ; GFX9-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p3) :: (load 4, align 1, addrspace 3) ; GFX9-UNALIGNED: $vgpr0 = COPY [[LOAD]](<2 x s16>) + ; GFX10-LABEL: name: test_load_local_v2s16_align1 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 1 from unknown-address + 1, addrspace 3) + ; GFX10: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX10: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; GFX10: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C1]] + ; GFX10: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX10: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C1]] + ; GFX10: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; GFX10: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C2]](s16) + ; GFX10: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL]] + ; GFX10: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; GFX10: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 1 from unknown-address + 2, addrspace 3) + ; GFX10: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; GFX10: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 1 from unknown-address + 3, addrspace 3) + ; GFX10: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) + ; GFX10: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C1]] + ; GFX10: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) + ; GFX10: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C1]] + ; GFX10: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C2]](s16) + ; GFX10: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] + ; GFX10: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16) + ; GFX10: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[OR1]](s16) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) + ; GFX10: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_v2s16_align1 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-UNALIGNED: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 1 from unknown-address + 1, addrspace 3) + ; GFX10-UNALIGNED: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX10-UNALIGNED: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; GFX10-UNALIGNED: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C1]] + ; GFX10-UNALIGNED: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX10-UNALIGNED: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C1]] + ; GFX10-UNALIGNED: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; GFX10-UNALIGNED: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C2]](s16) + ; GFX10-UNALIGNED: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL]] + ; GFX10-UNALIGNED: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-UNALIGNED: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; GFX10-UNALIGNED: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 1 from unknown-address + 2, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 1 from unknown-address + 3, addrspace 3) + ; GFX10-UNALIGNED: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) + ; GFX10-UNALIGNED: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C1]] + ; GFX10-UNALIGNED: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) + ; GFX10-UNALIGNED: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C1]] + ; GFX10-UNALIGNED: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C2]](s16) + ; GFX10-UNALIGNED: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] + ; GFX10-UNALIGNED: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16) + ; GFX10-UNALIGNED: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[OR1]](s16) + ; GFX10-UNALIGNED: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) + ; GFX10-UNALIGNED: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) %0:_(p3) = COPY $vgpr0 %1:_(<2 x s16>) = G_LOAD %0 :: (load 4, align 1, addrspace 3) $vgpr0 = COPY %1 @@ -6731,6 +9013,22 @@ ; GFX9-UNALIGNED: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; GFX9-UNALIGNED: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 ; GFX9-UNALIGNED: $vgpr0_vgpr1 = COPY [[INSERT]](<4 x s16>) + ; GFX10-LABEL: name: test_load_local_v3s16_align8 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3) + ; GFX10: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[LOAD]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; GFX10: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; GFX10: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; GFX10: $vgpr0_vgpr1 = COPY [[INSERT]](<4 x s16>) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_v3s16_align8 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3) + ; GFX10-UNALIGNED: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX10-UNALIGNED: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[LOAD]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; GFX10-UNALIGNED: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; GFX10-UNALIGNED: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 + ; GFX10-UNALIGNED: $vgpr0_vgpr1 = COPY [[INSERT]](<4 x s16>) %0:_(p3) = COPY $vgpr0 %1:_(<3 x s16>) = G_LOAD %0 :: (load 6, align 8, addrspace 3) %2:_(<4 x s16>) = G_IMPLICIT_DEF @@ -6909,6 +9207,60 @@ ; GFX9-UNALIGNED: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-UNALIGNED: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[LOAD]](<3 x s16>), 0 ; GFX9-UNALIGNED: $vgpr0_vgpr1 = COPY [[INSERT]](<4 x s16>) + ; GFX10-LABEL: name: test_load_local_v3s16_align2 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 2 from unknown-address + 2, addrspace 3) + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY1]](s32), [[COPY2]](s32) + ; GFX10: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 2 from unknown-address + 4, addrspace 3) + ; GFX10: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) + ; GFX10: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX10: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX10: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; GFX10: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; GFX10: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BUILD_VECTOR_TRUNC]](<2 x s16>), 0 + ; GFX10: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX10: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX10: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 + ; GFX10: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[TRUNC]](s16), 32 + ; GFX10: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX10: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; GFX10: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 + ; GFX10: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_v3s16_align2 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-UNALIGNED: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 2 from unknown-address + 2, addrspace 3) + ; GFX10-UNALIGNED: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10-UNALIGNED: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10-UNALIGNED: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY1]](s32), [[COPY2]](s32) + ; GFX10-UNALIGNED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10-UNALIGNED: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 2 from unknown-address + 4, addrspace 3) + ; GFX10-UNALIGNED: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) + ; GFX10-UNALIGNED: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX10-UNALIGNED: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX10-UNALIGNED: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX10-UNALIGNED: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; GFX10-UNALIGNED: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; GFX10-UNALIGNED: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BUILD_VECTOR_TRUNC]](<2 x s16>), 0 + ; GFX10-UNALIGNED: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX10-UNALIGNED: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX10-UNALIGNED: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 + ; GFX10-UNALIGNED: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[TRUNC]](s16), 32 + ; GFX10-UNALIGNED: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX10-UNALIGNED: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; GFX10-UNALIGNED: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 + ; GFX10-UNALIGNED: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) %0:_(p3) = COPY $vgpr0 %1:_(<3 x s16>) = G_LOAD %0 :: (load 6, align 2, addrspace 3) %2:_(<4 x s16>) = G_IMPLICIT_DEF @@ -7223,6 +9575,112 @@ ; GFX9-UNALIGNED: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-UNALIGNED: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[LOAD]](<3 x s16>), 0 ; GFX9-UNALIGNED: $vgpr0_vgpr1 = COPY [[INSERT]](<4 x s16>) + ; GFX10-LABEL: name: test_load_local_v3s16_align1 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 1 from unknown-address + 1, addrspace 3) + ; GFX10: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX10: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; GFX10: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C1]] + ; GFX10: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX10: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C1]] + ; GFX10: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; GFX10: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C2]](s16) + ; GFX10: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL]] + ; GFX10: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; GFX10: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 1 from unknown-address + 2, addrspace 3) + ; GFX10: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; GFX10: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 1 from unknown-address + 3, addrspace 3) + ; GFX10: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) + ; GFX10: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C1]] + ; GFX10: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) + ; GFX10: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C1]] + ; GFX10: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C2]](s16) + ; GFX10: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] + ; GFX10: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16) + ; GFX10: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[OR1]](s16) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) + ; GFX10: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10: [[PTR_ADD3:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; GFX10: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p3) :: (load 1 from unknown-address + 4, addrspace 3) + ; GFX10: [[PTR_ADD4:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; GFX10: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p3) :: (load 1 from unknown-address + 5, addrspace 3) + ; GFX10: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) + ; GFX10: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C1]] + ; GFX10: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) + ; GFX10: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C1]] + ; GFX10: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C2]](s16) + ; GFX10: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; GFX10: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX10: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX10: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; GFX10: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; GFX10: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BUILD_VECTOR_TRUNC]](<2 x s16>), 0 + ; GFX10: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX10: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX10: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 + ; GFX10: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[OR2]](s16), 32 + ; GFX10: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX10: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; GFX10: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 + ; GFX10: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_v3s16_align1 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-UNALIGNED: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 1 from unknown-address + 1, addrspace 3) + ; GFX10-UNALIGNED: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX10-UNALIGNED: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; GFX10-UNALIGNED: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C1]] + ; GFX10-UNALIGNED: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX10-UNALIGNED: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C1]] + ; GFX10-UNALIGNED: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; GFX10-UNALIGNED: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C2]](s16) + ; GFX10-UNALIGNED: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL]] + ; GFX10-UNALIGNED: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-UNALIGNED: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; GFX10-UNALIGNED: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 1 from unknown-address + 2, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 1 from unknown-address + 3, addrspace 3) + ; GFX10-UNALIGNED: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) + ; GFX10-UNALIGNED: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C1]] + ; GFX10-UNALIGNED: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) + ; GFX10-UNALIGNED: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C1]] + ; GFX10-UNALIGNED: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C2]](s16) + ; GFX10-UNALIGNED: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] + ; GFX10-UNALIGNED: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16) + ; GFX10-UNALIGNED: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[OR1]](s16) + ; GFX10-UNALIGNED: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) + ; GFX10-UNALIGNED: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10-UNALIGNED: [[PTR_ADD3:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; GFX10-UNALIGNED: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p3) :: (load 1 from unknown-address + 4, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD4:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p3) :: (load 1 from unknown-address + 5, addrspace 3) + ; GFX10-UNALIGNED: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) + ; GFX10-UNALIGNED: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C1]] + ; GFX10-UNALIGNED: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) + ; GFX10-UNALIGNED: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C1]] + ; GFX10-UNALIGNED: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C2]](s16) + ; GFX10-UNALIGNED: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; GFX10-UNALIGNED: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX10-UNALIGNED: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX10-UNALIGNED: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX10-UNALIGNED: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; GFX10-UNALIGNED: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; GFX10-UNALIGNED: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BUILD_VECTOR_TRUNC]](<2 x s16>), 0 + ; GFX10-UNALIGNED: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX10-UNALIGNED: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX10-UNALIGNED: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 + ; GFX10-UNALIGNED: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[OR2]](s16), 32 + ; GFX10-UNALIGNED: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX10-UNALIGNED: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; GFX10-UNALIGNED: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 + ; GFX10-UNALIGNED: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) %0:_(p3) = COPY $vgpr0 %1:_(<3 x s16>) = G_LOAD %0 :: (load 6, align 1, addrspace 3) %2:_(<4 x s16>) = G_IMPLICIT_DEF @@ -7259,6 +9717,14 @@ ; GFX9-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3) ; GFX9-UNALIGNED: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) + ; GFX10-LABEL: name: test_load_local_v4s16_align8 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3) + ; GFX10: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_v4s16_align8 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3) + ; GFX10-UNALIGNED: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) %0:_(p3) = COPY $vgpr0 %1:_(<4 x s16>) = G_LOAD %0 :: (load 8, align 8, addrspace 3) $vgpr0_vgpr1 = COPY %1 @@ -7294,6 +9760,14 @@ ; GFX9-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) ; GFX9-UNALIGNED: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) + ; GFX10-LABEL: name: test_load_local_v4s16_align4 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) + ; GFX10: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_v4s16_align4 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) + ; GFX10-UNALIGNED: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) %0:_(p3) = COPY $vgpr0 %1:_(<4 x s16>) = G_LOAD %0 :: (load 8, align 4, addrspace 3) $vgpr0_vgpr1 = COPY %1 @@ -7448,6 +9922,46 @@ ; GFX9-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p3) :: (load 8, align 2, addrspace 3) ; GFX9-UNALIGNED: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) + ; GFX10-LABEL: name: test_load_local_v4s16_align2 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 2 from unknown-address + 2, addrspace 3) + ; GFX10: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 2 from unknown-address + 4, addrspace 3) + ; GFX10: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 + ; GFX10: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 2 from unknown-address + 6, addrspace 3) + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY1]](s32), [[COPY2]](s32) + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY3]](s32), [[COPY4]](s32) + ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_v4s16_align2 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-UNALIGNED: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 2 from unknown-address + 2, addrspace 3) + ; GFX10-UNALIGNED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10-UNALIGNED: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 2 from unknown-address + 4, addrspace 3) + ; GFX10-UNALIGNED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 + ; GFX10-UNALIGNED: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-UNALIGNED: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 2 from unknown-address + 6, addrspace 3) + ; GFX10-UNALIGNED: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10-UNALIGNED: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10-UNALIGNED: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY1]](s32), [[COPY2]](s32) + ; GFX10-UNALIGNED: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX10-UNALIGNED: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX10-UNALIGNED: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY3]](s32), [[COPY4]](s32) + ; GFX10-UNALIGNED: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10-UNALIGNED: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(p3) = COPY $vgpr0 %1:_(<4 x s16>) = G_LOAD %0 :: (load 8, align 2, addrspace 3) $vgpr0_vgpr1 = COPY %1 @@ -7777,6 +10291,114 @@ ; GFX9-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p3) :: (load 8, align 1, addrspace 3) ; GFX9-UNALIGNED: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) + ; GFX10-LABEL: name: test_load_local_v4s16_align1 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 1 from unknown-address + 1, addrspace 3) + ; GFX10: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX10: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; GFX10: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C1]] + ; GFX10: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX10: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C1]] + ; GFX10: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; GFX10: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C2]](s16) + ; GFX10: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL]] + ; GFX10: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; GFX10: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 1 from unknown-address + 2, addrspace 3) + ; GFX10: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; GFX10: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 1 from unknown-address + 3, addrspace 3) + ; GFX10: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) + ; GFX10: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C1]] + ; GFX10: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) + ; GFX10: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C1]] + ; GFX10: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C2]](s16) + ; GFX10: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] + ; GFX10: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16) + ; GFX10: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[OR1]](s16) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) + ; GFX10: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10: [[PTR_ADD3:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; GFX10: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p3) :: (load 1 from unknown-address + 4, addrspace 3) + ; GFX10: [[PTR_ADD4:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; GFX10: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p3) :: (load 1 from unknown-address + 5, addrspace 3) + ; GFX10: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) + ; GFX10: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C1]] + ; GFX10: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) + ; GFX10: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C1]] + ; GFX10: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C2]](s16) + ; GFX10: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; GFX10: [[PTR_ADD5:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C3]](s32) + ; GFX10: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p3) :: (load 1 from unknown-address + 6, addrspace 3) + ; GFX10: [[PTR_ADD6:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; GFX10: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p3) :: (load 1 from unknown-address + 7, addrspace 3) + ; GFX10: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) + ; GFX10: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C1]] + ; GFX10: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) + ; GFX10: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C1]] + ; GFX10: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C2]](s16) + ; GFX10: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] + ; GFX10: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[OR2]](s16) + ; GFX10: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[OR3]](s16) + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT2]](s32), [[ANYEXT3]](s32) + ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_v4s16_align1 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-UNALIGNED: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 1 from unknown-address + 1, addrspace 3) + ; GFX10-UNALIGNED: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX10-UNALIGNED: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; GFX10-UNALIGNED: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C1]] + ; GFX10-UNALIGNED: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX10-UNALIGNED: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C1]] + ; GFX10-UNALIGNED: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; GFX10-UNALIGNED: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C2]](s16) + ; GFX10-UNALIGNED: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL]] + ; GFX10-UNALIGNED: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-UNALIGNED: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; GFX10-UNALIGNED: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 1 from unknown-address + 2, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 1 from unknown-address + 3, addrspace 3) + ; GFX10-UNALIGNED: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) + ; GFX10-UNALIGNED: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C1]] + ; GFX10-UNALIGNED: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) + ; GFX10-UNALIGNED: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C1]] + ; GFX10-UNALIGNED: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C2]](s16) + ; GFX10-UNALIGNED: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] + ; GFX10-UNALIGNED: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16) + ; GFX10-UNALIGNED: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[OR1]](s16) + ; GFX10-UNALIGNED: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) + ; GFX10-UNALIGNED: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10-UNALIGNED: [[PTR_ADD3:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; GFX10-UNALIGNED: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p3) :: (load 1 from unknown-address + 4, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD4:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p3) :: (load 1 from unknown-address + 5, addrspace 3) + ; GFX10-UNALIGNED: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) + ; GFX10-UNALIGNED: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C1]] + ; GFX10-UNALIGNED: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) + ; GFX10-UNALIGNED: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C1]] + ; GFX10-UNALIGNED: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C2]](s16) + ; GFX10-UNALIGNED: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; GFX10-UNALIGNED: [[PTR_ADD5:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C3]](s32) + ; GFX10-UNALIGNED: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p3) :: (load 1 from unknown-address + 6, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD6:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p3) :: (load 1 from unknown-address + 7, addrspace 3) + ; GFX10-UNALIGNED: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) + ; GFX10-UNALIGNED: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C1]] + ; GFX10-UNALIGNED: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) + ; GFX10-UNALIGNED: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C1]] + ; GFX10-UNALIGNED: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C2]](s16) + ; GFX10-UNALIGNED: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] + ; GFX10-UNALIGNED: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[OR2]](s16) + ; GFX10-UNALIGNED: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[OR3]](s16) + ; GFX10-UNALIGNED: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT2]](s32), [[ANYEXT3]](s32) + ; GFX10-UNALIGNED: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10-UNALIGNED: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(p3) = COPY $vgpr0 %1:_(<4 x s16>) = G_LOAD %0 :: (load 8, align 1, addrspace 3) $vgpr0_vgpr1 = COPY %1 @@ -7812,6 +10434,14 @@ ; GFX9-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3) ; GFX9-UNALIGNED: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; GFX10-LABEL: name: test_load_local_v2s32_align8 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3) + ; GFX10: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_v2s32_align8 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3) + ; GFX10-UNALIGNED: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) %0:_(p3) = COPY $vgpr0 %1:_(<2 x s32>) = G_LOAD %0 :: (load 8, align 8, addrspace 3) $vgpr0_vgpr1 = COPY %1 @@ -7847,6 +10477,14 @@ ; GFX9-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) ; GFX9-UNALIGNED: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; GFX10-LABEL: name: test_load_local_v2s32_align4 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) + ; GFX10: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_v2s32_align4 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) + ; GFX10-UNALIGNED: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) %0:_(p3) = COPY $vgpr0 %1:_(<2 x s32>) = G_LOAD %0 :: (load 8, align 4, addrspace 3) $vgpr0_vgpr1 = COPY %1 @@ -7997,6 +10635,60 @@ ; GFX9-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, align 2, addrspace 3) ; GFX9-UNALIGNED: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; GFX10-LABEL: name: test_load_local_v2s32_align2 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 2 from unknown-address + 2, addrspace 3) + ; GFX10: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX10: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX10: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX10: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; GFX10: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 2 from unknown-address + 4, addrspace 3) + ; GFX10: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; GFX10: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 2 from unknown-address + 6, addrspace 3) + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX10: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX10: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; GFX10: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; GFX10: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) + ; GFX10: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_v2s32_align2 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-UNALIGNED: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 2 from unknown-address + 2, addrspace 3) + ; GFX10-UNALIGNED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX10-UNALIGNED: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10-UNALIGNED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; GFX10-UNALIGNED: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10-UNALIGNED: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX10-UNALIGNED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-UNALIGNED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX10-UNALIGNED: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX10-UNALIGNED: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10-UNALIGNED: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; GFX10-UNALIGNED: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 2 from unknown-address + 4, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 2 from unknown-address + 6, addrspace 3) + ; GFX10-UNALIGNED: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX10-UNALIGNED: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; GFX10-UNALIGNED: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX10-UNALIGNED: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; GFX10-UNALIGNED: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; GFX10-UNALIGNED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX10-UNALIGNED: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) + ; GFX10-UNALIGNED: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(p3) = COPY $vgpr0 %1:_(<2 x s32>) = G_LOAD %0 :: (load 8, align 2, addrspace 3) $vgpr0_vgpr1 = COPY %1 @@ -8287,6 +10979,116 @@ ; GFX9-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, align 1, addrspace 3) ; GFX9-UNALIGNED: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; GFX10-LABEL: name: test_load_local_v2s32_align1 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 1 from unknown-address + 1, addrspace 3) + ; GFX10: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 1 from unknown-address + 2, addrspace 3) + ; GFX10: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; GFX10: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 1 from unknown-address + 3, addrspace 3) + ; GFX10: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX10: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX10: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX10: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX10: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C5]](s32) + ; GFX10: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX10: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX10: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; GFX10: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; GFX10: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; GFX10: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10: [[PTR_ADD3:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C7]](s32) + ; GFX10: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p3) :: (load 1 from unknown-address + 4, addrspace 3) + ; GFX10: [[PTR_ADD4:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; GFX10: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p3) :: (load 1 from unknown-address + 5, addrspace 3) + ; GFX10: [[PTR_ADD5:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s32) + ; GFX10: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p3) :: (load 1 from unknown-address + 6, addrspace 3) + ; GFX10: [[PTR_ADD6:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; GFX10: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p3) :: (load 1 from unknown-address + 7, addrspace 3) + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX10: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX10: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; GFX10: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; GFX10: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL3]] + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; GFX10: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; GFX10: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C5]](s32) + ; GFX10: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]] + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; GFX10: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; GFX10: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C6]](s32) + ; GFX10: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]] + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) + ; GFX10: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_v2s32_align1 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-UNALIGNED: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 1 from unknown-address + 1, addrspace 3) + ; GFX10-UNALIGNED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-UNALIGNED: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 1 from unknown-address + 2, addrspace 3) + ; GFX10-UNALIGNED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; GFX10-UNALIGNED: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-UNALIGNED: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 1 from unknown-address + 3, addrspace 3) + ; GFX10-UNALIGNED: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; GFX10-UNALIGNED: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10-UNALIGNED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX10-UNALIGNED: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10-UNALIGNED: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX10-UNALIGNED: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-UNALIGNED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX10-UNALIGNED: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX10-UNALIGNED: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX10-UNALIGNED: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX10-UNALIGNED: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-UNALIGNED: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C5]](s32) + ; GFX10-UNALIGNED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GFX10-UNALIGNED: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX10-UNALIGNED: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX10-UNALIGNED: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; GFX10-UNALIGNED: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; GFX10-UNALIGNED: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; GFX10-UNALIGNED: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10-UNALIGNED: [[PTR_ADD3:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C7]](s32) + ; GFX10-UNALIGNED: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p3) :: (load 1 from unknown-address + 4, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD4:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p3) :: (load 1 from unknown-address + 5, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD5:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p3) :: (load 1 from unknown-address + 6, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD6:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; GFX10-UNALIGNED: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p3) :: (load 1 from unknown-address + 7, addrspace 3) + ; GFX10-UNALIGNED: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX10-UNALIGNED: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; GFX10-UNALIGNED: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX10-UNALIGNED: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; GFX10-UNALIGNED: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; GFX10-UNALIGNED: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL3]] + ; GFX10-UNALIGNED: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; GFX10-UNALIGNED: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; GFX10-UNALIGNED: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C5]](s32) + ; GFX10-UNALIGNED: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]] + ; GFX10-UNALIGNED: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; GFX10-UNALIGNED: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; GFX10-UNALIGNED: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C6]](s32) + ; GFX10-UNALIGNED: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]] + ; GFX10-UNALIGNED: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) + ; GFX10-UNALIGNED: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(p3) = COPY $vgpr0 %1:_(<2 x s32>) = G_LOAD %0 :: (load 8, align 1, addrspace 3) $vgpr0_vgpr1 = COPY %1 @@ -8702,6 +11504,166 @@ ; GFX9-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p3) :: (load 12, align 1, addrspace 3) ; GFX9-UNALIGNED: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>) + ; GFX10-LABEL: name: test_load_local_v3s32_align16 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 1 from unknown-address + 1, addrspace 3) + ; GFX10: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 1 from unknown-address + 2, addrspace 3) + ; GFX10: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; GFX10: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 1 from unknown-address + 3, addrspace 3) + ; GFX10: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX10: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX10: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX10: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX10: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C5]](s32) + ; GFX10: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX10: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX10: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; GFX10: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; GFX10: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; GFX10: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10: [[PTR_ADD3:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C7]](s32) + ; GFX10: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p3) :: (load 1 from unknown-address + 4, addrspace 3) + ; GFX10: [[PTR_ADD4:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; GFX10: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p3) :: (load 1 from unknown-address + 5, addrspace 3) + ; GFX10: [[PTR_ADD5:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s32) + ; GFX10: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p3) :: (load 1 from unknown-address + 6, addrspace 3) + ; GFX10: [[PTR_ADD6:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; GFX10: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p3) :: (load 1 from unknown-address + 7, addrspace 3) + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX10: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX10: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; GFX10: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; GFX10: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL3]] + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; GFX10: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; GFX10: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C5]](s32) + ; GFX10: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]] + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; GFX10: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; GFX10: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C6]](s32) + ; GFX10: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]] + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) + ; GFX10: [[PTR_ADD7:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; GFX10: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p3) :: (load 1 from unknown-address + 8, addrspace 3) + ; GFX10: [[PTR_ADD8:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; GFX10: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p3) :: (load 1 from unknown-address + 9, addrspace 3) + ; GFX10: [[PTR_ADD9:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C1]](s32) + ; GFX10: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p3) :: (load 1 from unknown-address + 10, addrspace 3) + ; GFX10: [[PTR_ADD10:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; GFX10: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p3) :: (load 1 from unknown-address + 11, addrspace 3) + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; GFX10: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]] + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; GFX10: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]] + ; GFX10: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C4]](s32) + ; GFX10: [[OR6:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL6]] + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; GFX10: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C3]] + ; GFX10: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND10]], [[C5]](s32) + ; GFX10: [[OR7:%[0-9]+]]:_(s32) = G_OR [[OR6]], [[SHL7]] + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; GFX10: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C3]] + ; GFX10: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C6]](s32) + ; GFX10: [[OR8:%[0-9]+]]:_(s32) = G_OR [[OR7]], [[SHL8]] + ; GFX10: [[DEF:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF + ; GFX10: [[INSERT:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[DEF]], [[BUILD_VECTOR]](<2 x s32>), 0 + ; GFX10: [[INSERT1:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT]], [[OR8]](s32), 64 + ; GFX10: $vgpr0_vgpr1_vgpr2 = COPY [[INSERT1]](<3 x s32>) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_v3s32_align16 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-UNALIGNED: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 1 from unknown-address + 1, addrspace 3) + ; GFX10-UNALIGNED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-UNALIGNED: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 1 from unknown-address + 2, addrspace 3) + ; GFX10-UNALIGNED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; GFX10-UNALIGNED: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-UNALIGNED: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 1 from unknown-address + 3, addrspace 3) + ; GFX10-UNALIGNED: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; GFX10-UNALIGNED: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10-UNALIGNED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX10-UNALIGNED: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10-UNALIGNED: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX10-UNALIGNED: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-UNALIGNED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX10-UNALIGNED: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX10-UNALIGNED: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX10-UNALIGNED: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX10-UNALIGNED: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-UNALIGNED: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C5]](s32) + ; GFX10-UNALIGNED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GFX10-UNALIGNED: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX10-UNALIGNED: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX10-UNALIGNED: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; GFX10-UNALIGNED: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; GFX10-UNALIGNED: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; GFX10-UNALIGNED: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10-UNALIGNED: [[PTR_ADD3:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C7]](s32) + ; GFX10-UNALIGNED: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p3) :: (load 1 from unknown-address + 4, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD4:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p3) :: (load 1 from unknown-address + 5, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD5:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p3) :: (load 1 from unknown-address + 6, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD6:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; GFX10-UNALIGNED: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p3) :: (load 1 from unknown-address + 7, addrspace 3) + ; GFX10-UNALIGNED: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX10-UNALIGNED: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; GFX10-UNALIGNED: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX10-UNALIGNED: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; GFX10-UNALIGNED: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; GFX10-UNALIGNED: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL3]] + ; GFX10-UNALIGNED: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; GFX10-UNALIGNED: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; GFX10-UNALIGNED: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C5]](s32) + ; GFX10-UNALIGNED: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]] + ; GFX10-UNALIGNED: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; GFX10-UNALIGNED: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; GFX10-UNALIGNED: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C6]](s32) + ; GFX10-UNALIGNED: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]] + ; GFX10-UNALIGNED: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) + ; GFX10-UNALIGNED: [[PTR_ADD7:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; GFX10-UNALIGNED: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p3) :: (load 1 from unknown-address + 8, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD8:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p3) :: (load 1 from unknown-address + 9, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD9:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p3) :: (load 1 from unknown-address + 10, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD10:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; GFX10-UNALIGNED: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p3) :: (load 1 from unknown-address + 11, addrspace 3) + ; GFX10-UNALIGNED: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; GFX10-UNALIGNED: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]] + ; GFX10-UNALIGNED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; GFX10-UNALIGNED: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]] + ; GFX10-UNALIGNED: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C4]](s32) + ; GFX10-UNALIGNED: [[OR6:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL6]] + ; GFX10-UNALIGNED: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; GFX10-UNALIGNED: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C3]] + ; GFX10-UNALIGNED: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND10]], [[C5]](s32) + ; GFX10-UNALIGNED: [[OR7:%[0-9]+]]:_(s32) = G_OR [[OR6]], [[SHL7]] + ; GFX10-UNALIGNED: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; GFX10-UNALIGNED: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C3]] + ; GFX10-UNALIGNED: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C6]](s32) + ; GFX10-UNALIGNED: [[OR8:%[0-9]+]]:_(s32) = G_OR [[OR7]], [[SHL8]] + ; GFX10-UNALIGNED: [[DEF:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF + ; GFX10-UNALIGNED: [[INSERT:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[DEF]], [[BUILD_VECTOR]](<2 x s32>), 0 + ; GFX10-UNALIGNED: [[INSERT1:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT]], [[OR8]](s32), 64 + ; GFX10-UNALIGNED: $vgpr0_vgpr1_vgpr2 = COPY [[INSERT1]](<3 x s32>) %0:_(p3) = COPY $vgpr0 %1:_(<3 x s32>) = G_LOAD %0 :: (load 12, align 1, addrspace 3) $vgpr0_vgpr1_vgpr2 = COPY %1 @@ -8767,6 +11729,26 @@ ; GFX9-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p3) :: (load 12, align 4, addrspace 3) ; GFX9-UNALIGNED: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>) + ; GFX10-LABEL: name: test_load_local_v3s32_align4 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 from unknown-address + 8, addrspace 3) + ; GFX10: [[DEF:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF + ; GFX10: [[INSERT:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[DEF]], [[LOAD]](<2 x s32>), 0 + ; GFX10: [[INSERT1:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT]], [[LOAD1]](s32), 64 + ; GFX10: $vgpr0_vgpr1_vgpr2 = COPY [[INSERT1]](<3 x s32>) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_v3s32_align4 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-UNALIGNED: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 from unknown-address + 8, addrspace 3) + ; GFX10-UNALIGNED: [[DEF:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF + ; GFX10-UNALIGNED: [[INSERT:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[DEF]], [[LOAD]](<2 x s32>), 0 + ; GFX10-UNALIGNED: [[INSERT1:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT]], [[LOAD1]](s32), 64 + ; GFX10-UNALIGNED: $vgpr0_vgpr1_vgpr2 = COPY [[INSERT1]](<3 x s32>) %0:_(p3) = COPY $vgpr0 %1:_(<3 x s32>) = G_LOAD %0 :: (load 12, align 4, addrspace 3) $vgpr0_vgpr1_vgpr2 = COPY %1 @@ -8810,6 +11792,14 @@ ; GFX9-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load 16, addrspace 3) ; GFX9-UNALIGNED: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) + ; GFX10-LABEL: name: test_load_local_v4s32_align16 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load 16, addrspace 3) + ; GFX10: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_v4s32_align16 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load 16, addrspace 3) + ; GFX10-UNALIGNED: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) %0:_(p3) = COPY $vgpr0 %1:_(<4 x s32>) = G_LOAD %0 :: (load 16, align 16, addrspace 3) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -8853,6 +11843,14 @@ ; GFX9-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load 16, align 8, addrspace 3) ; GFX9-UNALIGNED: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) + ; GFX10-LABEL: name: test_load_local_v4s32_align8 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load 16, align 8, addrspace 3) + ; GFX10: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_v4s32_align8 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load 16, align 8, addrspace 3) + ; GFX10-UNALIGNED: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) %0:_(p3) = COPY $vgpr0 %1:_(<4 x s32>) = G_LOAD %0 :: (load 16, align 8, addrspace 3) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -8926,6 +11924,34 @@ ; GFX9-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load 16, align 4, addrspace 3) ; GFX9-UNALIGNED: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) + ; GFX10-LABEL: name: test_load_local_v4s32_align4 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 4, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 from unknown-address + 4, addrspace 3) + ; GFX10: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 4 from unknown-address + 8, addrspace 3) + ; GFX10: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 4 from unknown-address + 12, addrspace 3) + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) + ; GFX10: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_v4s32_align4 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 4, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10-UNALIGNED: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 from unknown-address + 4, addrspace 3) + ; GFX10-UNALIGNED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-UNALIGNED: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 4 from unknown-address + 8, addrspace 3) + ; GFX10-UNALIGNED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10-UNALIGNED: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-UNALIGNED: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 4 from unknown-address + 12, addrspace 3) + ; GFX10-UNALIGNED: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) + ; GFX10-UNALIGNED: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) %0:_(p3) = COPY $vgpr0 %1:_(<4 x s32>) = G_LOAD %0 :: (load 16, align 4, addrspace 3) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -9191,6 +12217,106 @@ ; GFX9-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load 16, align 2, addrspace 3) ; GFX9-UNALIGNED: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) + ; GFX10-LABEL: name: test_load_local_v4s32_align2 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 2 from unknown-address + 2, addrspace 3) + ; GFX10: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX10: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX10: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX10: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; GFX10: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 2 from unknown-address + 4, addrspace 3) + ; GFX10: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; GFX10: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 2 from unknown-address + 6, addrspace 3) + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX10: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX10: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; GFX10: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; GFX10: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) + ; GFX10: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10: [[PTR_ADD3:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; GFX10: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p3) :: (load 2 from unknown-address + 8, addrspace 3) + ; GFX10: [[PTR_ADD4:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; GFX10: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p3) :: (load 2 from unknown-address + 10, addrspace 3) + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX10: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]] + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX10: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]] + ; GFX10: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32) + ; GFX10: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; GFX10: [[PTR_ADD5:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C3]](s32) + ; GFX10: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p3) :: (load 2 from unknown-address + 12, addrspace 3) + ; GFX10: [[PTR_ADD6:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; GFX10: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p3) :: (load 2 from unknown-address + 14, addrspace 3) + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; GFX10: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]] + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; GFX10: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C1]] + ; GFX10: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C2]](s32) + ; GFX10: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR3]](s32) + ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s32>), [[BUILD_VECTOR1]](<2 x s32>) + ; GFX10: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[CONCAT_VECTORS]](<4 x s32>) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_v4s32_align2 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-UNALIGNED: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 2 from unknown-address + 2, addrspace 3) + ; GFX10-UNALIGNED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX10-UNALIGNED: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10-UNALIGNED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; GFX10-UNALIGNED: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10-UNALIGNED: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX10-UNALIGNED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-UNALIGNED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX10-UNALIGNED: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX10-UNALIGNED: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10-UNALIGNED: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; GFX10-UNALIGNED: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 2 from unknown-address + 4, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 2 from unknown-address + 6, addrspace 3) + ; GFX10-UNALIGNED: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX10-UNALIGNED: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; GFX10-UNALIGNED: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX10-UNALIGNED: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; GFX10-UNALIGNED: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; GFX10-UNALIGNED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX10-UNALIGNED: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) + ; GFX10-UNALIGNED: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-UNALIGNED: [[PTR_ADD3:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; GFX10-UNALIGNED: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p3) :: (load 2 from unknown-address + 8, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD4:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p3) :: (load 2 from unknown-address + 10, addrspace 3) + ; GFX10-UNALIGNED: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX10-UNALIGNED: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]] + ; GFX10-UNALIGNED: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX10-UNALIGNED: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]] + ; GFX10-UNALIGNED: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32) + ; GFX10-UNALIGNED: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; GFX10-UNALIGNED: [[PTR_ADD5:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C3]](s32) + ; GFX10-UNALIGNED: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p3) :: (load 2 from unknown-address + 12, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD6:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p3) :: (load 2 from unknown-address + 14, addrspace 3) + ; GFX10-UNALIGNED: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; GFX10-UNALIGNED: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]] + ; GFX10-UNALIGNED: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; GFX10-UNALIGNED: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C1]] + ; GFX10-UNALIGNED: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C2]](s32) + ; GFX10-UNALIGNED: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; GFX10-UNALIGNED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR3]](s32) + ; GFX10-UNALIGNED: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s32>), [[BUILD_VECTOR1]](<2 x s32>) + ; GFX10-UNALIGNED: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[CONCAT_VECTORS]](<4 x s32>) %0:_(p3) = COPY $vgpr0 %1:_(<4 x s32>) = G_LOAD %0 :: (load 16, align 2, addrspace 3) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -9708,6 +12834,206 @@ ; GFX9-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load 16, align 1, addrspace 3) ; GFX9-UNALIGNED: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) + ; GFX10-LABEL: name: test_load_local_v4s32_align1 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 1 from unknown-address + 1, addrspace 3) + ; GFX10: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 1 from unknown-address + 2, addrspace 3) + ; GFX10: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; GFX10: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 1 from unknown-address + 3, addrspace 3) + ; GFX10: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX10: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX10: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX10: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX10: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C5]](s32) + ; GFX10: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX10: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX10: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; GFX10: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; GFX10: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; GFX10: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10: [[PTR_ADD3:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C7]](s32) + ; GFX10: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p3) :: (load 1 from unknown-address + 4, addrspace 3) + ; GFX10: [[PTR_ADD4:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; GFX10: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p3) :: (load 1 from unknown-address + 5, addrspace 3) + ; GFX10: [[PTR_ADD5:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s32) + ; GFX10: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p3) :: (load 1 from unknown-address + 6, addrspace 3) + ; GFX10: [[PTR_ADD6:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; GFX10: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p3) :: (load 1 from unknown-address + 7, addrspace 3) + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX10: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX10: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; GFX10: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; GFX10: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL3]] + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; GFX10: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; GFX10: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C5]](s32) + ; GFX10: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]] + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; GFX10: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; GFX10: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C6]](s32) + ; GFX10: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]] + ; GFX10: [[PTR_ADD7:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; GFX10: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p3) :: (load 1 from unknown-address + 8, addrspace 3) + ; GFX10: [[PTR_ADD8:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; GFX10: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p3) :: (load 1 from unknown-address + 9, addrspace 3) + ; GFX10: [[PTR_ADD9:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C1]](s32) + ; GFX10: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p3) :: (load 1 from unknown-address + 10, addrspace 3) + ; GFX10: [[PTR_ADD10:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; GFX10: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p3) :: (load 1 from unknown-address + 11, addrspace 3) + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; GFX10: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]] + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; GFX10: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]] + ; GFX10: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C4]](s32) + ; GFX10: [[OR6:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL6]] + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; GFX10: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C3]] + ; GFX10: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND10]], [[C5]](s32) + ; GFX10: [[OR7:%[0-9]+]]:_(s32) = G_OR [[OR6]], [[SHL7]] + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; GFX10: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C3]] + ; GFX10: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C6]](s32) + ; GFX10: [[OR8:%[0-9]+]]:_(s32) = G_OR [[OR7]], [[SHL8]] + ; GFX10: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10: [[PTR_ADD11:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C8]](s32) + ; GFX10: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD11]](p3) :: (load 1 from unknown-address + 12, addrspace 3) + ; GFX10: [[PTR_ADD12:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; GFX10: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD12]](p3) :: (load 1 from unknown-address + 13, addrspace 3) + ; GFX10: [[PTR_ADD13:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s32) + ; GFX10: [[LOAD14:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD13]](p3) :: (load 1 from unknown-address + 14, addrspace 3) + ; GFX10: [[PTR_ADD14:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; GFX10: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p3) :: (load 1 from unknown-address + 15, addrspace 3) + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD12]](s32) + ; GFX10: [[AND12:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C3]] + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) + ; GFX10: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C3]] + ; GFX10: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[C4]](s32) + ; GFX10: [[OR9:%[0-9]+]]:_(s32) = G_OR [[AND12]], [[SHL9]] + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD14]](s32) + ; GFX10: [[AND14:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C3]] + ; GFX10: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND14]], [[C5]](s32) + ; GFX10: [[OR10:%[0-9]+]]:_(s32) = G_OR [[OR9]], [[SHL10]] + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) + ; GFX10: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C3]] + ; GFX10: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[C6]](s32) + ; GFX10: [[OR11:%[0-9]+]]:_(s32) = G_OR [[OR10]], [[SHL11]] + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX10: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_v4s32_align1 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-UNALIGNED: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 1 from unknown-address + 1, addrspace 3) + ; GFX10-UNALIGNED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-UNALIGNED: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 1 from unknown-address + 2, addrspace 3) + ; GFX10-UNALIGNED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; GFX10-UNALIGNED: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-UNALIGNED: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 1 from unknown-address + 3, addrspace 3) + ; GFX10-UNALIGNED: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; GFX10-UNALIGNED: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10-UNALIGNED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX10-UNALIGNED: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10-UNALIGNED: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX10-UNALIGNED: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-UNALIGNED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX10-UNALIGNED: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX10-UNALIGNED: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX10-UNALIGNED: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX10-UNALIGNED: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-UNALIGNED: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C5]](s32) + ; GFX10-UNALIGNED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GFX10-UNALIGNED: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX10-UNALIGNED: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX10-UNALIGNED: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; GFX10-UNALIGNED: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; GFX10-UNALIGNED: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; GFX10-UNALIGNED: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10-UNALIGNED: [[PTR_ADD3:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C7]](s32) + ; GFX10-UNALIGNED: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p3) :: (load 1 from unknown-address + 4, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD4:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p3) :: (load 1 from unknown-address + 5, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD5:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p3) :: (load 1 from unknown-address + 6, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD6:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; GFX10-UNALIGNED: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p3) :: (load 1 from unknown-address + 7, addrspace 3) + ; GFX10-UNALIGNED: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX10-UNALIGNED: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; GFX10-UNALIGNED: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX10-UNALIGNED: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; GFX10-UNALIGNED: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; GFX10-UNALIGNED: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL3]] + ; GFX10-UNALIGNED: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; GFX10-UNALIGNED: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; GFX10-UNALIGNED: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C5]](s32) + ; GFX10-UNALIGNED: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]] + ; GFX10-UNALIGNED: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; GFX10-UNALIGNED: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; GFX10-UNALIGNED: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C6]](s32) + ; GFX10-UNALIGNED: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]] + ; GFX10-UNALIGNED: [[PTR_ADD7:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; GFX10-UNALIGNED: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p3) :: (load 1 from unknown-address + 8, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD8:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p3) :: (load 1 from unknown-address + 9, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD9:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p3) :: (load 1 from unknown-address + 10, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD10:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; GFX10-UNALIGNED: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p3) :: (load 1 from unknown-address + 11, addrspace 3) + ; GFX10-UNALIGNED: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; GFX10-UNALIGNED: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]] + ; GFX10-UNALIGNED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; GFX10-UNALIGNED: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]] + ; GFX10-UNALIGNED: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C4]](s32) + ; GFX10-UNALIGNED: [[OR6:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL6]] + ; GFX10-UNALIGNED: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; GFX10-UNALIGNED: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C3]] + ; GFX10-UNALIGNED: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND10]], [[C5]](s32) + ; GFX10-UNALIGNED: [[OR7:%[0-9]+]]:_(s32) = G_OR [[OR6]], [[SHL7]] + ; GFX10-UNALIGNED: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; GFX10-UNALIGNED: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C3]] + ; GFX10-UNALIGNED: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C6]](s32) + ; GFX10-UNALIGNED: [[OR8:%[0-9]+]]:_(s32) = G_OR [[OR7]], [[SHL8]] + ; GFX10-UNALIGNED: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10-UNALIGNED: [[PTR_ADD11:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C8]](s32) + ; GFX10-UNALIGNED: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD11]](p3) :: (load 1 from unknown-address + 12, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD12:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD12]](p3) :: (load 1 from unknown-address + 13, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD13:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LOAD14:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD13]](p3) :: (load 1 from unknown-address + 14, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD14:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; GFX10-UNALIGNED: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p3) :: (load 1 from unknown-address + 15, addrspace 3) + ; GFX10-UNALIGNED: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD12]](s32) + ; GFX10-UNALIGNED: [[AND12:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C3]] + ; GFX10-UNALIGNED: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) + ; GFX10-UNALIGNED: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C3]] + ; GFX10-UNALIGNED: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[C4]](s32) + ; GFX10-UNALIGNED: [[OR9:%[0-9]+]]:_(s32) = G_OR [[AND12]], [[SHL9]] + ; GFX10-UNALIGNED: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD14]](s32) + ; GFX10-UNALIGNED: [[AND14:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C3]] + ; GFX10-UNALIGNED: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND14]], [[C5]](s32) + ; GFX10-UNALIGNED: [[OR10:%[0-9]+]]:_(s32) = G_OR [[OR9]], [[SHL10]] + ; GFX10-UNALIGNED: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) + ; GFX10-UNALIGNED: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C3]] + ; GFX10-UNALIGNED: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[C6]](s32) + ; GFX10-UNALIGNED: [[OR11:%[0-9]+]]:_(s32) = G_OR [[OR10]], [[SHL11]] + ; GFX10-UNALIGNED: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX10-UNALIGNED: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) %0:_(p3) = COPY $vgpr0 %1:_(<4 x s32>) = G_LOAD %0 :: (load 16, align 1, addrspace 3) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -9779,6 +13105,22 @@ ; GFX9-UNALIGNED: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p3) :: (load 16 from unknown-address + 16, addrspace 3) ; GFX9-UNALIGNED: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>) ; GFX9-UNALIGNED: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<8 x s32>) + ; GFX10-LABEL: name: test_load_local_v8s32_align32 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load 16, align 32, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p3) :: (load 16 from unknown-address + 16, addrspace 3) + ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>) + ; GFX10: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<8 x s32>) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_v8s32_align32 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load 16, align 32, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-UNALIGNED: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p3) :: (load 16 from unknown-address + 16, addrspace 3) + ; GFX10-UNALIGNED: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>) + ; GFX10-UNALIGNED: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<8 x s32>) %0:_(p3) = COPY $vgpr0 %1:_(<8 x s32>) = G_LOAD %0 :: (load 32, align 32, addrspace 3) $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY %1 @@ -9814,6 +13156,14 @@ ; GFX9-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[COPY]](p3) :: (load 16, align 32, addrspace 3) ; GFX9-UNALIGNED: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[LOAD]](<16 x s32>) + ; GFX10-LABEL: name: test_load_local_v16s32_align32 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[COPY]](p3) :: (load 16, align 32, addrspace 3) + ; GFX10: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[LOAD]](<16 x s32>) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_v16s32_align32 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[COPY]](p3) :: (load 16, align 32, addrspace 3) + ; GFX10-UNALIGNED: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[LOAD]](<16 x s32>) %0:_(p3) = COPY $vgpr0 %1:_(<16 x s32>) = G_LOAD %0 :: (load 16, align 32, addrspace 3) $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %1 @@ -9869,6 +13219,22 @@ ; GFX9-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p3) :: (load 16, align 4, addrspace 3) ; GFX9-UNALIGNED: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>) + ; GFX10-LABEL: name: test_load_local_v2s64_align4 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p3) :: (load 8 from unknown-address + 8, align 4, addrspace 3) + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[LOAD]](s64), [[LOAD1]](s64) + ; GFX10: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_v2s64_align4 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-UNALIGNED: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p3) :: (load 8 from unknown-address + 8, align 4, addrspace 3) + ; GFX10-UNALIGNED: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[LOAD]](s64), [[LOAD1]](s64) + ; GFX10-UNALIGNED: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) %0:_(p3) = COPY $vgpr0 %1:_(<2 x s64>) = G_LOAD %0 :: (load 16, align 4, addrspace 3) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -10492,6 +13858,230 @@ ; GFX9-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p3) :: (load 16, align 1, addrspace 3) ; GFX9-UNALIGNED: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>) + ; GFX10-LABEL: name: test_load_local_v2s64_align16 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 1 from unknown-address + 1, addrspace 3) + ; GFX10: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 1 from unknown-address + 2, addrspace 3) + ; GFX10: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; GFX10: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 1 from unknown-address + 3, addrspace 3) + ; GFX10: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10: [[PTR_ADD3:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; GFX10: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p3) :: (load 1 from unknown-address + 4, addrspace 3) + ; GFX10: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 + ; GFX10: [[PTR_ADD4:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; GFX10: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p3) :: (load 1 from unknown-address + 5, addrspace 3) + ; GFX10: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 + ; GFX10: [[PTR_ADD5:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; GFX10: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p3) :: (load 1 from unknown-address + 6, addrspace 3) + ; GFX10: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 7 + ; GFX10: [[PTR_ADD6:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C6]](s32) + ; GFX10: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p3) :: (load 1 from unknown-address + 7, addrspace 3) + ; GFX10: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX10: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; GFX10: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C7]] + ; GFX10: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX10: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C7]] + ; GFX10: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; GFX10: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C8]](s16) + ; GFX10: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL]] + ; GFX10: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) + ; GFX10: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C7]] + ; GFX10: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) + ; GFX10: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C7]] + ; GFX10: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C8]](s16) + ; GFX10: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] + ; GFX10: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) + ; GFX10: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C7]] + ; GFX10: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) + ; GFX10: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C7]] + ; GFX10: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C8]](s16) + ; GFX10: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; GFX10: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) + ; GFX10: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C7]] + ; GFX10: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) + ; GFX10: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] + ; GFX10: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) + ; GFX10: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] + ; GFX10: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX10: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX10: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; GFX10: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; GFX10: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX10: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX10: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; GFX10: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX10: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; GFX10: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10: [[PTR_ADD7:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C10]](s32) + ; GFX10: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p3) :: (load 1 from unknown-address + 8, addrspace 3) + ; GFX10: [[PTR_ADD8:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; GFX10: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p3) :: (load 1 from unknown-address + 9, addrspace 3) + ; GFX10: [[PTR_ADD9:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C1]](s32) + ; GFX10: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p3) :: (load 1 from unknown-address + 10, addrspace 3) + ; GFX10: [[PTR_ADD10:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; GFX10: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p3) :: (load 1 from unknown-address + 11, addrspace 3) + ; GFX10: [[PTR_ADD11:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C3]](s32) + ; GFX10: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD11]](p3) :: (load 1 from unknown-address + 12, addrspace 3) + ; GFX10: [[PTR_ADD12:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s32) + ; GFX10: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD12]](p3) :: (load 1 from unknown-address + 13, addrspace 3) + ; GFX10: [[PTR_ADD13:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C5]](s32) + ; GFX10: [[LOAD14:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD13]](p3) :: (load 1 from unknown-address + 14, addrspace 3) + ; GFX10: [[PTR_ADD14:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C6]](s32) + ; GFX10: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p3) :: (load 1 from unknown-address + 15, addrspace 3) + ; GFX10: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD8]](s32) + ; GFX10: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] + ; GFX10: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) + ; GFX10: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] + ; GFX10: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) + ; GFX10: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] + ; GFX10: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) + ; GFX10: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] + ; GFX10: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) + ; GFX10: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] + ; GFX10: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) + ; GFX10: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] + ; GFX10: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) + ; GFX10: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] + ; GFX10: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) + ; GFX10: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] + ; GFX10: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) + ; GFX10: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]] + ; GFX10: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) + ; GFX10: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] + ; GFX10: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) + ; GFX10: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] + ; GFX10: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) + ; GFX10: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]] + ; GFX10: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; GFX10: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; GFX10: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) + ; GFX10: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; GFX10: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; GFX10: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; GFX10: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) + ; GFX10: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; GFX10: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) + ; GFX10: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_v2s64_align16 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-UNALIGNED: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 1 from unknown-address + 1, addrspace 3) + ; GFX10-UNALIGNED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-UNALIGNED: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 1 from unknown-address + 2, addrspace 3) + ; GFX10-UNALIGNED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; GFX10-UNALIGNED: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-UNALIGNED: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 1 from unknown-address + 3, addrspace 3) + ; GFX10-UNALIGNED: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10-UNALIGNED: [[PTR_ADD3:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; GFX10-UNALIGNED: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p3) :: (load 1 from unknown-address + 4, addrspace 3) + ; GFX10-UNALIGNED: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 + ; GFX10-UNALIGNED: [[PTR_ADD4:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; GFX10-UNALIGNED: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p3) :: (load 1 from unknown-address + 5, addrspace 3) + ; GFX10-UNALIGNED: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 + ; GFX10-UNALIGNED: [[PTR_ADD5:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; GFX10-UNALIGNED: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p3) :: (load 1 from unknown-address + 6, addrspace 3) + ; GFX10-UNALIGNED: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 7 + ; GFX10-UNALIGNED: [[PTR_ADD6:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C6]](s32) + ; GFX10-UNALIGNED: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p3) :: (load 1 from unknown-address + 7, addrspace 3) + ; GFX10-UNALIGNED: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX10-UNALIGNED: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; GFX10-UNALIGNED: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C7]] + ; GFX10-UNALIGNED: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX10-UNALIGNED: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C7]] + ; GFX10-UNALIGNED: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; GFX10-UNALIGNED: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C8]](s16) + ; GFX10-UNALIGNED: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL]] + ; GFX10-UNALIGNED: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) + ; GFX10-UNALIGNED: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C7]] + ; GFX10-UNALIGNED: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) + ; GFX10-UNALIGNED: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C7]] + ; GFX10-UNALIGNED: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C8]](s16) + ; GFX10-UNALIGNED: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] + ; GFX10-UNALIGNED: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) + ; GFX10-UNALIGNED: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C7]] + ; GFX10-UNALIGNED: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) + ; GFX10-UNALIGNED: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C7]] + ; GFX10-UNALIGNED: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C8]](s16) + ; GFX10-UNALIGNED: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; GFX10-UNALIGNED: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) + ; GFX10-UNALIGNED: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C7]] + ; GFX10-UNALIGNED: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) + ; GFX10-UNALIGNED: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] + ; GFX10-UNALIGNED: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) + ; GFX10-UNALIGNED: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] + ; GFX10-UNALIGNED: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX10-UNALIGNED: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX10-UNALIGNED: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-UNALIGNED: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; GFX10-UNALIGNED: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; GFX10-UNALIGNED: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX10-UNALIGNED: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX10-UNALIGNED: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; GFX10-UNALIGNED: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX10-UNALIGNED: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; GFX10-UNALIGNED: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-UNALIGNED: [[PTR_ADD7:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C10]](s32) + ; GFX10-UNALIGNED: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p3) :: (load 1 from unknown-address + 8, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD8:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p3) :: (load 1 from unknown-address + 9, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD9:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p3) :: (load 1 from unknown-address + 10, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD10:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; GFX10-UNALIGNED: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p3) :: (load 1 from unknown-address + 11, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD11:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C3]](s32) + ; GFX10-UNALIGNED: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD11]](p3) :: (load 1 from unknown-address + 12, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD12:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s32) + ; GFX10-UNALIGNED: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD12]](p3) :: (load 1 from unknown-address + 13, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD13:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C5]](s32) + ; GFX10-UNALIGNED: [[LOAD14:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD13]](p3) :: (load 1 from unknown-address + 14, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD14:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C6]](s32) + ; GFX10-UNALIGNED: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p3) :: (load 1 from unknown-address + 15, addrspace 3) + ; GFX10-UNALIGNED: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD8]](s32) + ; GFX10-UNALIGNED: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] + ; GFX10-UNALIGNED: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) + ; GFX10-UNALIGNED: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] + ; GFX10-UNALIGNED: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) + ; GFX10-UNALIGNED: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] + ; GFX10-UNALIGNED: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) + ; GFX10-UNALIGNED: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] + ; GFX10-UNALIGNED: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) + ; GFX10-UNALIGNED: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] + ; GFX10-UNALIGNED: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) + ; GFX10-UNALIGNED: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] + ; GFX10-UNALIGNED: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) + ; GFX10-UNALIGNED: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] + ; GFX10-UNALIGNED: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) + ; GFX10-UNALIGNED: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] + ; GFX10-UNALIGNED: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) + ; GFX10-UNALIGNED: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]] + ; GFX10-UNALIGNED: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) + ; GFX10-UNALIGNED: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] + ; GFX10-UNALIGNED: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) + ; GFX10-UNALIGNED: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] + ; GFX10-UNALIGNED: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) + ; GFX10-UNALIGNED: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]] + ; GFX10-UNALIGNED: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; GFX10-UNALIGNED: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; GFX10-UNALIGNED: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) + ; GFX10-UNALIGNED: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; GFX10-UNALIGNED: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; GFX10-UNALIGNED: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; GFX10-UNALIGNED: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) + ; GFX10-UNALIGNED: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; GFX10-UNALIGNED: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) + ; GFX10-UNALIGNED: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) + ; GFX10-UNALIGNED: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) %0:_(p3) = COPY $vgpr0 %1:_(<2 x s64>) = G_LOAD %0 :: (load 16, align 1, addrspace 3) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -10577,6 +14167,30 @@ ; GFX9-UNALIGNED: [[DEF1:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF ; GFX9-UNALIGNED: [[INSERT2:%[0-9]+]]:_(<4 x s64>) = G_INSERT [[DEF1]], [[INSERT1]](<3 x s64>), 0 ; GFX9-UNALIGNED: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[INSERT2]](<4 x s64>) + ; GFX10-LABEL: name: test_load_local_v3s64_align32 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p3) :: (load 16, align 32, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p3) :: (load 8 from unknown-address + 16, align 16, addrspace 3) + ; GFX10: [[DEF:%[0-9]+]]:_(<3 x s64>) = G_IMPLICIT_DEF + ; GFX10: [[INSERT:%[0-9]+]]:_(<3 x s64>) = G_INSERT [[DEF]], [[LOAD]](<2 x s64>), 0 + ; GFX10: [[INSERT1:%[0-9]+]]:_(<3 x s64>) = G_INSERT [[INSERT]], [[LOAD1]](s64), 128 + ; GFX10: [[DEF1:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF + ; GFX10: [[INSERT2:%[0-9]+]]:_(<4 x s64>) = G_INSERT [[DEF1]], [[INSERT1]](<3 x s64>), 0 + ; GFX10: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[INSERT2]](<4 x s64>) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_v3s64_align32 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p3) :: (load 16, align 32, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-UNALIGNED: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p3) :: (load 8 from unknown-address + 16, align 16, addrspace 3) + ; GFX10-UNALIGNED: [[DEF:%[0-9]+]]:_(<3 x s64>) = G_IMPLICIT_DEF + ; GFX10-UNALIGNED: [[INSERT:%[0-9]+]]:_(<3 x s64>) = G_INSERT [[DEF]], [[LOAD]](<2 x s64>), 0 + ; GFX10-UNALIGNED: [[INSERT1:%[0-9]+]]:_(<3 x s64>) = G_INSERT [[INSERT]], [[LOAD1]](s64), 128 + ; GFX10-UNALIGNED: [[DEF1:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF + ; GFX10-UNALIGNED: [[INSERT2:%[0-9]+]]:_(<4 x s64>) = G_INSERT [[DEF1]], [[INSERT1]](<3 x s64>), 0 + ; GFX10-UNALIGNED: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[INSERT2]](<4 x s64>) %0:_(p3) = COPY $vgpr0 %1:_(<3 x s64>) = G_LOAD %0 :: (load 24, align 32, addrspace 3) %2:_(<4 x s64>) = G_IMPLICIT_DEF @@ -10650,6 +14264,22 @@ ; GFX9-UNALIGNED: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p3) :: (load 16 from unknown-address + 16, addrspace 3) ; GFX9-UNALIGNED: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>) ; GFX9-UNALIGNED: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>) + ; GFX10-LABEL: name: test_load_local_v4s64_align32 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p3) :: (load 16, align 32, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p3) :: (load 16 from unknown-address + 16, addrspace 3) + ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>) + ; GFX10: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_v4s64_align32 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p3) :: (load 16, align 32, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-UNALIGNED: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p3) :: (load 16 from unknown-address + 16, addrspace 3) + ; GFX10-UNALIGNED: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>) + ; GFX10-UNALIGNED: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>) %0:_(p3) = COPY $vgpr0 %1:_(<4 x s64>) = G_LOAD %0 :: (load 32, align 32, addrspace 3) $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY %1 @@ -10729,6 +14359,36 @@ ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load 16, align 4, addrspace 3) ; GFX9-UNALIGNED: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) ; GFX9-UNALIGNED: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) + ; GFX10-LABEL: name: test_load_local_v2p1_align4 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 4, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 from unknown-address + 4, addrspace 3) + ; GFX10: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 4 from unknown-address + 8, addrspace 3) + ; GFX10: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 4 from unknown-address + 12, addrspace 3) + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) + ; GFX10: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; GFX10: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_v2p1_align4 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 4, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10-UNALIGNED: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 from unknown-address + 4, addrspace 3) + ; GFX10-UNALIGNED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-UNALIGNED: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 4 from unknown-address + 8, addrspace 3) + ; GFX10-UNALIGNED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10-UNALIGNED: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-UNALIGNED: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 4 from unknown-address + 12, addrspace 3) + ; GFX10-UNALIGNED: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) + ; GFX10-UNALIGNED: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; GFX10-UNALIGNED: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) %0:_(p3) = COPY $vgpr0 %1:_(<2 x p1>) = G_LOAD %0 :: (load 16, align 4, addrspace 3) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -10764,6 +14424,14 @@ ; GFX9-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3) ; GFX9-UNALIGNED: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) + ; GFX10-LABEL: name: test_load_local_v2p3_align8 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3) + ; GFX10: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) + ; GFX10-UNALIGNED-LABEL: name: test_load_local_v2p3_align8 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3) + ; GFX10-UNALIGNED: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) %0:_(p3) = COPY $vgpr0 %1:_(<2 x p3>) = G_LOAD %0 :: (load 8, align 8, addrspace 3) $vgpr0_vgpr1 = COPY %1 @@ -10799,6 +14467,14 @@ ; GFX9-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, align 4, addrspace 3) ; GFX9-UNALIGNED: $vgpr0 = COPY [[LOAD]](s32) + ; GFX10-LABEL: name: test_extload_local_s32_from_1_align4 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, align 4, addrspace 3) + ; GFX10: $vgpr0 = COPY [[LOAD]](s32) + ; GFX10-UNALIGNED-LABEL: name: test_extload_local_s32_from_1_align4 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, align 4, addrspace 3) + ; GFX10-UNALIGNED: $vgpr0 = COPY [[LOAD]](s32) %0:_(p3) = COPY $vgpr0 %1:_(s32) = G_LOAD %0 :: (load 1, align 4, addrspace 3) $vgpr0 = COPY %1 @@ -10834,6 +14510,14 @@ ; GFX9-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, align 4, addrspace 3) ; GFX9-UNALIGNED: $vgpr0 = COPY [[LOAD]](s32) + ; GFX10-LABEL: name: test_extload_local_s32_from_2_align4 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, align 4, addrspace 3) + ; GFX10: $vgpr0 = COPY [[LOAD]](s32) + ; GFX10-UNALIGNED-LABEL: name: test_extload_local_s32_from_2_align4 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, align 4, addrspace 3) + ; GFX10-UNALIGNED: $vgpr0 = COPY [[LOAD]](s32) %0:_(p3) = COPY $vgpr0 %1:_(s32) = G_LOAD %0 :: (load 2, align 4, addrspace 3) $vgpr0 = COPY %1 @@ -10876,6 +14560,16 @@ ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, align 4, addrspace 3) ; GFX9-UNALIGNED: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32) ; GFX9-UNALIGNED: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64) + ; GFX10-LABEL: name: test_extload_local_s64_from_1_align4 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, align 4, addrspace 3) + ; GFX10: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32) + ; GFX10: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64) + ; GFX10-UNALIGNED-LABEL: name: test_extload_local_s64_from_1_align4 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, align 4, addrspace 3) + ; GFX10-UNALIGNED: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32) + ; GFX10-UNALIGNED: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64) %0:_(p3) = COPY $vgpr0 %1:_(s64) = G_LOAD %0 :: (load 1, align 4, addrspace 3) $vgpr0_vgpr1 = COPY %1 @@ -10917,6 +14611,16 @@ ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, align 4, addrspace 3) ; GFX9-UNALIGNED: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32) ; GFX9-UNALIGNED: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64) + ; GFX10-LABEL: name: test_extload_local_s64_from_2_align4 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, align 4, addrspace 3) + ; GFX10: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32) + ; GFX10: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64) + ; GFX10-UNALIGNED-LABEL: name: test_extload_local_s64_from_2_align4 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, align 4, addrspace 3) + ; GFX10-UNALIGNED: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32) + ; GFX10-UNALIGNED: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64) %0:_(p3) = COPY $vgpr0 %1:_(s64) = G_LOAD %0 :: (load 2, align 4, addrspace 3) $vgpr0_vgpr1 = COPY %1 @@ -10958,6 +14662,16 @@ ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 4, addrspace 3) ; GFX9-UNALIGNED: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32) ; GFX9-UNALIGNED: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64) + ; GFX10-LABEL: name: test_extload_local_s64_from_4_align4 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 4, addrspace 3) + ; GFX10: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32) + ; GFX10: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64) + ; GFX10-UNALIGNED-LABEL: name: test_extload_local_s64_from_4_align4 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 4, addrspace 3) + ; GFX10-UNALIGNED: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32) + ; GFX10-UNALIGNED: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64) %0:_(p3) = COPY $vgpr0 %1:_(s64) = G_LOAD %0 :: (load 4, align 4, addrspace 3) $vgpr0_vgpr1 = COPY %1 @@ -11017,6 +14731,22 @@ ; GFX9-UNALIGNED: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF ; GFX9-UNALIGNED: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[DEF1]](s64) ; GFX9-UNALIGNED: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV1]](s128) + ; GFX10-LABEL: name: test_extload_local_s128_from_4_align4 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 4, addrspace 3) + ; GFX10: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX10: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[DEF]](s32) + ; GFX10: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF + ; GFX10: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[DEF1]](s64) + ; GFX10: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV1]](s128) + ; GFX10-UNALIGNED-LABEL: name: test_extload_local_s128_from_4_align4 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 4, addrspace 3) + ; GFX10-UNALIGNED: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX10-UNALIGNED: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[DEF]](s32) + ; GFX10-UNALIGNED: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF + ; GFX10-UNALIGNED: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[DEF1]](s64) + ; GFX10-UNALIGNED: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV1]](s128) %0:_(p3) = COPY $vgpr0 %1:_(s128) = G_LOAD %0 :: (load 4, align 4, addrspace 3) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -11058,6 +14788,16 @@ ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, align 4, addrspace 3) ; GFX9-UNALIGNED: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32) ; GFX9-UNALIGNED: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64) + ; GFX10-LABEL: name: test_extload_local_s64_from_2_align2 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, align 4, addrspace 3) + ; GFX10: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32) + ; GFX10: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64) + ; GFX10-UNALIGNED-LABEL: name: test_extload_local_s64_from_2_align2 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, align 4, addrspace 3) + ; GFX10-UNALIGNED: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32) + ; GFX10-UNALIGNED: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64) %0:_(p3) = COPY $vgpr0 %1:_(s64) = G_LOAD %0 :: (load 2, align 4, addrspace 3) $vgpr0_vgpr1 = COPY %1 @@ -11099,6 +14839,16 @@ ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, align 4, addrspace 3) ; GFX9-UNALIGNED: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32) ; GFX9-UNALIGNED: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64) + ; GFX10-LABEL: name: test_extload_local_s64_from_1_align1 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, align 4, addrspace 3) + ; GFX10: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32) + ; GFX10: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64) + ; GFX10-UNALIGNED-LABEL: name: test_extload_local_s64_from_1_align1 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, align 4, addrspace 3) + ; GFX10-UNALIGNED: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32) + ; GFX10-UNALIGNED: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64) %0:_(p3) = COPY $vgpr0 %1:_(s64) = G_LOAD %0 :: (load 1, align 4, addrspace 3) $vgpr0_vgpr1 = COPY %1 @@ -11134,6 +14884,14 @@ ; GFX9-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 4, align 1, addrspace 3) ; GFX9-UNALIGNED: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; GFX10-LABEL: name: test_extload_local_v2s32_from_4_align1 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 4, align 1, addrspace 3) + ; GFX10: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; GFX10-UNALIGNED-LABEL: name: test_extload_local_v2s32_from_4_align1 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 4, align 1, addrspace 3) + ; GFX10-UNALIGNED: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) %0:_(p3) = COPY $vgpr0 %1:_(<2 x s32>) = G_LOAD %0 :: (load 4, align 1, addrspace 3) $vgpr0_vgpr1 = COPY %1 @@ -11169,6 +14927,14 @@ ; GFX9-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 4, align 2, addrspace 3) ; GFX9-UNALIGNED: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; GFX10-LABEL: name: test_extload_local_v2s32_from_4_align2 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 4, align 2, addrspace 3) + ; GFX10: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; GFX10-UNALIGNED-LABEL: name: test_extload_local_v2s32_from_4_align2 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 4, align 2, addrspace 3) + ; GFX10-UNALIGNED: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) %0:_(p3) = COPY $vgpr0 %1:_(<2 x s32>) = G_LOAD %0 :: (load 4, align 2, addrspace 3) $vgpr0_vgpr1 = COPY %1 @@ -11204,6 +14970,14 @@ ; GFX9-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 4, addrspace 3) ; GFX9-UNALIGNED: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; GFX10-LABEL: name: test_extload_local_v2s32_from_4_align4 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 4, addrspace 3) + ; GFX10: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; GFX10-UNALIGNED-LABEL: name: test_extload_local_v2s32_from_4_align4 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 4, addrspace 3) + ; GFX10-UNALIGNED: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) %0:_(p3) = COPY $vgpr0 %1:_(<2 x s32>) = G_LOAD %0 :: (load 4, align 4, addrspace 3) $vgpr0_vgpr1 = COPY %1 @@ -11239,6 +15013,14 @@ ; GFX9-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p3) :: (load 6, align 4, addrspace 3) ; GFX9-UNALIGNED: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>) + ; GFX10-LABEL: name: test_extload_local_v3s32_from_6_align4 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p3) :: (load 6, align 4, addrspace 3) + ; GFX10: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>) + ; GFX10-UNALIGNED-LABEL: name: test_extload_local_v3s32_from_6_align4 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p3) :: (load 6, align 4, addrspace 3) + ; GFX10-UNALIGNED: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>) %0:_(p3) = COPY $vgpr0 %1:_(<3 x s32>) = G_LOAD %0 :: (load 6, align 4, addrspace 3) $vgpr0_vgpr1_vgpr2 = COPY %1 @@ -11274,6 +15056,14 @@ ; GFX9-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-UNALIGNED: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) ; GFX9-UNALIGNED: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) + ; GFX10-LABEL: name: test_extload_local_v4s32_from_8_align4 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) + ; GFX10: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) + ; GFX10-UNALIGNED-LABEL: name: test_extload_local_v4s32_from_8_align4 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) + ; GFX10-UNALIGNED: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) %0:_(p3) = COPY $vgpr0 %1:_(<4 x s32>) = G_LOAD %0 :: (load 8, align 4, addrspace 3) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -12077,6 +15867,318 @@ ; GFX9-UNALIGNED: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) ; GFX9-UNALIGNED: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) ; GFX9-UNALIGNED: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96) + ; GFX10-LABEL: name: test_extload_local_v2s96_from_24_align1 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 1 from unknown-address + 1, addrspace 3) + ; GFX10: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 1 from unknown-address + 2, addrspace 3) + ; GFX10: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; GFX10: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 1 from unknown-address + 3, addrspace 3) + ; GFX10: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX10: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX10: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX10: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX10: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C5]](s32) + ; GFX10: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX10: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX10: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; GFX10: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; GFX10: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; GFX10: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10: [[PTR_ADD3:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C7]](s32) + ; GFX10: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p3) :: (load 1 from unknown-address + 4, addrspace 3) + ; GFX10: [[PTR_ADD4:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; GFX10: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p3) :: (load 1 from unknown-address + 5, addrspace 3) + ; GFX10: [[PTR_ADD5:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s32) + ; GFX10: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p3) :: (load 1 from unknown-address + 6, addrspace 3) + ; GFX10: [[PTR_ADD6:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; GFX10: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p3) :: (load 1 from unknown-address + 7, addrspace 3) + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX10: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX10: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; GFX10: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; GFX10: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL3]] + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; GFX10: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; GFX10: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C5]](s32) + ; GFX10: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]] + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; GFX10: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; GFX10: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C6]](s32) + ; GFX10: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]] + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) + ; GFX10: [[PTR_ADD7:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; GFX10: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p3) :: (load 1 from unknown-address + 8, addrspace 3) + ; GFX10: [[PTR_ADD8:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; GFX10: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p3) :: (load 1 from unknown-address + 9, addrspace 3) + ; GFX10: [[PTR_ADD9:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C1]](s32) + ; GFX10: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p3) :: (load 1 from unknown-address + 10, addrspace 3) + ; GFX10: [[PTR_ADD10:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; GFX10: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p3) :: (load 1 from unknown-address + 11, addrspace 3) + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; GFX10: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]] + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; GFX10: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]] + ; GFX10: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C4]](s32) + ; GFX10: [[OR6:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL6]] + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; GFX10: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C3]] + ; GFX10: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND10]], [[C5]](s32) + ; GFX10: [[OR7:%[0-9]+]]:_(s32) = G_OR [[OR6]], [[SHL7]] + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; GFX10: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C3]] + ; GFX10: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C6]](s32) + ; GFX10: [[OR8:%[0-9]+]]:_(s32) = G_OR [[OR7]], [[SHL8]] + ; GFX10: [[DEF:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF + ; GFX10: [[COPY13:%[0-9]+]]:_(<3 x s32>) = COPY [[DEF]](<3 x s32>) + ; GFX10: [[INSERT:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[COPY13]], [[BUILD_VECTOR]](<2 x s32>), 0 + ; GFX10: [[INSERT1:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT]], [[OR8]](s32), 64 + ; GFX10: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[INSERT1]](<3 x s32>) + ; GFX10: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10: [[PTR_ADD11:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C8]](s32) + ; GFX10: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD11]](p3) :: (load 1 from unknown-address + 12, addrspace 3) + ; GFX10: [[PTR_ADD12:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; GFX10: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD12]](p3) :: (load 1 from unknown-address + 13, addrspace 3) + ; GFX10: [[PTR_ADD13:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s32) + ; GFX10: [[LOAD14:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD13]](p3) :: (load 1 from unknown-address + 14, addrspace 3) + ; GFX10: [[PTR_ADD14:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; GFX10: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p3) :: (load 1 from unknown-address + 15, addrspace 3) + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD12]](s32) + ; GFX10: [[AND12:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C3]] + ; GFX10: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) + ; GFX10: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C3]] + ; GFX10: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[C4]](s32) + ; GFX10: [[OR9:%[0-9]+]]:_(s32) = G_OR [[AND12]], [[SHL9]] + ; GFX10: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD14]](s32) + ; GFX10: [[AND14:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C3]] + ; GFX10: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND14]], [[C5]](s32) + ; GFX10: [[OR10:%[0-9]+]]:_(s32) = G_OR [[OR9]], [[SHL10]] + ; GFX10: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) + ; GFX10: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY17]], [[C3]] + ; GFX10: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[C6]](s32) + ; GFX10: [[OR11:%[0-9]+]]:_(s32) = G_OR [[OR10]], [[SHL11]] + ; GFX10: [[PTR_ADD15:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD11]], [[C7]](s32) + ; GFX10: [[LOAD16:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD15]](p3) :: (load 1 from unknown-address + 16, addrspace 3) + ; GFX10: [[PTR_ADD16:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD15]], [[C]](s32) + ; GFX10: [[LOAD17:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD16]](p3) :: (load 1 from unknown-address + 17, addrspace 3) + ; GFX10: [[PTR_ADD17:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD15]], [[C1]](s32) + ; GFX10: [[LOAD18:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD17]](p3) :: (load 1 from unknown-address + 18, addrspace 3) + ; GFX10: [[PTR_ADD18:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s32) + ; GFX10: [[LOAD19:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p3) :: (load 1 from unknown-address + 19, addrspace 3) + ; GFX10: [[COPY18:%[0-9]+]]:_(s32) = COPY [[LOAD16]](s32) + ; GFX10: [[AND16:%[0-9]+]]:_(s32) = G_AND [[COPY18]], [[C3]] + ; GFX10: [[COPY19:%[0-9]+]]:_(s32) = COPY [[LOAD17]](s32) + ; GFX10: [[AND17:%[0-9]+]]:_(s32) = G_AND [[COPY19]], [[C3]] + ; GFX10: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[C4]](s32) + ; GFX10: [[OR12:%[0-9]+]]:_(s32) = G_OR [[AND16]], [[SHL12]] + ; GFX10: [[COPY20:%[0-9]+]]:_(s32) = COPY [[LOAD18]](s32) + ; GFX10: [[AND18:%[0-9]+]]:_(s32) = G_AND [[COPY20]], [[C3]] + ; GFX10: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[AND18]], [[C5]](s32) + ; GFX10: [[OR13:%[0-9]+]]:_(s32) = G_OR [[OR12]], [[SHL13]] + ; GFX10: [[COPY21:%[0-9]+]]:_(s32) = COPY [[LOAD19]](s32) + ; GFX10: [[AND19:%[0-9]+]]:_(s32) = G_AND [[COPY21]], [[C3]] + ; GFX10: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[AND19]], [[C6]](s32) + ; GFX10: [[OR14:%[0-9]+]]:_(s32) = G_OR [[OR13]], [[SHL14]] + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR11]](s32), [[OR14]](s32) + ; GFX10: [[PTR_ADD19:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD11]], [[C4]](s32) + ; GFX10: [[LOAD20:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD19]](p3) :: (load 1 from unknown-address + 20, addrspace 3) + ; GFX10: [[PTR_ADD20:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD19]], [[C]](s32) + ; GFX10: [[LOAD21:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD20]](p3) :: (load 1 from unknown-address + 21, addrspace 3) + ; GFX10: [[PTR_ADD21:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD19]], [[C1]](s32) + ; GFX10: [[LOAD22:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD21]](p3) :: (load 1 from unknown-address + 22, addrspace 3) + ; GFX10: [[PTR_ADD22:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s32) + ; GFX10: [[LOAD23:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p3) :: (load 1 from unknown-address + 23, addrspace 3) + ; GFX10: [[COPY22:%[0-9]+]]:_(s32) = COPY [[LOAD20]](s32) + ; GFX10: [[AND20:%[0-9]+]]:_(s32) = G_AND [[COPY22]], [[C3]] + ; GFX10: [[COPY23:%[0-9]+]]:_(s32) = COPY [[LOAD21]](s32) + ; GFX10: [[AND21:%[0-9]+]]:_(s32) = G_AND [[COPY23]], [[C3]] + ; GFX10: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[AND21]], [[C4]](s32) + ; GFX10: [[OR15:%[0-9]+]]:_(s32) = G_OR [[AND20]], [[SHL15]] + ; GFX10: [[COPY24:%[0-9]+]]:_(s32) = COPY [[LOAD22]](s32) + ; GFX10: [[AND22:%[0-9]+]]:_(s32) = G_AND [[COPY24]], [[C3]] + ; GFX10: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[AND22]], [[C5]](s32) + ; GFX10: [[OR16:%[0-9]+]]:_(s32) = G_OR [[OR15]], [[SHL16]] + ; GFX10: [[COPY25:%[0-9]+]]:_(s32) = COPY [[LOAD23]](s32) + ; GFX10: [[AND23:%[0-9]+]]:_(s32) = G_AND [[COPY25]], [[C3]] + ; GFX10: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[AND23]], [[C6]](s32) + ; GFX10: [[OR17:%[0-9]+]]:_(s32) = G_OR [[OR16]], [[SHL17]] + ; GFX10: [[INSERT2:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[DEF]], [[BUILD_VECTOR1]](<2 x s32>), 0 + ; GFX10: [[INSERT3:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT2]], [[OR17]](s32), 64 + ; GFX10: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[INSERT3]](<3 x s32>) + ; GFX10: [[COPY26:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96) + ; GFX10: [[COPY27:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) + ; GFX10: $vgpr0_vgpr1_vgpr2 = COPY [[COPY26]](s96) + ; GFX10: $vgpr3_vgpr4_vgpr5 = COPY [[COPY27]](s96) + ; GFX10-UNALIGNED-LABEL: name: test_extload_local_v2s96_from_24_align1 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-UNALIGNED: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 1 from unknown-address + 1, addrspace 3) + ; GFX10-UNALIGNED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-UNALIGNED: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 1 from unknown-address + 2, addrspace 3) + ; GFX10-UNALIGNED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; GFX10-UNALIGNED: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-UNALIGNED: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 1 from unknown-address + 3, addrspace 3) + ; GFX10-UNALIGNED: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; GFX10-UNALIGNED: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10-UNALIGNED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX10-UNALIGNED: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10-UNALIGNED: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX10-UNALIGNED: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-UNALIGNED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX10-UNALIGNED: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX10-UNALIGNED: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX10-UNALIGNED: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX10-UNALIGNED: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-UNALIGNED: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C5]](s32) + ; GFX10-UNALIGNED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GFX10-UNALIGNED: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX10-UNALIGNED: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX10-UNALIGNED: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; GFX10-UNALIGNED: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; GFX10-UNALIGNED: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; GFX10-UNALIGNED: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10-UNALIGNED: [[PTR_ADD3:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C7]](s32) + ; GFX10-UNALIGNED: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p3) :: (load 1 from unknown-address + 4, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD4:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p3) :: (load 1 from unknown-address + 5, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD5:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p3) :: (load 1 from unknown-address + 6, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD6:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; GFX10-UNALIGNED: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p3) :: (load 1 from unknown-address + 7, addrspace 3) + ; GFX10-UNALIGNED: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX10-UNALIGNED: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; GFX10-UNALIGNED: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX10-UNALIGNED: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; GFX10-UNALIGNED: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; GFX10-UNALIGNED: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL3]] + ; GFX10-UNALIGNED: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; GFX10-UNALIGNED: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; GFX10-UNALIGNED: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C5]](s32) + ; GFX10-UNALIGNED: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]] + ; GFX10-UNALIGNED: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; GFX10-UNALIGNED: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; GFX10-UNALIGNED: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C6]](s32) + ; GFX10-UNALIGNED: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]] + ; GFX10-UNALIGNED: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) + ; GFX10-UNALIGNED: [[PTR_ADD7:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; GFX10-UNALIGNED: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p3) :: (load 1 from unknown-address + 8, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD8:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p3) :: (load 1 from unknown-address + 9, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD9:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p3) :: (load 1 from unknown-address + 10, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD10:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; GFX10-UNALIGNED: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p3) :: (load 1 from unknown-address + 11, addrspace 3) + ; GFX10-UNALIGNED: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; GFX10-UNALIGNED: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]] + ; GFX10-UNALIGNED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; GFX10-UNALIGNED: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]] + ; GFX10-UNALIGNED: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C4]](s32) + ; GFX10-UNALIGNED: [[OR6:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL6]] + ; GFX10-UNALIGNED: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; GFX10-UNALIGNED: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C3]] + ; GFX10-UNALIGNED: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND10]], [[C5]](s32) + ; GFX10-UNALIGNED: [[OR7:%[0-9]+]]:_(s32) = G_OR [[OR6]], [[SHL7]] + ; GFX10-UNALIGNED: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; GFX10-UNALIGNED: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C3]] + ; GFX10-UNALIGNED: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C6]](s32) + ; GFX10-UNALIGNED: [[OR8:%[0-9]+]]:_(s32) = G_OR [[OR7]], [[SHL8]] + ; GFX10-UNALIGNED: [[DEF:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF + ; GFX10-UNALIGNED: [[COPY13:%[0-9]+]]:_(<3 x s32>) = COPY [[DEF]](<3 x s32>) + ; GFX10-UNALIGNED: [[INSERT:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[COPY13]], [[BUILD_VECTOR]](<2 x s32>), 0 + ; GFX10-UNALIGNED: [[INSERT1:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT]], [[OR8]](s32), 64 + ; GFX10-UNALIGNED: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[INSERT1]](<3 x s32>) + ; GFX10-UNALIGNED: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10-UNALIGNED: [[PTR_ADD11:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C8]](s32) + ; GFX10-UNALIGNED: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD11]](p3) :: (load 1 from unknown-address + 12, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD12:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD12]](p3) :: (load 1 from unknown-address + 13, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD13:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LOAD14:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD13]](p3) :: (load 1 from unknown-address + 14, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD14:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; GFX10-UNALIGNED: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p3) :: (load 1 from unknown-address + 15, addrspace 3) + ; GFX10-UNALIGNED: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD12]](s32) + ; GFX10-UNALIGNED: [[AND12:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C3]] + ; GFX10-UNALIGNED: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) + ; GFX10-UNALIGNED: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C3]] + ; GFX10-UNALIGNED: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[C4]](s32) + ; GFX10-UNALIGNED: [[OR9:%[0-9]+]]:_(s32) = G_OR [[AND12]], [[SHL9]] + ; GFX10-UNALIGNED: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD14]](s32) + ; GFX10-UNALIGNED: [[AND14:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C3]] + ; GFX10-UNALIGNED: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND14]], [[C5]](s32) + ; GFX10-UNALIGNED: [[OR10:%[0-9]+]]:_(s32) = G_OR [[OR9]], [[SHL10]] + ; GFX10-UNALIGNED: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) + ; GFX10-UNALIGNED: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY17]], [[C3]] + ; GFX10-UNALIGNED: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[C6]](s32) + ; GFX10-UNALIGNED: [[OR11:%[0-9]+]]:_(s32) = G_OR [[OR10]], [[SHL11]] + ; GFX10-UNALIGNED: [[PTR_ADD15:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD11]], [[C7]](s32) + ; GFX10-UNALIGNED: [[LOAD16:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD15]](p3) :: (load 1 from unknown-address + 16, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD16:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD15]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD17:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD16]](p3) :: (load 1 from unknown-address + 17, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD17:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD15]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LOAD18:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD17]](p3) :: (load 1 from unknown-address + 18, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD18:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s32) + ; GFX10-UNALIGNED: [[LOAD19:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p3) :: (load 1 from unknown-address + 19, addrspace 3) + ; GFX10-UNALIGNED: [[COPY18:%[0-9]+]]:_(s32) = COPY [[LOAD16]](s32) + ; GFX10-UNALIGNED: [[AND16:%[0-9]+]]:_(s32) = G_AND [[COPY18]], [[C3]] + ; GFX10-UNALIGNED: [[COPY19:%[0-9]+]]:_(s32) = COPY [[LOAD17]](s32) + ; GFX10-UNALIGNED: [[AND17:%[0-9]+]]:_(s32) = G_AND [[COPY19]], [[C3]] + ; GFX10-UNALIGNED: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[C4]](s32) + ; GFX10-UNALIGNED: [[OR12:%[0-9]+]]:_(s32) = G_OR [[AND16]], [[SHL12]] + ; GFX10-UNALIGNED: [[COPY20:%[0-9]+]]:_(s32) = COPY [[LOAD18]](s32) + ; GFX10-UNALIGNED: [[AND18:%[0-9]+]]:_(s32) = G_AND [[COPY20]], [[C3]] + ; GFX10-UNALIGNED: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[AND18]], [[C5]](s32) + ; GFX10-UNALIGNED: [[OR13:%[0-9]+]]:_(s32) = G_OR [[OR12]], [[SHL13]] + ; GFX10-UNALIGNED: [[COPY21:%[0-9]+]]:_(s32) = COPY [[LOAD19]](s32) + ; GFX10-UNALIGNED: [[AND19:%[0-9]+]]:_(s32) = G_AND [[COPY21]], [[C3]] + ; GFX10-UNALIGNED: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[AND19]], [[C6]](s32) + ; GFX10-UNALIGNED: [[OR14:%[0-9]+]]:_(s32) = G_OR [[OR13]], [[SHL14]] + ; GFX10-UNALIGNED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR11]](s32), [[OR14]](s32) + ; GFX10-UNALIGNED: [[PTR_ADD19:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD11]], [[C4]](s32) + ; GFX10-UNALIGNED: [[LOAD20:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD19]](p3) :: (load 1 from unknown-address + 20, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD20:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD19]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD21:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD20]](p3) :: (load 1 from unknown-address + 21, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD21:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD19]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LOAD22:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD21]](p3) :: (load 1 from unknown-address + 22, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD22:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s32) + ; GFX10-UNALIGNED: [[LOAD23:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p3) :: (load 1 from unknown-address + 23, addrspace 3) + ; GFX10-UNALIGNED: [[COPY22:%[0-9]+]]:_(s32) = COPY [[LOAD20]](s32) + ; GFX10-UNALIGNED: [[AND20:%[0-9]+]]:_(s32) = G_AND [[COPY22]], [[C3]] + ; GFX10-UNALIGNED: [[COPY23:%[0-9]+]]:_(s32) = COPY [[LOAD21]](s32) + ; GFX10-UNALIGNED: [[AND21:%[0-9]+]]:_(s32) = G_AND [[COPY23]], [[C3]] + ; GFX10-UNALIGNED: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[AND21]], [[C4]](s32) + ; GFX10-UNALIGNED: [[OR15:%[0-9]+]]:_(s32) = G_OR [[AND20]], [[SHL15]] + ; GFX10-UNALIGNED: [[COPY24:%[0-9]+]]:_(s32) = COPY [[LOAD22]](s32) + ; GFX10-UNALIGNED: [[AND22:%[0-9]+]]:_(s32) = G_AND [[COPY24]], [[C3]] + ; GFX10-UNALIGNED: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[AND22]], [[C5]](s32) + ; GFX10-UNALIGNED: [[OR16:%[0-9]+]]:_(s32) = G_OR [[OR15]], [[SHL16]] + ; GFX10-UNALIGNED: [[COPY25:%[0-9]+]]:_(s32) = COPY [[LOAD23]](s32) + ; GFX10-UNALIGNED: [[AND23:%[0-9]+]]:_(s32) = G_AND [[COPY25]], [[C3]] + ; GFX10-UNALIGNED: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[AND23]], [[C6]](s32) + ; GFX10-UNALIGNED: [[OR17:%[0-9]+]]:_(s32) = G_OR [[OR16]], [[SHL17]] + ; GFX10-UNALIGNED: [[INSERT2:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[DEF]], [[BUILD_VECTOR1]](<2 x s32>), 0 + ; GFX10-UNALIGNED: [[INSERT3:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT2]], [[OR17]](s32), 64 + ; GFX10-UNALIGNED: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[INSERT3]](<3 x s32>) + ; GFX10-UNALIGNED: [[COPY26:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96) + ; GFX10-UNALIGNED: [[COPY27:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) + ; GFX10-UNALIGNED: $vgpr0_vgpr1_vgpr2 = COPY [[COPY26]](s96) + ; GFX10-UNALIGNED: $vgpr3_vgpr4_vgpr5 = COPY [[COPY27]](s96) %0:_(p3) = COPY $vgpr0 %1:_(<2 x s96>) = G_LOAD %0 :: (load 24, align 1, addrspace 3) %2:_(s96) = G_EXTRACT %1, 0 @@ -12508,6 +16610,168 @@ ; GFX9-UNALIGNED: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) ; GFX9-UNALIGNED: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) ; GFX9-UNALIGNED: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96) + ; GFX10-LABEL: name: test_extload_local_v2s96_from_24_align2 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 2 from unknown-address + 2, addrspace 3) + ; GFX10: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX10: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX10: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX10: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; GFX10: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 2 from unknown-address + 4, addrspace 3) + ; GFX10: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; GFX10: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 2 from unknown-address + 6, addrspace 3) + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX10: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX10: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; GFX10: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; GFX10: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) + ; GFX10: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10: [[PTR_ADD3:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; GFX10: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p3) :: (load 2 from unknown-address + 8, addrspace 3) + ; GFX10: [[PTR_ADD4:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; GFX10: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p3) :: (load 2 from unknown-address + 10, addrspace 3) + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX10: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]] + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX10: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]] + ; GFX10: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32) + ; GFX10: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; GFX10: [[DEF:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF + ; GFX10: [[COPY7:%[0-9]+]]:_(<3 x s32>) = COPY [[DEF]](<3 x s32>) + ; GFX10: [[INSERT:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[COPY7]], [[BUILD_VECTOR]](<2 x s32>), 0 + ; GFX10: [[INSERT1:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT]], [[OR2]](s32), 64 + ; GFX10: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[INSERT1]](<3 x s32>) + ; GFX10: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10: [[PTR_ADD5:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; GFX10: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p3) :: (load 2 from unknown-address + 12, addrspace 3) + ; GFX10: [[PTR_ADD6:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; GFX10: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p3) :: (load 2 from unknown-address + 14, addrspace 3) + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; GFX10: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C1]] + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; GFX10: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C1]] + ; GFX10: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C2]](s32) + ; GFX10: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; GFX10: [[PTR_ADD7:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD5]], [[C3]](s32) + ; GFX10: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p3) :: (load 2 from unknown-address + 16, addrspace 3) + ; GFX10: [[PTR_ADD8:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; GFX10: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p3) :: (load 2 from unknown-address + 18, addrspace 3) + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; GFX10: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C1]] + ; GFX10: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; GFX10: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C1]] + ; GFX10: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C2]](s32) + ; GFX10: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL4]] + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR3]](s32), [[OR4]](s32) + ; GFX10: [[PTR_ADD9:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD5]], [[C4]](s32) + ; GFX10: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p3) :: (load 2 from unknown-address + 20, addrspace 3) + ; GFX10: [[PTR_ADD10:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; GFX10: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p3) :: (load 2 from unknown-address + 22, addrspace 3) + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; GFX10: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C1]] + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; GFX10: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C1]] + ; GFX10: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C2]](s32) + ; GFX10: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL5]] + ; GFX10: [[INSERT2:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[DEF]], [[BUILD_VECTOR1]](<2 x s32>), 0 + ; GFX10: [[INSERT3:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT2]], [[OR5]](s32), 64 + ; GFX10: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[INSERT3]](<3 x s32>) + ; GFX10: [[COPY14:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96) + ; GFX10: [[COPY15:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) + ; GFX10: $vgpr0_vgpr1_vgpr2 = COPY [[COPY14]](s96) + ; GFX10: $vgpr3_vgpr4_vgpr5 = COPY [[COPY15]](s96) + ; GFX10-UNALIGNED-LABEL: name: test_extload_local_v2s96_from_24_align2 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-UNALIGNED: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 2 from unknown-address + 2, addrspace 3) + ; GFX10-UNALIGNED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX10-UNALIGNED: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX10-UNALIGNED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; GFX10-UNALIGNED: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX10-UNALIGNED: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX10-UNALIGNED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-UNALIGNED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX10-UNALIGNED: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX10-UNALIGNED: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX10-UNALIGNED: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; GFX10-UNALIGNED: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 2 from unknown-address + 4, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 2 from unknown-address + 6, addrspace 3) + ; GFX10-UNALIGNED: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX10-UNALIGNED: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; GFX10-UNALIGNED: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX10-UNALIGNED: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; GFX10-UNALIGNED: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; GFX10-UNALIGNED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX10-UNALIGNED: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) + ; GFX10-UNALIGNED: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-UNALIGNED: [[PTR_ADD3:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; GFX10-UNALIGNED: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p3) :: (load 2 from unknown-address + 8, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD4:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p3) :: (load 2 from unknown-address + 10, addrspace 3) + ; GFX10-UNALIGNED: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX10-UNALIGNED: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]] + ; GFX10-UNALIGNED: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX10-UNALIGNED: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]] + ; GFX10-UNALIGNED: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32) + ; GFX10-UNALIGNED: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; GFX10-UNALIGNED: [[DEF:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF + ; GFX10-UNALIGNED: [[COPY7:%[0-9]+]]:_(<3 x s32>) = COPY [[DEF]](<3 x s32>) + ; GFX10-UNALIGNED: [[INSERT:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[COPY7]], [[BUILD_VECTOR]](<2 x s32>), 0 + ; GFX10-UNALIGNED: [[INSERT1:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT]], [[OR2]](s32), 64 + ; GFX10-UNALIGNED: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[INSERT1]](<3 x s32>) + ; GFX10-UNALIGNED: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10-UNALIGNED: [[PTR_ADD5:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; GFX10-UNALIGNED: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p3) :: (load 2 from unknown-address + 12, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD6:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p3) :: (load 2 from unknown-address + 14, addrspace 3) + ; GFX10-UNALIGNED: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; GFX10-UNALIGNED: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C1]] + ; GFX10-UNALIGNED: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; GFX10-UNALIGNED: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C1]] + ; GFX10-UNALIGNED: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C2]](s32) + ; GFX10-UNALIGNED: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; GFX10-UNALIGNED: [[PTR_ADD7:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD5]], [[C3]](s32) + ; GFX10-UNALIGNED: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p3) :: (load 2 from unknown-address + 16, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD8:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p3) :: (load 2 from unknown-address + 18, addrspace 3) + ; GFX10-UNALIGNED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; GFX10-UNALIGNED: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C1]] + ; GFX10-UNALIGNED: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; GFX10-UNALIGNED: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C1]] + ; GFX10-UNALIGNED: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C2]](s32) + ; GFX10-UNALIGNED: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL4]] + ; GFX10-UNALIGNED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR3]](s32), [[OR4]](s32) + ; GFX10-UNALIGNED: [[PTR_ADD9:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD5]], [[C4]](s32) + ; GFX10-UNALIGNED: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p3) :: (load 2 from unknown-address + 20, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD10:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p3) :: (load 2 from unknown-address + 22, addrspace 3) + ; GFX10-UNALIGNED: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; GFX10-UNALIGNED: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C1]] + ; GFX10-UNALIGNED: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; GFX10-UNALIGNED: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C1]] + ; GFX10-UNALIGNED: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C2]](s32) + ; GFX10-UNALIGNED: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL5]] + ; GFX10-UNALIGNED: [[INSERT2:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[DEF]], [[BUILD_VECTOR1]](<2 x s32>), 0 + ; GFX10-UNALIGNED: [[INSERT3:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT2]], [[OR5]](s32), 64 + ; GFX10-UNALIGNED: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[INSERT3]](<3 x s32>) + ; GFX10-UNALIGNED: [[COPY14:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96) + ; GFX10-UNALIGNED: [[COPY15:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) + ; GFX10-UNALIGNED: $vgpr0_vgpr1_vgpr2 = COPY [[COPY14]](s96) + ; GFX10-UNALIGNED: $vgpr3_vgpr4_vgpr5 = COPY [[COPY15]](s96) %0:_(p3) = COPY $vgpr0 %1:_(<2 x s96>) = G_LOAD %0 :: (load 24, align 2, addrspace 3) %2:_(s96) = G_EXTRACT %1, 0 @@ -12649,6 +16913,52 @@ ; GFX9-UNALIGNED: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) ; GFX9-UNALIGNED: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) ; GFX9-UNALIGNED: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96) + ; GFX10-LABEL: name: test_extload_local_v2s96_from_24_align4 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 from unknown-address + 8, addrspace 3) + ; GFX10: [[DEF:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF + ; GFX10: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY [[DEF]](<3 x s32>) + ; GFX10: [[INSERT:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[COPY1]], [[LOAD]](<2 x s32>), 0 + ; GFX10: [[INSERT1:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT]], [[LOAD1]](s32), 64 + ; GFX10: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[INSERT1]](<3 x s32>) + ; GFX10: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10: [[LOAD2:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[PTR_ADD1]](p3) :: (load 8 from unknown-address + 12, align 4, addrspace 3) + ; GFX10: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; GFX10: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 4 from unknown-address + 20, addrspace 3) + ; GFX10: [[INSERT2:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[DEF]], [[LOAD2]](<2 x s32>), 0 + ; GFX10: [[INSERT3:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT2]], [[LOAD3]](s32), 64 + ; GFX10: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[INSERT3]](<3 x s32>) + ; GFX10: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96) + ; GFX10: [[COPY3:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) + ; GFX10: $vgpr0_vgpr1_vgpr2 = COPY [[COPY2]](s96) + ; GFX10: $vgpr3_vgpr4_vgpr5 = COPY [[COPY3]](s96) + ; GFX10-UNALIGNED-LABEL: name: test_extload_local_v2s96_from_24_align4 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-UNALIGNED: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 from unknown-address + 8, addrspace 3) + ; GFX10-UNALIGNED: [[DEF:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF + ; GFX10-UNALIGNED: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY [[DEF]](<3 x s32>) + ; GFX10-UNALIGNED: [[INSERT:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[COPY1]], [[LOAD]](<2 x s32>), 0 + ; GFX10-UNALIGNED: [[INSERT1:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT]], [[LOAD1]](s32), 64 + ; GFX10-UNALIGNED: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[INSERT1]](<3 x s32>) + ; GFX10-UNALIGNED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10-UNALIGNED: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LOAD2:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[PTR_ADD1]](p3) :: (load 8 from unknown-address + 12, align 4, addrspace 3) + ; GFX10-UNALIGNED: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 4 from unknown-address + 20, addrspace 3) + ; GFX10-UNALIGNED: [[INSERT2:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[DEF]], [[LOAD2]](<2 x s32>), 0 + ; GFX10-UNALIGNED: [[INSERT3:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT2]], [[LOAD3]](s32), 64 + ; GFX10-UNALIGNED: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[INSERT3]](<3 x s32>) + ; GFX10-UNALIGNED: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96) + ; GFX10-UNALIGNED: [[COPY3:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) + ; GFX10-UNALIGNED: $vgpr0_vgpr1_vgpr2 = COPY [[COPY2]](s96) + ; GFX10-UNALIGNED: $vgpr3_vgpr4_vgpr5 = COPY [[COPY3]](s96) %0:_(p3) = COPY $vgpr0 %1:_(<2 x s96>) = G_LOAD %0 :: (load 24, align 4, addrspace 3) %2:_(s96) = G_EXTRACT %1, 0 @@ -12775,6 +17085,42 @@ ; GFX9-UNALIGNED: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) ; GFX9-UNALIGNED: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) ; GFX9-UNALIGNED: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96) + ; GFX10-LABEL: name: test_extload_local_v2s96_from_24_align16 + ; GFX10: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p3) :: (load 12, align 16, addrspace 3) + ; GFX10: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10: [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[PTR_ADD]](p3) :: (load 8 from unknown-address + 12, align 4, addrspace 3) + ; GFX10: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD]], [[C1]](s32) + ; GFX10: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 4 from unknown-address + 20, addrspace 3) + ; GFX10: [[DEF:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF + ; GFX10: [[INSERT:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[DEF]], [[LOAD1]](<2 x s32>), 0 + ; GFX10: [[INSERT1:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT]], [[LOAD2]](s32), 64 + ; GFX10: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[INSERT1]](<3 x s32>) + ; GFX10: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96) + ; GFX10: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) + ; GFX10: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) + ; GFX10: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96) + ; GFX10-UNALIGNED-LABEL: name: test_extload_local_v2s96_from_24_align16 + ; GFX10-UNALIGNED: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX10-UNALIGNED: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p3) :: (load 12, align 16, addrspace 3) + ; GFX10-UNALIGNED: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) + ; GFX10-UNALIGNED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10-UNALIGNED: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX10-UNALIGNED: [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[PTR_ADD]](p3) :: (load 8 from unknown-address + 12, align 4, addrspace 3) + ; GFX10-UNALIGNED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-UNALIGNED: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD]], [[C1]](s32) + ; GFX10-UNALIGNED: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 4 from unknown-address + 20, addrspace 3) + ; GFX10-UNALIGNED: [[DEF:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF + ; GFX10-UNALIGNED: [[INSERT:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[DEF]], [[LOAD1]](<2 x s32>), 0 + ; GFX10-UNALIGNED: [[INSERT1:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT]], [[LOAD2]](s32), 64 + ; GFX10-UNALIGNED: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[INSERT1]](<3 x s32>) + ; GFX10-UNALIGNED: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96) + ; GFX10-UNALIGNED: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) + ; GFX10-UNALIGNED: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) + ; GFX10-UNALIGNED: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96) %0:_(p3) = COPY $vgpr0 %1:_(<2 x s96>) = G_LOAD %0 :: (load 24, align 16, addrspace 3) %2:_(s96) = G_EXTRACT %1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir @@ -3,6 +3,7 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=bonaire -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=CI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=VI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s --- name: test_load_private_s1_align1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-lshr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-lshr.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-lshr.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-lshr.mir @@ -2,6 +2,7 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=SI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=VI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -O0 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s --- name: test_lshr_s32_s32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-mul.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-mul.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-mul.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-mul.mir @@ -2,6 +2,7 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX6 %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX8 %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX9 %s --- name: test_mul_s32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddsat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddsat.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddsat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddsat.mir @@ -2,6 +2,7 @@ # RUN: llc -global-isel-abort=0 -march=amdgcn -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX6 %s # RUN: llc -global-isel-abort=0 -march=amdgcn -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX8 %s # RUN: llc -global-isel-abort=0 -march=amdgcn -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -global-isel-abort=0 -march=amdgcn -mcpu=gfx1010 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s --- name: saddsat_s7 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sdiv.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sdiv.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sdiv.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sdiv.mir @@ -2,6 +2,7 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX6 %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX8 %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -O0 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s --- name: test_sdiv_s32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sext-inreg.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sext-inreg.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sext-inreg.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sext-inreg.mir @@ -2,6 +2,7 @@ # RUN: llc -O0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s # RUN: llc -O0 -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX8 %s # RUN: llc -O0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX6 %s +# RUN: llc -O0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s --- name: test_sext_inreg_s32_1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shl.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shl.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shl.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shl.mir @@ -2,6 +2,7 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=SI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=VI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s --- name: test_shl_s32_s32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.s16.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.s16.mir @@ -1,6 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX8 %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX9 %s --- name: shufflevector_v2s16_v2s16_undef_undef diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smax.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smax.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smax.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smax.mir @@ -2,6 +2,7 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=SI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=VI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s --- name: test_smax_s32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smin.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smin.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smin.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smin.mir @@ -2,6 +2,7 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=SI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=VI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s --- name: test_smin_s32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-srem.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-srem.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-srem.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-srem.mir @@ -2,6 +2,7 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX6 %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX8 %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -O0 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s --- name: test_srem_s32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sshlsat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sshlsat.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sshlsat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sshlsat.mir @@ -2,6 +2,7 @@ # RUN: llc -global-isel-abort=0 -march=amdgcn -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX6 %s # RUN: llc -global-isel-abort=0 -march=amdgcn -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX8 %s # RUN: llc -global-isel-abort=0 -march=amdgcn -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -global-isel-abort=0 -march=amdgcn -mcpu=gfx1010 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s --- name: sshlsat_s7 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubsat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubsat.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubsat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubsat.mir @@ -2,6 +2,7 @@ # RUN: llc -global-isel-abort=0 -march=amdgcn -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX6 %s # RUN: llc -global-isel-abort=0 -march=amdgcn -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX8 %s # RUN: llc -global-isel-abort=0 -march=amdgcn -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -global-isel-abort=0 -march=amdgcn -mcpu=gfx1010 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s --- name: ssubsat_s7 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir @@ -3,6 +3,7 @@ # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=CI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=VI %s # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s --- name: test_store_global_s1_align1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sub.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sub.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sub.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sub.mir @@ -2,6 +2,7 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX6 %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX8 %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX9 %s --- name: test_sub_s32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uaddsat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uaddsat.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uaddsat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uaddsat.mir @@ -2,6 +2,7 @@ # RUN: llc -global-isel-abort=0 -march=amdgcn -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX6 %s # RUN: llc -global-isel-abort=0 -march=amdgcn -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX8 %s # RUN: llc -global-isel-abort=0 -march=amdgcn -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -global-isel-abort=0 -march=amdgcn -mcpu=gfx1010 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s --- name: uaddsat_s7 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-udiv.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-udiv.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-udiv.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-udiv.mir @@ -2,6 +2,7 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX6 %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX8 %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX9 %s --- name: test_udiv_s32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umax.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umax.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umax.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umax.mir @@ -2,6 +2,7 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=SI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=VI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s --- name: test_umax_s32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umin.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umin.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umin.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umin.mir @@ -2,6 +2,7 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=SI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=VI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s --- name: test_umin_s32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-urem.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-urem.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-urem.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-urem.mir @@ -2,6 +2,7 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX6 %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX8 %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -O0 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX9 %s --- name: test_urem_s32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ushlsat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ushlsat.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ushlsat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ushlsat.mir @@ -2,6 +2,7 @@ # RUN: llc -global-isel-abort=0 -march=amdgcn -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX6 %s # RUN: llc -global-isel-abort=0 -march=amdgcn -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX8 %s # RUN: llc -global-isel-abort=0 -march=amdgcn -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -global-isel-abort=0 -march=amdgcn -mcpu=gfx1010 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s --- name: ushlsat_s7 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubsat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubsat.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubsat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubsat.mir @@ -2,6 +2,7 @@ # RUN: llc -global-isel-abort=0 -march=amdgcn -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX6 %s # RUN: llc -global-isel-abort=0 -march=amdgcn -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX8 %s # RUN: llc -global-isel-abort=0 -march=amdgcn -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -global-isel-abort=0 -march=amdgcn -mcpu=gfx1010 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s --- name: usubsat_s7 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll @@ -2,6 +2,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s ; FIXME: Merge with other test. DS offset folding doesn't work due to ; register bank copies, and no return optimization is missing. @@ -60,6 +61,19 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: lds_atomic_inc_ret_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX10-NEXT: v_mov_b32_e32 v1, 42 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: ds_inc_rtn_u32 v0, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false), !noalias !0 store i32 %result, i32 addrspace(1)* %out ret void @@ -110,6 +124,19 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: lds_atomic_inc_ret_i32_offset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: ds_inc_rtn_u32 v0, v1, v0 offset:16 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: s_endpgm %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %gep, i32 42, i32 0, i32 0, i1 false) store i32 %result, i32 addrspace(1)* %out @@ -145,6 +172,15 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_inc_rtn_u32 v0, v0, v1 ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: lds_atomic_inc_noret_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v1, 42 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: ds_inc_rtn_u32 v0, v0, v1 +; GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false) ret void } @@ -178,6 +214,15 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: ds_inc_rtn_u32 v0, v1, v0 offset:16 ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: lds_atomic_inc_noret_i32_offset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: ds_inc_rtn_u32 v0, v1, v0 offset:16 +; GFX10-NEXT: s_endpgm %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %gep, i32 42, i32 0, i32 0, i1 false) ret void @@ -222,6 +267,17 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_atomic_inc_ret_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_atomic_inc v0, v1, v0, s[2:3] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42, i32 0, i32 0, i1 false) store i32 %result, i32 addrspace(1)* %out ret void @@ -270,6 +326,17 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_atomic_inc_ret_i32_offset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_atomic_inc v0, v1, v0, s[2:3] offset:16 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: s_endpgm %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false) store i32 %result, i32 addrspace(1)* %out @@ -305,6 +372,15 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_atomic_inc v0, v1, v0, s[0:1] glc ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_atomic_inc_noret_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_atomic_inc v0, v1, v0, s[0:1] glc +; GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42, i32 0, i32 0, i1 false) ret void } @@ -342,6 +418,15 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_atomic_inc v0, v1, v0, s[0:1] offset:16 glc ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_atomic_inc_noret_i32_offset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_atomic_inc v0, v1, v0, s[0:1] offset:16 glc +; GFX10-NEXT: s_endpgm %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false) ret void @@ -400,6 +485,17 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_atomic_inc_ret_i32_offset_addr64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 42 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_atomic_inc v1, v0, v1, s[2:3] offset:20 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id @@ -448,6 +544,15 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_atomic_inc v0, v0, v1, s[0:1] offset:20 glc ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_atomic_inc_noret_i32_offset_addr64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 42 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_atomic_inc v0, v0, v1, s[0:1] offset:20 glc +; GFX10-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id %gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5 @@ -504,6 +609,19 @@ ; GFX9-NEXT: global_store_dword v2, v1, s[2:3] ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_inc_shl_base_lds_0_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, 9 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0 +; GFX10-NEXT: ds_inc_rtn_u32 v1, v1, v2 offset:8 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-NEXT: global_store_dword v2, v1, s[0:1] +; GFX10-NEXT: s_endpgm %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds0, i32 0, i32 %idx.0 @@ -559,6 +677,20 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: lds_atomic_inc_ret_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: s_endpgm %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %ptr, i64 42, i32 0, i32 0, i1 false) store i64 %result, i64 addrspace(1)* %out ret void @@ -610,6 +742,20 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: lds_atomic_inc_ret_i64_offset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: s_endpgm %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %gep, i64 42, i32 0, i32 0, i1 false) store i64 %result, i64 addrspace(1)* %out @@ -648,6 +794,16 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: lds_atomic_inc_noret_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] +; GFX10-NEXT: s_endpgm %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %ptr, i64 42, i32 0, i32 0, i1 false) ret void } @@ -684,6 +840,16 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: lds_atomic_inc_noret_i64_offset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 +; GFX10-NEXT: s_endpgm %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %gep, i64 42, i32 0, i32 0, i1 false) ret void @@ -731,6 +897,18 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_atomic_inc_ret_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: s_endpgm %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false) store i64 %result, i64 addrspace(1)* %out ret void @@ -782,6 +960,18 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_atomic_inc_ret_i64_offset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: s_endpgm %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false) store i64 %result, i64 addrspace(1)* %out @@ -820,6 +1010,16 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_atomic_inc_noret_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[0:1] glc +; GFX10-NEXT: s_endpgm %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false) ret void } @@ -860,6 +1060,16 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_atomic_inc_noret_i64_offset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc +; GFX10-NEXT: s_endpgm %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false) ret void @@ -921,6 +1131,18 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_atomic_inc_ret_i64_offset_addr64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] offset:40 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id @@ -972,6 +1194,16 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[0:1] offset:40 glc ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_atomic_inc_noret_i64_offset_addr64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[0:1] offset:40 glc +; GFX10-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id %gep = getelementptr i64, i64 addrspace(1)* %gep.tid, i32 5 @@ -1044,6 +1276,22 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_dword v[0:1], v2 ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: flat_atomic_inc_ret_i32_offset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v2, 42 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_add_u32 s2, s2, 16 +; GFX10-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: flat_store_dword v[0:1], v2 +; GFX10-NEXT: s_endpgm %gep = getelementptr i32, i32* %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %gep, i32 42, i32 0, i32 0, i1 false) store i32 %result, i32* %out @@ -1098,6 +1346,18 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:16 glc ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: flat_atomic_inc_noret_i32_offset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v2, 42 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_add_u32 s0, s0, 16 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GFX10-NEXT: s_endpgm %gep = getelementptr i32, i32* %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %gep, i32 42, i32 0, i32 0, i1 false) ret void @@ -1164,6 +1424,27 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_dword v[2:3], v0 ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: flat_atomic_inc_ret_i32_offset_addr64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, 42 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, 20 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: flat_atomic_inc v3, v[0:1], v3 glc +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: flat_store_dword v[0:1], v3 +; GFX10-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, i32* %ptr, i32 %id %out.gep = getelementptr i32, i32* %out, i32 %id @@ -1216,6 +1497,21 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:20 glc ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: flat_atomic_inc_noret_i32_offset_addr64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, 42 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, 20 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GFX10-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, i32* %ptr, i32 %id %gep = getelementptr i32, i32* %gep.tid, i32 5 @@ -1275,6 +1571,20 @@ ; GFX9-NEXT: global_store_dword v3, v2, s[2:3] ; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_inc_shl_base_lds_0_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v1, 9 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0 +; GFX10-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_store_dword v3, v0, s[2:3] +; GFX10-NEXT: global_store_dwordx2 v3, v[1:2], s[0:1] +; GFX10-NEXT: s_endpgm %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i64], [512 x i64] addrspace(3)* @lds1, i32 0, i32 %idx.0 @@ -1353,6 +1663,23 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: flat_atomic_inc_ret_i64_offset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v2, 42 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_add_u32 s2, s2, 32 +; GFX10-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX10-NEXT: s_endpgm %gep = getelementptr i64, i64* %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %gep, i64 42, i32 0, i32 0, i1 false) store i64 %result, i64* %out @@ -1411,6 +1738,19 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] offset:32 glc ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: flat_atomic_inc_noret_i64_offset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v2, 42 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_add_u32 s0, s0, 32 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX10-NEXT: s_endpgm %gep = getelementptr i64, i64* %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %gep, i64 42, i32 0, i32 0, i1 false) ret void @@ -1480,6 +1820,28 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: flat_atomic_inc_ret_i64_offset_addr64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, 42 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v8, s3 +; GFX10-NEXT: v_mov_b32_e32 v7, s2 +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v7, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v8, vcc_lo +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, 40 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v2, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX10-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, i64* %ptr, i32 %id %out.gep = getelementptr i64, i64* %out, i32 %id @@ -1535,6 +1897,22 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] offset:40 glc ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: flat_atomic_inc_noret_i64_offset_addr64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, 42 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, 40 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX10-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, i64* %ptr, i32 %id %gep = getelementptr i64, i64* %gep.tid, i32 5 @@ -1598,6 +1976,21 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: nocse_lds_atomic_inc_ret_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: ds_inc_rtn_u32 v2, v1, v0 +; GFX10-NEXT: ds_inc_rtn_u32 v0, v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_store_dword v1, v2, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-NEXT: s_endpgm %result0 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false) %result1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fadd.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fadd.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s define amdgpu_ps float @ds_fadd_f32_ss(float addrspace(3)* inreg %ptr, float inreg %val) { ; GFX8-LABEL: ds_fadd_f32_ss: @@ -19,6 +20,14 @@ ; GFX9-NEXT: ds_add_rtn_f32 v0, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: ds_fadd_f32_ss: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: ds_add_rtn_f32 v0, v0, v1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: ; return to shader part epilog %ret = call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %ptr, float %val, i32 0, i32 0, i1 false) ret float %ret } @@ -40,6 +49,14 @@ ; GFX9-NEXT: ds_add_rtn_f32 v0, v1, v0 offset:512 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: ds_fadd_f32_ss_offset: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: ds_add_rtn_f32 v0, v1, v0 offset:512 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: ; return to shader part epilog %gep = getelementptr float, float addrspace(3)* %ptr, i32 128 %ret = call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %gep, float %val, i32 0, i32 0, i1 false) ret float %ret @@ -60,6 +77,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: ds_add_rtn_f32 v0, v0, v1 ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: ds_fadd_f32_ss_nortn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: ds_add_rtn_f32 v0, v0, v1 +; GFX10-NEXT: s_endpgm %unused = call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %ptr, float %val, i32 0, i32 0, i1 false) ret void } @@ -79,6 +103,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: ds_add_rtn_f32 v0, v1, v0 offset:512 ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: ds_fadd_f32_ss_offset_nortn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: ds_add_rtn_f32 v0, v1, v0 offset:512 +; GFX10-NEXT: s_endpgm %gep = getelementptr float, float addrspace(3)* %ptr, i32 128 %unused = call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %gep, float %val, i32 0, i32 0, i1 false) ret void @@ -99,6 +130,14 @@ ; GFX9-NEXT: ds_add_rtn_f32 v0, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: ds_fadd_f32_vv: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_add_rtn_f32 v0, v0, v1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %ptr, float %val, i32 0, i32 0, i1 false) ret float %ret } @@ -118,6 +157,14 @@ ; GFX9-NEXT: ds_add_rtn_f32 v0, v0, v1 offset:512 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: ds_fadd_f32_vv_offset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_add_rtn_f32 v0, v0, v1 offset:512 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, float addrspace(3)* %ptr, i32 128 %ret = call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %gep, float %val, i32 0, i32 0, i1 false) ret float %ret @@ -138,6 +185,14 @@ ; GFX9-NEXT: ds_add_rtn_f32 v0, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: ds_fadd_f32_vv_nortn: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_add_rtn_f32 v0, v0, v1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %ptr, float %val, i32 0, i32 0, i1 false) ret void } @@ -157,6 +212,14 @@ ; GFX9-NEXT: ds_add_rtn_f32 v0, v0, v1 offset:512 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: ds_fadd_f32_vv_offset_nortn: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_add_rtn_f32 v0, v0, v1 offset:512 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, float addrspace(3)* %ptr, i32 128 %ret = call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %gep, float %val, i32 0, i32 0, i1 false) ret void @@ -177,6 +240,14 @@ ; GFX9-NEXT: ds_add_rtn_f32 v0, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: ds_fadd_f32_vv_volatile: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_add_rtn_f32 v0, v0, v1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %ptr, float %val, i32 0, i32 0, i1 true) ret float %ret } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmin.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmin.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s define amdgpu_ps float @ds_fmin_f32_ss(float addrspace(3)* inreg %ptr, float inreg %val) { ; GFX8-LABEL: ds_fmin_f32_ss: @@ -19,6 +20,14 @@ ; GFX9-NEXT: ds_min_rtn_f32 v0, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: ds_fmin_f32_ss: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: ds_min_rtn_f32 v0, v0, v1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: ; return to shader part epilog %ret = call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %ptr, float %val, i32 0, i32 0, i1 false) ret float %ret } @@ -40,6 +49,14 @@ ; GFX9-NEXT: ds_min_rtn_f32 v0, v1, v0 offset:512 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: ds_fmin_f32_ss_offset: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: ds_min_rtn_f32 v0, v1, v0 offset:512 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: ; return to shader part epilog %gep = getelementptr float, float addrspace(3)* %ptr, i32 128 %ret = call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %gep, float %val, i32 0, i32 0, i1 false) ret float %ret @@ -60,6 +77,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: ds_min_rtn_f32 v0, v0, v1 ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: ds_fmin_f32_ss_nortn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: ds_min_rtn_f32 v0, v0, v1 +; GFX10-NEXT: s_endpgm %unused = call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %ptr, float %val, i32 0, i32 0, i1 false) ret void } @@ -79,6 +103,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: ds_min_rtn_f32 v0, v1, v0 offset:512 ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: ds_fmin_f32_ss_offset_nortn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: ds_min_rtn_f32 v0, v1, v0 offset:512 +; GFX10-NEXT: s_endpgm %gep = getelementptr float, float addrspace(3)* %ptr, i32 128 %unused = call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %gep, float %val, i32 0, i32 0, i1 false) ret void @@ -99,6 +130,14 @@ ; GFX9-NEXT: ds_min_rtn_f32 v0, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: ds_fmin_f32_vv: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_min_rtn_f32 v0, v0, v1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %ptr, float %val, i32 0, i32 0, i1 false) ret float %ret } @@ -118,6 +157,14 @@ ; GFX9-NEXT: ds_min_rtn_f32 v0, v0, v1 offset:512 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: ds_fmin_f32_vv_offset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_min_rtn_f32 v0, v0, v1 offset:512 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, float addrspace(3)* %ptr, i32 128 %ret = call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %gep, float %val, i32 0, i32 0, i1 false) ret float %ret @@ -138,6 +185,14 @@ ; GFX9-NEXT: ds_min_rtn_f32 v0, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: ds_fmin_f32_vv_nortn: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_min_rtn_f32 v0, v0, v1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %ptr, float %val, i32 0, i32 0, i1 false) ret void } @@ -157,6 +212,14 @@ ; GFX9-NEXT: ds_min_rtn_f32 v0, v0, v1 offset:512 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: ds_fmin_f32_vv_offset_nortn: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_min_rtn_f32 v0, v0, v1 offset:512 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, float addrspace(3)* %ptr, i32 128 %ret = call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %gep, float %val, i32 0, i32 0, i1 false) ret void @@ -177,6 +240,14 @@ ; GFX9-NEXT: ds_min_rtn_f32 v0, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: ds_fmin_f32_vv_volatile: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_min_rtn_f32 v0, v0, v1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %ptr, float %val, i32 0, i32 0, i1 true) ret float %ret } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=CI %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s ; TODO: Merge with DAG test @@ -36,6 +37,21 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: is_private_vgpr: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) +; GFX10-NEXT: s_lshl_b32 s0, s0, 16 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s0, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10-NEXT: global_store_dword v[0:1], v0, off +; GFX10-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i8*, i8* addrspace(1)* %ptr.ptr, i32 %id %ptr = load volatile i8*, i8* addrspace(1)* %gep @@ -75,6 +91,21 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: BB1_2: ; %bb1 ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: is_private_sgpr: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) +; GFX10-NEXT: s_lshl_b32 s0, s0, 16 +; GFX10-NEXT: s_cmp_lg_u32 s1, s0 +; GFX10-NEXT: s_cbranch_scc1 BB1_2 +; GFX10-NEXT: ; %bb.1: ; %bb0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: global_store_dword v[0:1], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: BB1_2: ; %bb1 +; GFX10-NEXT: s_endpgm %val = call i1 @llvm.amdgcn.is.private(i8* %ptr) br i1 %val, label %bb0, label %bb1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=CI %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s ; TODO: Merge with DAG test @@ -36,6 +37,21 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: is_local_vgpr: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) +; GFX10-NEXT: s_lshl_b32 s0, s0, 16 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s0, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10-NEXT: global_store_dword v[0:1], v0, off +; GFX10-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i8*, i8* addrspace(1)* %ptr.ptr, i32 %id %ptr = load volatile i8*, i8* addrspace(1)* %gep @@ -75,6 +91,21 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: BB1_2: ; %bb1 ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: is_local_sgpr: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) +; GFX10-NEXT: s_lshl_b32 s0, s0, 16 +; GFX10-NEXT: s_cmp_lg_u32 s1, s0 +; GFX10-NEXT: s_cbranch_scc1 BB1_2 +; GFX10-NEXT: ; %bb.1: ; %bb0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: global_store_dword v[0:1], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: BB1_2: ; %bb1 +; GFX10-NEXT: s_endpgm %val = call i1 @llvm.amdgcn.is.shared(i8* %ptr) br i1 %val, label %bb0, label %bb1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll @@ -2,6 +2,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,CI %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,VI %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s define double @v_trig_preop_f64(double %a, i32 %b) { ; GCN-LABEL: v_trig_preop_f64: @@ -9,6 +10,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_trig_preop_f64 v[0:1], v[0:1], v2 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_trig_preop_f64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_trig_preop_f64 v[0:1], v[0:1], v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call double @llvm.amdgcn.trig.preop.f64(double %a, i32 %b) ret double %result } @@ -19,6 +27,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_trig_preop_f64 v[0:1], v[0:1], 7 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_trig_preop_f64_imm: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_trig_preop_f64 v[0:1], v[0:1], 7 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call double @llvm.amdgcn.trig.preop.f64(double %a, i32 7) ret double %result } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s ; FIXME: ; XUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s @@ -20,6 +21,14 @@ ; GFX7-NEXT: ds_read_b128 v[0:3], v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: load_lds_v4i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_read_b128 v[0:3], v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr ret <4 x i32> %load } @@ -166,6 +175,73 @@ ; GFX7-NEXT: v_or_b32_e32 v3, v5, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: load_lds_v4i32_align1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_read_u8 v1, v0 offset:1 +; GFX10-NEXT: ds_read_u8 v2, v0 offset:2 +; GFX10-NEXT: ds_read_u8 v3, v0 offset:3 +; GFX10-NEXT: ds_read_u8 v4, v0 offset:5 +; GFX10-NEXT: ds_read_u8 v5, v0 offset:6 +; GFX10-NEXT: ds_read_u8 v6, v0 offset:7 +; GFX10-NEXT: ds_read_u8 v7, v0 offset:9 +; GFX10-NEXT: ds_read_u8 v8, v0 +; GFX10-NEXT: ds_read_u8 v9, v0 offset:4 +; GFX10-NEXT: ds_read_u8 v10, v0 offset:8 +; GFX10-NEXT: ds_read_u8 v12, v0 offset:10 +; GFX10-NEXT: ds_read_u8 v13, v0 offset:11 +; GFX10-NEXT: ds_read_u8 v25, v0 offset:12 +; GFX10-NEXT: ds_read_u8 v15, v0 offset:13 +; GFX10-NEXT: ds_read_u8 v16, v0 offset:14 +; GFX10-NEXT: ds_read_u8 v0, v0 offset:15 +; GFX10-NEXT: v_mov_b32_e32 v17, 8 +; GFX10-NEXT: s_mov_b32 s5, 8 +; GFX10-NEXT: v_mov_b32_e32 v11, 0xff +; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: s_waitcnt lgkmcnt(15) +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_waitcnt lgkmcnt(14) +; GFX10-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX10-NEXT: s_waitcnt lgkmcnt(13) +; GFX10-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX10-NEXT: s_waitcnt lgkmcnt(12) +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_waitcnt lgkmcnt(11) +; GFX10-NEXT: v_and_b32_e32 v5, v5, v11 +; GFX10-NEXT: s_waitcnt lgkmcnt(10) +; GFX10-NEXT: v_and_b32_e32 v6, v6, v11 +; GFX10-NEXT: s_waitcnt lgkmcnt(9) +; GFX10-NEXT: v_lshlrev_b32_sdwa v21, v17, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_waitcnt lgkmcnt(8) +; GFX10-NEXT: v_and_or_b32 v1, v8, s4, v1 +; GFX10-NEXT: s_waitcnt lgkmcnt(5) +; GFX10-NEXT: v_and_b32_e32 v8, v12, v11 +; GFX10-NEXT: v_and_or_b32 v4, v9, s4, v4 +; GFX10-NEXT: s_waitcnt lgkmcnt(4) +; GFX10-NEXT: v_and_b32_e32 v9, v13, v11 +; GFX10-NEXT: v_and_or_b32 v7, v10, v11, v21 +; GFX10-NEXT: s_waitcnt lgkmcnt(2) +; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v17, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_waitcnt lgkmcnt(1) +; GFX10-NEXT: v_and_b32_e32 v12, v16, v11 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_and_b32_e32 v0, v0, v11 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX10-NEXT: v_and_or_b32 v10, v25, v11, v10 +; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; GFX10-NEXT: v_lshlrev_b32_e32 v12, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX10-NEXT: v_or3_b32 v0, v1, v2, v3 +; GFX10-NEXT: v_or3_b32 v3, v10, v11, v12 +; GFX10-NEXT: v_or3_b32 v1, v4, v5, v6 +; GFX10-NEXT: v_or3_b32 v2, v7, v8, v9 +; GFX10-NEXT: s_setpc_b64 s[30:31] %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 1 ret <4 x i32> %load } @@ -236,6 +312,41 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: load_lds_v4i32_align2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_read_u16 v1, v0 offset:2 +; GFX10-NEXT: ds_read_u16 v2, v0 offset:6 +; GFX10-NEXT: ds_read_u16 v3, v0 offset:10 +; GFX10-NEXT: ds_read_u16 v4, v0 offset:14 +; GFX10-NEXT: ds_read_u16 v5, v0 +; GFX10-NEXT: ds_read_u16 v6, v0 offset:4 +; GFX10-NEXT: ds_read_u16 v7, v0 offset:8 +; GFX10-NEXT: ds_read_u16 v8, v0 offset:12 +; GFX10-NEXT: s_mov_b32 s4, 0xffff +; GFX10-NEXT: s_waitcnt lgkmcnt(7) +; GFX10-NEXT: v_and_b32_e32 v0, s4, v1 +; GFX10-NEXT: s_waitcnt lgkmcnt(6) +; GFX10-NEXT: v_and_b32_e32 v1, s4, v2 +; GFX10-NEXT: s_waitcnt lgkmcnt(5) +; GFX10-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX10-NEXT: s_waitcnt lgkmcnt(4) +; GFX10-NEXT: v_and_b32_e32 v3, s4, v4 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX10-NEXT: s_waitcnt lgkmcnt(3) +; GFX10-NEXT: v_and_or_b32 v0, v5, s4, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(2) +; GFX10-NEXT: v_and_or_b32 v1, v6, s4, v1 +; GFX10-NEXT: s_waitcnt lgkmcnt(1) +; GFX10-NEXT: v_and_or_b32 v2, v7, s4, v2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_and_or_b32 v3, v8, s4, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 2 ret <4 x i32> %load } @@ -259,6 +370,16 @@ ; GFX7-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: load_lds_v4i32_align4: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; GFX10-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 4 ret <4 x i32> %load } @@ -278,6 +399,14 @@ ; GFX7-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: load_lds_v4i32_align8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 8 ret <4 x i32> %load } @@ -297,6 +426,14 @@ ; GFX7-NEXT: ds_read_b128 v[0:3], v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: load_lds_v4i32_align16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_read_b128 v[0:3], v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 16 ret <4 x i32> %load } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s ; FIXME: ; XUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s @@ -20,6 +21,14 @@ ; GFX7-NEXT: ds_read_b96 v[0:2], v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: load_lds_v3i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_read_b96 v[0:2], v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr ret <3 x i32> %load } @@ -137,6 +146,61 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: load_lds_v3i32_align1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_read_u8 v1, v0 offset:1 +; GFX10-NEXT: ds_read_u8 v2, v0 offset:2 +; GFX10-NEXT: ds_read_u8 v3, v0 offset:3 +; GFX10-NEXT: ds_read_u8 v4, v0 offset:5 +; GFX10-NEXT: ds_read_u8 v5, v0 offset:6 +; GFX10-NEXT: ds_read_u8 v6, v0 offset:7 +; GFX10-NEXT: ds_read_u8 v7, v0 offset:9 +; GFX10-NEXT: ds_read_u8 v15, v0 offset:10 +; GFX10-NEXT: ds_read_u8 v9, v0 offset:11 +; GFX10-NEXT: ds_read_u8 v10, v0 +; GFX10-NEXT: ds_read_u8 v11, v0 offset:4 +; GFX10-NEXT: ds_read_u8 v14, v0 offset:8 +; GFX10-NEXT: v_mov_b32_e32 v12, 0xff +; GFX10-NEXT: v_mov_b32_e32 v13, 8 +; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: s_mov_b32 s5, 8 +; GFX10-NEXT: s_waitcnt lgkmcnt(11) +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_waitcnt lgkmcnt(10) +; GFX10-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX10-NEXT: s_waitcnt lgkmcnt(9) +; GFX10-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX10-NEXT: s_waitcnt lgkmcnt(8) +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_waitcnt lgkmcnt(7) +; GFX10-NEXT: v_and_b32_e32 v5, v5, v12 +; GFX10-NEXT: s_waitcnt lgkmcnt(6) +; GFX10-NEXT: v_and_b32_e32 v6, v6, v12 +; GFX10-NEXT: s_waitcnt lgkmcnt(5) +; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v13, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_waitcnt lgkmcnt(4) +; GFX10-NEXT: v_and_b32_e32 v8, v15, v12 +; GFX10-NEXT: s_waitcnt lgkmcnt(3) +; GFX10-NEXT: v_and_b32_e32 v9, v9, v12 +; GFX10-NEXT: s_waitcnt lgkmcnt(2) +; GFX10-NEXT: v_and_or_b32 v1, v10, s4, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX10-NEXT: s_waitcnt lgkmcnt(1) +; GFX10-NEXT: v_and_or_b32 v4, v11, s4, v4 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_and_or_b32 v7, v14, v12, v7 +; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX10-NEXT: v_or3_b32 v0, v1, v2, v3 +; GFX10-NEXT: v_or3_b32 v1, v4, v5, v6 +; GFX10-NEXT: v_or3_b32 v2, v7, v8, v9 +; GFX10-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 1 ret <3 x i32> %load } @@ -194,6 +258,34 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: load_lds_v3i32_align2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_read_u16 v1, v0 offset:2 +; GFX10-NEXT: ds_read_u16 v2, v0 offset:6 +; GFX10-NEXT: ds_read_u16 v3, v0 offset:10 +; GFX10-NEXT: ds_read_u16 v7, v0 +; GFX10-NEXT: ds_read_u16 v11, v0 offset:4 +; GFX10-NEXT: ds_read_u16 v15, v0 offset:8 +; GFX10-NEXT: s_mov_b32 s4, 0xffff +; GFX10-NEXT: s_waitcnt lgkmcnt(5) +; GFX10-NEXT: v_and_b32_e32 v0, s4, v1 +; GFX10-NEXT: s_waitcnt lgkmcnt(4) +; GFX10-NEXT: v_and_b32_e32 v1, s4, v2 +; GFX10-NEXT: s_waitcnt lgkmcnt(3) +; GFX10-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: s_waitcnt lgkmcnt(2) +; GFX10-NEXT: v_and_or_b32 v0, v7, s4, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(1) +; GFX10-NEXT: v_and_or_b32 v1, v11, s4, v1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_and_or_b32 v2, v15, s4, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 2 ret <3 x i32> %load } @@ -217,6 +309,16 @@ ; GFX7-NEXT: ds_read_b32 v2, v2 offset:8 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: load_lds_v3i32_align4: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; GFX10-NEXT: ds_read_b32 v2, v2 offset:8 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 4 ret <3 x i32> %load } @@ -240,6 +342,16 @@ ; GFX7-NEXT: ds_read_b32 v2, v2 offset:8 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: load_lds_v3i32_align8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: ds_read_b64 v[0:1], v0 +; GFX10-NEXT: ds_read_b32 v2, v2 offset:8 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 8 ret <3 x i32> %load } @@ -259,6 +371,14 @@ ; GFX7-NEXT: ds_read_b96 v[0:2], v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: load_lds_v3i32_align16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_read_b96 v[0:2], v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 16 ret <3 x i32> %load } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+unaligned-access-mode -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=+unaligned-access-mode -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -mattr=+unaligned-access-mode -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s ; Unaligned DS access in available from GFX9 onwards. ; LDS alignment enforcement is controlled by a configuration register: @@ -90,6 +91,73 @@ ; GFX7-NEXT: v_or_b32_e32 v3, v5, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: load_lds_v4i32_align1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_read_u8 v1, v0 offset:1 +; GFX10-NEXT: ds_read_u8 v2, v0 offset:2 +; GFX10-NEXT: ds_read_u8 v3, v0 offset:3 +; GFX10-NEXT: ds_read_u8 v4, v0 offset:5 +; GFX10-NEXT: ds_read_u8 v5, v0 offset:6 +; GFX10-NEXT: ds_read_u8 v6, v0 offset:7 +; GFX10-NEXT: ds_read_u8 v7, v0 offset:9 +; GFX10-NEXT: ds_read_u8 v8, v0 +; GFX10-NEXT: ds_read_u8 v9, v0 offset:4 +; GFX10-NEXT: ds_read_u8 v10, v0 offset:8 +; GFX10-NEXT: ds_read_u8 v12, v0 offset:10 +; GFX10-NEXT: ds_read_u8 v13, v0 offset:11 +; GFX10-NEXT: ds_read_u8 v25, v0 offset:12 +; GFX10-NEXT: ds_read_u8 v15, v0 offset:13 +; GFX10-NEXT: ds_read_u8 v16, v0 offset:14 +; GFX10-NEXT: ds_read_u8 v0, v0 offset:15 +; GFX10-NEXT: v_mov_b32_e32 v17, 8 +; GFX10-NEXT: s_mov_b32 s5, 8 +; GFX10-NEXT: v_mov_b32_e32 v11, 0xff +; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: s_waitcnt lgkmcnt(15) +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_waitcnt lgkmcnt(14) +; GFX10-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX10-NEXT: s_waitcnt lgkmcnt(13) +; GFX10-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX10-NEXT: s_waitcnt lgkmcnt(12) +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_waitcnt lgkmcnt(11) +; GFX10-NEXT: v_and_b32_e32 v5, v5, v11 +; GFX10-NEXT: s_waitcnt lgkmcnt(10) +; GFX10-NEXT: v_and_b32_e32 v6, v6, v11 +; GFX10-NEXT: s_waitcnt lgkmcnt(9) +; GFX10-NEXT: v_lshlrev_b32_sdwa v21, v17, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_waitcnt lgkmcnt(8) +; GFX10-NEXT: v_and_or_b32 v1, v8, s4, v1 +; GFX10-NEXT: s_waitcnt lgkmcnt(5) +; GFX10-NEXT: v_and_b32_e32 v8, v12, v11 +; GFX10-NEXT: v_and_or_b32 v4, v9, s4, v4 +; GFX10-NEXT: s_waitcnt lgkmcnt(4) +; GFX10-NEXT: v_and_b32_e32 v9, v13, v11 +; GFX10-NEXT: v_and_or_b32 v7, v10, v11, v21 +; GFX10-NEXT: s_waitcnt lgkmcnt(2) +; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v17, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_waitcnt lgkmcnt(1) +; GFX10-NEXT: v_and_b32_e32 v12, v16, v11 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_and_b32_e32 v0, v0, v11 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX10-NEXT: v_and_or_b32 v10, v25, v11, v10 +; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; GFX10-NEXT: v_lshlrev_b32_e32 v12, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX10-NEXT: v_or3_b32 v0, v1, v2, v3 +; GFX10-NEXT: v_or3_b32 v3, v10, v11, v12 +; GFX10-NEXT: v_or3_b32 v1, v4, v5, v6 +; GFX10-NEXT: v_or3_b32 v2, v7, v8, v9 +; GFX10-NEXT: s_setpc_b64 s[30:31] %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 1 ret <4 x i32> %load } @@ -162,6 +230,61 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: load_lds_v3i32_align1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_read_u8 v1, v0 offset:1 +; GFX10-NEXT: ds_read_u8 v2, v0 offset:2 +; GFX10-NEXT: ds_read_u8 v3, v0 offset:3 +; GFX10-NEXT: ds_read_u8 v4, v0 offset:5 +; GFX10-NEXT: ds_read_u8 v5, v0 offset:6 +; GFX10-NEXT: ds_read_u8 v6, v0 offset:7 +; GFX10-NEXT: ds_read_u8 v7, v0 offset:9 +; GFX10-NEXT: ds_read_u8 v15, v0 offset:10 +; GFX10-NEXT: ds_read_u8 v9, v0 offset:11 +; GFX10-NEXT: ds_read_u8 v10, v0 +; GFX10-NEXT: ds_read_u8 v11, v0 offset:4 +; GFX10-NEXT: ds_read_u8 v14, v0 offset:8 +; GFX10-NEXT: v_mov_b32_e32 v12, 0xff +; GFX10-NEXT: v_mov_b32_e32 v13, 8 +; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: s_mov_b32 s5, 8 +; GFX10-NEXT: s_waitcnt lgkmcnt(11) +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_waitcnt lgkmcnt(10) +; GFX10-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX10-NEXT: s_waitcnt lgkmcnt(9) +; GFX10-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX10-NEXT: s_waitcnt lgkmcnt(8) +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_waitcnt lgkmcnt(7) +; GFX10-NEXT: v_and_b32_e32 v5, v5, v12 +; GFX10-NEXT: s_waitcnt lgkmcnt(6) +; GFX10-NEXT: v_and_b32_e32 v6, v6, v12 +; GFX10-NEXT: s_waitcnt lgkmcnt(5) +; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v13, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_waitcnt lgkmcnt(4) +; GFX10-NEXT: v_and_b32_e32 v8, v15, v12 +; GFX10-NEXT: s_waitcnt lgkmcnt(3) +; GFX10-NEXT: v_and_b32_e32 v9, v9, v12 +; GFX10-NEXT: s_waitcnt lgkmcnt(2) +; GFX10-NEXT: v_and_or_b32 v1, v10, s4, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX10-NEXT: s_waitcnt lgkmcnt(1) +; GFX10-NEXT: v_and_or_b32 v4, v11, s4, v4 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_and_or_b32 v7, v14, v12, v7 +; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX10-NEXT: v_or3_b32 v0, v1, v2, v3 +; GFX10-NEXT: v_or3_b32 v1, v4, v5, v6 +; GFX10-NEXT: v_or3_b32 v2, v7, v8, v9 +; GFX10-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 1 ret <3 x i32> %load } @@ -208,6 +331,41 @@ ; GFX7-NEXT: ds_write_b8 v0, v3 offset:15 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: store_lds_v4i32_align1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX10-NEXT: ds_write_b8 v0, v1 +; GFX10-NEXT: ds_write_b8 v0, v5 offset:1 +; GFX10-NEXT: ds_write_b8 v0, v6 offset:2 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX10-NEXT: ds_write_b8 v0, v7 offset:3 +; GFX10-NEXT: ds_write_b8 v0, v2 offset:4 +; GFX10-NEXT: ds_write_b8 v0, v1 offset:5 +; GFX10-NEXT: ds_write_b8 v0, v5 offset:6 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v3 +; GFX10-NEXT: ds_write_b8 v0, v6 offset:7 +; GFX10-NEXT: ds_write_b8 v0, v3 offset:8 +; GFX10-NEXT: ds_write_b8 v0, v1 offset:9 +; GFX10-NEXT: ds_write_b8 v0, v2 offset:10 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v4 +; GFX10-NEXT: ds_write_b8 v0, v5 offset:11 +; GFX10-NEXT: ds_write_b8 v0, v4 offset:12 +; GFX10-NEXT: ds_write_b8 v0, v1 offset:13 +; GFX10-NEXT: ds_write_b8 v0, v2 offset:14 +; GFX10-NEXT: ds_write_b8 v0, v3 offset:15 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 1 ret void } @@ -247,6 +405,34 @@ ; GFX7-NEXT: ds_write_b8 v0, v4 offset:11 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: store_lds_v3i32_align1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX10-NEXT: ds_write_b8 v0, v1 +; GFX10-NEXT: ds_write_b8 v0, v7 offset:1 +; GFX10-NEXT: ds_write_b8 v0, v5 offset:2 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v2 +; GFX10-NEXT: ds_write_b8 v0, v6 offset:3 +; GFX10-NEXT: ds_write_b8 v0, v2 offset:4 +; GFX10-NEXT: ds_write_b8 v0, v1 offset:5 +; GFX10-NEXT: ds_write_b8 v0, v7 offset:6 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v3 +; GFX10-NEXT: ds_write_b8 v0, v5 offset:7 +; GFX10-NEXT: ds_write_b8 v0, v3 offset:8 +; GFX10-NEXT: ds_write_b8 v0, v1 offset:9 +; GFX10-NEXT: ds_write_b8 v0, v2 offset:10 +; GFX10-NEXT: ds_write_b8 v0, v7 offset:11 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -2,6 +2,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s define i8 @v_lshr_i8(i8 %value, i8 %amount) { ; GFX6-LABEL: v_lshr_i8: @@ -24,6 +25,15 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshrrev_b16_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_lshr_i8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX10-NEXT: v_lshrrev_b16_e64 v0, v1, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = lshr i8 %value, %amount ret i8 %result } @@ -49,6 +59,14 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 7 ; GFX9-NEXT: v_lshrrev_b16_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_lshr_i8_7: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX10-NEXT: v_lshrrev_b16_e64 v0, 7, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = lshr i8 %value, 7 ret i8 %result } @@ -77,6 +95,14 @@ ; GFX9-NEXT: s_and_b32 s1, s1, s2 ; GFX9-NEXT: s_lshr_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_lshr_i8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_movk_i32 s2, 0xff +; GFX10-NEXT: s_and_b32 s0, s0, s2 +; GFX10-NEXT: s_and_b32 s1, s1, s2 +; GFX10-NEXT: s_lshr_b32 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog %result = lshr i8 %value, %amount ret i8 %result } @@ -87,6 +113,12 @@ ; GCN-NEXT: s_and_b32 s0, s0, 0xff ; GCN-NEXT: s_lshr_b32 s0, s0, 7 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_lshr_i8_7: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_and_b32 s0, s0, 0xff +; GFX10-NEXT: s_lshr_b32 s0, s0, 7 +; GFX10-NEXT: ; return to shader part epilog %result = lshr i8 %value, 7 ret i8 %result } @@ -112,6 +144,14 @@ ; GCN-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 7, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_lshr_i24_7: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 7, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = lshr i24 %value, 7 ret i24 %result } @@ -124,6 +164,14 @@ ; GCN-NEXT: s_and_b32 s0, s0, s2 ; GCN-NEXT: s_lshr_b32 s0, s0, s1 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_lshr_i24: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s2, 0xffffff +; GFX10-NEXT: s_and_b32 s1, s1, s2 +; GFX10-NEXT: s_and_b32 s0, s0, s2 +; GFX10-NEXT: s_lshr_b32 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog %result = lshr i24 %value, %amount ret i24 %result } @@ -134,6 +182,12 @@ ; GCN-NEXT: s_and_b32 s0, s0, 0xffffff ; GCN-NEXT: s_lshr_b32 s0, s0, 7 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_lshr_i24_7: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_and_b32 s0, s0, 0xffffff +; GFX10-NEXT: s_lshr_b32 s0, s0, 7 +; GFX10-NEXT: ; return to shader part epilog %result = lshr i24 %value, 7 ret i24 %result } @@ -144,6 +198,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_lshr_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = lshr i32 %value, %amount ret i32 %result } @@ -154,6 +215,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_lshrrev_b32_e32 v0, 31, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_lshr_i32_31: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 31, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = lshr i32 %value, 31 ret i32 %result } @@ -163,6 +231,11 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_lshr_b32 s0, s0, s1 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_lshr_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshr_b32 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog %result = lshr i32 %value, %amount ret i32 %result } @@ -172,6 +245,11 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_lshr_b32 s0, s0, 31 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_lshr_i32_31: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshr_b32 s0, s0, 31 +; GFX10-NEXT: ; return to shader part epilog %result = lshr i32 %value, 31 ret i32 %result } @@ -191,6 +269,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshrrev_b32_e64 v0, v0, s0 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: lshr_i32_sv: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_lshrrev_b32_e64 v0, v0, s0 +; GFX10-NEXT: ; return to shader part epilog %result = lshr i32 %value, %amount %cast = bitcast i32 %result to float ret float %cast @@ -201,6 +284,11 @@ ; GCN: ; %bb.0: ; GCN-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: lshr_i32_vs: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s0, v0 +; GFX10-NEXT: ; return to shader part epilog %result = lshr i32 %value, %amount %cast = bitcast i32 %result to float ret float %cast @@ -213,6 +301,14 @@ ; GCN-NEXT: v_lshrrev_b32_e32 v0, v2, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v1, v3, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_lshr_v2i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v2, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, v3, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = lshr <2 x i32> %value, %amount ret <2 x i32> %result } @@ -224,6 +320,14 @@ ; GCN-NEXT: v_lshrrev_b32_e32 v0, 31, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 31, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_lshr_v2i32_31: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 31, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 31, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = lshr <2 x i32> %value, ret <2 x i32> %result } @@ -234,6 +338,12 @@ ; GCN-NEXT: s_lshr_b32 s0, s0, s2 ; GCN-NEXT: s_lshr_b32 s1, s1, s3 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_lshr_v2i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshr_b32 s0, s0, s2 +; GFX10-NEXT: s_lshr_b32 s1, s1, s3 +; GFX10-NEXT: ; return to shader part epilog %result = lshr <2 x i32> %value, %amount ret <2 x i32> %result } @@ -246,6 +356,15 @@ ; GCN-NEXT: v_lshrrev_b32_e32 v1, v4, v1 ; GCN-NEXT: v_lshrrev_b32_e32 v2, v5, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_lshr_v3i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, v4, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, v5, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = lshr <3 x i32> %value, %amount ret <3 x i32> %result } @@ -257,6 +376,13 @@ ; GCN-NEXT: s_lshr_b32 s1, s1, s4 ; GCN-NEXT: s_lshr_b32 s2, s2, s5 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_lshr_v3i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshr_b32 s0, s0, s3 +; GFX10-NEXT: s_lshr_b32 s1, s1, s4 +; GFX10-NEXT: s_lshr_b32 s2, s2, s5 +; GFX10-NEXT: ; return to shader part epilog %result = lshr <3 x i32> %value, %amount ret <3 x i32> %result } @@ -270,6 +396,16 @@ ; GCN-NEXT: v_lshrrev_b32_e32 v2, v6, v2 ; GCN-NEXT: v_lshrrev_b32_e32 v3, v7, v3 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_lshr_v4i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v4, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, v5, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, v6, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v7, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = lshr <4 x i32> %value, %amount ret <4 x i32> %result } @@ -282,6 +418,14 @@ ; GCN-NEXT: s_lshr_b32 s2, s2, s6 ; GCN-NEXT: s_lshr_b32 s3, s3, s7 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_lshr_v4i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshr_b32 s0, s0, s4 +; GFX10-NEXT: s_lshr_b32 s1, s1, s5 +; GFX10-NEXT: s_lshr_b32 s2, s2, s6 +; GFX10-NEXT: s_lshr_b32 s3, s3, s7 +; GFX10-NEXT: ; return to shader part epilog %result = lshr <4 x i32> %value, %amount ret <4 x i32> %result } @@ -296,6 +440,17 @@ ; GCN-NEXT: v_lshrrev_b32_e32 v3, v8, v3 ; GCN-NEXT: v_lshrrev_b32_e32 v4, v9, v4 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_lshr_v5i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v5, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, v6, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, v7, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v8, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, v9, v4 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = lshr <5 x i32> %value, %amount ret <5 x i32> %result } @@ -309,6 +464,15 @@ ; GCN-NEXT: s_lshr_b32 s3, s3, s8 ; GCN-NEXT: s_lshr_b32 s4, s4, s9 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_lshr_v5i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshr_b32 s0, s0, s5 +; GFX10-NEXT: s_lshr_b32 s1, s1, s6 +; GFX10-NEXT: s_lshr_b32 s2, s2, s7 +; GFX10-NEXT: s_lshr_b32 s3, s3, s8 +; GFX10-NEXT: s_lshr_b32 s4, s4, s9 +; GFX10-NEXT: ; return to shader part epilog %result = lshr <5 x i32> %value, %amount ret <5 x i32> %result } @@ -334,6 +498,28 @@ ; GCN-NEXT: v_lshrrev_b32_e32 v14, v30, v14 ; GCN-NEXT: v_lshrrev_b32_e32 v15, v31, v15 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_lshr_v16i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v16, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, v17, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, v18, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v19, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, v20, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, v21, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, v22, v6 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, v23, v7 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, v24, v8 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, v25, v9 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, v26, v10 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, v27, v11 +; GFX10-NEXT: v_lshrrev_b32_e32 v12, v28, v12 +; GFX10-NEXT: v_lshrrev_b32_e32 v13, v29, v13 +; GFX10-NEXT: v_lshrrev_b32_e32 v14, v30, v14 +; GFX10-NEXT: v_lshrrev_b32_e32 v15, v31, v15 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = lshr <16 x i32> %value, %amount ret <16 x i32> %result } @@ -358,6 +544,26 @@ ; GCN-NEXT: s_lshr_b32 s14, s14, s30 ; GCN-NEXT: s_lshr_b32 s15, s15, s31 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_lshr_v16i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshr_b32 s0, s0, s16 +; GFX10-NEXT: s_lshr_b32 s1, s1, s17 +; GFX10-NEXT: s_lshr_b32 s2, s2, s18 +; GFX10-NEXT: s_lshr_b32 s3, s3, s19 +; GFX10-NEXT: s_lshr_b32 s4, s4, s20 +; GFX10-NEXT: s_lshr_b32 s5, s5, s21 +; GFX10-NEXT: s_lshr_b32 s6, s6, s22 +; GFX10-NEXT: s_lshr_b32 s7, s7, s23 +; GFX10-NEXT: s_lshr_b32 s8, s8, s24 +; GFX10-NEXT: s_lshr_b32 s9, s9, s25 +; GFX10-NEXT: s_lshr_b32 s10, s10, s26 +; GFX10-NEXT: s_lshr_b32 s11, s11, s27 +; GFX10-NEXT: s_lshr_b32 s12, s12, s28 +; GFX10-NEXT: s_lshr_b32 s13, s13, s29 +; GFX10-NEXT: s_lshr_b32 s14, s14, s30 +; GFX10-NEXT: s_lshr_b32 s15, s15, s31 +; GFX10-NEXT: ; return to shader part epilog %result = lshr <16 x i32> %value, %amount ret <16 x i32> %result } @@ -383,6 +589,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshrrev_b16_e32 v0, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_lshr_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshrrev_b16_e64 v0, v1, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = lshr i16 %value, %amount ret i16 %result } @@ -392,6 +605,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_lshr_i16_31: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = lshr i16 %value, 31 ret i16 %result } @@ -420,6 +639,14 @@ ; GFX9-NEXT: s_and_b32 s1, s1, s2 ; GFX9-NEXT: s_lshr_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_lshr_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s2, 0xffff +; GFX10-NEXT: s_and_b32 s0, s0, s2 +; GFX10-NEXT: s_and_b32 s1, s1, s2 +; GFX10-NEXT: s_lshr_b32 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog %result = lshr i16 %value, %amount ret i16 %result } @@ -430,6 +657,12 @@ ; GCN-NEXT: s_and_b32 s0, s0, 0xffff ; GCN-NEXT: s_lshr_b32 s0, s0, 15 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_lshr_i16_15: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10-NEXT: s_lshr_b32 s0, s0, 15 +; GFX10-NEXT: ; return to shader part epilog %result = lshr i16 %value, 15 ret i16 %result } @@ -452,6 +685,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshrrev_b16_e64 v0, v0, s0 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: lshr_i16_sv: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_lshrrev_b16_e64 v0, v0, s0 +; GFX10-NEXT: ; return to shader part epilog %result = lshr i16 %value, %amount %cast = bitcast i16 %result to half ret half %cast @@ -475,6 +713,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshrrev_b16_e32 v0, s0, v0 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: lshr_i16_vs: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_lshrrev_b16_e64 v0, s0, v0 +; GFX10-NEXT: ; return to shader part epilog %result = lshr i16 %value, %amount %cast = bitcast i16 %result to half ret half %cast @@ -508,6 +751,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_lshrrev_b16 v0, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_lshr_v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_lshrrev_b16 v0, v1, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = lshr <2 x i16> %value, %amount ret <2 x i16> %result } @@ -537,6 +787,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 15, v0 op_sel_hi:[0,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_lshr_v2i16_15: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_lshrrev_b16 v0, 15, v0 op_sel_hi:[0,1] +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = lshr <2 x i16> %value, ret <2 x i16> %result } @@ -580,6 +837,18 @@ ; GFX9-NEXT: s_lshr_b32 s1, s2, s4 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_lshr_v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s2, 0xffff +; GFX10-NEXT: s_lshr_b32 s3, s0, 16 +; GFX10-NEXT: s_and_b32 s0, s0, s2 +; GFX10-NEXT: s_and_b32 s2, s1, s2 +; GFX10-NEXT: s_lshr_b32 s1, s1, 16 +; GFX10-NEXT: s_lshr_b32 s0, s0, s2 +; GFX10-NEXT: s_lshr_b32 s1, s3, s1 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog %result = lshr <2 x i16> %value, %amount %cast = bitcast <2 x i16> %result to i32 ret i32 %cast @@ -612,6 +881,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: v_pk_lshrrev_b16 v0, v0, s0 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: lshr_v2i16_sv: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_pk_lshrrev_b16 v0, v0, s0 +; GFX10-NEXT: ; return to shader part epilog %result = lshr <2 x i16> %value, %amount %cast = bitcast <2 x i16> %result to float ret float %cast @@ -644,6 +918,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: v_pk_lshrrev_b16 v0, s0, v0 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: lshr_v2i16_vs: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_pk_lshrrev_b16 v0, s0, v0 +; GFX10-NEXT: ; return to shader part epilog %result = lshr <2 x i16> %value, %amount %cast = bitcast <2 x i16> %result to float ret float %cast @@ -700,6 +979,14 @@ ; GFX9-NEXT: v_pk_lshrrev_b16 v0, v2, v0 ; GFX9-NEXT: v_pk_lshrrev_b16 v1, v3, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_lshr_v4i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_lshrrev_b16 v0, v2, v0 +; GFX10-NEXT: v_pk_lshrrev_b16 v1, v3, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = lshr <4 x i16> %value, %amount %cast = bitcast <4 x i16> %result to <2 x float> ret <2 x float> %cast @@ -768,6 +1055,25 @@ ; GFX9-NEXT: s_lshr_b32 s2, s2, s4 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_lshr_v4i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s4, 0xffff +; GFX10-NEXT: s_lshr_b32 s5, s0, 16 +; GFX10-NEXT: s_and_b32 s6, s2, s4 +; GFX10-NEXT: s_lshr_b32 s2, s2, 16 +; GFX10-NEXT: s_and_b32 s0, s0, s4 +; GFX10-NEXT: s_lshr_b32 s2, s5, s2 +; GFX10-NEXT: s_lshr_b32 s5, s1, 16 +; GFX10-NEXT: s_and_b32 s1, s1, s4 +; GFX10-NEXT: s_and_b32 s4, s3, s4 +; GFX10-NEXT: s_lshr_b32 s3, s3, 16 +; GFX10-NEXT: s_lshr_b32 s0, s0, s6 +; GFX10-NEXT: s_lshr_b32 s1, s1, s4 +; GFX10-NEXT: s_lshr_b32 s3, s5, s3 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 +; GFX10-NEXT: ; return to shader part epilog %result = lshr <4 x i16> %value, %amount %cast = bitcast <4 x i16> %result to <2 x i32> ret <2 x i32> %cast @@ -861,6 +1167,16 @@ ; GFX9-NEXT: v_pk_lshrrev_b16 v2, v6, v2 ; GFX9-NEXT: v_pk_lshrrev_b16 v3, v7, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_lshr_v8i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_lshrrev_b16 v0, v4, v0 +; GFX10-NEXT: v_pk_lshrrev_b16 v1, v5, v1 +; GFX10-NEXT: v_pk_lshrrev_b16 v2, v6, v2 +; GFX10-NEXT: v_pk_lshrrev_b16 v3, v7, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = lshr <8 x i16> %value, %amount %cast = bitcast <8 x i16> %result to <4 x float> ret <4 x float> %cast @@ -977,6 +1293,39 @@ ; GFX9-NEXT: s_lshr_b32 s4, s4, s5 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_lshr_v8i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s8, 0xffff +; GFX10-NEXT: s_lshr_b32 s9, s0, 16 +; GFX10-NEXT: s_and_b32 s10, s4, s8 +; GFX10-NEXT: s_and_b32 s0, s0, s8 +; GFX10-NEXT: s_lshr_b32 s4, s4, 16 +; GFX10-NEXT: s_lshr_b32 s0, s0, s10 +; GFX10-NEXT: s_lshr_b32 s4, s9, s4 +; GFX10-NEXT: s_lshr_b32 s9, s1, 16 +; GFX10-NEXT: s_and_b32 s10, s5, s8 +; GFX10-NEXT: s_and_b32 s1, s1, s8 +; GFX10-NEXT: s_lshr_b32 s5, s5, 16 +; GFX10-NEXT: s_lshr_b32 s1, s1, s10 +; GFX10-NEXT: s_lshr_b32 s5, s9, s5 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 +; GFX10-NEXT: s_lshr_b32 s4, s2, 16 +; GFX10-NEXT: s_and_b32 s5, s6, s8 +; GFX10-NEXT: s_and_b32 s2, s2, s8 +; GFX10-NEXT: s_lshr_b32 s6, s6, 16 +; GFX10-NEXT: s_lshr_b32 s2, s2, s5 +; GFX10-NEXT: s_lshr_b32 s4, s4, s6 +; GFX10-NEXT: s_lshr_b32 s5, s3, 16 +; GFX10-NEXT: s_and_b32 s6, s7, s8 +; GFX10-NEXT: s_and_b32 s3, s3, s8 +; GFX10-NEXT: s_lshr_b32 s7, s7, 16 +; GFX10-NEXT: s_lshr_b32 s3, s3, s6 +; GFX10-NEXT: s_lshr_b32 s5, s5, s7 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5 +; GFX10-NEXT: ; return to shader part epilog %result = lshr <8 x i16> %value, %amount %cast = bitcast <8 x i16> %result to <4 x i32> ret <4 x i32> %cast @@ -1000,6 +1349,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_lshr_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshrrev_b64 v[0:1], v2, v[0:1] +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = lshr i64 %value, %amount ret i64 %result } @@ -1011,6 +1367,14 @@ ; GCN-NEXT: v_lshrrev_b32_e32 v0, 31, v1 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_lshr_i64_63: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 31, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = lshr i64 %value, 63 ret i64 %result } @@ -1022,6 +1386,14 @@ ; GCN-NEXT: v_lshrrev_b32_e32 v0, 1, v1 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_lshr_i64_33: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = lshr i64 %value, 33 ret i64 %result } @@ -1033,6 +1405,14 @@ ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_lshr_i64_32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = lshr i64 %value, 32 ret i64 %result } @@ -1055,6 +1435,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 31, v[0:1] ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_lshr_i64_31: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshrrev_b64 v[0:1], 31, v[0:1] +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = lshr i64 %value, 31 ret i64 %result } @@ -1064,6 +1451,11 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_lshr_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX10-NEXT: ; return to shader part epilog %result = lshr i64 %value, %amount ret i64 %result } @@ -1074,6 +1466,12 @@ ; GCN-NEXT: s_lshr_b32 s0, s1, 31 ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_lshr_i64_63: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshr_b32 s0, s1, 31 +; GFX10-NEXT: s_mov_b32 s1, 0 +; GFX10-NEXT: ; return to shader part epilog %result = lshr i64 %value, 63 ret i64 %result } @@ -1084,6 +1482,12 @@ ; GCN-NEXT: s_lshr_b32 s0, s1, 1 ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_lshr_i64_33: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshr_b32 s0, s1, 1 +; GFX10-NEXT: s_mov_b32 s1, 0 +; GFX10-NEXT: ; return to shader part epilog %result = lshr i64 %value, 33 ret i64 %result } @@ -1094,6 +1498,12 @@ ; GCN-NEXT: s_mov_b32 s0, s1 ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_lshr_i64_32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s1 +; GFX10-NEXT: s_mov_b32 s1, 0 +; GFX10-NEXT: ; return to shader part epilog %result = lshr i64 %value, 32 ret i64 %result } @@ -1103,6 +1513,11 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_lshr_b64 s[0:1], s[0:1], 31 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_lshr_i64_31: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], 31 +; GFX10-NEXT: ; return to shader part epilog %result = lshr i64 %value, 31 ret i64 %result } @@ -1122,6 +1537,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, s[0:1] ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: lshr_i64_sv: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_lshrrev_b64 v[0:1], v0, s[0:1] +; GFX10-NEXT: ; return to shader part epilog %result = lshr i64 %value, %amount %cast = bitcast i64 %result to <2 x float> ret <2 x float> %cast @@ -1142,6 +1562,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshrrev_b64 v[0:1], s0, v[0:1] ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: lshr_i64_vs: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_lshrrev_b64 v[0:1], s0, v[0:1] +; GFX10-NEXT: ; return to shader part epilog %result = lshr i64 %value, %amount %cast = bitcast i64 %result to <2 x float> ret <2 x float> %cast @@ -1168,6 +1593,18 @@ ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v4, v[0:1] ; GFX9-NEXT: v_lshrrev_b64 v[2:3], v6, v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_lshr_v2i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v10, v0 +; GFX10-NEXT: v_mov_b32_e32 v11, v1 +; GFX10-NEXT: v_mov_b32_e32 v7, v2 +; GFX10-NEXT: v_mov_b32_e32 v8, v3 +; GFX10-NEXT: v_lshrrev_b64 v[0:1], v4, v[10:11] +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v6, v[7:8] +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = lshr <2 x i64> %value, %amount ret <2 x i64> %result } @@ -1193,6 +1630,14 @@ ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 31, v[0:1] ; GFX9-NEXT: v_lshrrev_b64 v[2:3], 31, v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_lshr_v2i64_31: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshrrev_b64 v[0:1], 31, v[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[2:3], 31, v[2:3] +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = lshr <2 x i64> %value, ret <2 x i64> %result } @@ -1203,6 +1648,12 @@ ; GCN-NEXT: s_lshr_b64 s[0:1], s[0:1], s4 ; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], s6 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_lshr_v2i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], s4 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s6 +; GFX10-NEXT: ; return to shader part epilog %result = lshr <2 x i64> %value, %amount ret <2 x i64> %result } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -2,6 +2,7 @@ ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s define amdgpu_ps i16 @s_mul_i16(i16 inreg %num, i16 inreg %den) { ; GFX7-LABEL: s_mul_i16: @@ -24,6 +25,14 @@ ; GFX9-NEXT: s_and_b32 s1, s1, s2 ; GFX9-NEXT: s_mul_i32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_mul_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s2, 0xffff +; GFX10-NEXT: s_and_b32 s0, s0, s2 +; GFX10-NEXT: s_and_b32 s1, s1, s2 +; GFX10-NEXT: s_mul_i32 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog %result = mul i16 %num, %den ret i16 %result } @@ -49,6 +58,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u16_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_mul_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_lo_u16_e64 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = mul i16 %num, %den ret i16 %result } @@ -77,6 +93,15 @@ ; GFX9-NEXT: s_mul_i32 s0, s0, s1 ; GFX9-NEXT: s_and_b32 s0, s0, s2 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_mul_i16_zeroext: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s2, 0xffff +; GFX10-NEXT: s_and_b32 s0, s0, s2 +; GFX10-NEXT: s_and_b32 s1, s1, s2 +; GFX10-NEXT: s_mul_i32 s0, s0, s1 +; GFX10-NEXT: s_and_b32 s0, s0, s2 +; GFX10-NEXT: ; return to shader part epilog %result = mul i16 %num, %den ret i16 %result } @@ -100,6 +125,14 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u16_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_mul_i16_zeroext: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_lo_u16_e64 v0, v0, v1 +; GFX10-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = mul i16 %num, %den ret i16 %result } @@ -128,6 +161,15 @@ ; GFX9-NEXT: s_mul_i32 s0, s0, s1 ; GFX9-NEXT: s_sext_i32_i16 s0, s0 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_mul_i16_signext: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s2, 0xffff +; GFX10-NEXT: s_and_b32 s0, s0, s2 +; GFX10-NEXT: s_and_b32 s1, s1, s2 +; GFX10-NEXT: s_mul_i32 s0, s0, s1 +; GFX10-NEXT: s_sext_i32_i16 s0, s0 +; GFX10-NEXT: ; return to shader part epilog %result = mul i16 %num, %den ret i16 %result } @@ -156,6 +198,14 @@ ; GFX9-NEXT: v_mul_lo_u16_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_mul_i16_signext: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_lo_u16_e64 v0, v0, v1 +; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = mul i16 %num, %den ret i16 %result } @@ -165,6 +215,11 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_mul_i32 s0, s0, s1 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_mul_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mul_i32 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog %result = mul i32 %num, %den ret i32 %result } @@ -175,6 +230,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_mul_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_lo_u32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = mul i32 %num, %den ret i32 %result } @@ -185,6 +247,12 @@ ; GCN-NEXT: s_mul_i32 s0, s0, s2 ; GCN-NEXT: s_mul_i32 s1, s1, s3 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_mul_v2i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mul_i32 s0, s0, s2 +; GFX10-NEXT: s_mul_i32 s1, s1, s3 +; GFX10-NEXT: ; return to shader part epilog %result = mul <2 x i32> %num, %den ret <2 x i32> %result } @@ -196,6 +264,14 @@ ; GCN-NEXT: v_mul_lo_u32 v0, v0, v2 ; GCN-NEXT: v_mul_lo_u32 v1, v1, v3 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_mul_v2i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_lo_u32 v0, v0, v2 +; GFX10-NEXT: v_mul_lo_u32 v1, v1, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = mul <2 x i32> %num, %den ret <2 x i32> %result } @@ -237,6 +313,16 @@ ; GFX9-NEXT: s_add_i32 s1, s1, s0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_mul_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mul_i32 s1, s1, s2 +; GFX10-NEXT: s_mul_i32 s3, s0, s3 +; GFX10-NEXT: s_mul_hi_u32 s4, s0, s2 +; GFX10-NEXT: s_add_i32 s1, s1, s3 +; GFX10-NEXT: s_mul_i32 s0, s0, s2 +; GFX10-NEXT: s_add_i32 s1, s1, s4 +; GFX10-NEXT: ; return to shader part epilog %result = mul i64 %num, %den ret i64 %result } @@ -275,6 +361,17 @@ ; GFX9-NEXT: v_mul_lo_u32 v0, v0, v2 ; GFX9-NEXT: v_add3_u32 v1, v1, v3, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_mul_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_lo_u32 v1, v1, v2 +; GFX10-NEXT: v_mul_lo_u32 v3, v0, v3 +; GFX10-NEXT: v_mul_hi_u32 v4, v0, v2 +; GFX10-NEXT: v_mul_lo_u32 v0, v0, v2 +; GFX10-NEXT: v_add3_u32 v1, v1, v3, v4 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = mul i64 %num, %den ret i64 %result } @@ -366,6 +463,32 @@ ; GFX9-NEXT: s_mov_b32 s0, s6 ; GFX9-NEXT: s_mov_b32 s1, s7 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_mul_i96: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mul_i32 s6, s1, s3 +; GFX10-NEXT: s_mul_i32 s7, s0, s4 +; GFX10-NEXT: s_mul_hi_u32 s8, s0, s3 +; GFX10-NEXT: s_add_u32 s6, s6, s7 +; GFX10-NEXT: s_cselect_b32 s7, 1, 0 +; GFX10-NEXT: s_mul_i32 s9, s1, s4 +; GFX10-NEXT: s_and_b32 s7, s7, 1 +; GFX10-NEXT: s_mul_i32 s2, s2, s3 +; GFX10-NEXT: s_add_u32 s6, s6, s8 +; GFX10-NEXT: s_cselect_b32 s8, 1, 0 +; GFX10-NEXT: s_mul_i32 s5, s0, s5 +; GFX10-NEXT: s_add_i32 s2, s2, s9 +; GFX10-NEXT: s_mul_hi_u32 s1, s1, s3 +; GFX10-NEXT: s_add_i32 s2, s2, s5 +; GFX10-NEXT: s_and_b32 s8, s8, 1 +; GFX10-NEXT: s_mul_hi_u32 s4, s0, s4 +; GFX10-NEXT: s_add_i32 s1, s2, s1 +; GFX10-NEXT: s_add_i32 s7, s7, s8 +; GFX10-NEXT: s_add_i32 s1, s1, s4 +; GFX10-NEXT: s_mul_i32 s0, s0, s3 +; GFX10-NEXT: s_add_i32 s2, s1, s7 +; GFX10-NEXT: s_mov_b32 s1, s6 +; GFX10-NEXT: ; return to shader part epilog %result = mul i96 %num, %den %cast = bitcast i96 %result to <3 x i32> ret <3 x i32> %cast @@ -623,6 +746,61 @@ ; GFX9-NEXT: s_mov_b32 s1, s9 ; GFX9-NEXT: s_mov_b32 s2, s10 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_mul_i128: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mul_i32 s8, s1, s4 +; GFX10-NEXT: s_mul_i32 s9, s0, s5 +; GFX10-NEXT: s_mul_hi_u32 s10, s0, s4 +; GFX10-NEXT: s_add_u32 s8, s8, s9 +; GFX10-NEXT: s_cselect_b32 s9, 1, 0 +; GFX10-NEXT: s_mul_i32 s11, s1, s5 +; GFX10-NEXT: s_and_b32 s9, s9, 1 +; GFX10-NEXT: s_add_u32 s8, s8, s10 +; GFX10-NEXT: s_cselect_b32 s10, 1, 0 +; GFX10-NEXT: s_mul_i32 s12, s0, s6 +; GFX10-NEXT: s_and_b32 s10, s10, 1 +; GFX10-NEXT: s_mul_hi_u32 s13, s1, s4 +; GFX10-NEXT: s_add_i32 s9, s9, s10 +; GFX10-NEXT: s_mul_i32 s10, s2, s4 +; GFX10-NEXT: s_mul_i32 s3, s3, s4 +; GFX10-NEXT: s_add_u32 s10, s10, s11 +; GFX10-NEXT: s_cselect_b32 s11, 1, 0 +; GFX10-NEXT: s_mul_i32 s7, s0, s7 +; GFX10-NEXT: s_and_b32 s11, s11, 1 +; GFX10-NEXT: s_add_u32 s10, s10, s12 +; GFX10-NEXT: s_cselect_b32 s12, 1, 0 +; GFX10-NEXT: s_and_b32 s12, s12, 1 +; GFX10-NEXT: s_add_i32 s11, s11, s12 +; GFX10-NEXT: s_add_u32 s10, s10, s13 +; GFX10-NEXT: s_cselect_b32 s12, 1, 0 +; GFX10-NEXT: s_mul_hi_u32 s13, s0, s5 +; GFX10-NEXT: s_and_b32 s12, s12, 1 +; GFX10-NEXT: s_add_i32 s11, s11, s12 +; GFX10-NEXT: s_add_u32 s10, s10, s13 +; GFX10-NEXT: s_cselect_b32 s12, 1, 0 +; GFX10-NEXT: s_mul_i32 s13, s1, s6 +; GFX10-NEXT: s_and_b32 s12, s12, 1 +; GFX10-NEXT: s_mul_hi_u32 s1, s1, s5 +; GFX10-NEXT: s_add_i32 s11, s11, s12 +; GFX10-NEXT: s_mul_i32 s12, s2, s5 +; GFX10-NEXT: s_add_u32 s9, s10, s9 +; GFX10-NEXT: s_cselect_b32 s10, 1, 0 +; GFX10-NEXT: s_add_i32 s3, s3, s12 +; GFX10-NEXT: s_mul_hi_u32 s2, s2, s4 +; GFX10-NEXT: s_add_i32 s3, s3, s13 +; GFX10-NEXT: s_and_b32 s10, s10, 1 +; GFX10-NEXT: s_add_i32 s3, s3, s7 +; GFX10-NEXT: s_add_i32 s11, s11, s10 +; GFX10-NEXT: s_add_i32 s2, s3, s2 +; GFX10-NEXT: s_mul_hi_u32 s3, s0, s6 +; GFX10-NEXT: s_add_i32 s1, s2, s1 +; GFX10-NEXT: s_mul_i32 s0, s0, s4 +; GFX10-NEXT: s_add_i32 s1, s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s9 +; GFX10-NEXT: s_add_i32 s3, s1, s11 +; GFX10-NEXT: s_mov_b32 s1, s8 +; GFX10-NEXT: ; return to shader part epilog %result = mul i128 %num, %den %cast = bitcast i128 %result to <4 x i32> ret <4 x i32> %cast @@ -1569,6 +1747,277 @@ ; GFX9-NEXT: s_mov_b32 s5, s21 ; GFX9-NEXT: s_mov_b32 s6, s22 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_mul_i256: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mul_i32 s16, s1, s8 +; GFX10-NEXT: s_mul_i32 s17, s0, s9 +; GFX10-NEXT: s_mul_hi_u32 s18, s0, s8 +; GFX10-NEXT: s_add_u32 s16, s16, s17 +; GFX10-NEXT: s_cselect_b32 s17, 1, 0 +; GFX10-NEXT: s_mul_i32 s19, s1, s9 +; GFX10-NEXT: s_and_b32 s17, s17, 1 +; GFX10-NEXT: s_add_u32 s16, s16, s18 +; GFX10-NEXT: s_cselect_b32 s18, 1, 0 +; GFX10-NEXT: s_mul_i32 s20, s0, s10 +; GFX10-NEXT: s_and_b32 s18, s18, 1 +; GFX10-NEXT: s_mul_hi_u32 s21, s1, s8 +; GFX10-NEXT: s_add_i32 s17, s17, s18 +; GFX10-NEXT: s_mul_i32 s18, s2, s8 +; GFX10-NEXT: s_mul_i32 s22, s0, s11 +; GFX10-NEXT: s_add_u32 s18, s18, s19 +; GFX10-NEXT: s_cselect_b32 s19, 1, 0 +; GFX10-NEXT: s_mul_i32 s23, s1, s11 +; GFX10-NEXT: s_and_b32 s19, s19, 1 +; GFX10-NEXT: s_add_u32 s18, s18, s20 +; GFX10-NEXT: s_cselect_b32 s20, 1, 0 +; GFX10-NEXT: s_mul_i32 s24, s0, s12 +; GFX10-NEXT: s_and_b32 s20, s20, 1 +; GFX10-NEXT: s_mul_i32 s25, s4, s9 +; GFX10-NEXT: s_add_i32 s19, s19, s20 +; GFX10-NEXT: s_add_u32 s18, s18, s21 +; GFX10-NEXT: s_cselect_b32 s20, 1, 0 +; GFX10-NEXT: s_mul_hi_u32 s21, s0, s9 +; GFX10-NEXT: s_and_b32 s20, s20, 1 +; GFX10-NEXT: s_mul_i32 s26, s2, s11 +; GFX10-NEXT: s_add_i32 s19, s19, s20 +; GFX10-NEXT: s_add_u32 s18, s18, s21 +; GFX10-NEXT: s_cselect_b32 s20, 1, 0 +; GFX10-NEXT: s_mul_i32 s21, s1, s10 +; GFX10-NEXT: s_and_b32 s20, s20, 1 +; GFX10-NEXT: s_mul_i32 s27, s0, s13 +; GFX10-NEXT: s_add_i32 s19, s19, s20 +; GFX10-NEXT: s_add_u32 s17, s18, s17 +; GFX10-NEXT: s_cselect_b32 s18, 1, 0 +; GFX10-NEXT: s_mul_i32 s20, s2, s9 +; GFX10-NEXT: s_and_b32 s18, s18, 1 +; GFX10-NEXT: s_mul_hi_u32 s28, s3, s9 +; GFX10-NEXT: s_add_i32 s19, s19, s18 +; GFX10-NEXT: s_mul_i32 s18, s3, s8 +; GFX10-NEXT: s_mul_i32 s7, s7, s8 +; GFX10-NEXT: s_add_u32 s18, s18, s20 +; GFX10-NEXT: s_cselect_b32 s20, 1, 0 +; GFX10-NEXT: s_mul_i32 s15, s0, s15 +; GFX10-NEXT: s_and_b32 s20, s20, 1 +; GFX10-NEXT: s_add_u32 s18, s18, s21 +; GFX10-NEXT: s_cselect_b32 s21, 1, 0 +; GFX10-NEXT: s_and_b32 s21, s21, 1 +; GFX10-NEXT: s_add_i32 s20, s20, s21 +; GFX10-NEXT: s_add_u32 s18, s18, s22 +; GFX10-NEXT: s_cselect_b32 s21, 1, 0 +; GFX10-NEXT: s_mul_hi_u32 s22, s2, s8 +; GFX10-NEXT: s_and_b32 s21, s21, 1 +; GFX10-NEXT: s_add_i32 s20, s20, s21 +; GFX10-NEXT: s_add_u32 s18, s18, s22 +; GFX10-NEXT: s_cselect_b32 s21, 1, 0 +; GFX10-NEXT: s_mul_hi_u32 s22, s1, s9 +; GFX10-NEXT: s_and_b32 s21, s21, 1 +; GFX10-NEXT: s_add_i32 s20, s20, s21 +; GFX10-NEXT: s_add_u32 s18, s18, s22 +; GFX10-NEXT: s_cselect_b32 s21, 1, 0 +; GFX10-NEXT: s_mul_hi_u32 s22, s0, s10 +; GFX10-NEXT: s_and_b32 s21, s21, 1 +; GFX10-NEXT: s_add_i32 s20, s20, s21 +; GFX10-NEXT: s_add_u32 s18, s18, s22 +; GFX10-NEXT: s_cselect_b32 s21, 1, 0 +; GFX10-NEXT: s_mul_i32 s22, s2, s10 +; GFX10-NEXT: s_and_b32 s21, s21, 1 +; GFX10-NEXT: s_add_i32 s20, s20, s21 +; GFX10-NEXT: s_add_u32 s18, s18, s19 +; GFX10-NEXT: s_cselect_b32 s19, 1, 0 +; GFX10-NEXT: s_mul_i32 s21, s3, s9 +; GFX10-NEXT: s_and_b32 s19, s19, 1 +; GFX10-NEXT: s_add_i32 s20, s20, s19 +; GFX10-NEXT: s_mul_i32 s19, s4, s8 +; GFX10-NEXT: s_add_u32 s19, s19, s21 +; GFX10-NEXT: s_cselect_b32 s21, 1, 0 +; GFX10-NEXT: s_and_b32 s21, s21, 1 +; GFX10-NEXT: s_add_u32 s19, s19, s22 +; GFX10-NEXT: s_cselect_b32 s22, 1, 0 +; GFX10-NEXT: s_and_b32 s22, s22, 1 +; GFX10-NEXT: s_add_i32 s21, s21, s22 +; GFX10-NEXT: s_add_u32 s19, s19, s23 +; GFX10-NEXT: s_cselect_b32 s22, 1, 0 +; GFX10-NEXT: s_mul_hi_u32 s23, s3, s8 +; GFX10-NEXT: s_and_b32 s22, s22, 1 +; GFX10-NEXT: s_add_i32 s21, s21, s22 +; GFX10-NEXT: s_add_u32 s19, s19, s24 +; GFX10-NEXT: s_cselect_b32 s22, 1, 0 +; GFX10-NEXT: s_mul_hi_u32 s24, s2, s9 +; GFX10-NEXT: s_and_b32 s22, s22, 1 +; GFX10-NEXT: s_add_i32 s21, s21, s22 +; GFX10-NEXT: s_add_u32 s19, s19, s23 +; GFX10-NEXT: s_cselect_b32 s22, 1, 0 +; GFX10-NEXT: s_mul_hi_u32 s23, s1, s10 +; GFX10-NEXT: s_and_b32 s22, s22, 1 +; GFX10-NEXT: s_add_i32 s21, s21, s22 +; GFX10-NEXT: s_add_u32 s19, s19, s24 +; GFX10-NEXT: s_cselect_b32 s22, 1, 0 +; GFX10-NEXT: s_mul_hi_u32 s24, s0, s11 +; GFX10-NEXT: s_and_b32 s22, s22, 1 +; GFX10-NEXT: s_add_i32 s21, s21, s22 +; GFX10-NEXT: s_add_u32 s19, s19, s23 +; GFX10-NEXT: s_cselect_b32 s22, 1, 0 +; GFX10-NEXT: s_mul_i32 s23, s5, s8 +; GFX10-NEXT: s_and_b32 s22, s22, 1 +; GFX10-NEXT: s_add_i32 s21, s21, s22 +; GFX10-NEXT: s_add_u32 s19, s19, s24 +; GFX10-NEXT: s_cselect_b32 s22, 1, 0 +; GFX10-NEXT: s_mul_i32 s24, s3, s10 +; GFX10-NEXT: s_and_b32 s22, s22, 1 +; GFX10-NEXT: s_add_i32 s21, s21, s22 +; GFX10-NEXT: s_add_u32 s19, s19, s20 +; GFX10-NEXT: s_cselect_b32 s20, 1, 0 +; GFX10-NEXT: s_mul_i32 s22, s1, s12 +; GFX10-NEXT: s_and_b32 s20, s20, 1 +; GFX10-NEXT: s_add_i32 s21, s21, s20 +; GFX10-NEXT: s_add_u32 s23, s23, s25 +; GFX10-NEXT: s_cselect_b32 s25, 1, 0 +; GFX10-NEXT: s_mul_hi_u32 s20, s4, s8 +; GFX10-NEXT: s_and_b32 s25, s25, 1 +; GFX10-NEXT: s_add_u32 s23, s23, s24 +; GFX10-NEXT: s_cselect_b32 s24, 1, 0 +; GFX10-NEXT: s_and_b32 s24, s24, 1 +; GFX10-NEXT: s_add_i32 s24, s25, s24 +; GFX10-NEXT: s_add_u32 s23, s23, s26 +; GFX10-NEXT: s_cselect_b32 s25, 1, 0 +; GFX10-NEXT: s_mul_hi_u32 s26, s2, s10 +; GFX10-NEXT: s_and_b32 s25, s25, 1 +; GFX10-NEXT: s_add_i32 s24, s24, s25 +; GFX10-NEXT: s_add_u32 s22, s23, s22 +; GFX10-NEXT: s_cselect_b32 s23, 1, 0 +; GFX10-NEXT: s_mul_hi_u32 s25, s1, s11 +; GFX10-NEXT: s_and_b32 s23, s23, 1 +; GFX10-NEXT: s_add_i32 s23, s24, s23 +; GFX10-NEXT: s_add_u32 s22, s22, s27 +; GFX10-NEXT: s_cselect_b32 s24, 1, 0 +; GFX10-NEXT: s_mul_hi_u32 s27, s0, s12 +; GFX10-NEXT: s_and_b32 s24, s24, 1 +; GFX10-NEXT: s_add_i32 s23, s23, s24 +; GFX10-NEXT: s_add_u32 s20, s22, s20 +; GFX10-NEXT: s_cselect_b32 s22, 1, 0 +; GFX10-NEXT: s_mul_i32 s24, s6, s8 +; GFX10-NEXT: s_and_b32 s22, s22, 1 +; GFX10-NEXT: s_add_i32 s22, s23, s22 +; GFX10-NEXT: s_add_u32 s20, s20, s28 +; GFX10-NEXT: s_cselect_b32 s23, 1, 0 +; GFX10-NEXT: s_mul_i32 s28, s5, s9 +; GFX10-NEXT: s_and_b32 s23, s23, 1 +; GFX10-NEXT: s_add_i32 s22, s22, s23 +; GFX10-NEXT: s_add_u32 s20, s20, s26 +; GFX10-NEXT: s_cselect_b32 s23, 1, 0 +; GFX10-NEXT: s_mul_i32 s26, s4, s10 +; GFX10-NEXT: s_and_b32 s23, s23, 1 +; GFX10-NEXT: s_add_i32 s22, s22, s23 +; GFX10-NEXT: s_add_u32 s20, s20, s25 +; GFX10-NEXT: s_cselect_b32 s23, 1, 0 +; GFX10-NEXT: s_mul_i32 s25, s3, s11 +; GFX10-NEXT: s_and_b32 s23, s23, 1 +; GFX10-NEXT: s_add_i32 s22, s22, s23 +; GFX10-NEXT: s_add_u32 s20, s20, s27 +; GFX10-NEXT: s_cselect_b32 s23, 1, 0 +; GFX10-NEXT: s_mul_i32 s27, s2, s12 +; GFX10-NEXT: s_and_b32 s23, s23, 1 +; GFX10-NEXT: s_add_i32 s22, s22, s23 +; GFX10-NEXT: s_add_u32 s20, s20, s21 +; GFX10-NEXT: s_cselect_b32 s21, 1, 0 +; GFX10-NEXT: s_mul_i32 s23, s1, s13 +; GFX10-NEXT: s_and_b32 s21, s21, 1 +; GFX10-NEXT: s_add_i32 s22, s22, s21 +; GFX10-NEXT: s_add_u32 s21, s24, s28 +; GFX10-NEXT: s_cselect_b32 s24, 1, 0 +; GFX10-NEXT: s_mul_i32 s28, s0, s14 +; GFX10-NEXT: s_and_b32 s24, s24, 1 +; GFX10-NEXT: s_add_u32 s21, s21, s26 +; GFX10-NEXT: s_cselect_b32 s26, 1, 0 +; GFX10-NEXT: s_and_b32 s26, s26, 1 +; GFX10-NEXT: s_add_i32 s24, s24, s26 +; GFX10-NEXT: s_add_u32 s21, s21, s25 +; GFX10-NEXT: s_cselect_b32 s25, 1, 0 +; GFX10-NEXT: s_mul_hi_u32 s26, s5, s8 +; GFX10-NEXT: s_and_b32 s25, s25, 1 +; GFX10-NEXT: s_add_i32 s24, s24, s25 +; GFX10-NEXT: s_add_u32 s21, s21, s27 +; GFX10-NEXT: s_cselect_b32 s25, 1, 0 +; GFX10-NEXT: s_mul_hi_u32 s27, s4, s9 +; GFX10-NEXT: s_and_b32 s25, s25, 1 +; GFX10-NEXT: s_add_i32 s24, s24, s25 +; GFX10-NEXT: s_add_u32 s21, s21, s23 +; GFX10-NEXT: s_cselect_b32 s23, 1, 0 +; GFX10-NEXT: s_mul_hi_u32 s25, s3, s10 +; GFX10-NEXT: s_and_b32 s23, s23, 1 +; GFX10-NEXT: s_add_i32 s23, s24, s23 +; GFX10-NEXT: s_add_u32 s21, s21, s28 +; GFX10-NEXT: s_cselect_b32 s24, 1, 0 +; GFX10-NEXT: s_mul_hi_u32 s28, s2, s11 +; GFX10-NEXT: s_and_b32 s24, s24, 1 +; GFX10-NEXT: s_add_i32 s23, s23, s24 +; GFX10-NEXT: s_add_u32 s21, s21, s26 +; GFX10-NEXT: s_cselect_b32 s24, 1, 0 +; GFX10-NEXT: s_mul_hi_u32 s26, s1, s12 +; GFX10-NEXT: s_and_b32 s24, s24, 1 +; GFX10-NEXT: s_add_i32 s23, s23, s24 +; GFX10-NEXT: s_add_u32 s21, s21, s27 +; GFX10-NEXT: s_cselect_b32 s24, 1, 0 +; GFX10-NEXT: s_mul_hi_u32 s27, s0, s13 +; GFX10-NEXT: s_and_b32 s24, s24, 1 +; GFX10-NEXT: s_add_i32 s23, s23, s24 +; GFX10-NEXT: s_add_u32 s21, s21, s25 +; GFX10-NEXT: s_cselect_b32 s24, 1, 0 +; GFX10-NEXT: s_mul_i32 s25, s6, s9 +; GFX10-NEXT: s_and_b32 s24, s24, 1 +; GFX10-NEXT: s_mul_hi_u32 s6, s6, s8 +; GFX10-NEXT: s_add_i32 s23, s23, s24 +; GFX10-NEXT: s_add_u32 s21, s21, s28 +; GFX10-NEXT: s_cselect_b32 s24, 1, 0 +; GFX10-NEXT: s_and_b32 s24, s24, 1 +; GFX10-NEXT: s_add_i32 s23, s23, s24 +; GFX10-NEXT: s_add_u32 s21, s21, s26 +; GFX10-NEXT: s_cselect_b32 s24, 1, 0 +; GFX10-NEXT: s_mul_i32 s26, s5, s10 +; GFX10-NEXT: s_and_b32 s24, s24, 1 +; GFX10-NEXT: s_mul_hi_u32 s5, s5, s9 +; GFX10-NEXT: s_add_i32 s23, s23, s24 +; GFX10-NEXT: s_add_u32 s21, s21, s27 +; GFX10-NEXT: s_cselect_b32 s24, 1, 0 +; GFX10-NEXT: s_mul_i32 s27, s4, s11 +; GFX10-NEXT: s_and_b32 s24, s24, 1 +; GFX10-NEXT: s_mul_hi_u32 s4, s4, s10 +; GFX10-NEXT: s_add_i32 s23, s23, s24 +; GFX10-NEXT: s_add_u32 s21, s21, s22 +; GFX10-NEXT: s_cselect_b32 s22, 1, 0 +; GFX10-NEXT: s_add_i32 s7, s7, s25 +; GFX10-NEXT: s_mul_i32 s24, s3, s12 +; GFX10-NEXT: s_add_i32 s7, s7, s26 +; GFX10-NEXT: s_mul_i32 s25, s2, s13 +; GFX10-NEXT: s_add_i32 s7, s7, s27 +; GFX10-NEXT: s_mul_i32 s26, s1, s14 +; GFX10-NEXT: s_add_i32 s7, s7, s24 +; GFX10-NEXT: s_mul_hi_u32 s3, s3, s11 +; GFX10-NEXT: s_add_i32 s7, s7, s25 +; GFX10-NEXT: s_mul_hi_u32 s2, s2, s12 +; GFX10-NEXT: s_add_i32 s7, s7, s26 +; GFX10-NEXT: s_mul_hi_u32 s1, s1, s13 +; GFX10-NEXT: s_add_i32 s7, s7, s15 +; GFX10-NEXT: s_add_i32 s6, s7, s6 +; GFX10-NEXT: s_add_i32 s5, s6, s5 +; GFX10-NEXT: s_mov_b32 s6, s21 +; GFX10-NEXT: s_add_i32 s4, s5, s4 +; GFX10-NEXT: s_mov_b32 s5, s20 +; GFX10-NEXT: s_add_i32 s3, s4, s3 +; GFX10-NEXT: s_mul_hi_u32 s4, s0, s14 +; GFX10-NEXT: s_add_i32 s2, s3, s2 +; GFX10-NEXT: s_and_b32 s3, s22, 1 +; GFX10-NEXT: s_add_i32 s1, s2, s1 +; GFX10-NEXT: s_add_i32 s23, s23, s3 +; GFX10-NEXT: s_add_i32 s1, s1, s4 +; GFX10-NEXT: s_mul_i32 s0, s0, s8 +; GFX10-NEXT: s_add_i32 s7, s1, s23 +; GFX10-NEXT: s_mov_b32 s1, s16 +; GFX10-NEXT: s_mov_b32 s2, s17 +; GFX10-NEXT: s_mov_b32 s3, s18 +; GFX10-NEXT: s_mov_b32 s4, s19 +; GFX10-NEXT: ; return to shader part epilog %result = mul i256 %num, %den %cast = bitcast i256 %result to <8 x i32> ret <8 x i32> %cast diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.v2i16.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s define <2 x i16> @v_mul_v2i16(<2 x i16> %a, <2 x i16> %b) { ; GFX9-LABEL: v_mul_v2i16: @@ -16,6 +17,13 @@ ; GFX8-NEXT: v_mul_lo_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_mul_v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_mul_lo_u16 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %mul = mul <2 x i16> %a, %b ret <2 x i16> %mul } @@ -35,6 +43,13 @@ ; GFX8-NEXT: v_mul_lo_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_mul_v2i16_fneg_lhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_mul_lo_u16 v0, v0, v1 neg_lo:[1,0] neg_hi:[1,0] +; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg <2 x half> %a %cast.neg.a = bitcast <2 x half> %neg.a to <2 x i16> %mul = mul <2 x i16> %cast.neg.a, %b @@ -56,6 +71,13 @@ ; GFX8-NEXT: v_mul_lo_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_mul_v2i16_fneg_rhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_mul_lo_u16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.b = fneg <2 x half> %b %cast.neg.b = bitcast <2 x half> %neg.b to <2 x i16> %mul = mul <2 x i16> %a, %cast.neg.b @@ -79,6 +101,13 @@ ; GFX8-NEXT: v_mul_lo_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_mul_v2i16_fneg_lhs_fneg_rhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_mul_lo_u16 v0, v0, v1 neg_lo:[1,1] neg_hi:[1,1] +; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg <2 x half> %a %neg.b = fneg <2 x half> %b %cast.neg.a = bitcast <2 x half> %neg.a to <2 x i16> diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll @@ -1,12 +1,18 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s ; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s define amdgpu_ps i32 @s_orn2_i32(i32 inreg %src0, i32 inreg %src1) { ; GCN-LABEL: s_orn2_i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_orn2_b32 s0, s2, s3 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_orn2_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_orn2_b32 s0, s2, s3 +; GFX10-NEXT: ; return to shader part epilog %not.src1 = xor i32 %src1, -1 %or = or i32 %src0, %not.src1 ret i32 %or @@ -17,6 +23,11 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_orn2_b32 s0, s2, s3 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_orn2_i32_commute: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_orn2_b32 s0, s2, s3 +; GFX10-NEXT: ; return to shader part epilog %not.src1 = xor i32 %src1, -1 %or = or i32 %not.src1, %src0 ret i32 %or @@ -28,6 +39,12 @@ ; GCN-NEXT: s_not_b32 s1, s3 ; GCN-NEXT: s_orn2_b32 s0, s2, s3 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_orn2_i32_multi_use: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_orn2_b32 s0, s2, s3 +; GFX10-NEXT: s_not_b32 s1, s3 +; GFX10-NEXT: ; return to shader part epilog %not.src1 = xor i32 %src1, -1 %or = or i32 %src0, %not.src1 %insert.0 = insertvalue { i32, i32 } undef, i32 %or, 0 @@ -41,6 +58,12 @@ ; GCN-NEXT: s_orn2_b32 s0, s2, s4 ; GCN-NEXT: s_orn2_b32 s1, s3, s4 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_orn2_i32_multi_foldable_use: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_orn2_b32 s0, s2, s4 +; GFX10-NEXT: s_orn2_b32 s1, s3, s4 +; GFX10-NEXT: ; return to shader part epilog %not.src2 = xor i32 %src2, -1 %or0 = or i32 %src0, %not.src2 %or1 = or i32 %src1, %not.src2 @@ -56,6 +79,14 @@ ; GCN-NEXT: v_xor_b32_e32 v1, -1, v1 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_orn2_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %not.src1 = xor i32 %src1, -1 %or = or i32 %src0, %not.src1 ret i32 %or @@ -67,6 +98,12 @@ ; GCN-NEXT: v_xor_b32_e32 v0, -1, v0 ; GCN-NEXT: v_or_b32_e32 v0, s2, v0 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_orn2_i32_sv: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX10-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX10-NEXT: ; return to shader part epilog %not.src1 = xor i32 %src1, -1 %or = or i32 %src0, %not.src1 %cast = bitcast i32 %or to float @@ -79,6 +116,12 @@ ; GCN-NEXT: s_not_b32 s0, s2 ; GCN-NEXT: v_or_b32_e32 v0, s0, v0 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_orn2_i32_vs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_not_b32 s0, s2 +; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX10-NEXT: ; return to shader part epilog %not.src1 = xor i32 %src1, -1 %or = or i32 %src0, %not.src1 %cast = bitcast i32 %or to float @@ -90,6 +133,11 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_orn2_b64 s[0:1], s[2:3], s[4:5] ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_orn2_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_orn2_b64 s[0:1], s[2:3], s[4:5] +; GFX10-NEXT: ; return to shader part epilog %not.src1 = xor i64 %src1, -1 %or = or i64 %src0, %not.src1 ret i64 %or @@ -100,6 +148,11 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_orn2_b64 s[0:1], s[2:3], s[4:5] ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_orn2_i64_commute: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_orn2_b64 s[0:1], s[2:3], s[4:5] +; GFX10-NEXT: ; return to shader part epilog %not.src1 = xor i64 %src1, -1 %or = or i64 %not.src1, %src0 ret i64 %or @@ -111,6 +164,12 @@ ; GCN-NEXT: s_orn2_b64 s[0:1], s[2:3], s[6:7] ; GCN-NEXT: s_orn2_b64 s[2:3], s[4:5], s[6:7] ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_orn2_i64_multi_foldable_use: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_orn2_b64 s[0:1], s[2:3], s[6:7] +; GFX10-NEXT: s_orn2_b64 s[2:3], s[4:5], s[6:7] +; GFX10-NEXT: ; return to shader part epilog %not.src2 = xor i64 %src2, -1 %or0 = or i64 %src0, %not.src2 %or1 = or i64 %src1, %not.src2 @@ -127,6 +186,12 @@ ; GCN-NEXT: s_mov_b32 s2, s6 ; GCN-NEXT: s_mov_b32 s3, s7 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_orn2_i64_multi_use: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_orn2_b64 s[0:1], s[2:3], s[4:5] +; GFX10-NEXT: s_not_b64 s[2:3], s[4:5] +; GFX10-NEXT: ; return to shader part epilog %not.src1 = xor i64 %src1, -1 %or = or i64 %src0, %not.src1 %insert.0 = insertvalue { i64, i64 } undef, i64 %or, 0 @@ -143,6 +208,16 @@ ; GCN-NEXT: v_or_b32_e32 v0, v0, v2 ; GCN-NEXT: v_or_b32_e32 v1, v1, v3 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_orn2_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX10-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] %not.src1 = xor i64 %src1, -1 %or = or i64 %src0, %not.src1 ret i64 %or @@ -156,6 +231,14 @@ ; GCN-NEXT: v_or_b32_e32 v0, s2, v0 ; GCN-NEXT: v_or_b32_e32 v1, s3, v1 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_orn2_i64_sv: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX10-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX10-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX10-NEXT: v_or_b32_e32 v1, s3, v1 +; GFX10-NEXT: ; return to shader part epilog %not.src1 = xor i64 %src1, -1 %or = or i64 %src0, %not.src1 %cast = bitcast i64 %or to <2 x float> @@ -169,6 +252,13 @@ ; GCN-NEXT: v_or_b32_e32 v0, s0, v0 ; GCN-NEXT: v_or_b32_e32 v1, s1, v1 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_orn2_i64_vs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_not_b64 s[0:1], s[2:3] +; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX10-NEXT: v_or_b32_e32 v1, s1, v1 +; GFX10-NEXT: ; return to shader part epilog %not.src1 = xor i64 %src1, -1 %or = or i64 %src0, %not.src1 %cast = bitcast i64 %or to <2 x float> @@ -180,6 +270,11 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_orn2_b64 s[0:1], s[2:3], s[4:5] ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_orn2_v2i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_orn2_b64 s[0:1], s[2:3], s[4:5] +; GFX10-NEXT: ; return to shader part epilog %not.src1 = xor <2 x i32> %src1, %or = or <2 x i32> %src0, %not.src1 ret <2 x i32> %or @@ -190,6 +285,11 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_orn2_b64 s[0:1], s[2:3], s[4:5] ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_orn2_v2i32_commute: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_orn2_b64 s[0:1], s[2:3], s[4:5] +; GFX10-NEXT: ; return to shader part epilog %not.src1 = xor <2 x i32> %src1, %or = or <2 x i32> %not.src1, %src0 ret <2 x i32> %or @@ -200,6 +300,11 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_orn2_b32 s0, s2, s3 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_orn2_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_orn2_b32 s0, s2, s3 +; GFX10-NEXT: ; return to shader part epilog %not.src1 = xor i16 %src1, -1 %or = or i16 %src0, %not.src1 ret i16 %or @@ -210,6 +315,11 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_orn2_b32 s0, s2, s3 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_orn2_i16_commute: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_orn2_b32 s0, s2, s3 +; GFX10-NEXT: ; return to shader part epilog %not.src1 = xor i16 %src1, -1 %or = or i16 %not.src1, %src0 ret i16 %or @@ -221,6 +331,12 @@ ; GCN-NEXT: s_xor_b32 s1, s3, -1 ; GCN-NEXT: s_orn2_b32 s0, s2, s3 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_orn2_i16_multi_use: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_orn2_b32 s0, s2, s3 +; GFX10-NEXT: s_xor_b32 s1, s3, -1 +; GFX10-NEXT: ; return to shader part epilog %not.src1 = xor i16 %src1, -1 %or = or i16 %src0, %not.src1 %insert.0 = insertvalue { i16, i16 } undef, i16 %or, 0 @@ -234,6 +350,12 @@ ; GCN-NEXT: s_orn2_b32 s0, s2, s4 ; GCN-NEXT: s_orn2_b32 s1, s3, s4 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_orn2_i16_multi_foldable_use: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_orn2_b32 s0, s2, s4 +; GFX10-NEXT: s_orn2_b32 s1, s3, s4 +; GFX10-NEXT: ; return to shader part epilog %not.src2 = xor i16 %src2, -1 %or0 = or i16 %src0, %not.src2 %or1 = or i16 %src1, %not.src2 @@ -249,6 +371,14 @@ ; GCN-NEXT: v_xor_b32_e32 v1, -1, v1 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_orn2_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %not.src1 = xor i16 %src1, -1 %or = or i16 %src0, %not.src1 ret i16 %or @@ -261,6 +391,13 @@ ; GCN-NEXT: v_or_b32_e32 v0, s2, v0 ; GCN-NEXT: v_bfe_u32 v0, v0, 0, 16 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_orn2_i16_sv: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX10-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX10-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX10-NEXT: ; return to shader part epilog %not.src1 = xor i16 %src1, -1 %or = or i16 %src0, %not.src1 %zext = zext i16 %or to i32 @@ -275,6 +412,13 @@ ; GCN-NEXT: v_or_b32_e32 v0, s0, v0 ; GCN-NEXT: v_bfe_u32 v0, v0, 0, 16 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_orn2_i16_vs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_xor_b32 s0, s2, -1 +; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX10-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX10-NEXT: ; return to shader part epilog %not.src1 = xor i16 %src1, -1 %or = or i16 %src0, %not.src1 %zext = zext i16 %or to i32 @@ -293,6 +437,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_orn2_b32 s0, s2, s3 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_orn2_v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_orn2_b32 s0, s2, s3 +; GFX10-NEXT: ; return to shader part epilog %not.src1 = xor <2 x i16> %src1, %or = or <2 x i16> %src0, %not.src1 %cast = bitcast <2 x i16> %or to i32 @@ -310,6 +459,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_orn2_b32 s0, s2, s3 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_orn2_v2i16_commute: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_orn2_b32 s0, s2, s3 +; GFX10-NEXT: ; return to shader part epilog %not.src1 = xor <2 x i16> %src1, %or = or <2 x i16> %not.src1, %src0 %cast = bitcast <2 x i16> %or to i32 @@ -328,6 +482,12 @@ ; GFX9-NEXT: s_xor_b32 s1, s3, -1 ; GFX9-NEXT: s_orn2_b32 s0, s2, s3 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_orn2_v2i16_multi_use: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_orn2_b32 s0, s2, s3 +; GFX10-NEXT: s_xor_b32 s1, s3, -1 +; GFX10-NEXT: ; return to shader part epilog %not.src1 = xor <2 x i16> %src1, %or = or <2 x i16> %src0, %not.src1 @@ -351,6 +511,12 @@ ; GFX9-NEXT: s_orn2_b32 s0, s2, s4 ; GFX9-NEXT: s_orn2_b32 s1, s3, s4 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_orn2_v2i16_multi_foldable_use: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_orn2_b32 s0, s2, s4 +; GFX10-NEXT: s_orn2_b32 s1, s3, s4 +; GFX10-NEXT: ; return to shader part epilog %not.src2 = xor <2 x i16> %src2, %or0 = or <2 x i16> %src0, %not.src2 %or1 = or <2 x i16> %src1, %not.src2 @@ -369,6 +535,14 @@ ; GCN-NEXT: v_xor_b32_e32 v1, -1, v1 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_orn2_v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %not.src1 = xor <2 x i16> %src1, %or = or <2 x i16> %src0, %not.src1 ret <2 x i16> %or @@ -435,6 +609,14 @@ ; GFX9-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] ; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_orn2_v4i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, -1 +; GFX10-NEXT: s_mov_b32 s1, s0 +; GFX10-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] +; GFX10-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GFX10-NEXT: ; return to shader part epilog %not.src1 = xor <4 x i16> %src1, %or = or <4 x i16> %src0, %not.src1 %cast = bitcast <4 x i16> %or to i64 @@ -470,6 +652,14 @@ ; GFX9-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] ; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_orn2_v4i16_commute: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, -1 +; GFX10-NEXT: s_mov_b32 s1, s0 +; GFX10-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] +; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX10-NEXT: ; return to shader part epilog %not.src1 = xor <4 x i16> %src1, %or = or <4 x i16> %not.src1, %src0 %cast = bitcast <4 x i16> %or to i64 @@ -507,6 +697,16 @@ ; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_orn2_v4i16_multi_use: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, -1 +; GFX10-NEXT: s_mov_b32 s1, s0 +; GFX10-NEXT: s_xor_b64 s[4:5], s[4:5], s[0:1] +; GFX10-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5] +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: ; return to shader part epilog %not.src1 = xor <4 x i16> %src1, %or = or <4 x i16> %src0, %not.src1 @@ -554,6 +754,15 @@ ; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[6:7] ; GFX9-NEXT: s_or_b64 s[2:3], s[4:5], s[6:7] ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_orn2_v4i16_multi_foldable_use: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, -1 +; GFX10-NEXT: s_mov_b32 s1, s0 +; GFX10-NEXT: s_xor_b64 s[6:7], s[6:7], s[0:1] +; GFX10-NEXT: s_or_b64 s[0:1], s[2:3], s[6:7] +; GFX10-NEXT: s_or_b64 s[2:3], s[4:5], s[6:7] +; GFX10-NEXT: ; return to shader part epilog %not.src2 = xor <4 x i16> %src2, %or0 = or <4 x i16> %src0, %not.src2 %or1 = or <4 x i16> %src1, %not.src2 @@ -598,6 +807,16 @@ ; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_orn2_v4i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX10-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] %not.src1 = xor <4 x i16> %src1, %or = or <4 x i16> %src0, %not.src1 ret <4 x i16> %or diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-and.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-and.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-and.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-and.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s --- name: remove_and_255_zextload diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll @@ -3,6 +3,7 @@ ; RUN: llc -global-isel -march=amdgcn -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s define float @v_roundeven_f32(float %x) { ; GFX6-LABEL: v_roundeven_f32: @@ -28,6 +29,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_rndne_f32_e32 v0, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_roundeven_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_rndne_f32_e32 v0, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %roundeven = call float @llvm.roundeven.f32(float %x) ret float %roundeven } @@ -60,6 +68,14 @@ ; GFX9-NEXT: v_rndne_f32_e32 v0, v0 ; GFX9-NEXT: v_rndne_f32_e32 v1, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_roundeven_v2f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_rndne_f32_e32 v0, v0 +; GFX10-NEXT: v_rndne_f32_e32 v1, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %roundeven = call <2 x float> @llvm.roundeven.v2f32(<2 x float> %x) ret <2 x float> %roundeven } @@ -96,6 +112,15 @@ ; GFX9-NEXT: v_rndne_f32_e32 v1, v1 ; GFX9-NEXT: v_rndne_f32_e32 v2, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_roundeven_v3f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_rndne_f32_e32 v0, v0 +; GFX10-NEXT: v_rndne_f32_e32 v1, v1 +; GFX10-NEXT: v_rndne_f32_e32 v2, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] %roundeven = call <3 x float> @llvm.roundeven.v3f32(<3 x float> %x) ret <3 x float> %roundeven } @@ -136,6 +161,16 @@ ; GFX9-NEXT: v_rndne_f32_e32 v2, v2 ; GFX9-NEXT: v_rndne_f32_e32 v3, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_roundeven_v4f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_rndne_f32_e32 v0, v0 +; GFX10-NEXT: v_rndne_f32_e32 v1, v1 +; GFX10-NEXT: v_rndne_f32_e32 v2, v2 +; GFX10-NEXT: v_rndne_f32_e32 v3, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] %roundeven = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %x) ret <4 x float> %roundeven } @@ -168,6 +203,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_rndne_f16_e32 v0, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_roundeven_f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_rndne_f16_e32 v0, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %roundeven = call half @llvm.roundeven.f16(half %x) ret half %roundeven } @@ -223,6 +265,15 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_roundeven_v2f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_rndne_f16_e32 v1, v0 +; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v0, v1, 0xffff, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %roundeven = call <2 x half> @llvm.roundeven.v2f16(<2 x half> %x) ret <2 x half> %roundeven } @@ -282,6 +333,16 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_roundeven_v2f16_fneg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX10-NEXT: v_rndne_f16_e32 v1, v0 +; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v0, v1, 0xffff, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %x.fneg = fneg <2 x half> %x %roundeven = call <2 x half> @llvm.roundeven.v2f16(<2 x half> %x.fneg) ret <2 x half> %roundeven @@ -347,6 +408,19 @@ ; GFX9-NEXT: v_and_or_b32 v0, v2, v4, v0 ; GFX9-NEXT: v_and_or_b32 v1, v3, v4, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_roundeven_v4f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_rndne_f16_e32 v3, v1 +; GFX10-NEXT: v_rndne_f16_e32 v2, v0 +; GFX10-NEXT: v_rndne_f16_sdwa v7, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX10-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v0, v2, v4, v7 +; GFX10-NEXT: v_and_or_b32 v1, v3, v4, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %roundeven = call <4 x half> @llvm.roundeven.v4f16(<4 x half> %x) ret <4 x half> %roundeven } @@ -376,6 +450,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_rndne_f32_e64 v0, |v0| ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_roundeven_f32_fabs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_rndne_f32_e64 v0, |v0| +; GFX10-NEXT: s_setpc_b64 s[30:31] %fabs.x = call float @llvm.fabs.f32(float %x) %roundeven = call float @llvm.roundeven.f32(float %fabs.x) ret float %roundeven @@ -401,6 +482,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: v_rndne_f32_e32 v0, s0 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_roundeven_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_rndne_f32_e32 v0, s0 +; GFX10-NEXT: ; return to shader part epilog %roundeven = call float @llvm.roundeven.f32(float %x) ret float %roundeven } @@ -429,6 +515,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_rndne_f32_e64 v0, -v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_roundeven_f32_fneg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_rndne_f32_e64 v0, -v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.x = fneg float %x %roundeven = call float @llvm.roundeven.f32(float %neg.x) ret float %roundeven @@ -467,6 +560,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_rndne_f64_e32 v[0:1], v[0:1] ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_roundeven_f64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_rndne_f64_e32 v[0:1], v[0:1] +; GFX10-NEXT: s_setpc_b64 s[30:31] %roundeven = call double @llvm.roundeven.f64(double %x) ret double %roundeven } @@ -506,6 +606,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_rndne_f64_e64 v[0:1], -v[0:1] ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_roundeven_f64_fneg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_rndne_f64_e64 v[0:1], -v[0:1] +; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.x = fneg double %x %roundeven = call double @llvm.roundeven.f64(double %neg.x) ret double %roundeven @@ -556,6 +663,14 @@ ; GFX9-NEXT: v_rndne_f64_e32 v[0:1], v[0:1] ; GFX9-NEXT: v_rndne_f64_e32 v[2:3], v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_roundeven_v2f64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_rndne_f64_e32 v[0:1], v[0:1] +; GFX10-NEXT: v_rndne_f64_e32 v[2:3], v[2:3] +; GFX10-NEXT: s_setpc_b64 s[30:31] %roundeven = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %x) ret <2 x double> %roundeven } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll @@ -2,6 +2,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s ; Test optimization to reduce shifts to narrower sizes. @@ -12,6 +13,13 @@ ; GCN-NEXT: s_lshl_b32 s0, s0, 2 ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_shl_i64_zext_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_andn2_b32 s0, s0, -2.0 +; GFX10-NEXT: s_mov_b32 s1, 0 +; GFX10-NEXT: s_lshl_b32 s0, s0, 2 +; GFX10-NEXT: ; return to shader part epilog %and = and i32 %x, 1073741823 %ext = zext i32 %and to i64 %shl = shl i64 %ext, 2 @@ -26,6 +34,15 @@ ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_shl_i64_zext_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v0, 0x3fffffff, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %and = and i32 %x, 1073741823 %ext = zext i32 %and to i64 %shl = shl i64 %ext, 2 @@ -39,6 +56,13 @@ ; GCN-NEXT: s_lshl_b32 s0, s0, 2 ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_shl_i64_sext_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_and_b32 s0, s0, 0x1fffffff +; GFX10-NEXT: s_mov_b32 s1, 0 +; GFX10-NEXT: s_lshl_b32 s0, s0, 2 +; GFX10-NEXT: ; return to shader part epilog %and = and i32 %x, 536870911 %ext = sext i32 %and to i64 %shl = shl i64 %ext, 2 @@ -53,6 +77,15 @@ ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_shl_i64_sext_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v0, 0x1fffffff, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %and = and i32 %x, 536870911 %ext = sext i32 %and to i64 %shl = shl i64 %ext, 2 @@ -66,6 +99,13 @@ ; GCN-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000 ; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_shl_i64_zext_i32_overflow: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_bitset0_b32 s0, 31 +; GFX10-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000 +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX10-NEXT: ; return to shader part epilog %and = and i32 %x, 2147483647 %ext = zext i32 %and to i64 %shl = shl i64 %ext, 2 @@ -96,6 +136,15 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_shl_i64_zext_i32_overflow: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX10-NEXT: s_setpc_b64 s[30:31] %and = and i32 %x, 2147483647 %ext = zext i32 %and to i64 %shl = shl i64 %ext, 2 @@ -109,6 +158,13 @@ ; GCN-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x200000 ; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_shl_i64_sext_i32_overflow: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_bitset0_b32 s0, 31 +; GFX10-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x200000 +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX10-NEXT: ; return to shader part epilog %and = and i32 %x, 2147483647 %ext = sext i32 %and to i64 %shl = shl i64 %ext, 2 @@ -139,6 +195,15 @@ ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_shl_i64_sext_i32_overflow: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX10-NEXT: s_setpc_b64 s[30:31] %and = and i32 %x, 2147483647 %ext = sext i32 %and to i64 %shl = shl i64 %ext, 2 @@ -188,6 +253,21 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc ; GFX9-NEXT: global_store_dword v[2:3], v1, off ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: mulu24_shl64: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_and_b32_e32 v0, 6, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mul_u32_u24_e32 v0, 7, v0 +; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[0:1] +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v5, s1 +; GFX10-NEXT: v_mov_b32_e32 v4, s0 +; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v4, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo +; GFX10-NEXT: global_store_dword v[2:3], v1, off +; GFX10-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = and i32 %tmp, 6 @@ -258,6 +338,21 @@ ; GFX9-NEXT: v_lshlrev_b64 v[1:2], 3, v[1:2] ; GFX9-NEXT: global_store_dwordx2 v0, v[1:2], s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: muli24_shl64: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v1, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_or_b32_e32 v1, 0xff800000, v1 +; GFX10-NEXT: v_mul_i32_i24_e32 v1, -7, v1 +; GFX10-NEXT: v_lshlrev_b64 v[1:2], 3, v[1:2] +; GFX10-NEXT: global_store_dwordx2 v0, v[1:2], s[0:1] +; GFX10-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp2 = sext i32 %tmp to i64 @@ -284,6 +379,18 @@ ; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN-NEXT: s_lshl_b64 s[2:3], s[4:5], 2 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_shl_v2i64_zext_v2i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_brev_b32 s2, -4 +; GFX10-NEXT: s_mov_b32 s3, s2 +; GFX10-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; GFX10-NEXT: s_mov_b32 s2, s1 +; GFX10-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000 +; GFX10-NEXT: s_bfe_u64 s[2:3], s[2:3], 0x200000 +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GFX10-NEXT: ; return to shader part epilog %and = and <2 x i32> %x, %ext = zext <2 x i32> %and to <2 x i64> %shl = shl <2 x i64> %ext, @@ -326,6 +433,19 @@ ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_shl_v2i64_zext_v2i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: s_brev_b32 s4, -4 +; GFX10-NEXT: v_and_b32_e32 v2, s4, v0 +; GFX10-NEXT: v_and_b32_e32 v4, s4, v1 +; GFX10-NEXT: v_mov_b32_e32 v5, v3 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[2:3] +; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[4:5] +; GFX10-NEXT: s_setpc_b64 s[30:31] %and = and <2 x i32> %x, %ext = zext <2 x i32> %and to <2 x i64> %shl = shl <2 x i64> %ext, @@ -344,6 +464,18 @@ ; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN-NEXT: s_lshl_b64 s[2:3], s[4:5], 2 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_shl_v2i64_sext_v2i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_brev_b32 s2, -8 +; GFX10-NEXT: s_mov_b32 s3, s2 +; GFX10-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; GFX10-NEXT: s_mov_b32 s2, s1 +; GFX10-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x200000 +; GFX10-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x200000 +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GFX10-NEXT: ; return to shader part epilog %and = and <2 x i32> %x, %ext = sext <2 x i32> %and to <2 x i64> %shl = shl <2 x i64> %ext, @@ -386,6 +518,19 @@ ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_shl_v2i64_sext_v2i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_brev_b32 s4, -8 +; GFX10-NEXT: v_and_b32_e32 v2, s4, v1 +; GFX10-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX10-NEXT: s_setpc_b64 s[30:31] %and = and <2 x i32> %x, %ext = sext <2 x i32> %and to <2 x i64> %shl = shl <2 x i64> %ext, @@ -413,6 +558,13 @@ ; GFX9-NEXT: s_and_b32 s0, s0, 0x3fff ; GFX9-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_shl_i32_zext_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10-NEXT: s_and_b32 s0, s0, 0x3fff +; GFX10-NEXT: s_lshl_b32 s0, s0, 2 +; GFX10-NEXT: ; return to shader part epilog %and = and i16 %x, 16383 %ext = zext i16 %and to i32 %shl = shl i32 %ext, 2 @@ -441,6 +593,15 @@ ; GFX9-NEXT: v_and_b32_e32 v0, 0x3fff, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 2, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_shl_i32_zext_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v0, 0x3fff, v0 +; GFX10-NEXT: v_lshlrev_b16_e64 v0, 2, v0 +; GFX10-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX10-NEXT: s_setpc_b64 s[30:31] %and = and i16 %x, 16383 %ext = zext i16 %and to i32 %shl = shl i32 %ext, 2 @@ -476,6 +637,15 @@ ; GFX9-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-NEXT: s_lshl_b32 s1, s1, 2 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_shl_v2i32_zext_v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_and_b32 s0, s0, 0x3fff3fff +; GFX10-NEXT: s_and_b32 s1, s0, 0xffff +; GFX10-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10-NEXT: s_lshl_b32 s0, s1, 2 +; GFX10-NEXT: s_lshl_b32 s1, s2, 2 +; GFX10-NEXT: ; return to shader part epilog %and = and <2 x i16> %x, %ext = zext <2 x i16> %and to <2 x i32> %shl = shl <2 x i32> %ext, @@ -515,6 +685,16 @@ ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_shl_v2i32_zext_v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v1, 0x3fff3fff, v0 +; GFX10-NEXT: s_mov_b32 s4, 2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %and = and <2 x i16> %x, %ext = zext <2 x i16> %and to <2 x i32> %shl = shl <2 x i32> %ext, diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll @@ -2,6 +2,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s define i8 @v_shl_i8(i8 %value, i8 %amount) { ; GFX6-LABEL: v_shl_i8: @@ -22,6 +23,14 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_shl_i8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX10-NEXT: v_lshlrev_b16_e64 v0, v1, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = shl i8 %value, %amount ret i8 %result } @@ -44,6 +53,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 7, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_shl_i8_7: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshlrev_b16_e64 v0, 7, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = shl i8 %value, 7 ret i8 %result } @@ -70,6 +86,14 @@ ; GFX9-NEXT: s_and_b32 s1, s1, s2 ; GFX9-NEXT: s_lshl_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_shl_i8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_movk_i32 s2, 0xff +; GFX10-NEXT: s_and_b32 s0, s0, s2 +; GFX10-NEXT: s_and_b32 s1, s1, s2 +; GFX10-NEXT: s_lshl_b32 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog %result = shl i8 %value, %amount ret i8 %result } @@ -91,6 +115,12 @@ ; GFX9-NEXT: s_bfe_u32 s1, 7, 0x100000 ; GFX9-NEXT: s_lshl_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_shl_i8_7: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_bfe_u32 s1, 7, 0x100000 +; GFX10-NEXT: s_lshl_b32 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog %result = shl i8 %value, 7 ret i8 %result } @@ -103,6 +133,14 @@ ; GCN-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GCN-NEXT: v_lshlrev_b32_e32 v0, v1, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_shl_i24: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = shl i24 %value, %amount ret i24 %result } @@ -113,6 +151,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_shl_i24_7: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = shl i24 %value, 7 ret i24 %result } @@ -123,6 +168,12 @@ ; GCN-NEXT: s_and_b32 s1, s1, 0xffffff ; GCN-NEXT: s_lshl_b32 s0, s0, s1 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_shl_i24: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_and_b32 s1, s1, 0xffffff +; GFX10-NEXT: s_lshl_b32 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog %result = shl i24 %value, %amount ret i24 %result } @@ -132,6 +183,11 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_lshl_b32 s0, s0, 7 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_shl_i24_7: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshl_b32 s0, s0, 7 +; GFX10-NEXT: ; return to shader part epilog %result = shl i24 %value, 7 ret i24 %result } @@ -142,6 +198,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v0, v1, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_shl_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = shl i32 %value, %amount ret i32 %result } @@ -152,6 +215,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v0, 31, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_shl_i32_31: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 31, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = shl i32 %value, 31 ret i32 %result } @@ -161,6 +231,11 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_lshl_b32 s0, s0, s1 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_shl_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshl_b32 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog %result = shl i32 %value, %amount ret i32 %result } @@ -170,6 +245,11 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_lshl_b32 s0, s0, 31 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_shl_i32_31: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshl_b32 s0, s0, 31 +; GFX10-NEXT: ; return to shader part epilog %result = shl i32 %value, 31 ret i32 %result } @@ -189,6 +269,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s0 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: shl_i32_sv: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, s0 +; GFX10-NEXT: ; return to shader part epilog %result = shl i32 %value, %amount %cast = bitcast i32 %result to float ret float %cast @@ -199,6 +284,11 @@ ; GCN: ; %bb.0: ; GCN-NEXT: v_lshlrev_b32_e32 v0, s0, v0 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: shl_i32_vs: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_lshlrev_b32_e32 v0, s0, v0 +; GFX10-NEXT: ; return to shader part epilog %result = shl i32 %value, %amount %cast = bitcast i32 %result to float ret float %cast @@ -211,6 +301,14 @@ ; GCN-NEXT: v_lshlrev_b32_e32 v0, v2, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, v3, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_shl_v2i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, v2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, v3, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = shl <2 x i32> %value, %amount ret <2 x i32> %result } @@ -222,6 +320,14 @@ ; GCN-NEXT: v_lshlrev_b32_e32 v0, 31, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 31, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_shl_v2i32_31: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 31, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 31, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = shl <2 x i32> %value, ret <2 x i32> %result } @@ -232,6 +338,12 @@ ; GCN-NEXT: s_lshl_b32 s0, s0, s2 ; GCN-NEXT: s_lshl_b32 s1, s1, s3 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_shl_v2i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10-NEXT: s_lshl_b32 s1, s1, s3 +; GFX10-NEXT: ; return to shader part epilog %result = shl <2 x i32> %value, %amount ret <2 x i32> %result } @@ -244,6 +356,15 @@ ; GCN-NEXT: v_lshlrev_b32_e32 v1, v4, v1 ; GCN-NEXT: v_lshlrev_b32_e32 v2, v5, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_shl_v3i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, v3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, v4, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = shl <3 x i32> %value, %amount ret <3 x i32> %result } @@ -255,6 +376,13 @@ ; GCN-NEXT: s_lshl_b32 s1, s1, s4 ; GCN-NEXT: s_lshl_b32 s2, s2, s5 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_shl_v3i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshl_b32 s0, s0, s3 +; GFX10-NEXT: s_lshl_b32 s1, s1, s4 +; GFX10-NEXT: s_lshl_b32 s2, s2, s5 +; GFX10-NEXT: ; return to shader part epilog %result = shl <3 x i32> %value, %amount ret <3 x i32> %result } @@ -268,6 +396,16 @@ ; GCN-NEXT: v_lshlrev_b32_e32 v2, v6, v2 ; GCN-NEXT: v_lshlrev_b32_e32 v3, v7, v3 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_shl_v4i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, v4, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, v5, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, v6, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, v7, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = shl <4 x i32> %value, %amount ret <4 x i32> %result } @@ -280,6 +418,14 @@ ; GCN-NEXT: s_lshl_b32 s2, s2, s6 ; GCN-NEXT: s_lshl_b32 s3, s3, s7 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_shl_v4i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshl_b32 s0, s0, s4 +; GFX10-NEXT: s_lshl_b32 s1, s1, s5 +; GFX10-NEXT: s_lshl_b32 s2, s2, s6 +; GFX10-NEXT: s_lshl_b32 s3, s3, s7 +; GFX10-NEXT: ; return to shader part epilog %result = shl <4 x i32> %value, %amount ret <4 x i32> %result } @@ -294,6 +440,17 @@ ; GCN-NEXT: v_lshlrev_b32_e32 v3, v8, v3 ; GCN-NEXT: v_lshlrev_b32_e32 v4, v9, v4 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_shl_v5i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, v5, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, v6, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, v7, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, v8, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, v9, v4 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = shl <5 x i32> %value, %amount ret <5 x i32> %result } @@ -307,6 +464,15 @@ ; GCN-NEXT: s_lshl_b32 s3, s3, s8 ; GCN-NEXT: s_lshl_b32 s4, s4, s9 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_shl_v5i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshl_b32 s0, s0, s5 +; GFX10-NEXT: s_lshl_b32 s1, s1, s6 +; GFX10-NEXT: s_lshl_b32 s2, s2, s7 +; GFX10-NEXT: s_lshl_b32 s3, s3, s8 +; GFX10-NEXT: s_lshl_b32 s4, s4, s9 +; GFX10-NEXT: ; return to shader part epilog %result = shl <5 x i32> %value, %amount ret <5 x i32> %result } @@ -332,6 +498,28 @@ ; GCN-NEXT: v_lshlrev_b32_e32 v14, v30, v14 ; GCN-NEXT: v_lshlrev_b32_e32 v15, v31, v15 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_shl_v16i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, v16, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, v17, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, v18, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, v19, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, v20, v4 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, v21, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, v22, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v7, v23, v7 +; GFX10-NEXT: v_lshlrev_b32_e32 v8, v24, v8 +; GFX10-NEXT: v_lshlrev_b32_e32 v9, v25, v9 +; GFX10-NEXT: v_lshlrev_b32_e32 v10, v26, v10 +; GFX10-NEXT: v_lshlrev_b32_e32 v11, v27, v11 +; GFX10-NEXT: v_lshlrev_b32_e32 v12, v28, v12 +; GFX10-NEXT: v_lshlrev_b32_e32 v13, v29, v13 +; GFX10-NEXT: v_lshlrev_b32_e32 v14, v30, v14 +; GFX10-NEXT: v_lshlrev_b32_e32 v15, v31, v15 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = shl <16 x i32> %value, %amount ret <16 x i32> %result } @@ -356,6 +544,26 @@ ; GCN-NEXT: s_lshl_b32 s14, s14, s30 ; GCN-NEXT: s_lshl_b32 s15, s15, s31 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_shl_v16i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshl_b32 s0, s0, s16 +; GFX10-NEXT: s_lshl_b32 s1, s1, s17 +; GFX10-NEXT: s_lshl_b32 s2, s2, s18 +; GFX10-NEXT: s_lshl_b32 s3, s3, s19 +; GFX10-NEXT: s_lshl_b32 s4, s4, s20 +; GFX10-NEXT: s_lshl_b32 s5, s5, s21 +; GFX10-NEXT: s_lshl_b32 s6, s6, s22 +; GFX10-NEXT: s_lshl_b32 s7, s7, s23 +; GFX10-NEXT: s_lshl_b32 s8, s8, s24 +; GFX10-NEXT: s_lshl_b32 s9, s9, s25 +; GFX10-NEXT: s_lshl_b32 s10, s10, s26 +; GFX10-NEXT: s_lshl_b32 s11, s11, s27 +; GFX10-NEXT: s_lshl_b32 s12, s12, s28 +; GFX10-NEXT: s_lshl_b32 s13, s13, s29 +; GFX10-NEXT: s_lshl_b32 s14, s14, s30 +; GFX10-NEXT: s_lshl_b32 s15, s15, s31 +; GFX10-NEXT: ; return to shader part epilog %result = shl <16 x i32> %value, %amount ret <16 x i32> %result } @@ -379,6 +587,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_shl_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshlrev_b16_e64 v0, v1, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = shl i16 %value, %amount ret i16 %result } @@ -388,6 +603,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_shl_i16_31: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = shl i16 %value, 31 ret i16 %result } @@ -414,6 +635,14 @@ ; GFX9-NEXT: s_and_b32 s1, s1, s2 ; GFX9-NEXT: s_lshl_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_shl_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s2, 0xffff +; GFX10-NEXT: s_and_b32 s0, s0, s2 +; GFX10-NEXT: s_and_b32 s1, s1, s2 +; GFX10-NEXT: s_lshl_b32 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog %result = shl i16 %value, %amount ret i16 %result } @@ -435,6 +664,12 @@ ; GFX9-NEXT: s_bfe_u32 s1, 15, 0x100000 ; GFX9-NEXT: s_lshl_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_shl_i16_15: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_bfe_u32 s1, 15, 0x100000 +; GFX10-NEXT: s_lshl_b32 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog %result = shl i16 %value, 15 ret i16 %result } @@ -455,6 +690,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b16_e64 v0, v0, s0 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: shl_i16_sv: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_lshlrev_b16_e64 v0, v0, s0 +; GFX10-NEXT: ; return to shader part epilog %result = shl i16 %value, %amount %cast = bitcast i16 %result to half ret half %cast @@ -476,6 +716,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b16_e32 v0, s0, v0 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: shl_i16_vs: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_lshlrev_b16_e64 v0, s0, v0 +; GFX10-NEXT: ; return to shader part epilog %result = shl i16 %value, %amount %cast = bitcast i16 %result to half ret half %cast @@ -510,6 +755,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_shl_v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_lshlrev_b16 v0, v1, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = shl <2 x i16> %value, %amount ret <2 x i16> %result } @@ -539,6 +791,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 15, v0 op_sel_hi:[0,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_shl_v2i16_15: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_lshlrev_b16 v0, 15, v0 op_sel_hi:[0,1] +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = shl <2 x i16> %value, ret <2 x i16> %result } @@ -580,6 +839,15 @@ ; GFX9-NEXT: s_lshl_b32 s1, s2, s3 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_shl_v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10-NEXT: s_lshr_b32 s3, s1, 16 +; GFX10-NEXT: s_lshl_b32 s0, s0, s1 +; GFX10-NEXT: s_lshl_b32 s1, s2, s3 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog %result = shl <2 x i16> %value, %amount %cast = bitcast <2 x i16> %result to i32 ret i32 %cast @@ -613,6 +881,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, s0 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: shl_v2i16_sv: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_pk_lshlrev_b16 v0, v0, s0 +; GFX10-NEXT: ; return to shader part epilog %result = shl <2 x i16> %value, %amount %cast = bitcast <2 x i16> %result to float ret float %cast @@ -646,6 +919,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: v_pk_lshlrev_b16 v0, s0, v0 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: shl_v2i16_vs: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_pk_lshlrev_b16 v0, s0, v0 +; GFX10-NEXT: ; return to shader part epilog %result = shl <2 x i16> %value, %amount %cast = bitcast <2 x i16> %result to float ret float %cast @@ -702,6 +980,14 @@ ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v2, v0 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, v3, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_shl_v4i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_lshlrev_b16 v0, v2, v0 +; GFX10-NEXT: v_pk_lshlrev_b16 v1, v3, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = shl <4 x i16> %value, %amount %cast = bitcast <4 x i16> %result to <2 x float> ret <2 x float> %cast @@ -765,6 +1051,20 @@ ; GFX9-NEXT: s_lshl_b32 s2, s2, s4 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_shl_v4i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-NEXT: s_lshr_b32 s5, s2, 16 +; GFX10-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10-NEXT: s_lshl_b32 s2, s4, s5 +; GFX10-NEXT: s_lshr_b32 s4, s1, 16 +; GFX10-NEXT: s_lshr_b32 s5, s3, 16 +; GFX10-NEXT: s_lshl_b32 s1, s1, s3 +; GFX10-NEXT: s_lshl_b32 s3, s4, s5 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 +; GFX10-NEXT: ; return to shader part epilog %result = shl <4 x i16> %value, %amount %cast = bitcast <4 x i16> %result to <2 x i32> ret <2 x i32> %cast @@ -858,6 +1158,16 @@ ; GFX9-NEXT: v_pk_lshlrev_b16 v2, v6, v2 ; GFX9-NEXT: v_pk_lshlrev_b16 v3, v7, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_shl_v8i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_lshlrev_b16 v0, v4, v0 +; GFX10-NEXT: v_pk_lshlrev_b16 v1, v5, v1 +; GFX10-NEXT: v_pk_lshlrev_b16 v2, v6, v2 +; GFX10-NEXT: v_pk_lshlrev_b16 v3, v7, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = shl <8 x i16> %value, %amount %cast = bitcast <8 x i16> %result to <4 x float> ret <4 x float> %cast @@ -965,6 +1275,30 @@ ; GFX9-NEXT: s_lshl_b32 s4, s4, s5 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_shl_v8i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshr_b32 s8, s0, 16 +; GFX10-NEXT: s_lshr_b32 s9, s4, 16 +; GFX10-NEXT: s_lshl_b32 s0, s0, s4 +; GFX10-NEXT: s_lshl_b32 s4, s8, s9 +; GFX10-NEXT: s_lshr_b32 s8, s1, 16 +; GFX10-NEXT: s_lshr_b32 s9, s5, 16 +; GFX10-NEXT: s_lshl_b32 s1, s1, s5 +; GFX10-NEXT: s_lshl_b32 s5, s8, s9 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 +; GFX10-NEXT: s_lshr_b32 s4, s2, 16 +; GFX10-NEXT: s_lshr_b32 s5, s6, 16 +; GFX10-NEXT: s_lshl_b32 s2, s2, s6 +; GFX10-NEXT: s_lshl_b32 s4, s4, s5 +; GFX10-NEXT: s_lshr_b32 s5, s3, 16 +; GFX10-NEXT: s_lshr_b32 s6, s7, 16 +; GFX10-NEXT: s_lshl_b32 s3, s3, s7 +; GFX10-NEXT: s_lshl_b32 s5, s5, s6 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5 +; GFX10-NEXT: ; return to shader part epilog %result = shl <8 x i16> %value, %amount %cast = bitcast <8 x i16> %result to <4 x i32> ret <4 x i32> %cast @@ -988,6 +1322,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_shl_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = shl i64 %value, %amount ret i64 %result } @@ -999,6 +1340,14 @@ ; GCN-NEXT: v_lshlrev_b32_e32 v1, 31, v0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_shl_i64_63: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 31, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = shl i64 %value, 63 ret i64 %result } @@ -1010,6 +1359,14 @@ ; GCN-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_shl_i64_33: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 1, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = shl i64 %value, 33 ret i64 %result } @@ -1021,6 +1378,14 @@ ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_shl_i64_32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = shl i64 %value, 32 ret i64 %result } @@ -1043,6 +1408,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1] ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_shl_i64_31: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1] +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = shl i64 %value, 31 ret i64 %result } @@ -1052,6 +1424,11 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_shl_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 +; GFX10-NEXT: ; return to shader part epilog %result = shl i64 %value, %amount ret i64 %result } @@ -1062,6 +1439,12 @@ ; GCN-NEXT: s_lshl_b32 s1, s0, 31 ; GCN-NEXT: s_mov_b32 s0, 0 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_shl_i64_63: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshl_b32 s1, s0, 31 +; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: ; return to shader part epilog %result = shl i64 %value, 63 ret i64 %result } @@ -1072,6 +1455,12 @@ ; GCN-NEXT: s_lshl_b32 s1, s0, 1 ; GCN-NEXT: s_mov_b32 s0, 0 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_shl_i64_33: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshl_b32 s1, s0, 1 +; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: ; return to shader part epilog %result = shl i64 %value, 33 ret i64 %result } @@ -1082,6 +1471,12 @@ ; GCN-NEXT: s_mov_b32 s1, s0 ; GCN-NEXT: s_mov_b32 s0, 0 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_shl_i64_32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s1, s0 +; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: ; return to shader part epilog %result = shl i64 %value, 32 ret i64 %result } @@ -1091,6 +1486,11 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], 31 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_shl_i64_31: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 31 +; GFX10-NEXT: ; return to shader part epilog %result = shl i64 %value, 31 ret i64 %result } @@ -1110,6 +1510,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v0, s[0:1] ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: shl_i64_sv: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, s[0:1] +; GFX10-NEXT: ; return to shader part epilog %result = shl i64 %value, %amount %cast = bitcast i64 %result to <2 x float> ret <2 x float> %cast @@ -1130,6 +1535,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b64 v[0:1], s0, v[0:1] ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: shl_i64_vs: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_lshlrev_b64 v[0:1], s0, v[0:1] +; GFX10-NEXT: ; return to shader part epilog %result = shl i64 %value, %amount %cast = bitcast i64 %result to <2 x float> ret <2 x float> %cast @@ -1156,6 +1566,18 @@ ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[2:3], v6, v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_shl_v2i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v10, v0 +; GFX10-NEXT: v_mov_b32_e32 v11, v1 +; GFX10-NEXT: v_mov_b32_e32 v7, v2 +; GFX10-NEXT: v_mov_b32_e32 v8, v3 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v4, v[10:11] +; GFX10-NEXT: v_lshlrev_b64 v[2:3], v6, v[7:8] +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = shl <2 x i64> %value, %amount ret <2 x i64> %result } @@ -1181,6 +1603,14 @@ ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 31, v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_shl_v2i64_31: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[2:3], 31, v[2:3] +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = shl <2 x i64> %value, ret <2 x i64> %result } @@ -1191,6 +1621,12 @@ ; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], s4 ; GCN-NEXT: s_lshl_b64 s[2:3], s[2:3], s6 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_shl_v2i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s4 +; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], s6 +; GFX10-NEXT: ; return to shader part epilog %result = shl <2 x i64> %value, %amount ret <2 x i64> %result } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shlN_add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shlN_add.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shlN_add.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shlN_add.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s ; Test gfx9+ s_shl[1-4]_add_u32 pattern matching @@ -15,6 +16,11 @@ ; GFX8-NEXT: s_lshl_b32 s0, s0, 1 ; GFX8-NEXT: s_add_i32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_shl1_add_u32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshl1_add_u32 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog %shl = shl i32 %src0, 1 %add = add i32 %shl, %src1 ret i32 %add @@ -31,6 +37,11 @@ ; GFX8-NEXT: s_lshl_b32 s0, s0, 2 ; GFX8-NEXT: s_add_i32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_shl2_add_u32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshl2_add_u32 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog %shl = shl i32 %src0, 2 %add = add i32 %shl, %src1 ret i32 %add @@ -47,6 +58,11 @@ ; GFX8-NEXT: s_lshl_b32 s0, s0, 3 ; GFX8-NEXT: s_add_i32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_shl3_add_u32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshl3_add_u32 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog %shl = shl i32 %src0, 3 %add = add i32 %shl, %src1 ret i32 %add @@ -63,6 +79,11 @@ ; GFX8-NEXT: s_lshl_b32 s0, s0, 4 ; GFX8-NEXT: s_add_i32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_shl4_add_u32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshl4_add_u32 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog %shl = shl i32 %src0, 4 %add = add i32 %shl, %src1 ret i32 %add @@ -92,6 +113,13 @@ ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_shl1_add_u32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshl_add_u32 v0, v0, 1, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %shl = shl i32 %src0, 1 %add = add i32 %shl, %src1 ret i32 %add @@ -110,6 +138,13 @@ ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_shl2_add_u32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %shl = shl i32 %src0, 2 %add = add i32 %shl, %src1 ret i32 %add @@ -128,6 +163,13 @@ ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_shl3_add_u32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshl_add_u32 v0, v0, 3, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %shl = shl i32 %src0, 3 %add = add i32 %shl, %src1 ret i32 %add @@ -146,6 +188,13 @@ ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_shl4_add_u32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshl_add_u32 v0, v0, 4, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %shl = shl i32 %src0, 4 %add = add i32 %shl, %src1 ret i32 %add @@ -164,6 +213,13 @@ ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_shl5_add_u32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshl_add_u32 v0, v0, 5, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %shl = shl i32 %src0, 5 %add = add i32 %shl, %src1 ret i32 %add @@ -183,6 +239,12 @@ ; GFX8-NEXT: s_lshl_b32 s0, s0, 1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: shl1_add_u32_vgpr1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshl_b32 s0, s0, 1 +; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX10-NEXT: ; return to shader part epilog %shl = shl i32 %src0, 1 %add = add i32 %shl, %src1 %cast = bitcast i32 %add to float @@ -201,6 +263,12 @@ ; GFX8-NEXT: s_lshl_b32 s0, s0, 2 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: shl2_add_u32_vgpr1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshl_b32 s0, s0, 2 +; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX10-NEXT: ; return to shader part epilog %shl = shl i32 %src0, 2 %add = add i32 %shl, %src1 %cast = bitcast i32 %add to float @@ -219,6 +287,12 @@ ; GFX8-NEXT: s_lshl_b32 s0, s0, 3 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: shl3_add_u32_vgpr1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshl_b32 s0, s0, 3 +; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX10-NEXT: ; return to shader part epilog %shl = shl i32 %src0, 3 %add = add i32 %shl, %src1 %cast = bitcast i32 %add to float @@ -237,6 +311,12 @@ ; GFX8-NEXT: s_lshl_b32 s0, s0, 4 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: shl4_add_u32_vgpr1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshl_b32 s0, s0, 4 +; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX10-NEXT: ; return to shader part epilog %shl = shl i32 %src0, 4 %add = add i32 %shl, %src1 %cast = bitcast i32 %add to float @@ -255,6 +335,12 @@ ; GFX8-NEXT: s_lshl_b32 s0, s0, 5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: shl5_add_u32_vgpr1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshl_b32 s0, s0, 5 +; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX10-NEXT: ; return to shader part epilog %shl = shl i32 %src0, 5 %add = add i32 %shl, %src1 %cast = bitcast i32 %add to float @@ -275,6 +361,12 @@ ; GFX8-NEXT: s_add_i32 s0, s0, s2 ; GFX8-NEXT: s_add_i32 s1, s1, s3 ; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_shl1_add_u32_v2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshl1_add_u32 s0, s0, s2 +; GFX10-NEXT: s_lshl1_add_u32 s1, s1, s3 +; GFX10-NEXT: ; return to shader part epilog %shl = shl <2 x i32> %src0, %add = add <2 x i32> %shl, %src1 ret <2 x i32> %add @@ -294,6 +386,12 @@ ; GFX8-NEXT: s_add_i32 s0, s0, s2 ; GFX8-NEXT: s_add_i32 s1, s1, s3 ; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_shl2_add_u32_v2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshl2_add_u32 s0, s0, s2 +; GFX10-NEXT: s_lshl2_add_u32 s1, s1, s3 +; GFX10-NEXT: ; return to shader part epilog %shl = shl <2 x i32> %src0, %add = add <2 x i32> %shl, %src1 ret <2 x i32> %add @@ -313,6 +411,12 @@ ; GFX8-NEXT: s_add_i32 s0, s0, s2 ; GFX8-NEXT: s_add_i32 s1, s1, s3 ; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_shl3_add_u32_v2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshl3_add_u32 s0, s0, s2 +; GFX10-NEXT: s_lshl3_add_u32 s1, s1, s3 +; GFX10-NEXT: ; return to shader part epilog %shl = shl <2 x i32> %src0, %add = add <2 x i32> %shl, %src1 ret <2 x i32> %add @@ -332,6 +436,12 @@ ; GFX8-NEXT: s_add_i32 s0, s0, s2 ; GFX8-NEXT: s_add_i32 s1, s1, s3 ; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_shl4_add_u32_v2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshl4_add_u32 s0, s0, s2 +; GFX10-NEXT: s_lshl4_add_u32 s1, s1, s3 +; GFX10-NEXT: ; return to shader part epilog %shl = shl <2 x i32> %src0, %add = add <2 x i32> %shl, %src1 ret <2 x i32> %add @@ -351,6 +461,12 @@ ; GFX8-NEXT: s_add_i32 s0, s0, s2 ; GFX8-NEXT: s_add_i32 s1, s1, s3 ; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_shl_2_4_add_u32_v2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshl2_add_u32 s0, s0, s2 +; GFX10-NEXT: s_lshl4_add_u32 s1, s1, s3 +; GFX10-NEXT: ; return to shader part epilog %shl = shl <2 x i32> %src0, %add = add <2 x i32> %shl, %src1 ret <2 x i32> %add diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s ; FIXME: ; XUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s @@ -32,6 +33,20 @@ ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: ds_write_b128 v4, v[0:3] ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: store_lds_v4i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: v_mov_b32_e32 v3, s7 +; GFX10-NEXT: v_mov_b32_e32 v4, s2 +; GFX10-NEXT: ds_write_b128 v4, v[0:3] +; GFX10-NEXT: s_endpgm store <4 x i32> %x, <4 x i32> addrspace(3)* %out ret void } @@ -141,6 +156,59 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:15 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: store_lds_v4i32_align1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshr_b32 s0, s4, 8 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: s_lshr_b32 s1, s4, 16 +; GFX10-NEXT: s_lshr_b32 s3, s4, 24 +; GFX10-NEXT: s_lshr_b32 s2, s5, 8 +; GFX10-NEXT: s_lshr_b32 s4, s5, 16 +; GFX10-NEXT: s_lshr_b32 s8, s5, 24 +; GFX10-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-NEXT: s_lshr_b32 s5, s6, 8 +; GFX10-NEXT: s_lshr_b32 s9, s6, 16 +; GFX10-NEXT: v_mov_b32_e32 v4, s0 +; GFX10-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-NEXT: v_mov_b32_e32 v15, s1 +; GFX10-NEXT: v_mov_b32_e32 v10, s5 +; GFX10-NEXT: s_lshr_b32 s0, s6, 24 +; GFX10-NEXT: v_mov_b32_e32 v6, s3 +; GFX10-NEXT: v_mov_b32_e32 v11, s9 +; GFX10-NEXT: v_mov_b32_e32 v7, s2 +; GFX10-NEXT: v_mov_b32_e32 v8, s4 +; GFX10-NEXT: v_mov_b32_e32 v19, s8 +; GFX10-NEXT: ds_write_b8 v1, v0 +; GFX10-NEXT: ds_write_b8 v1, v2 offset:4 +; GFX10-NEXT: ds_write_b8 v1, v4 offset:1 +; GFX10-NEXT: ds_write_b8 v1, v15 offset:2 +; GFX10-NEXT: ds_write_b8 v1, v6 offset:3 +; GFX10-NEXT: ds_write_b8 v1, v7 offset:5 +; GFX10-NEXT: ds_write_b8 v1, v8 offset:6 +; GFX10-NEXT: ds_write_b8 v1, v19 offset:7 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: s_lshr_b32 s0, s7, 8 +; GFX10-NEXT: s_lshr_b32 s1, s7, 16 +; GFX10-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-NEXT: ds_write_b8 v1, v3 offset:8 +; GFX10-NEXT: ds_write_b8 v1, v10 offset:9 +; GFX10-NEXT: ds_write_b8 v1, v11 offset:10 +; GFX10-NEXT: s_lshr_b32 s2, s7, 24 +; GFX10-NEXT: v_mov_b32_e32 v3, s0 +; GFX10-NEXT: v_mov_b32_e32 v4, s1 +; GFX10-NEXT: v_mov_b32_e32 v7, s2 +; GFX10-NEXT: ds_write_b8 v1, v0 offset:11 +; GFX10-NEXT: ds_write_b8 v1, v2 offset:12 +; GFX10-NEXT: ds_write_b8 v1, v3 offset:13 +; GFX10-NEXT: ds_write_b8 v1, v4 offset:14 +; GFX10-NEXT: ds_write_b8 v1, v7 offset:15 +; GFX10-NEXT: s_endpgm store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 1 ret void } @@ -202,6 +270,35 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: ds_write_b16 v1, v0 offset:14 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: store_lds_v4i32_align2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-NEXT: s_lshr_b32 s0, s4, 16 +; GFX10-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-NEXT: s_lshr_b32 s1, s5, 16 +; GFX10-NEXT: s_lshr_b32 s2, s6, 16 +; GFX10-NEXT: s_lshr_b32 s3, s7, 16 +; GFX10-NEXT: v_mov_b32_e32 v4, s7 +; GFX10-NEXT: v_mov_b32_e32 v11, s0 +; GFX10-NEXT: v_mov_b32_e32 v6, s1 +; GFX10-NEXT: v_mov_b32_e32 v7, s2 +; GFX10-NEXT: v_mov_b32_e32 v8, s3 +; GFX10-NEXT: ds_write_b16 v1, v0 +; GFX10-NEXT: ds_write_b16 v1, v2 offset:4 +; GFX10-NEXT: ds_write_b16 v1, v3 offset:8 +; GFX10-NEXT: ds_write_b16 v1, v4 offset:12 +; GFX10-NEXT: ds_write_b16 v1, v11 offset:2 +; GFX10-NEXT: ds_write_b16 v1, v6 offset:6 +; GFX10-NEXT: ds_write_b16 v1, v7 offset:10 +; GFX10-NEXT: ds_write_b16 v1, v8 offset:14 +; GFX10-NEXT: s_endpgm store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 2 ret void } @@ -235,6 +332,21 @@ ; GFX7-NEXT: v_mov_b32_e32 v2, s3 ; GFX7-NEXT: ds_write2_b32 v1, v0, v2 offset0:2 offset1:3 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: store_lds_v4i32_align4: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-NEXT: v_mov_b32_e32 v4, s7 +; GFX10-NEXT: ds_write2_b32 v1, v0, v2 offset1:1 +; GFX10-NEXT: ds_write2_b32 v1, v3, v4 offset0:2 offset1:3 +; GFX10-NEXT: s_endpgm store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 4 ret void } @@ -266,6 +378,20 @@ ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: store_lds_v4i32_align8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: v_mov_b32_e32 v3, s7 +; GFX10-NEXT: v_mov_b32_e32 v4, s2 +; GFX10-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 +; GFX10-NEXT: s_endpgm store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 8 ret void } @@ -297,6 +423,20 @@ ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: ds_write_b128 v4, v[0:3] ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: store_lds_v4i32_align16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: v_mov_b32_e32 v3, s7 +; GFX10-NEXT: v_mov_b32_e32 v4, s2 +; GFX10-NEXT: ds_write_b128 v4, v[0:3] +; GFX10-NEXT: s_endpgm store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 16 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s ; FIXME: ; XUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s @@ -30,6 +31,19 @@ ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: ds_write_b96 v3, v[0:2] ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: store_lds_v3i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s12 +; GFX10-NEXT: v_mov_b32_e32 v1, s13 +; GFX10-NEXT: v_mov_b32_e32 v2, s14 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 +; GFX10-NEXT: ds_write_b96 v3, v[0:2] +; GFX10-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out ret void } @@ -117,6 +131,48 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s3 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:11 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: store_lds_v3i32_align1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshr_b32 s0, s12, 8 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: s_lshr_b32 s5, s13, 24 +; GFX10-NEXT: v_mov_b32_e32 v0, s12 +; GFX10-NEXT: s_lshr_b32 s1, s12, 16 +; GFX10-NEXT: v_mov_b32_e32 v2, s13 +; GFX10-NEXT: s_lshr_b32 s3, s12, 24 +; GFX10-NEXT: s_lshr_b32 s6, s14, 8 +; GFX10-NEXT: v_mov_b32_e32 v4, s0 +; GFX10-NEXT: v_mov_b32_e32 v15, s5 +; GFX10-NEXT: s_lshr_b32 s2, s13, 8 +; GFX10-NEXT: s_lshr_b32 s4, s13, 16 +; GFX10-NEXT: s_lshr_b32 s7, s14, 16 +; GFX10-NEXT: v_mov_b32_e32 v3, s14 +; GFX10-NEXT: v_mov_b32_e32 v11, s1 +; GFX10-NEXT: s_lshr_b32 s8, s14, 24 +; GFX10-NEXT: v_mov_b32_e32 v6, s3 +; GFX10-NEXT: v_mov_b32_e32 v10, s6 +; GFX10-NEXT: v_mov_b32_e32 v7, s2 +; GFX10-NEXT: v_mov_b32_e32 v8, s4 +; GFX10-NEXT: ds_write_b8 v1, v0 +; GFX10-NEXT: ds_write_b8 v1, v2 offset:4 +; GFX10-NEXT: ds_write_b8 v1, v4 offset:1 +; GFX10-NEXT: ds_write_b8 v1, v11 offset:2 +; GFX10-NEXT: ds_write_b8 v1, v6 offset:3 +; GFX10-NEXT: ds_write_b8 v1, v7 offset:5 +; GFX10-NEXT: ds_write_b8 v1, v8 offset:6 +; GFX10-NEXT: v_mov_b32_e32 v0, s7 +; GFX10-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-NEXT: ds_write_b8 v1, v15 offset:7 +; GFX10-NEXT: ds_write_b8 v1, v3 offset:8 +; GFX10-NEXT: ds_write_b8 v1, v10 offset:9 +; GFX10-NEXT: ds_write_b8 v1, v0 offset:10 +; GFX10-NEXT: ds_write_b8 v1, v2 offset:11 +; GFX10-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1 ret void } @@ -168,6 +224,30 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: ds_write_b16 v1, v0 offset:10 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: store_lds_v3i32_align2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s12 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: s_lshr_b32 s0, s12, 16 +; GFX10-NEXT: v_mov_b32_e32 v2, s13 +; GFX10-NEXT: s_lshr_b32 s1, s13, 16 +; GFX10-NEXT: v_mov_b32_e32 v3, s14 +; GFX10-NEXT: s_lshr_b32 s2, s14, 16 +; GFX10-NEXT: v_mov_b32_e32 v4, s0 +; GFX10-NEXT: v_mov_b32_e32 v7, s1 +; GFX10-NEXT: v_mov_b32_e32 v6, s2 +; GFX10-NEXT: ds_write_b16 v1, v0 +; GFX10-NEXT: ds_write_b16 v1, v2 offset:4 +; GFX10-NEXT: ds_write_b16 v1, v3 offset:8 +; GFX10-NEXT: ds_write_b16 v1, v4 offset:2 +; GFX10-NEXT: ds_write_b16 v1, v7 offset:6 +; GFX10-NEXT: ds_write_b16 v1, v6 offset:10 +; GFX10-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 2 ret void } @@ -199,6 +279,20 @@ ; GFX7-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 ; GFX7-NEXT: ds_write_b32 v2, v3 offset:8 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: store_lds_v3i32_align4: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s12 +; GFX10-NEXT: v_mov_b32_e32 v1, s13 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s14 +; GFX10-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 +; GFX10-NEXT: ds_write_b32 v2, v3 offset:8 +; GFX10-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 4 ret void } @@ -230,6 +324,20 @@ ; GFX7-NEXT: ds_write_b64 v2, v[0:1] ; GFX7-NEXT: ds_write_b32 v2, v3 offset:8 ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: store_lds_v3i32_align8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s12 +; GFX10-NEXT: v_mov_b32_e32 v1, s13 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s14 +; GFX10-NEXT: ds_write_b64 v2, v[0:1] +; GFX10-NEXT: ds_write_b32 v2, v3 offset:8 +; GFX10-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 8 ret void } @@ -259,6 +367,19 @@ ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: ds_write_b96 v3, v[0:2] ; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: store_lds_v3i32_align16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s12 +; GFX10-NEXT: v_mov_b32_e32 v1, s13 +; GFX10-NEXT: v_mov_b32_e32 v2, s14 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 +; GFX10-NEXT: ds_write_b96 v3, v[0:2] +; GFX10-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 16 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll @@ -3,12 +3,18 @@ ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s define amdgpu_ps i32 @scalar_xnor_i32_one_use(i32 inreg %a, i32 inreg %b) { ; GCN-LABEL: scalar_xnor_i32_one_use: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_xnor_b32 s0, s0, s1 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: scalar_xnor_i32_one_use: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_xnor_b32 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog entry: %xor = xor i32 %a, %b %r0.val = xor i32 %xor, -1 @@ -47,6 +53,12 @@ ; GFX906-NEXT: s_xor_b32 s0, s0, s1 ; GFX906-NEXT: s_xor_b32 s0, s0, -1 ; GFX906-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: scalar_xnor_v2i16_one_use: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_xor_b32 s0, s0, s1 +; GFX10-NEXT: s_xor_b32 s0, s0, -1 +; GFX10-NEXT: ; return to shader part epilog entry: %xor = xor <2 x i16> %a, %b %r0.val = xor <2 x i16> %xor, @@ -62,6 +74,14 @@ ; GCN-NEXT: s_add_i32 s1, s1, s0 ; GCN-NEXT: s_mov_b32 s0, s2 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: scalar_xnor_i32_mul_use: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_xor_b32 s1, s0, s1 +; GFX10-NEXT: s_not_b32 s2, s1 +; GFX10-NEXT: s_add_i32 s1, s1, s0 +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: ; return to shader part epilog entry: %xor = xor i32 %a, %b %r0.val = xor i32 %xor, -1 @@ -76,6 +96,11 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_xnor_b64 s[0:1], s[0:1], s[2:3] ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: scalar_xnor_i64_one_use: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_xnor_b64 s[0:1], s[0:1], s[2:3] +; GFX10-NEXT: ; return to shader part epilog %xor = xor i64 %a, %b %r0.val = xor i64 %xor, -1 ret i64 %r0.val @@ -138,6 +163,14 @@ ; GFX906-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] ; GFX906-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] ; GFX906-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: scalar_xnor_v4i16_one_use: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s4, -1 +; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; GFX10-NEXT: s_mov_b32 s5, s4 +; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] +; GFX10-NEXT: ; return to shader part epilog %xor = xor <4 x i16> %a, %b %ret = xor <4 x i16> %xor, %cast = bitcast <4 x i16> %ret to i64 @@ -157,6 +190,19 @@ ; GCN-NEXT: s_mov_b32 s0, s4 ; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: scalar_xnor_i64_mul_use: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_xor_b64 s[2:3], s[0:1], s[2:3] +; GFX10-NEXT: s_not_b64 s[4:5], s[2:3] +; GFX10-NEXT: s_add_u32 s2, s2, s0 +; GFX10-NEXT: s_cselect_b32 s0, 1, 0 +; GFX10-NEXT: s_and_b32 s0, s0, 1 +; GFX10-NEXT: s_cmp_lg_u32 s0, 0 +; GFX10-NEXT: s_mov_b32 s0, s4 +; GFX10-NEXT: s_addc_u32 s3, s3, s1 +; GFX10-NEXT: s_mov_b32 s1, s5 +; GFX10-NEXT: ; return to shader part epilog %xor = xor i64 %a, %b %r0.val = xor i64 %xor, -1 %r1.val = add i64 %xor, %a @@ -192,6 +238,13 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: v_xnor_b32_e32 v0, v0, v1 ; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: vector_xnor_i32_one_use: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_xor3_b32 v0, v0, v1, -1 +; GFX10-NEXT: s_setpc_b64 s[30:31] entry: %xor = xor i32 %a, %b %r = xor i32 %xor, -1 @@ -207,6 +260,16 @@ ; GCN-NEXT: v_xor_b32_e32 v0, -1, v0 ; GCN-NEXT: v_xor_b32_e32 v1, -1, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: vector_xnor_i64_one_use: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_xor_b32_e32 v0, v0, v2 +; GFX10-NEXT: v_xor_b32_e32 v1, v1, v3 +; GFX10-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX10-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] entry: %xor = xor i64 %a, %b %r = xor i64 %xor, -1 @@ -236,6 +299,11 @@ ; GFX906: ; %bb.0: ; GFX906-NEXT: v_xnor_b32_e32 v0, s0, v0 ; GFX906-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: xnor_s_v_i32_one_use: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_xor3_b32 v0, s0, v0, -1 +; GFX10-NEXT: ; return to shader part epilog %xor = xor i32 %s, %v %d = xor i32 %xor, -1 %cast = bitcast i32 %d to float @@ -265,6 +333,11 @@ ; GFX906: ; %bb.0: ; GFX906-NEXT: v_xnor_b32_e64 v0, v0, s0 ; GFX906-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: xnor_v_s_i32_one_use: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_xor3_b32 v0, v0, s0, -1 +; GFX10-NEXT: ; return to shader part epilog %xor = xor i32 %v, %s %d = xor i32 %xor, -1 %cast = bitcast i32 %d to float @@ -307,6 +380,15 @@ ; GFX906-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX906-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX906-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: xnor_i64_s_v_one_use: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: v_lshlrev_b64 v[0:1], 29, v[0:1] +; GFX10-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX10-NEXT: v_xor_b32_e32 v1, s1, v1 +; GFX10-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX10-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX10-NEXT: ; return to shader part epilog entry: %b = shl i64 %b64, 29 %xor = xor i64 %a, %b @@ -351,6 +433,15 @@ ; GFX906-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX906-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX906-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: xnor_i64_v_s_one_use: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_lshlrev_b64 v[0:1], 29, v[0:1] +; GFX10-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX10-NEXT: v_xor_b32_e32 v1, s1, v1 +; GFX10-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX10-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX10-NEXT: ; return to shader part epilog %b = shl i64 %b64, 29 %xor = xor i64 %b, %a %r0.val = xor i64 %xor, -1 @@ -385,6 +476,13 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: v_xnor_b32_e32 v0, v0, v1 ; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: vector_xor_na_b_i32_one_use: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_xor3_b32 v0, v0, -1, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] entry: %na = xor i32 %a, -1 %r = xor i32 %na, %b @@ -418,6 +516,13 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: v_xnor_b32_e32 v0, v1, v0 ; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: vector_xor_a_nb_i32_one_use: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_xor3_b32 v0, v1, -1, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] entry: %nb = xor i32 %b, -1 %r = xor i32 %a, %nb @@ -430,6 +535,12 @@ ; GCN-NEXT: s_not_b64 s[2:3], s[2:3] ; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: scalar_xor_a_nb_i64_one_use: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_not_b64 s[2:3], s[2:3] +; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; GFX10-NEXT: ; return to shader part epilog entry: %nb = xor i64 %b, -1 %r0.val = xor i64 %a, %nb @@ -443,6 +554,12 @@ ; GCN-NEXT: s_not_b64 s[0:1], s[0:1] ; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: scalar_xor_na_b_i64_one_use: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_not_b64 s[0:1], s[0:1] +; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; GFX10-NEXT: ; return to shader part epilog entry: %na = xor i64 %a, -1 %r0.val = xor i64 %na, %b diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll @@ -2,6 +2,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck --check-prefix=GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s define i64 @zextload_global_i1_to_i64(i1 addrspace(1)* %ptr) { ; GFX9-LABEL: zextload_global_i1_to_i64: @@ -33,6 +34,16 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: zextload_global_i1_to_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %load = load i1, i1 addrspace(1)* %ptr %ext = zext i1 %load to i64 ret i64 %ext @@ -65,6 +76,15 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: zextload_global_i8_to_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] %load = load i8, i8 addrspace(1)* %ptr %ext = zext i8 %load to i64 ret i64 %ext @@ -97,6 +117,15 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: zextload_global_i16_to_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_ushort v0, v[0:1], off +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] %load = load i16, i16 addrspace(1)* %ptr %ext = zext i16 %load to i64 ret i64 %ext @@ -129,6 +158,15 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: zextload_global_i32_to_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] %load = load i32, i32 addrspace(1)* %ptr %ext = zext i32 %load to i64 ret i64 %ext @@ -164,6 +202,16 @@ ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: zextload_global_i32_to_i96: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] %load = load i32, i32 addrspace(1)* %ptr %ext = zext i32 %load to i96 ret i96 %ext @@ -202,6 +250,17 @@ ; GFX6-NEXT: v_mov_b32_e32 v3, 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: zextload_global_i32_to_i128: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] %load = load i32, i32 addrspace(1)* %ptr %ext = zext i32 %load to i128 ret i128 %ext diff --git a/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll b/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll --- a/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll +++ b/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll @@ -1,40 +1,68 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX9 +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX10 define amdgpu_kernel void @test0() { -; CHECK-LABEL: test0: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_endpgm +; GFX9-LABEL: test0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: test0: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_endpgm tail call void @llvm.amdgcn.endpgm() unreachable } define void @test1() { -; CHECK-LABEL: test1: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_endpgm +; GFX9-LABEL: test1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: test1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_endpgm tail call void @llvm.amdgcn.endpgm() unreachable } define amdgpu_kernel void @test2(i32* %p, i32 %x) { -; CHECK-LABEL: test2: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x2c -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_cmp_lt_i32 s2, 1 -; CHECK-NEXT: s_cbranch_scc0 BB2_2 -; CHECK-NEXT: ; %bb.1: ; %else -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; CHECK-NEXT: v_mov_b32_e32 v2, s2 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-NEXT: v_mov_b32_e32 v1, s1 -; CHECK-NEXT: flat_store_dword v[0:1], v2 -; CHECK-NEXT: s_endpgm -; CHECK-NEXT: BB2_2: ; %then -; CHECK-NEXT: s_endpgm +; GFX9-LABEL: test2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_cmp_lt_i32 s2, 1 +; GFX9-NEXT: s_cbranch_scc0 BB2_2 +; GFX9-NEXT: ; %bb.1: ; %else +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_store_dword v[0:1], v2 +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB2_2: ; %then +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: test2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_cmp_lt_i32 s2, 1 +; GFX10-NEXT: s_cbranch_scc0 BB2_2 +; GFX10-NEXT: ; %bb.1: ; %else +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: flat_store_dword v[0:1], v2 +; GFX10-NEXT: s_endpgm +; GFX10-NEXT: BB2_2: ; %then +; GFX10-NEXT: s_endpgm %cond = icmp sgt i32 %x, 0 br i1 %cond, label %then, label %else diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX900 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=-unaligned-access-mode -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,FLATSCR %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX10,GFX10_DEFAULT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs -mattr=-unaligned-access-mode -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=GFX10,FLATSCR_GFX10 %s define <2 x half> @chain_hi_to_lo_private() { ; GFX900-LABEL: chain_hi_to_lo_private: @@ -22,6 +24,29 @@ ; FLATSCR-NEXT: scratch_load_short_d16_hi v0, off, s0 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_setpc_b64 s[30:31] +; +; GFX10_DEFAULT-LABEL: chain_hi_to_lo_private: +; GFX10_DEFAULT: ; %bb.0: ; %bb +; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10_DEFAULT-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10_DEFAULT-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:2 +; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) +; GFX10_DEFAULT-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], 0 +; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) +; GFX10_DEFAULT-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR_GFX10-LABEL: chain_hi_to_lo_private: +; FLATSCR_GFX10: ; %bb.0: ; %bb +; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; FLATSCR_GFX10-NEXT: s_mov_b32 s0, 2 +; FLATSCR_GFX10-NEXT: scratch_load_ushort v0, off, s0 +; FLATSCR_GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; FLATSCR_GFX10-NEXT: s_mov_b32 s0, 0 +; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) +; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v0, off, s0 +; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) +; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31] bb: %gep_lo = getelementptr inbounds half, half addrspace(5)* null, i64 1 %load_lo = load half, half addrspace(5)* %gep_lo @@ -52,6 +77,26 @@ ; FLATSCR-NEXT: scratch_load_short_d16_hi v0, v1, off ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_setpc_b64 s[30:31] +; +; GFX10_DEFAULT-LABEL: chain_hi_to_lo_private_different_bases: +; GFX10_DEFAULT: ; %bb.0: ; %bb +; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10_DEFAULT-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10_DEFAULT-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen +; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) +; GFX10_DEFAULT-NEXT: buffer_load_short_d16_hi v0, v1, s[0:3], 0 offen +; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) +; GFX10_DEFAULT-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR_GFX10-LABEL: chain_hi_to_lo_private_different_bases: +; FLATSCR_GFX10: ; %bb.0: ; %bb +; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; FLATSCR_GFX10-NEXT: scratch_load_ushort v0, v0, off +; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) +; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v0, v1, off +; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) +; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31] bb: %load_lo = load half, half addrspace(5)* %base_lo %load_hi = load half, half addrspace(5)* %base_hi @@ -80,6 +125,26 @@ ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: v_mov_b32_e32 v0, v1 ; FLATSCR-NEXT: s_setpc_b64 s[30:31] +; +; GFX10_DEFAULT-LABEL: chain_hi_to_lo_arithmatic: +; GFX10_DEFAULT: ; %bb.0: ; %bb +; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10_DEFAULT-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10_DEFAULT-NEXT: v_add_f16_e32 v1, 1.0, v1 +; GFX10_DEFAULT-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen +; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) +; GFX10_DEFAULT-NEXT: v_mov_b32_e32 v0, v1 +; GFX10_DEFAULT-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR_GFX10-LABEL: chain_hi_to_lo_arithmatic: +; FLATSCR_GFX10: ; %bb.0: ; %bb +; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; FLATSCR_GFX10-NEXT: v_add_f16_e32 v1, 1.0, v1 +; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v1, v0, off +; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) +; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v0, v1 +; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31] bb: %arith_lo = fadd half %in, 1.0 %load_hi = load half, half addrspace(5)* %base @@ -100,6 +165,17 @@ ; GCN-NEXT: ds_read_u16_d16_hi v0, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: chain_hi_to_lo_group: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: ds_read_u16 v0, v1 offset:2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: ds_read_u16_d16_hi v0, v1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] bb: %gep_lo = getelementptr inbounds half, half addrspace(3)* null, i64 1 %load_lo = load half, half addrspace(3)* %gep_lo @@ -121,6 +197,16 @@ ; GCN-NEXT: ds_read_u16_d16_hi v0, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: chain_hi_to_lo_group_different_bases: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_read_u16 v0, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: ds_read_u16_d16_hi v0, v1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] bb: %load_lo = load half, half addrspace(3)* %base_lo %load_hi = load half, half addrspace(3)* %base_hi @@ -144,6 +230,20 @@ ; GCN-NEXT: global_load_short_d16_hi v0, v[1:2], off ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: chain_hi_to_lo_global: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, 2 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: global_load_ushort v0, v[0:1], off +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_short_d16_hi v0, v[1:2], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] bb: %gep_lo = getelementptr inbounds half, half addrspace(1)* null, i64 1 %load_lo = load half, half addrspace(1)* %gep_lo @@ -165,6 +265,16 @@ ; GCN-NEXT: global_load_short_d16_hi v0, v[2:3], off ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: chain_hi_to_lo_global_different_bases: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_ushort v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_short_d16_hi v0, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] bb: %load_lo = load half, half addrspace(1)* %base_lo %load_hi = load half, half addrspace(1)* %base_hi @@ -188,6 +298,20 @@ ; GCN-NEXT: flat_load_short_d16_hi v0, v[1:2] ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: chain_hi_to_lo_flat: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, 2 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: flat_load_ushort v0, v[0:1] +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: flat_load_short_d16_hi v0, v[1:2] +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] bb: %gep_lo = getelementptr inbounds half, half* null, i64 1 %load_lo = load half, half* %gep_lo @@ -209,6 +333,16 @@ ; GCN-NEXT: flat_load_short_d16_hi v0, v[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: chain_hi_to_lo_flat_different_bases: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_load_ushort v0, v[0:1] +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: flat_load_short_d16_hi v0, v[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] bb: %load_lo = load half, half* %base_lo %load_hi = load half, half* %base_hi @@ -283,6 +417,78 @@ ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; FLATSCR-NEXT: s_endpgm +; +; GFX10_DEFAULT-LABEL: vload2_private: +; GFX10_DEFAULT: ; %bb.0: ; %entry +; GFX10_DEFAULT-NEXT: s_add_u32 s6, s6, s9 +; GFX10_DEFAULT-NEXT: s_addc_u32 s7, s7, 0 +; GFX10_DEFAULT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; GFX10_DEFAULT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GFX10_DEFAULT-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX10_DEFAULT-NEXT: v_mov_b32_e32 v2, 0 +; GFX10_DEFAULT-NEXT: s_add_u32 s0, s0, s9 +; GFX10_DEFAULT-NEXT: s_addc_u32 s1, s1, 0 +; GFX10_DEFAULT-NEXT: s_waitcnt lgkmcnt(0) +; GFX10_DEFAULT-NEXT: global_load_ushort v0, v2, s[4:5] +; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) +; GFX10_DEFAULT-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 +; GFX10_DEFAULT-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10_DEFAULT-NEXT: global_load_ushort v0, v2, s[4:5] offset:2 +; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) +; GFX10_DEFAULT-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:6 +; GFX10_DEFAULT-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10_DEFAULT-NEXT: global_load_ushort v0, v2, s[4:5] offset:4 +; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) +; GFX10_DEFAULT-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:8 +; GFX10_DEFAULT-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10_DEFAULT-NEXT: s_clause 0x1 +; GFX10_DEFAULT-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:6 +; GFX10_DEFAULT-NEXT: buffer_load_ushort v3, off, s[0:3], 0 offset:4 +; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(1) +; GFX10_DEFAULT-NEXT: v_mov_b32_e32 v1, v0 +; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) +; GFX10_DEFAULT-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10_DEFAULT-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:8 +; GFX10_DEFAULT-NEXT: v_lshl_or_b32 v0, v0, 16, v3 +; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) +; GFX10_DEFAULT-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX10_DEFAULT-NEXT: s_endpgm +; +; FLATSCR_GFX10-LABEL: vload2_private: +; FLATSCR_GFX10: ; %bb.0: ; %entry +; FLATSCR_GFX10-NEXT: s_add_u32 s2, s2, s5 +; FLATSCR_GFX10-NEXT: s_addc_u32 s3, s3, 0 +; FLATSCR_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; FLATSCR_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; FLATSCR_GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v2, 0 +; FLATSCR_GFX10-NEXT: s_mov_b32 vcc_lo, 0 +; FLATSCR_GFX10-NEXT: s_waitcnt lgkmcnt(0) +; FLATSCR_GFX10-NEXT: global_load_ushort v0, v2, s[0:1] +; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) +; FLATSCR_GFX10-NEXT: scratch_store_short off, v0, vcc_lo offset:4 +; FLATSCR_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; FLATSCR_GFX10-NEXT: global_load_ushort v0, v2, s[0:1] offset:2 +; FLATSCR_GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; FLATSCR_GFX10-NEXT: s_mov_b32 vcc_lo, 0 +; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) +; FLATSCR_GFX10-NEXT: scratch_store_short off, v0, vcc_lo offset:6 +; FLATSCR_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; FLATSCR_GFX10-NEXT: global_load_ushort v0, v2, s[0:1] offset:4 +; FLATSCR_GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; FLATSCR_GFX10-NEXT: s_mov_b32 vcc_lo, 0 +; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) +; FLATSCR_GFX10-NEXT: scratch_store_short off, v0, vcc_lo offset:8 +; FLATSCR_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; FLATSCR_GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; FLATSCR_GFX10-NEXT: s_mov_b32 vcc_lo, 0 +; FLATSCR_GFX10-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 +; FLATSCR_GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; FLATSCR_GFX10-NEXT: s_mov_b32 vcc_lo, 0 +; FLATSCR_GFX10-NEXT: scratch_load_dword v1, off, vcc_lo offset:6 +; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) +; FLATSCR_GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; FLATSCR_GFX10-NEXT: s_endpgm entry: %loc = alloca [3 x i16], align 2, addrspace(5) %loc.0.sroa_cast1 = bitcast [3 x i16] addrspace(5)* %loc to i8 addrspace(5)* @@ -322,6 +528,18 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: chain_hi_to_lo_group_other_dep: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_read_u16_d16_hi v1, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0] +; GFX10-NEXT: ds_read_u16_d16 v1, v0 offset:2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] bb: %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1 %load_lo = load i16, i16 addrspace(3)* %gep_lo @@ -345,6 +563,17 @@ ; GCN-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] ; GCN-NEXT: v_bfi_b32 v0, v2, v1, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: chain_hi_to_lo_group_other_dep_multi_chain: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_read_u16 v1, v0 offset:2 +; GFX10-NEXT: ds_read_u16_d16_hi v0, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] +; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] bb: %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1 %load_lo = load volatile i16, i16 addrspace(3)* %gep_lo @@ -378,6 +607,30 @@ ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: v_mov_b32_e32 v0, v1 ; FLATSCR-NEXT: s_setpc_b64 s[30:31] +; +; GFX10_DEFAULT-LABEL: chain_hi_to_lo_private_other_dep: +; GFX10_DEFAULT: ; %bb.0: ; %bb +; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10_DEFAULT-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10_DEFAULT-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen +; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) +; GFX10_DEFAULT-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0] +; GFX10_DEFAULT-NEXT: buffer_load_short_d16 v1, v0, s[0:3], 0 offen offset:2 +; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) +; GFX10_DEFAULT-NEXT: v_mov_b32_e32 v0, v1 +; GFX10_DEFAULT-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR_GFX10-LABEL: chain_hi_to_lo_private_other_dep: +; FLATSCR_GFX10: ; %bb.0: ; %bb +; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v1, v0, off +; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) +; FLATSCR_GFX10-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0] +; FLATSCR_GFX10-NEXT: scratch_load_short_d16 v1, v0, off offset:2 +; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) +; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v0, v1 +; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31] bb: %gep_lo = getelementptr inbounds i16, i16 addrspace(5)* %ptr, i64 1 %load_lo = load i16, i16 addrspace(5)* %gep_lo @@ -401,6 +654,18 @@ ; GCN-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] ; GCN-NEXT: v_bfi_b32 v0, v1, v2, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: chain_hi_to_lo_global_other_dep: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_ushort v2, v[0:1], off offset:2 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_short_d16_hi v0, v[0:1], off glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] +; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v2, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] bb: %gep_lo = getelementptr inbounds i16, i16 addrspace(1)* %ptr, i64 1 %load_lo = load volatile i16, i16 addrspace(1)* %gep_lo @@ -425,6 +690,20 @@ ; GCN-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] ; GCN-NEXT: v_bfi_b32 v0, v1, v2, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: chain_hi_to_lo_flat_other_dep: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v0, 2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: flat_load_ushort v2, v[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: flat_load_short_d16_hi v0, v[0:1] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] +; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v2, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] bb: %gep_lo = getelementptr inbounds i16, i16 addrspace(0)* %ptr, i64 1 %load_lo = load volatile i16, i16 addrspace(0)* %gep_lo @@ -448,6 +727,19 @@ ; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GCN-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: chain_hi_to_lo_group_may_alias_store: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0x7b +; GFX10-NEXT: ds_read_u16 v3, v0 +; GFX10-NEXT: ds_write_b16 v1, v2 +; GFX10-NEXT: ds_read_u16 v0, v0 offset:2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] bb: %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1 %gep_hi = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 0 diff --git a/llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir b/llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir --- a/llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir +++ b/llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir @@ -1,6 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass postrapseudos -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX908 %s # RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass postrapseudos -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX90A %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass postrapseudos -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX10 %s --- name: copy_v64_to_v64 @@ -15,6 +16,10 @@ ; GFX90A-LABEL: name: copy_v64_to_v64 ; GFX90A: liveins: $vgpr2_vgpr3 ; GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec + ; GFX10-LABEL: name: copy_v64_to_v64 + ; GFX10: liveins: $vgpr2_vgpr3 + ; GFX10: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr2_vgpr3 + ; GFX10: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec $vgpr0_vgpr1 = COPY killed $vgpr2_vgpr3, implicit $exec ... @@ -31,6 +36,10 @@ ; GFX90A-LABEL: name: copy_s64_to_v64 ; GFX90A: liveins: $sgpr2_sgpr3 ; GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $sgpr2_sgpr3, 12, $sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit killed $sgpr2_sgpr3, implicit $exec + ; GFX10-LABEL: name: copy_s64_to_v64 + ; GFX10: liveins: $sgpr2_sgpr3 + ; GFX10: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr2_sgpr3 + ; GFX10: $vgpr1 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit killed $sgpr2_sgpr3, implicit $exec $vgpr0_vgpr1 = COPY killed $sgpr2_sgpr3, implicit $exec ... @@ -48,6 +57,10 @@ ; GFX90A: liveins: $agpr2_agpr3 ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr2_agpr3 ; GFX90A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr2_agpr3, implicit $exec + ; GFX10-LABEL: name: copy_a64_to_v64 + ; GFX10: liveins: $agpr2_agpr3 + ; GFX10: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr2_agpr3 + ; GFX10: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr2_agpr3, implicit $exec $vgpr0_vgpr1 = COPY killed $agpr2_agpr3, implicit $exec ... @@ -67,6 +80,12 @@ ; GFX90A: liveins: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A: $vgpr2_vgpr3 = V_PK_MOV_B32 8, $vgpr4_vgpr5, 12, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5, implicit $exec + ; GFX10-LABEL: name: copy_v128_to_v128_fwd + ; GFX10: liveins: $vgpr2_vgpr3_vgpr4_vgpr5 + ; GFX10: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr2_vgpr3_vgpr4_vgpr5 + ; GFX10: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5 + ; GFX10: $vgpr2 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5 + ; GFX10: $vgpr3 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5, implicit $exec $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed $vgpr2_vgpr3_vgpr4_vgpr5, implicit $exec ... @@ -86,6 +105,12 @@ ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A: $vgpr4_vgpr5 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3, implicit-def $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX90A: $vgpr2_vgpr3 = V_PK_MOV_B32 8, $vgpr0_vgpr1, 12, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec + ; GFX10-LABEL: name: copy_v128_to_v128_back + ; GFX10: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX10: $vgpr5 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX10: $vgpr4 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX10: $vgpr3 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX10: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec $vgpr2_vgpr3_vgpr4_vgpr5 = COPY killed $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec ... @@ -105,6 +130,11 @@ ; GFX90A: $vgpr0 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr4_vgpr5_vgpr6 ; GFX90A: $vgpr1 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr4_vgpr5_vgpr6 ; GFX90A: $vgpr2 = V_MOV_B32_e32 $vgpr6, implicit $exec, implicit killed $vgpr4_vgpr5_vgpr6, implicit $exec + ; GFX10-LABEL: name: copy_v96_to_v96 + ; GFX10: liveins: $vgpr4_vgpr5_vgpr6 + ; GFX10: $vgpr0 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr4_vgpr5_vgpr6 + ; GFX10: $vgpr1 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr4_vgpr5_vgpr6 + ; GFX10: $vgpr2 = V_MOV_B32_e32 $vgpr6, implicit $exec, implicit killed $vgpr4_vgpr5_vgpr6, implicit $exec $vgpr0_vgpr1_vgpr2 = COPY killed $vgpr4_vgpr5_vgpr6, implicit $exec ... @@ -121,6 +151,10 @@ ; GFX90A-LABEL: name: copy_v64_to_v64_undef_sub0 ; GFX90A: liveins: $vgpr3 ; GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec + ; GFX10-LABEL: name: copy_v64_to_v64_undef_sub0 + ; GFX10: liveins: $vgpr3 + ; GFX10: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr2_vgpr3 + ; GFX10: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec $vgpr0_vgpr1 = COPY killed $vgpr2_vgpr3, implicit $exec ... @@ -137,6 +171,10 @@ ; GFX90A-LABEL: name: copy_v64_to_v64_undef_sub1 ; GFX90A: liveins: $vgpr2 ; GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec + ; GFX10-LABEL: name: copy_v64_to_v64_undef_sub1 + ; GFX10: liveins: $vgpr2 + ; GFX10: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr2_vgpr3 + ; GFX10: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec $vgpr0_vgpr1 = COPY killed $vgpr2_vgpr3, implicit $exec ... @@ -156,6 +194,12 @@ ; GFX90A: liveins: $sgpr4_sgpr5_sgpr6_sgpr7 ; GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $sgpr4_sgpr5, 12, $sgpr4_sgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $sgpr4_sgpr5_sgpr6_sgpr7, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A: $vgpr2_vgpr3 = V_PK_MOV_B32 8, $sgpr6_sgpr7, 12, $sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX10-LABEL: name: copy_s128_to_v128_killed + ; GFX10: liveins: $sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX10: $vgpr0 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX10: $vgpr1 = V_MOV_B32_e32 $sgpr5, implicit $exec, implicit $sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX10: $vgpr2 = V_MOV_B32_e32 $sgpr6, implicit $exec, implicit $sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX10: $vgpr3 = V_MOV_B32_e32 $sgpr7, implicit $exec, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7 $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed $sgpr4_sgpr5_sgpr6_sgpr7 ... @@ -173,6 +217,10 @@ ; GFX90A: liveins: $vgpr2_vgpr3 ; GFX90A: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $vgpr2_vgpr3 ; GFX90A: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3, implicit $exec + ; GFX10-LABEL: name: copy_v64_to_v64_unaligned + ; GFX10: liveins: $vgpr2_vgpr3 + ; GFX10: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $vgpr2_vgpr3 + ; GFX10: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3, implicit $exec $vgpr1_vgpr2 = COPY killed $vgpr2_vgpr3, implicit $exec ... @@ -190,6 +238,10 @@ ; GFX90A: liveins: $vgpr3_vgpr4 ; GFX90A: $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr3_vgpr4 ; GFX90A: $vgpr1 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit killed $vgpr3_vgpr4, implicit $exec + ; GFX10-LABEL: name: copy_v64_unaligned_to_v64 + ; GFX10: liveins: $vgpr3_vgpr4 + ; GFX10: $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr3_vgpr4 + ; GFX10: $vgpr1 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit killed $vgpr3_vgpr4, implicit $exec $vgpr0_vgpr1 = COPY killed $vgpr3_vgpr4, implicit $exec ... @@ -211,6 +263,12 @@ ; GFX90A: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11 ; GFX90A: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11 ; GFX90A: $vgpr4 = V_MOV_B32_e32 $vgpr11, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10_vgpr11, implicit $exec + ; GFX10-LABEL: name: copy_v128_to_v128_unaligned + ; GFX10: liveins: $vgpr8_vgpr9_vgpr10_vgpr11 + ; GFX10: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr8_vgpr9_vgpr10_vgpr11 + ; GFX10: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11 + ; GFX10: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11 + ; GFX10: $vgpr4 = V_MOV_B32_e32 $vgpr11, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10_vgpr11, implicit $exec $vgpr1_vgpr2_vgpr3_vgpr4 = COPY killed $vgpr8_vgpr9_vgpr10_vgpr11, implicit $exec ... @@ -232,6 +290,12 @@ ; GFX90A: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10 ; GFX90A: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10 ; GFX90A: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9_vgpr10, implicit $exec + ; GFX10-LABEL: name: copy_v128_unaligned_to_v128 + ; GFX10: liveins: $vgpr7_vgpr8_vgpr9_vgpr10 + ; GFX10: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr7_vgpr8_vgpr9_vgpr10 + ; GFX10: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10 + ; GFX10: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10 + ; GFX10: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9_vgpr10, implicit $exec $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed $vgpr7_vgpr8_vgpr9_vgpr10, implicit $exec ... @@ -249,6 +313,10 @@ ; GFX90A: liveins: $sgpr8_sgpr9 ; GFX90A: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr8_sgpr9 ; GFX90A: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit killed $sgpr8_sgpr9, implicit $exec + ; GFX10-LABEL: name: copy_s64_to_v64_unaligned + ; GFX10: liveins: $sgpr8_sgpr9 + ; GFX10: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr8_sgpr9 + ; GFX10: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit killed $sgpr8_sgpr9, implicit $exec $vgpr1_vgpr2 = COPY killed $sgpr8_sgpr9, implicit $exec ... @@ -270,6 +338,12 @@ ; GFX90A: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A: $vgpr3 = V_MOV_B32_e32 $sgpr10, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A: $vgpr4 = V_MOV_B32_e32 $sgpr11, implicit $exec, implicit killed $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec + ; GFX10-LABEL: name: copy_s128_to_v128_unaligned + ; GFX10: liveins: $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX10: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX10: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX10: $vgpr3 = V_MOV_B32_e32 $sgpr10, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX10: $vgpr4 = V_MOV_B32_e32 $sgpr11, implicit $exec, implicit killed $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec $vgpr1_vgpr2_vgpr3_vgpr4 = COPY killed $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec ... @@ -289,6 +363,11 @@ ; GFX90A: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $vgpr8_vgpr9_vgpr10 ; GFX90A: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10 ; GFX90A: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10, implicit $exec + ; GFX10-LABEL: name: copy_v96_to_v96_unaligned + ; GFX10: liveins: $vgpr8_vgpr9_vgpr10 + ; GFX10: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $vgpr8_vgpr9_vgpr10 + ; GFX10: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10 + ; GFX10: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10, implicit $exec $vgpr1_vgpr2_vgpr3 = COPY killed $vgpr8_vgpr9_vgpr10, implicit $exec ... @@ -308,6 +387,11 @@ ; GFX90A: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr7_vgpr8_vgpr9 ; GFX90A: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9 ; GFX90A: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9, implicit $exec + ; GFX10-LABEL: name: copy_v96_unaligned_to_v96 + ; GFX10: liveins: $vgpr7_vgpr8_vgpr9 + ; GFX10: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr7_vgpr8_vgpr9 + ; GFX10: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9 + ; GFX10: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9, implicit $exec $vgpr0_vgpr1_vgpr2 = COPY killed $vgpr7_vgpr8_vgpr9, implicit $exec ... @@ -327,6 +411,11 @@ ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $sgpr0_sgpr1_sgpr2 ; GFX90A: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 ; GFX90A: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec + ; GFX10-LABEL: name: copy_s96_to_v96 + ; GFX10: liveins: $sgpr0_sgpr1_sgpr2 + ; GFX10: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $sgpr0_sgpr1_sgpr2 + ; GFX10: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 + ; GFX10: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec $vgpr0_vgpr1_vgpr2 = COPY killed $sgpr0_sgpr1_sgpr2, implicit $exec ... @@ -346,5 +435,10 @@ ; GFX90A: $vgpr3 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $sgpr0_sgpr1_sgpr2 ; GFX90A: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 ; GFX90A: $vgpr1 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec + ; GFX10-LABEL: name: copy_s96_to_v96_unaligned + ; GFX10: liveins: $sgpr0_sgpr1_sgpr2 + ; GFX10: $vgpr3 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $sgpr0_sgpr1_sgpr2 + ; GFX10: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 + ; GFX10: $vgpr1 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec $vgpr1_vgpr2_vgpr3 = COPY killed $sgpr0_sgpr1_sgpr2, implicit $exec ... diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll --- a/llvm/test/CodeGen/AMDGPU/ctlz.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -2,6 +2,7 @@ ; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=SI ; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=VI ; RUN: llc < %s -march=r600 -mcpu=cypress -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=EG +; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=GFX10 declare i7 @llvm.ctlz.i7(i7, i1) nounwind readnone declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone @@ -57,6 +58,20 @@ ; EG-NEXT: CNDE_INT T0.X, KC0[2].Z, literal.x, PV.W, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, ; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45) +; +; GFX10-LABEL: s_ctlz_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_flbit_i32_b32 s0, s4 +; GFX10-NEXT: s_cmp_lg_u32 s4, 0 +; GFX10-NEXT: s_cselect_b32 s0, s0, 32 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-NEXT: s_endpgm %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone store i32 %ctlz, i32 addrspace(1)* %out, align 4 ret void @@ -121,6 +136,21 @@ ; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, ; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45) +; +; GFX10-LABEL: v_ctlz_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_ffbh_u32_e32 v1, v0 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc_lo +; GFX10-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid %val = load i32, i32 addrspace(1)* %in.gep, align 4 @@ -197,6 +227,24 @@ ; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, ; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45) +; +; GFX10-LABEL: v_ctlz_v2i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v7, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_ffbh_u32_e32 v2, v1 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 +; GFX10-NEXT: v_ffbh_u32_e32 v3, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 32, v3, vcc_lo +; GFX10-NEXT: global_store_dwordx2 v7, v[0:1], s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8 @@ -291,6 +339,30 @@ ; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, ; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45) +; +; GFX10-LABEL: v_ctlz_v4i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_ffbh_u32_e32 v5, v3 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 +; GFX10-NEXT: v_ffbh_u32_e32 v6, v2 +; GFX10-NEXT: v_ffbh_u32_e32 v7, v1 +; GFX10-NEXT: v_ffbh_u32_e32 v8, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v3, 32, v5, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v2, 32, v6, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 32, v7, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 32, v8, vcc_lo +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16 @@ -367,6 +439,22 @@ ; EG-NEXT: MOV * T0.Z, 0.0, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX10-LABEL: v_ctlz_i8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_ffbh_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX10-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc_lo +; GFX10-NEXT: v_add_nc_u32_e32 v1, -16, v1 +; GFX10-NEXT: v_add_nc_u16_e64 v1, v1, -8 +; GFX10-NEXT: global_store_byte v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm %val = load i8, i8 addrspace(1)* %valptr %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone store i8 %ctlz, i8 addrspace(1)* %out @@ -432,6 +520,25 @@ ; EG-NEXT: MOV T0.Y, 0.0, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX10-LABEL: s_ctlz_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_flbit_i32_b32 s0, s2 +; GFX10-NEXT: s_flbit_i32_b32 s1, s3 +; GFX10-NEXT: s_add_i32 s0, s0, 32 +; GFX10-NEXT: s_cmp_eq_u32 s3, 0 +; GFX10-NEXT: s_cselect_b32 s0, s0, s1 +; GFX10-NEXT: s_or_b32 s1, s2, s3 +; GFX10-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10-NEXT: s_cselect_b32 s0, s0, 64 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] +; GFX10-NEXT: s_endpgm %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false) store i64 %ctlz, i64 addrspace(1)* %out ret void @@ -493,6 +600,25 @@ ; EG-NEXT: CNDE_INT T0.X, KC0[3].X, PS, PV.W, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX10-LABEL: s_ctlz_i64_trunc: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_flbit_i32_b32 s0, s2 +; GFX10-NEXT: s_flbit_i32_b32 s1, s3 +; GFX10-NEXT: s_add_i32 s0, s0, 32 +; GFX10-NEXT: s_cmp_eq_u32 s3, 0 +; GFX10-NEXT: s_cselect_b32 s0, s0, s1 +; GFX10-NEXT: s_or_b32 s1, s2, s3 +; GFX10-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10-NEXT: s_cselect_b32 s0, s0, 64 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_endpgm %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false) %trunc = trunc i64 %ctlz to i32 store i32 %trunc, i32 addrspace(1)* %out @@ -577,6 +703,26 @@ ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, ; EG-NEXT: LSHR * T1.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX10-LABEL: v_ctlz_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_ffbh_u32_e32 v3, v0 +; GFX10-NEXT: v_ffbh_u32_e32 v4, v1 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 64, v1, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid @@ -664,6 +810,26 @@ ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, PV.Z, ; EG-NEXT: LSHR * T1.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX10-LABEL: v_ctlz_i64_trunc: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_ffbh_u32_e32 v3, v1 +; GFX10-NEXT: v_ffbh_u32_e32 v4, v2 +; GFX10-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 64, v2, vcc_lo +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -731,6 +897,19 @@ ; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, ; EG-NEXT: -1(nan), 2(2.802597e-45) +; +; GFX10-LABEL: v_ctlz_i32_sel_eq_neg1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid %val = load i32, i32 addrspace(1)* %in.gep @@ -798,6 +977,19 @@ ; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, ; EG-NEXT: -1(nan), 2(2.802597e-45) +; +; GFX10-LABEL: v_ctlz_i32_sel_ne_neg1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid %val = load i32, i32 addrspace(1)* %in.gep @@ -876,6 +1068,23 @@ ; EG-NEXT: CNDE_INT T0.X, PV.W, T0.W, literal.x, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, ; EG-NEXT: -1(nan), 2(2.802597e-45) +; +; GFX10-LABEL: v_ctlz_i32_sel_eq_bitwidth: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_ffbh_u32_e32 v1, v0 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid %val = load i32, i32 addrspace(1)* %in.gep @@ -953,6 +1162,23 @@ ; EG-NEXT: CNDE_INT T0.X, PV.W, literal.x, T0.W, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, ; EG-NEXT: -1(nan), 2(2.802597e-45) +; +; GFX10-LABEL: v_ctlz_i32_sel_ne_bitwidth: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_ffbh_u32_e32 v1, v0 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid %val = load i32, i32 addrspace(1)* %in.gep @@ -1023,6 +1249,18 @@ ; EG-NEXT: MOV * T0.Z, 0.0, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX10-LABEL: v_ctlz_i8_sel_eq_neg1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 +; GFX10-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %valptr.gep = getelementptr i8, i8 addrspace(1)* %valptr, i32 %tid %val = load i8, i8 addrspace(1)* %valptr.gep @@ -1096,6 +1334,22 @@ ; EG-NEXT: MOV * T0.Z, 0.0, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX10-LABEL: v_ctlz_i16_sel_eq_neg1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_ffbh_u32_e32 v2, v1 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc_lo +; GFX10-NEXT: v_add_nc_u32_e32 v1, -16, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffff, v1, vcc_lo +; GFX10-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm %val = load i16, i16 addrspace(1)* %valptr %ctlz = call i16 @llvm.ctlz.i16(i16 %val, i1 false) nounwind readnone %cmp = icmp eq i16 %val, 0 @@ -1167,6 +1421,19 @@ ; EG-NEXT: MOV * T0.Z, 0.0, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX10-LABEL: v_ctlz_i7_sel_eq_neg1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; GFX10-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %valptr.gep = getelementptr i7, i7 addrspace(1)* %valptr, i32 %tid %val = load i7, i7 addrspace(1)* %valptr.gep diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,SI ; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,VI +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone @@ -11,6 +12,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_uitofp_i32_to_f32_mask255: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %masked = and i32 %arg0, 255 %cvt = uitofp i32 %masked to float ret float %cvt @@ -22,6 +30,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_sitofp_i32_to_f32_mask255: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %masked = and i32 %arg0, 255 %cvt = sitofp i32 %masked to float ret float %cvt @@ -34,6 +49,14 @@ ; GCN-NEXT: v_lshrrev_b32_e32 v0, 7, v0 ; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_uitofp_to_f32_lshr7_mask255: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 7, v0 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %lshr.7 = lshr i32 %arg0, 7 %masked = and i32 %lshr.7, 255 %cvt = uitofp i32 %masked to float @@ -46,6 +69,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_uitofp_to_f32_lshr8_mask255: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %lshr.8 = lshr i32 %arg0, 8 %masked = and i32 %lshr.8, 255 %cvt = uitofp i32 %masked to float @@ -74,6 +104,16 @@ ; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 +; GFX10-NEXT: global_store_dword v[0:1], v1, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %lshr.8 = lshr i32 %arg0, 8 store i32 %lshr.8, i32 addrspace(1)* undef %masked = and i32 %lshr.8, 255 @@ -87,6 +127,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_uitofp_to_f32_lshr16_mask255: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %lshr.16 = lshr i32 %arg0, 16 %masked = and i32 %lshr.16, 255 %cvt = uitofp i32 %masked to float @@ -99,6 +146,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_uitofp_to_f32_lshr24_mask255: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %lshr.16 = lshr i32 %arg0, 24 %masked = and i32 %lshr.16, 255 %cvt = uitofp i32 %masked to float @@ -111,6 +165,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_uitofp_i8_to_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %cvt = uitofp i8 %arg0 to float ret float %cvt } @@ -123,6 +184,15 @@ ; GCN-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GCN-NEXT: v_mov_b32_e32 v0, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_uitofp_v2i8_to_v2f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 +; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = bitcast i16 %arg0 to <2 x i8> %cvt = uitofp <2 x i8> %val to <2 x float> ret <2 x float> %cvt @@ -137,6 +207,16 @@ ; GCN-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; GCN-NEXT: v_mov_b32_e32 v0, v3 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_uitofp_v3i8_to_v3f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, v0 +; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 +; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] %trunc = trunc i32 %arg0 to i24 %val = bitcast i24 %trunc to <3 x i8> %cvt = uitofp <3 x i8> %val to <3 x float> @@ -153,6 +233,17 @@ ; GCN-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 ; GCN-NEXT: v_mov_b32_e32 v0, v4 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_uitofp_v4i8_to_v4f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v0 +; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 +; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 +; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = bitcast i32 %arg0 to <4 x i8> %cvt = uitofp <4 x i8> %val to <4 x float> ret <4 x float> %cvt @@ -168,6 +259,17 @@ ; GCN-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 ; GCN-NEXT: v_mov_b32_e32 v0, v4 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_uitofp_unpack_i32_to_v4f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v0 +; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 +; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 +; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: s_setpc_b64 s[30:31] %mask.arg0 = and i32 %arg0, 255 %cvt0 = uitofp i32 %mask.arg0 to float @@ -205,6 +307,14 @@ ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_uitofp_i32_to_f16_mask255: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %masked = and i32 %arg0, 255 %cvt = uitofp i32 %masked to half ret half %cvt @@ -225,6 +335,14 @@ ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_sitofp_i32_to_f16_mask255: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %masked = and i32 %arg0, 255 %cvt = sitofp i32 %masked to half ret half %cvt @@ -245,6 +363,14 @@ ; VI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_uitofp_to_f16_lshr8_mask255: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 +; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %lshr.8 = lshr i32 %arg0, 8 %masked = and i32 %lshr.8, 255 %cvt = uitofp i32 %masked to half @@ -266,6 +392,14 @@ ; VI-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_uitofp_to_f16_lshr16_mask255: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 +; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %lshr.16 = lshr i32 %arg0, 16 %masked = and i32 %lshr.16, 255 %cvt = uitofp i32 %masked to half @@ -287,6 +421,14 @@ ; VI-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_uitofp_to_f16_lshr24_mask255: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 +; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %lshr.16 = lshr i32 %arg0, 24 %masked = and i32 %lshr.16, 255 %cvt = uitofp i32 %masked to half @@ -307,6 +449,13 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 ; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_uitofp_i8_to_f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %cvt = uitofp i8 %arg0 to half ret half %cvt } @@ -318,6 +467,14 @@ ; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_uitofp_i32_to_f64_mask255: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %masked = and i32 %arg0, 255 %cvt = uitofp i32 %masked to double ret double %cvt @@ -330,6 +487,14 @@ ; GCN-NEXT: v_bfe_u32 v0, v0, 8, 8 ; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_uitofp_to_f64_lshr8_mask255: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_bfe_u32 v0, v0, 8, 8 +; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %lshr.8 = lshr i32 %arg0, 8 %masked = and i32 %lshr.8, 255 %cvt = uitofp i32 %masked to double @@ -343,6 +508,14 @@ ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_uitofp_to_f64_lshr16_mask255: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_bfe_u32 v0, v0, 16, 8 +; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %lshr.16 = lshr i32 %arg0, 16 %masked = and i32 %lshr.16, 255 %cvt = uitofp i32 %masked to double @@ -356,6 +529,14 @@ ; GCN-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_uitofp_to_f64_lshr24_mask255: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %lshr.16 = lshr i32 %arg0, 24 %masked = and i32 %lshr.16, 255 %cvt = uitofp i32 %masked to double @@ -377,6 +558,15 @@ ; VI-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 ; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_uitofp_i8_to_f64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX10-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %cvt = uitofp i8 %arg0 to double ret double %cvt } @@ -413,6 +603,18 @@ ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm +; +; GFX10-LABEL: load_i8_to_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid %load = load i8, i8 addrspace(1)* %gep, align 1 @@ -457,6 +659,20 @@ ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm +; +; GFX10-LABEL: load_v2i8_to_v2f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <2 x i8>, <2 x i8> addrspace(1)* %in, i32 %tid %load = load <2 x i8>, <2 x i8> addrspace(1)* %gep, align 2 @@ -504,6 +720,21 @@ ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0 ; VI-NEXT: s_endpgm +; +; GFX10-LABEL: load_v3i8_to_v3f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 +; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX10-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid %load = load <3 x i8>, <3 x i8> addrspace(1)* %gep, align 4 @@ -552,6 +783,22 @@ ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm +; +; GFX10-LABEL: load_v4i8_to_v4f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 +; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 +; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4 @@ -623,6 +870,29 @@ ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm +; +; GFX10-LABEL: load_v4i8_to_v4f32_unaligned: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v6, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x3 +; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3 +; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2 +; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:1 +; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(3) +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 +; GFX10-NEXT: s_waitcnt vmcnt(2) +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v1, v4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v5 +; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1 @@ -711,6 +981,40 @@ ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm +; +; GFX10-LABEL: load_v4i8_to_v4f32_2_uses: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX10-NEXT: s_movk_i32 s0, 0x900 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX10-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff00, v0 +; GFX10-NEXT: v_add_nc_u16_e64 v4, v0, 9 +; GFX10-NEXT: v_add_nc_u16_e64 v2, v2, 9 +; GFX10-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 +; GFX10-NEXT: v_add_nc_u16_e64 v1, v1, s0 +; GFX10-NEXT: v_add_nc_u16_e64 v5, v2, s0 +; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX10-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX10-NEXT: global_store_dword v4, v5, s[4:5] +; GFX10-NEXT: s_endpgm %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x %load = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4 @@ -810,6 +1114,40 @@ ; VI-NEXT: buffer_store_dwordx3 v[4:6], off, s[4:7], 0 offset:16 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm +; +; GFX10-LABEL: load_v7i8_to_v7f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v11, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x5 +; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:2 +; GFX10-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3 +; GFX10-NEXT: global_load_short_d16 v2, v0, s[2:3] offset:4 +; GFX10-NEXT: global_load_ubyte v6, v0, s[2:3] offset:6 +; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:1 +; GFX10-NEXT: global_load_ubyte v7, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(4) +; GFX10-NEXT: v_lshl_or_b32 v0, v3, 8, v1 +; GFX10-NEXT: s_waitcnt vmcnt(3) +; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v5, v2 +; GFX10-NEXT: s_waitcnt vmcnt(2) +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, v6 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v1, v4 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 +; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v7 +; GFX10-NEXT: global_store_dword v11, v6, s[0:1] offset:24 +; GFX10-NEXT: global_store_dwordx2 v11, v[4:5], s[0:1] offset:16 +; GFX10-NEXT: global_store_dwordx4 v11, v[0:3], s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <7 x i8>, <7 x i8> addrspace(1)* %in, i32 %tid %load = load <7 x i8>, <7 x i8> addrspace(1)* %gep, align 1 @@ -868,6 +1206,27 @@ ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm +; +; GFX10-LABEL: load_v8i8_to_v8f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v10, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dwordx2 v[8:9], v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v7, v9 +; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v6, v9 +; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v5, v9 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v9 +; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v8 +; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v8 +; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v8 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v8 +; GFX10-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 +; GFX10-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <8 x i8>, <8 x i8> addrspace(1)* %in, i32 %tid %load = load <8 x i8>, <8 x i8> addrspace(1)* %gep, align 8 @@ -912,6 +1271,20 @@ ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm +; +; GFX10-LABEL: i8_zext_inreg_i32_to_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %load = load i32, i32 addrspace(1)* %gep, align 4 @@ -956,6 +1329,19 @@ ; VI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm +; +; GFX10-LABEL: i8_zext_inreg_hi1_to_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %load = load i32, i32 addrspace(1)* %gep, align 4 @@ -1000,6 +1386,18 @@ ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm +; +; GFX10-LABEL: i8_zext_i32_to_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid %load = load i8, i8 addrspace(1)* %gep, align 1 @@ -1068,6 +1466,30 @@ ; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v5 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm +; +; GFX10-LABEL: v4i8_zext_v4i32_to_v4f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x3 +; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3 +; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2 +; GFX10-NEXT: global_load_ubyte v3, v0, s[2:3] offset:1 +; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(2) +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 8, v2 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v1, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 +; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 +; GFX10-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1 @@ -1111,6 +1533,19 @@ ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm +; +; GFX10-LABEL: extract_byte0_to_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %val = load i32, i32 addrspace(1)* %gep @@ -1154,6 +1589,19 @@ ; VI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm +; +; GFX10-LABEL: extract_byte1_to_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %val = load i32, i32 addrspace(1)* %gep @@ -1198,6 +1646,19 @@ ; VI-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm +; +; GFX10-LABEL: extract_byte2_to_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %val = load i32, i32 addrspace(1)* %gep @@ -1242,6 +1703,19 @@ ; VI-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm +; +; GFX10-LABEL: extract_byte3_to_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %val = load i32, i32 addrspace(1)* %gep @@ -1292,6 +1766,20 @@ ; VI-NEXT: v_add_f32_e32 v0, v0, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX10-LABEL: cvt_ubyte0_or_multiuse: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v0, s[0:1] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_or_b32_e32 v0, 0x80000001, v0 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v0 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-NEXT: s_endpgm bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %lid diff --git a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll --- a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=CI %s ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s declare i32 @llvm.amdgcn.workitem.id.x() #0 @@ -23,6 +24,14 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX9-NEXT: ds_write_b32 v0, v1 offset:12 ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: write_ds_sub0_offset0_global: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0 +; GFX10-NEXT: ds_write_b32 v0, v1 offset:12 +; GFX10-NEXT: s_endpgm entry: %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1 %sub1 = sub i32 0, %x.i @@ -69,6 +78,22 @@ ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: write_ds_sub0_offset0_global_clamp_bit: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_mov_b32 vcc_lo, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0x7b +; GFX10-NEXT: v_sub_nc_u32_e32 v3, 0, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: ds_write_b32 v3, v2 offset:12 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_div_fmas_f32 v7, s0, s0, s0 +; GFX10-NEXT: global_store_dword v[0:1], v7, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_endpgm entry: %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1 %sub1 = sub i32 0, %x.i @@ -97,6 +122,14 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 13 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:65535 ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: add_x_shl_neg_to_sub_max_offset: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 13 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0 +; GFX10-NEXT: ds_write_b8 v0, v1 offset:65535 +; GFX10-NEXT: s_endpgm %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 %neg = sub i32 0, %x.i %shl = shl i32 %neg, 2 @@ -123,6 +156,14 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 13 ; GFX9-NEXT: ds_write_b8 v0, v1 ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: add_x_shl_neg_to_sub_max_offset_p1: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 13 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0x10000, v0 +; GFX10-NEXT: ds_write_b8 v0, v1 +; GFX10-NEXT: s_endpgm %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 %neg = sub i32 0, %x.i %shl = shl i32 %neg, 2 @@ -151,6 +192,15 @@ ; GFX9-NEXT: ds_write_b32 v0, v1 offset:123 ; GFX9-NEXT: ds_write_b32 v0, v1 offset:456 ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: add_x_shl_neg_to_sub_multi_use: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 13 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0 +; GFX10-NEXT: ds_write_b32 v0, v1 offset:123 +; GFX10-NEXT: ds_write_b32 v0, v1 offset:456 +; GFX10-NEXT: s_endpgm %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 %neg = sub i32 0, %x.i %shl = shl i32 %neg, 2 @@ -182,6 +232,15 @@ ; GFX9-NEXT: ds_write_b32 v0, v1 offset:123 ; GFX9-NEXT: ds_write_b32 v0, v1 offset:123 ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: add_x_shl_neg_to_sub_multi_use_same_offset: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 13 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0 +; GFX10-NEXT: ds_write_b32 v0, v1 offset:123 +; GFX10-NEXT: ds_write_b32 v0, v1 offset:123 +; GFX10-NEXT: s_endpgm %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 %neg = sub i32 0, %x.i %shl = shl i32 %neg, 2 @@ -211,6 +270,15 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0x3fb, v0 +; GFX10-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 +; GFX10-NEXT: s_endpgm %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 %neg = sub i32 0, %x.i %shl = shl i32 %neg, 2 @@ -259,6 +327,23 @@ ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_mov_b32 vcc_lo, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0x7b +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_sub_nc_u32_e32 v4, 0x3fb, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: ds_write2_b32 v4, v2, v3 offset1:1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_div_fmas_f32 v7, s0, s0, s0 +; GFX10-NEXT: global_store_dword v[0:1], v7, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_endpgm %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 %neg = sub i32 0, %x.i %shl = shl i32 %neg, 2 @@ -289,6 +374,15 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0x3fc, v0 +; GFX10-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 +; GFX10-NEXT: s_endpgm %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 %neg = sub i32 0, %x.i %shl = shl i32 %neg, 2 diff --git a/llvm/test/CodeGen/AMDGPU/early-term.mir b/llvm/test/CodeGen/AMDGPU/early-term.mir --- a/llvm/test/CodeGen/AMDGPU/early-term.mir +++ b/llvm/test/CodeGen/AMDGPU/early-term.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=si-late-branch-lowering -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=si-late-branch-lowering -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX10 %s --- | define amdgpu_ps void @early_term_scc0_end_block() { @@ -30,21 +30,21 @@ - { reg: '$sgpr0' } - { reg: '$sgpr1' } body: | - ; CHECK-LABEL: name: early_term_scc0_end_block - ; CHECK: bb.0: - ; CHECK: successors: %bb.1(0x80000000), %bb.2(0x00000000) - ; CHECK: liveins: $sgpr0, $sgpr1 - ; CHECK: $vgpr0 = V_MOV_B32_e32 0, implicit $exec - ; CHECK: dead $sgpr0 = S_AND_B32 $sgpr0, killed $sgpr1, implicit-def $scc - ; CHECK: S_CBRANCH_SCC0 %bb.2, implicit $scc - ; CHECK: bb.1: - ; CHECK: liveins: $vgpr0 - ; CHECK: EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec - ; CHECK: S_ENDPGM 0 - ; CHECK: bb.2: - ; CHECK: $exec_lo = S_MOV_B32 0 - ; CHECK: EXP_DONE 9, undef $vgpr0, undef $vgpr0, undef $vgpr0, undef $vgpr0, 1, 0, 0, implicit $exec - ; CHECK: S_ENDPGM 0 + ; GFX10-LABEL: name: early_term_scc0_end_block + ; GFX10: bb.0: + ; GFX10: successors: %bb.1(0x80000000), %bb.2(0x00000000) + ; GFX10: liveins: $sgpr0, $sgpr1 + ; GFX10: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX10: dead $sgpr0 = S_AND_B32 $sgpr0, killed $sgpr1, implicit-def $scc + ; GFX10: S_CBRANCH_SCC0 %bb.2, implicit $scc + ; GFX10: bb.1: + ; GFX10: liveins: $vgpr0 + ; GFX10: EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec + ; GFX10: S_ENDPGM 0 + ; GFX10: bb.2: + ; GFX10: $exec_lo = S_MOV_B32 0 + ; GFX10: EXP_DONE 9, undef $vgpr0, undef $vgpr0, undef $vgpr0, undef $vgpr0, 1, 0, 0, implicit $exec + ; GFX10: S_ENDPGM 0 bb.0: liveins: $sgpr0, $sgpr1 successors: %bb.1 @@ -66,25 +66,25 @@ - { reg: '$sgpr0' } - { reg: '$sgpr1' } body: | - ; CHECK-LABEL: name: early_term_scc0_next_terminator - ; CHECK: bb.0: - ; CHECK: successors: %bb.2(0x80000000), %bb.3(0x00000000) - ; CHECK: liveins: $sgpr0, $sgpr1 - ; CHECK: $vgpr0 = V_MOV_B32_e32 0, implicit $exec - ; CHECK: dead $sgpr0 = S_AND_B32 $sgpr0, killed $sgpr1, implicit-def $scc - ; CHECK: S_CBRANCH_SCC0 %bb.3, implicit $scc - ; CHECK: S_BRANCH %bb.2 - ; CHECK: bb.1: - ; CHECK: successors: %bb.2(0x80000000) - ; CHECK: $vgpr0 = V_MOV_B32_e32 1, implicit $exec - ; CHECK: bb.2: - ; CHECK: liveins: $vgpr0 - ; CHECK: EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec - ; CHECK: S_ENDPGM 0 - ; CHECK: bb.3: - ; CHECK: $exec_lo = S_MOV_B32 0 - ; CHECK: EXP_DONE 9, undef $vgpr0, undef $vgpr0, undef $vgpr0, undef $vgpr0, 1, 0, 0, implicit $exec - ; CHECK: S_ENDPGM 0 + ; GFX10-LABEL: name: early_term_scc0_next_terminator + ; GFX10: bb.0: + ; GFX10: successors: %bb.2(0x80000000), %bb.3(0x00000000) + ; GFX10: liveins: $sgpr0, $sgpr1 + ; GFX10: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX10: dead $sgpr0 = S_AND_B32 $sgpr0, killed $sgpr1, implicit-def $scc + ; GFX10: S_CBRANCH_SCC0 %bb.3, implicit $scc + ; GFX10: S_BRANCH %bb.2 + ; GFX10: bb.1: + ; GFX10: successors: %bb.2(0x80000000) + ; GFX10: $vgpr0 = V_MOV_B32_e32 1, implicit $exec + ; GFX10: bb.2: + ; GFX10: liveins: $vgpr0 + ; GFX10: EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec + ; GFX10: S_ENDPGM 0 + ; GFX10: bb.3: + ; GFX10: $exec_lo = S_MOV_B32 0 + ; GFX10: EXP_DONE 9, undef $vgpr0, undef $vgpr0, undef $vgpr0, undef $vgpr0, 1, 0, 0, implicit $exec + ; GFX10: S_ENDPGM 0 bb.0: liveins: $sgpr0, $sgpr1 successors: %bb.2 @@ -112,26 +112,26 @@ - { reg: '$sgpr0' } - { reg: '$sgpr1' } body: | - ; CHECK-LABEL: name: early_term_scc0_in_block - ; CHECK: bb.0: - ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; CHECK: liveins: $sgpr0, $sgpr1 - ; CHECK: $vgpr0 = V_MOV_B32_e32 0, implicit $exec - ; CHECK: dead $sgpr0 = S_AND_B32 $sgpr0, killed $sgpr1, implicit-def $scc - ; CHECK: S_CBRANCH_SCC0 %bb.2, implicit $scc - ; CHECK: bb.3: - ; CHECK: successors: %bb.1(0x80000000) - ; CHECK: liveins: $vgpr0, $scc - ; CHECK: $vgpr1 = V_MOV_B32_e32 1, implicit $exec - ; CHECK: bb.1: - ; CHECK: liveins: $vgpr0, $vgpr1 - ; CHECK: EXP 1, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec - ; CHECK: EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec - ; CHECK: S_ENDPGM 0 - ; CHECK: bb.2: - ; CHECK: $exec_lo = S_MOV_B32 0 - ; CHECK: EXP_DONE 9, undef $vgpr0, undef $vgpr0, undef $vgpr0, undef $vgpr0, 1, 0, 0, implicit $exec - ; CHECK: S_ENDPGM 0 + ; GFX10-LABEL: name: early_term_scc0_in_block + ; GFX10: bb.0: + ; GFX10: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; GFX10: liveins: $sgpr0, $sgpr1 + ; GFX10: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX10: dead $sgpr0 = S_AND_B32 $sgpr0, killed $sgpr1, implicit-def $scc + ; GFX10: S_CBRANCH_SCC0 %bb.2, implicit $scc + ; GFX10: bb.3: + ; GFX10: successors: %bb.1(0x80000000) + ; GFX10: liveins: $vgpr0, $scc + ; GFX10: $vgpr1 = V_MOV_B32_e32 1, implicit $exec + ; GFX10: bb.1: + ; GFX10: liveins: $vgpr0, $vgpr1 + ; GFX10: EXP 1, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX10: EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec + ; GFX10: S_ENDPGM 0 + ; GFX10: bb.2: + ; GFX10: $exec_lo = S_MOV_B32 0 + ; GFX10: EXP_DONE 9, undef $vgpr0, undef $vgpr0, undef $vgpr0, undef $vgpr0, 1, 0, 0, implicit $exec + ; GFX10: S_ENDPGM 0 bb.0: liveins: $sgpr0, $sgpr1 successors: %bb.1 @@ -155,15 +155,15 @@ - { reg: '$sgpr0' } - { reg: '$sgpr1' } body: | - ; CHECK-LABEL: name: early_term_scc0_gs - ; CHECK: bb.0: - ; CHECK: successors: %bb.1(0x80000000) - ; CHECK: liveins: $sgpr0, $sgpr1 - ; CHECK: $vgpr0 = V_MOV_B32_e32 0, implicit $exec - ; CHECK: dead $sgpr0 = S_AND_B32 $sgpr0, killed $sgpr1, implicit-def $scc - ; CHECK: bb.1: - ; CHECK: liveins: $vgpr0 - ; CHECK: S_ENDPGM 0 + ; GFX10-LABEL: name: early_term_scc0_gs + ; GFX10: bb.0: + ; GFX10: successors: %bb.1(0x80000000) + ; GFX10: liveins: $sgpr0, $sgpr1 + ; GFX10: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX10: dead $sgpr0 = S_AND_B32 $sgpr0, killed $sgpr1, implicit-def $scc + ; GFX10: bb.1: + ; GFX10: liveins: $vgpr0 + ; GFX10: S_ENDPGM 0 bb.0: liveins: $sgpr0, $sgpr1 successors: %bb.1 @@ -184,19 +184,19 @@ - { reg: '$sgpr0' } - { reg: '$sgpr1' } body: | - ; CHECK-LABEL: name: early_term_scc0_cs - ; CHECK: bb.0: - ; CHECK: successors: %bb.1(0x80000000), %bb.2(0x00000000) - ; CHECK: liveins: $sgpr0, $sgpr1 - ; CHECK: $vgpr0 = V_MOV_B32_e32 0, implicit $exec - ; CHECK: dead $sgpr0 = S_AND_B32 $sgpr0, killed $sgpr1, implicit-def $scc - ; CHECK: S_CBRANCH_SCC0 %bb.2, implicit $scc - ; CHECK: bb.1: - ; CHECK: liveins: $vgpr0 - ; CHECK: S_ENDPGM 0 - ; CHECK: bb.2: - ; CHECK: $exec_lo = S_MOV_B32 0 - ; CHECK: S_ENDPGM 0 + ; GFX10-LABEL: name: early_term_scc0_cs + ; GFX10: bb.0: + ; GFX10: successors: %bb.1(0x80000000), %bb.2(0x00000000) + ; GFX10: liveins: $sgpr0, $sgpr1 + ; GFX10: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX10: dead $sgpr0 = S_AND_B32 $sgpr0, killed $sgpr1, implicit-def $scc + ; GFX10: S_CBRANCH_SCC0 %bb.2, implicit $scc + ; GFX10: bb.1: + ; GFX10: liveins: $vgpr0 + ; GFX10: S_ENDPGM 0 + ; GFX10: bb.2: + ; GFX10: $exec_lo = S_MOV_B32 0 + ; GFX10: S_ENDPGM 0 bb.0: liveins: $sgpr0, $sgpr1 successors: %bb.1 diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll @@ -2,6 +2,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -mattr=-unaligned-access-mode < %s | FileCheck --check-prefix=GFX7-ALIGNED %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -mattr=+unaligned-access-mode < %s | FileCheck --check-prefix=GFX7-UNALIGNED %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck --check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+unaligned-access-mode < %s | FileCheck --check-prefix=GFX10 %s ; Should not merge this to a dword load define i32 @global_load_2xi16_align2(i16 addrspace(1)* %p) #0 { @@ -37,6 +38,17 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_load_2xi16_align2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_ushort v2, v[0:1], off +; GFX10-NEXT: global_load_ushort v3, v[0:1], off offset:2 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshl_or_b32 v0, v3, 16, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(1)* %p, i64 1 %p.0 = load i16, i16 addrspace(1)* %p, align 2 %p.1 = load i16, i16 addrspace(1)* %gep.p, align 2 @@ -91,6 +103,17 @@ ; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: global_store_short v0, v2, s[0:1] offset:2 ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_2xi16_align2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 1 +; GFX10-NEXT: v_mov_b32_e32 v2, 2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-NEXT: global_store_short v0, v2, s[0:1] offset:2 +; GFX10-NEXT: s_endpgm %gep.r = getelementptr i16, i16 addrspace(1)* %r, i64 1 store i16 1, i16 addrspace(1)* %r, align 2 store i16 2, i16 addrspace(1)* %gep.r, align 2 @@ -140,6 +163,16 @@ ; GFX9-NEXT: v_bfi_b32 v1, v1, 0, v0 ; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_load_2xi16_align1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xffff, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(1)* %p, i64 1 %p.0 = load i16, i16 addrspace(1)* %p, align 1 %p.1 = load i16, i16 addrspace(1)* %gep.p, align 1 @@ -197,6 +230,15 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_2xi16_align1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x20001 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm %gep.r = getelementptr i16, i16 addrspace(1)* %r, i64 1 store i16 1, i16 addrspace(1)* %r, align 1 store i16 2, i16 addrspace(1)* %gep.r, align 1 @@ -236,6 +278,16 @@ ; GFX9-NEXT: v_bfi_b32 v1, v1, 0, v0 ; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_load_2xi16_align4: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xffff, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(1)* %p, i64 1 %p.0 = load i16, i16 addrspace(1)* %p, align 4 %p.1 = load i16, i16 addrspace(1)* %gep.p, align 2 @@ -286,6 +338,15 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_2xi16_align4: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x20001 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm %gep.r = getelementptr i16, i16 addrspace(1)* %r, i64 1 store i16 1, i16 addrspace(1)* %r, align 4 store i16 2, i16 addrspace(1)* %gep.r, align 2 diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll @@ -3,6 +3,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -mattr=+unaligned-scratch-access < %s | FileCheck --check-prefix=GFX7-UNALIGNED %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+unaligned-scratch-access < %s | FileCheck --check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+unaligned-scratch-access -amdgpu-enable-flat-scratch < %s | FileCheck --check-prefix=GFX9-FLASTSCR %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+unaligned-scratch-access < %s | FileCheck --check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+unaligned-scratch-access -amdgpu-enable-flat-scratch < %s | FileCheck --check-prefix=GFX10-FLASTSCR %s ; Should not merge this to a dword load define i32 @private_load_2xi16_align2(i16 addrspace(5)* %p) #0 { @@ -45,6 +47,28 @@ ; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLASTSCR-NEXT: v_lshl_or_b32 v0, v2, 16, v1 ; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: private_load_2xi16_align2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_ushort v1, v0, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_ushort v2, v0, s[0:3], 0 offen offset:2 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-FLASTSCR-LABEL: private_load_2xi16_align2: +; GFX10-FLASTSCR: ; %bb.0: +; GFX10-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-FLASTSCR-NEXT: s_clause 0x1 +; GFX10-FLASTSCR-NEXT: scratch_load_ushort v1, v0, off +; GFX10-FLASTSCR-NEXT: scratch_load_ushort v2, v0, off offset:2 +; GFX10-FLASTSCR-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLASTSCR-NEXT: v_lshl_or_b32 v0, v2, 16, v1 +; GFX10-FLASTSCR-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1 %p.0 = load i16, i16 addrspace(5)* %p, align 2 %p.1 = load i16, i16 addrspace(5)* %gep.p, align 2 @@ -98,6 +122,28 @@ ; GFX9-FLASTSCR-NEXT: scratch_store_short v1, v0, off offset:2 ; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: private_store_2xi16_align2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, 1 +; GFX10-NEXT: v_mov_b32_e32 v2, 2 +; GFX10-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_short v2, v1, s[0:3], 0 offen offset:2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-FLASTSCR-LABEL: private_store_2xi16_align2: +; GFX10-FLASTSCR: ; %bb.0: +; GFX10-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-FLASTSCR-NEXT: v_mov_b32_e32 v0, 1 +; GFX10-FLASTSCR-NEXT: v_mov_b32_e32 v2, 2 +; GFX10-FLASTSCR-NEXT: scratch_store_short v1, v0, off +; GFX10-FLASTSCR-NEXT: scratch_store_short v1, v2, off offset:2 +; GFX10-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-FLASTSCR-NEXT: s_setpc_b64 s[30:31] %gep.r = getelementptr i16, i16 addrspace(5)* %r, i64 1 store i16 1, i16 addrspace(5)* %r, align 2 store i16 2, i16 addrspace(5)* %gep.r, align 2 @@ -155,6 +201,26 @@ ; GFX9-FLASTSCR-NEXT: v_bfi_b32 v1, v1, 0, v0 ; GFX9-FLASTSCR-NEXT: v_and_or_b32 v0, v0, s0, v1 ; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: private_load_2xi16_align1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xffff, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-FLASTSCR-LABEL: private_load_2xi16_align1: +; GFX10-FLASTSCR: ; %bb.0: +; GFX10-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-FLASTSCR-NEXT: scratch_load_dword v0, v0, off +; GFX10-FLASTSCR-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLASTSCR-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 +; GFX10-FLASTSCR-NEXT: v_and_or_b32 v0, v0, 0xffff, v1 +; GFX10-FLASTSCR-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1 %p.0 = load i16, i16 addrspace(5)* %p, align 1 %p.1 = load i16, i16 addrspace(5)* %gep.p, align 1 @@ -206,6 +272,24 @@ ; GFX9-FLASTSCR-NEXT: scratch_store_dword v1, v0, off ; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: private_store_2xi16_align1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-FLASTSCR-LABEL: private_store_2xi16_align1: +; GFX10-FLASTSCR: ; %bb.0: +; GFX10-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-FLASTSCR-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX10-FLASTSCR-NEXT: scratch_store_dword v1, v0, off +; GFX10-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-FLASTSCR-NEXT: s_setpc_b64 s[30:31] %gep.r = getelementptr i16, i16 addrspace(5)* %r, i64 1 store i16 1, i16 addrspace(5)* %r, align 1 store i16 2, i16 addrspace(5)* %gep.r, align 1 @@ -256,6 +340,26 @@ ; GFX9-FLASTSCR-NEXT: v_bfi_b32 v1, v1, 0, v0 ; GFX9-FLASTSCR-NEXT: v_and_or_b32 v0, v0, s0, v1 ; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: private_load_2xi16_align4: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xffff, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-FLASTSCR-LABEL: private_load_2xi16_align4: +; GFX10-FLASTSCR: ; %bb.0: +; GFX10-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-FLASTSCR-NEXT: scratch_load_dword v0, v0, off +; GFX10-FLASTSCR-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLASTSCR-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 +; GFX10-FLASTSCR-NEXT: v_and_or_b32 v0, v0, 0xffff, v1 +; GFX10-FLASTSCR-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1 %p.0 = load i16, i16 addrspace(5)* %p, align 4 %p.1 = load i16, i16 addrspace(5)* %gep.p, align 2 @@ -309,6 +413,24 @@ ; GFX9-FLASTSCR-NEXT: scratch_store_dword v1, v0, off ; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: private_store_2xi16_align4: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-FLASTSCR-LABEL: private_store_2xi16_align4: +; GFX10-FLASTSCR: ; %bb.0: +; GFX10-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-FLASTSCR-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX10-FLASTSCR-NEXT: scratch_store_dword v1, v0, off +; GFX10-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-FLASTSCR-NEXT: s_setpc_b64 s[30:31] %gep.r = getelementptr i16, i16 addrspace(5)* %r, i64 1 store i16 1, i16 addrspace(5)* %r, align 4 store i16 2, i16 addrspace(5)* %gep.r, align 2 diff --git a/llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll b/llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll --- a/llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll +++ b/llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll @@ -4,12 +4,14 @@ ; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math-f32=ieee < %s | FileCheck --check-prefix=NOFUSE %s ; RUN: llc -march=amdgcn -mcpu=tonga -denormal-fp-math-f32=ieee < %s | FileCheck --check-prefix=NOFUSE %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=ieee < %s | FileCheck --check-prefix=FMA %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math-f32=ieee < %s | FileCheck --check-prefix=FMAGFX10 %s ; RUN: llc -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign < %s | FileCheck --check-prefix=FMAD %s ; RUN: llc -march=amdgcn -mcpu=verde -denormal-fp-math-f32=preserve-sign < %s | FileCheck --check-prefix=FMAD %s ; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math-f32=preserve-sign < %s | FileCheck --check-prefix=FMAD %s ; RUN: llc -march=amdgcn -mcpu=tonga -denormal-fp-math-f32=preserve-sign < %s | FileCheck --check-prefix=FMAD %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign < %s | FileCheck --check-prefix=FMAD %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math-f32=preserve-sign < %s | FileCheck --check-prefix=FMADGFX10 %s ; Check for incorrect fmad formation when distributing @@ -27,11 +29,25 @@ ; NOFUSE-NEXT: v_mul_f32_e32 v0, v0, v1 ; NOFUSE-NEXT: s_setpc_b64 s[30:31] ; +; FMAGFX10-LABEL: unsafe_fmul_fadd_distribute_fast_f32: +; FMAGFX10: ; %bb.0: +; FMAGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FMAGFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; FMAGFX10-NEXT: v_fmac_f32_e32 v0, v1, v0 +; FMAGFX10-NEXT: s_setpc_b64 s[30:31] +; ; FMAD-LABEL: unsafe_fmul_fadd_distribute_fast_f32: ; FMAD: ; %bb.0: ; FMAD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; FMAD-NEXT: v_mac_f32_e32 v0, v1, v0 ; FMAD-NEXT: s_setpc_b64 s[30:31] +; +; FMADGFX10-LABEL: unsafe_fmul_fadd_distribute_fast_f32: +; FMADGFX10: ; %bb.0: +; FMADGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FMADGFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; FMADGFX10-NEXT: v_fmac_f32_e32 v0, v1, v0 +; FMADGFX10-NEXT: s_setpc_b64 s[30:31] %add = fadd fast float %arg1, 1.0 %tmp1 = fmul fast float %arg0, %add ret float %tmp1 @@ -51,11 +67,25 @@ ; NOFUSE-NEXT: v_mul_f32_e32 v0, v0, v1 ; NOFUSE-NEXT: s_setpc_b64 s[30:31] ; +; FMAGFX10-LABEL: unsafe_fmul_fsub_distribute_fast_f32: +; FMAGFX10: ; %bb.0: +; FMAGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FMAGFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; FMAGFX10-NEXT: v_fmac_f32_e64 v0, -v1, v0 +; FMAGFX10-NEXT: s_setpc_b64 s[30:31] +; ; FMAD-LABEL: unsafe_fmul_fsub_distribute_fast_f32: ; FMAD: ; %bb.0: ; FMAD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; FMAD-NEXT: v_mad_f32 v0, -v1, v0, v0 ; FMAD-NEXT: s_setpc_b64 s[30:31] +; +; FMADGFX10-LABEL: unsafe_fmul_fsub_distribute_fast_f32: +; FMADGFX10: ; %bb.0: +; FMADGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FMADGFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; FMADGFX10-NEXT: v_fmac_f32_e64 v0, -v1, v0 +; FMADGFX10-NEXT: s_setpc_b64 s[30:31] %add = fsub fast float 1.0, %arg1 %tmp1 = fmul fast float %arg0, %add ret float %tmp1 @@ -78,12 +108,28 @@ ; NOFUSE-NEXT: v_mul_f32_e32 v1, v1, v3 ; NOFUSE-NEXT: s_setpc_b64 s[30:31] ; +; FMAGFX10-LABEL: unsafe_fmul_fadd_distribute_fast_v2f32: +; FMAGFX10: ; %bb.0: +; FMAGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FMAGFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; FMAGFX10-NEXT: v_fmac_f32_e32 v0, v2, v0 +; FMAGFX10-NEXT: v_fmac_f32_e32 v1, v3, v1 +; FMAGFX10-NEXT: s_setpc_b64 s[30:31] +; ; FMAD-LABEL: unsafe_fmul_fadd_distribute_fast_v2f32: ; FMAD: ; %bb.0: ; FMAD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; FMAD-NEXT: v_mac_f32_e32 v0, v2, v0 ; FMAD-NEXT: v_mac_f32_e32 v1, v3, v1 ; FMAD-NEXT: s_setpc_b64 s[30:31] +; +; FMADGFX10-LABEL: unsafe_fmul_fadd_distribute_fast_v2f32: +; FMADGFX10: ; %bb.0: +; FMADGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FMADGFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; FMADGFX10-NEXT: v_fmac_f32_e32 v0, v2, v0 +; FMADGFX10-NEXT: v_fmac_f32_e32 v1, v3, v1 +; FMADGFX10-NEXT: s_setpc_b64 s[30:31] %add = fadd fast <2 x float> %arg1, %tmp1 = fmul fast <2 x float> %arg0, %add ret <2 x float> %tmp1 @@ -106,12 +152,28 @@ ; NOFUSE-NEXT: v_mul_f32_e32 v1, v1, v3 ; NOFUSE-NEXT: s_setpc_b64 s[30:31] ; +; FMAGFX10-LABEL: unsafe_fmul_fsub_distribute_fast_v2f32: +; FMAGFX10: ; %bb.0: +; FMAGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FMAGFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; FMAGFX10-NEXT: v_fmac_f32_e64 v0, -v2, v0 +; FMAGFX10-NEXT: v_fmac_f32_e64 v1, -v3, v1 +; FMAGFX10-NEXT: s_setpc_b64 s[30:31] +; ; FMAD-LABEL: unsafe_fmul_fsub_distribute_fast_v2f32: ; FMAD: ; %bb.0: ; FMAD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; FMAD-NEXT: v_mad_f32 v0, -v2, v0, v0 ; FMAD-NEXT: v_mad_f32 v1, -v3, v1, v1 ; FMAD-NEXT: s_setpc_b64 s[30:31] +; +; FMADGFX10-LABEL: unsafe_fmul_fsub_distribute_fast_v2f32: +; FMADGFX10: ; %bb.0: +; FMADGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FMADGFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; FMADGFX10-NEXT: v_fmac_f32_e64 v0, -v2, v0 +; FMADGFX10-NEXT: v_fmac_f32_e64 v1, -v3, v1 +; FMADGFX10-NEXT: s_setpc_b64 s[30:31] %add = fsub fast <2 x float> , %arg1 %tmp1 = fmul fast <2 x float> %arg0, %add ret <2 x float> %tmp1 @@ -131,11 +193,25 @@ ; NOFUSE-NEXT: v_mul_f32_e32 v0, v1, v0 ; NOFUSE-NEXT: s_setpc_b64 s[30:31] ; +; FMAGFX10-LABEL: unsafe_fast_fmul_fadd_distribute_post_legalize_f32: +; FMAGFX10: ; %bb.0: +; FMAGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FMAGFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; FMAGFX10-NEXT: v_fma_f32 v0, v0, v1, v1 +; FMAGFX10-NEXT: s_setpc_b64 s[30:31] +; ; FMAD-LABEL: unsafe_fast_fmul_fadd_distribute_post_legalize_f32: ; FMAD: ; %bb.0: ; FMAD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; FMAD-NEXT: v_mad_f32 v0, v0, v1, v1 ; FMAD-NEXT: s_setpc_b64 s[30:31] +; +; FMADGFX10-LABEL: unsafe_fast_fmul_fadd_distribute_post_legalize_f32: +; FMADGFX10: ; %bb.0: +; FMADGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FMADGFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; FMADGFX10-NEXT: v_mad_f32 v0, v0, v1, v1 +; FMADGFX10-NEXT: s_setpc_b64 s[30:31] %add = fadd fast float %arg0, 1.0 %splat = insertelement <2 x float> undef, float %add, i32 0 %tmp1 = fmul fast <2 x float> %arg1, %splat @@ -156,11 +232,25 @@ ; NOFUSE-NEXT: v_mul_f32_e32 v0, v1, v0 ; NOFUSE-NEXT: s_setpc_b64 s[30:31] ; +; FMAGFX10-LABEL: unsafe_fast_fmul_fsub_ditribute_post_legalize: +; FMAGFX10: ; %bb.0: +; FMAGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FMAGFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; FMAGFX10-NEXT: v_fma_f32 v0, v1, -v0, v1 +; FMAGFX10-NEXT: s_setpc_b64 s[30:31] +; ; FMAD-LABEL: unsafe_fast_fmul_fsub_ditribute_post_legalize: ; FMAD: ; %bb.0: ; FMAD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; FMAD-NEXT: v_mad_f32 v0, -v0, v1, v1 ; FMAD-NEXT: s_setpc_b64 s[30:31] +; +; FMADGFX10-LABEL: unsafe_fast_fmul_fsub_ditribute_post_legalize: +; FMADGFX10: ; %bb.0: +; FMADGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FMADGFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; FMADGFX10-NEXT: v_mad_f32 v0, -v0, v1, v1 +; FMADGFX10-NEXT: s_setpc_b64 s[30:31] %sub = fsub fast float 1.0, %arg0 %splat = insertelement <2 x float> undef, float %sub, i32 0 %tmp1 = fmul fast <2 x float> %arg1, %splat diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fold-legalize-dag-increase-insts.ll b/llvm/test/CodeGen/AMDGPU/fneg-fold-legalize-dag-increase-insts.ll --- a/llvm/test/CodeGen/AMDGPU/fneg-fold-legalize-dag-increase-insts.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fold-legalize-dag-increase-insts.ll @@ -1,18 +1,31 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-no-signed-zeros-fp-math=true < %s | FileCheck %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-no-signed-zeros-fp-math=false < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-no-signed-zeros-fp-math=true < %s | FileCheck %s --check-prefix=GFX9 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-no-signed-zeros-fp-math=false < %s | FileCheck %s --check-prefix=GFX9 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -enable-no-signed-zeros-fp-math=true < %s | FileCheck %s --check-prefix=GFX10 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -enable-no-signed-zeros-fp-math=false < %s | FileCheck %s --check-prefix=GFX10 ; no-signed-zeros-fp-math should not increase the number of ; instructions emitted. define { double, double } @testfn(double %arg, double %arg1, double %arg2) { -; CHECK-LABEL: testfn: -; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_add_f64 v[4:5], v[4:5], -v[0:1] -; CHECK-NEXT: v_add_f64 v[0:1], v[4:5], -v[2:3] -; CHECK-NEXT: v_add_f64 v[2:3], -v[2:3], -v[4:5] -; CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: testfn: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], -v[0:1] +; GFX9-NEXT: v_add_f64 v[0:1], v[4:5], -v[2:3] +; GFX9-NEXT: v_add_f64 v[2:3], -v[2:3], -v[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: testfn: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v7, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], -v[0:1] +; GFX10-NEXT: v_add_f64 v[0:1], v[4:5], -v[2:3] +; GFX10-NEXT: v_add_f64 v[2:3], -v[2:3], -v[4:5] +; GFX10-NEXT: s_setpc_b64 s[30:31] bb: %tmp = fsub fast double 0.000000e+00, %arg1 %tmp3 = fsub fast double %arg2, %arg diff --git a/llvm/test/CodeGen/AMDGPU/fpow.ll b/llvm/test/CodeGen/AMDGPU/fpow.ll --- a/llvm/test/CodeGen/AMDGPU/fpow.ll +++ b/llvm/test/CodeGen/AMDGPU/fpow.ll @@ -2,6 +2,7 @@ ; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s ; RUN: llc -march=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s define float @v_pow_f32(float %x, float %y) { ; GFX6-LABEL: v_pow_f32: @@ -27,6 +28,15 @@ ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_pow_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_log_f32_e32 v0, v0 +; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 +; GFX10-NEXT: v_exp_f32_e32 v0, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %pow = call float @llvm.pow.f32(float %x, float %y) ret float %pow } @@ -64,6 +74,18 @@ ; GFX9-NEXT: v_exp_f32_e32 v0, v0 ; GFX9-NEXT: v_exp_f32_e32 v1, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_pow_v2f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_log_f32_e32 v0, v0 +; GFX10-NEXT: v_log_f32_e32 v1, v1 +; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v2, v0 +; GFX10-NEXT: v_mul_legacy_f32_e32 v1, v3, v1 +; GFX10-NEXT: v_exp_f32_e32 v0, v0 +; GFX10-NEXT: v_exp_f32_e32 v1, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %pow = call <2 x float> @llvm.pow.v2f32(<2 x float> %x, <2 x float> %y) ret <2 x float> %pow } @@ -102,6 +124,18 @@ ; GFX9-NEXT: v_exp_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_pow_f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX10-NEXT: v_log_f32_e32 v0, v0 +; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 +; GFX10-NEXT: v_exp_f32_e32 v0, v0 +; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %pow = call half @llvm.pow.f16(half %x, half %y) ret half %pow } @@ -162,6 +196,26 @@ ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_pow_v2f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX10-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX10-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_log_f32_e32 v2, v2 +; GFX10-NEXT: v_log_f32_e32 v0, v0 +; GFX10-NEXT: v_mul_legacy_f32_e32 v2, v3, v2 +; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 +; GFX10-NEXT: v_exp_f32_e32 v1, v2 +; GFX10-NEXT: v_exp_f32_e32 v0, v0 +; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y) ret <2 x half> %pow } @@ -226,6 +280,26 @@ ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_pow_v2f16_fneg_lhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cvt_f32_f16_e64 v2, -v0 +; GFX10-NEXT: v_cvt_f32_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX10-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_log_f32_e32 v2, v2 +; GFX10-NEXT: v_log_f32_e32 v0, v0 +; GFX10-NEXT: v_mul_legacy_f32_e32 v2, v3, v2 +; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 +; GFX10-NEXT: v_exp_f32_e32 v1, v2 +; GFX10-NEXT: v_exp_f32_e32 v0, v0 +; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %x.fneg = fneg <2 x half> %x %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x.fneg, <2 x half> %y) ret <2 x half> %pow @@ -291,6 +365,26 @@ ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_pow_v2f16_fneg_rhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX10-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_cvt_f32_f16_e64 v3, -v1 +; GFX10-NEXT: v_cvt_f32_f16_sdwa v1, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_log_f32_e32 v2, v2 +; GFX10-NEXT: v_log_f32_e32 v0, v0 +; GFX10-NEXT: v_mul_legacy_f32_e32 v2, v3, v2 +; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 +; GFX10-NEXT: v_exp_f32_e32 v1, v2 +; GFX10-NEXT: v_exp_f32_e32 v0, v0 +; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %y.fneg = fneg <2 x half> %y %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y.fneg) ret <2 x half> %pow @@ -361,6 +455,26 @@ ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_pow_v2f16_fneg_lhs_rhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cvt_f32_f16_e64 v2, -v0 +; GFX10-NEXT: v_cvt_f32_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_cvt_f32_f16_e64 v3, -v1 +; GFX10-NEXT: v_cvt_f32_f16_sdwa v1, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_log_f32_e32 v2, v2 +; GFX10-NEXT: v_log_f32_e32 v0, v0 +; GFX10-NEXT: v_mul_legacy_f32_e32 v2, v3, v2 +; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 +; GFX10-NEXT: v_exp_f32_e32 v1, v2 +; GFX10-NEXT: v_exp_f32_e32 v0, v0 +; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %x.fneg = fneg <2 x half> %x %y.fneg = fneg <2 x half> %y %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x.fneg, <2 x half> %y.fneg) @@ -400,6 +514,16 @@ ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_pow_f32_fabs_lhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX10-NEXT: v_log_f32_e32 v0, v0 +; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 +; GFX10-NEXT: v_exp_f32_e32 v0, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fabs.x = call float @llvm.fabs.f32(float %x) %pow = call float @llvm.pow.f32(float %fabs.x, float %y) ret float %pow @@ -432,6 +556,16 @@ ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_pow_f32_fabs_rhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_log_f32_e32 v0, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 +; GFX10-NEXT: v_exp_f32_e32 v0, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fabs.y = call float @llvm.fabs.f32(float %y) %pow = call float @llvm.pow.f32(float %x, float %fabs.y) ret float %pow @@ -470,6 +604,18 @@ ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_pow_f32_fabs_lhs_rhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_brev_b32 s4, -2 +; GFX10-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX10-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX10-NEXT: v_log_f32_e32 v0, v0 +; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 +; GFX10-NEXT: v_exp_f32_e32 v0, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fabs.x = call float @llvm.fabs.f32(float %x) %fabs.y = call float @llvm.fabs.f32(float %y) %pow = call float @llvm.pow.f32(float %fabs.x, float %fabs.y) @@ -497,6 +643,13 @@ ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_pow_f32_sgpr_vgpr: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_log_f32_e32 v1, s0 +; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_exp_f32_e32 v0, v0 +; GFX10-NEXT: ; return to shader part epilog %pow = call float @llvm.pow.f32(float %x, float %y) ret float %pow } @@ -522,6 +675,13 @@ ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, s0, v0 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_pow_f32_vgpr_sgpr: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_log_f32_e32 v0, v0 +; GFX10-NEXT: v_mul_legacy_f32_e32 v0, s0, v0 +; GFX10-NEXT: v_exp_f32_e32 v0, v0 +; GFX10-NEXT: ; return to shader part epilog %pow = call float @llvm.pow.f32(float %x, float %y) ret float %pow } @@ -547,6 +707,13 @@ ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, s1, v0 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_pow_f32_sgpr_sgpr: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_log_f32_e32 v0, s0 +; GFX10-NEXT: v_mul_legacy_f32_e32 v0, s1, v0 +; GFX10-NEXT: v_exp_f32_e32 v0, v0 +; GFX10-NEXT: ; return to shader part epilog %pow = call float @llvm.pow.f32(float %x, float %y) ret float %pow } diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -2,6 +2,8 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mattr=+mad-mac-f32-insts -verify-machineinstrs < %s | FileCheck --check-prefix=SI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck --check-prefix=CI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s define amdgpu_kernel void @frem_f16(half addrspace(1)* %out, half addrspace(1)* %in1, ; SI-LABEL: frem_f16: @@ -113,6 +115,50 @@ ; VI-NEXT: v_fma_f16 v2, -v3, v2, v4 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX9-LABEL: frem_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] offset:8 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX9-NEXT: v_rcp_f32_e32 v4, v4 +; GFX9-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX9-NEXT: v_div_fixup_f16 v3, v3, v2, v1 +; GFX9-NEXT: v_trunc_f16_e32 v3, v3 +; GFX9-NEXT: v_fma_f16 v1, -v3, v2, v1 +; GFX9-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: frem_f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] +; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] offset:8 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX10-NEXT: v_rcp_f32_e32 v4, v4 +; GFX10-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1 +; GFX10-NEXT: v_trunc_f16_e32 v3, v3 +; GFX10-NEXT: v_fmac_f16_e64 v1, -v3, v2 +; GFX10-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-NEXT: s_endpgm half addrspace(1)* %in2) #0 { %gep2 = getelementptr half, half addrspace(1)* %in2, i32 4 %r0 = load half, half addrspace(1)* %in1, align 4 @@ -203,6 +249,40 @@ ; VI-NEXT: v_fma_f16 v2, -v3, v2, v4 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fast_frem_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] offset:8 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_rcp_f16_e32 v3, v2 +; GFX9-NEXT: v_mul_f16_e32 v3, v1, v3 +; GFX9-NEXT: v_trunc_f16_e32 v3, v3 +; GFX9-NEXT: v_fma_f16 v1, -v3, v2, v1 +; GFX9-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: fast_frem_f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] +; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] offset:8 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_rcp_f16_e32 v3, v2 +; GFX10-NEXT: v_mul_f16_e32 v3, v1, v3 +; GFX10-NEXT: v_trunc_f16_e32 v3, v3 +; GFX10-NEXT: v_fmac_f16_e64 v1, -v3, v2 +; GFX10-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-NEXT: s_endpgm half addrspace(1)* %in2) #0 { %gep2 = getelementptr half, half addrspace(1)* %in2, i32 4 %r0 = load half, half addrspace(1)* %in1, align 4 @@ -293,6 +373,40 @@ ; VI-NEXT: v_fma_f16 v2, -v3, v2, v4 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX9-LABEL: unsafe_frem_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] offset:8 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_rcp_f16_e32 v3, v2 +; GFX9-NEXT: v_mul_f16_e32 v3, v1, v3 +; GFX9-NEXT: v_trunc_f16_e32 v3, v3 +; GFX9-NEXT: v_fma_f16 v1, -v3, v2, v1 +; GFX9-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: unsafe_frem_f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] +; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] offset:8 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_rcp_f16_e32 v3, v2 +; GFX10-NEXT: v_mul_f16_e32 v3, v1, v3 +; GFX10-NEXT: v_trunc_f16_e32 v3, v3 +; GFX10-NEXT: v_fmac_f16_e64 v1, -v3, v2 +; GFX10-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-NEXT: s_endpgm half addrspace(1)* %in2) #1 { %gep2 = getelementptr half, half addrspace(1)* %in2, i32 4 %r0 = load half, half addrspace(1)* %in1, align 4 @@ -408,6 +522,62 @@ ; VI-NEXT: v_fma_f32 v2, -v3, v2, v4 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX9-LABEL: frem_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_div_scale_f32 v4, s[0:1], v2, v2, v1 +; GFX9-NEXT: v_div_scale_f32 v3, vcc, v1, v2, v1 +; GFX9-NEXT: v_rcp_f32_e32 v5, v4 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX9-NEXT: v_fma_f32 v6, -v4, v5, 1.0 +; GFX9-NEXT: v_fma_f32 v5, v6, v5, v5 +; GFX9-NEXT: v_mul_f32_e32 v6, v3, v5 +; GFX9-NEXT: v_fma_f32 v7, -v4, v6, v3 +; GFX9-NEXT: v_fma_f32 v6, v7, v5, v6 +; GFX9-NEXT: v_fma_f32 v3, -v4, v6, v3 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX9-NEXT: v_div_fmas_f32 v3, v3, v5, v6 +; GFX9-NEXT: v_div_fixup_f32 v3, v3, v2, v1 +; GFX9-NEXT: v_trunc_f32_e32 v3, v3 +; GFX9-NEXT: v_fma_f32 v1, -v3, v2, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: frem_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_div_scale_f32 v4, s0, v2, v2, v1 +; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, v1, v2, v1 +; GFX10-NEXT: v_rcp_f32_e32 v5, v4 +; GFX10-NEXT: s_denorm_mode 15 +; GFX10-NEXT: v_fma_f32 v6, -v4, v5, 1.0 +; GFX10-NEXT: v_fma_f32 v5, v6, v5, v5 +; GFX10-NEXT: v_mul_f32_e32 v6, v3, v5 +; GFX10-NEXT: v_fma_f32 v7, -v4, v6, v3 +; GFX10-NEXT: v_fma_f32 v6, v7, v5, v6 +; GFX10-NEXT: v_fma_f32 v3, -v4, v6, v3 +; GFX10-NEXT: s_denorm_mode 12 +; GFX10-NEXT: v_div_fmas_f32 v3, v3, v5, v6 +; GFX10-NEXT: v_div_fixup_f32 v3, v3, v2, v1 +; GFX10-NEXT: v_trunc_f32_e32 v3, v3 +; GFX10-NEXT: v_fmac_f32_e64 v1, -v3, v2 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_endpgm float addrspace(1)* %in2) #0 { %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4 %r0 = load float, float addrspace(1)* %in1, align 4 @@ -490,6 +660,40 @@ ; VI-NEXT: v_fma_f32 v2, -v3, v2, v4 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fast_frem_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_rcp_f32_e32 v3, v2 +; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 +; GFX9-NEXT: v_trunc_f32_e32 v3, v3 +; GFX9-NEXT: v_fma_f32 v1, -v3, v2, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: fast_frem_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_rcp_f32_e32 v3, v2 +; GFX10-NEXT: v_mul_f32_e32 v3, v1, v3 +; GFX10-NEXT: v_trunc_f32_e32 v3, v3 +; GFX10-NEXT: v_fmac_f32_e64 v1, -v3, v2 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_endpgm float addrspace(1)* %in2) #0 { %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4 %r0 = load float, float addrspace(1)* %in1, align 4 @@ -572,6 +776,40 @@ ; VI-NEXT: v_fma_f32 v2, -v3, v2, v4 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX9-LABEL: unsafe_frem_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_rcp_f32_e32 v3, v2 +; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 +; GFX9-NEXT: v_trunc_f32_e32 v3, v3 +; GFX9-NEXT: v_fma_f32 v1, -v3, v2, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: unsafe_frem_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_rcp_f32_e32 v3, v2 +; GFX10-NEXT: v_mul_f32_e32 v3, v1, v3 +; GFX10-NEXT: v_trunc_f32_e32 v3, v3 +; GFX10-NEXT: v_fmac_f32_e64 v1, -v3, v2 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_endpgm float addrspace(1)* %in2) #1 { %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4 %r0 = load float, float addrspace(1)* %in1, align 4 @@ -700,6 +938,59 @@ ; VI-NEXT: v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3] ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm +; +; GFX9-LABEL: frem_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v12, s[6:7] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v12, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1] +; GFX9-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; GFX9-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX9-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX9-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX9-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX9-NEXT: v_div_scale_f64 v[8:9], vcc, v[0:1], v[2:3], v[0:1] +; GFX9-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] +; GFX9-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] +; GFX9-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] +; GFX9-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] +; GFX9-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] +; GFX9-NEXT: global_store_dwordx2 v12, v[0:1], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: frem_f64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v15, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v15, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v15, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_div_scale_f64 v[4:5], s0, v[2:3], v[2:3], v[0:1] +; GFX10-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX10-NEXT: v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1] +; GFX10-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] +; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] +; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] +; GFX10-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] +; GFX10-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] +; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] +; GFX10-NEXT: global_store_dwordx2 v15, v[0:1], s[4:5] +; GFX10-NEXT: s_endpgm double addrspace(1)* %in2) #0 { %r0 = load double, double addrspace(1)* %in1, align 8 %r1 = load double, double addrspace(1)* %in2, align 8 @@ -812,6 +1103,52 @@ ; VI-NEXT: v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3] ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fast_frem_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v10, s[6:7] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v10, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GFX9-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GFX9-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GFX9-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] +; GFX9-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] +; GFX9-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] +; GFX9-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] +; GFX9-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] +; GFX9-NEXT: global_store_dwordx2 v10, v[0:1], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: fast_frem_f64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v10, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v10, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v10, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GFX10-NEXT: v_fma_f64 v[14:15], v[6:7], v[4:5], v[4:5] +; GFX10-NEXT: v_mul_f64 v[6:7], v[0:1], v[14:15] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] +; GFX10-NEXT: v_fma_f64 v[4:5], v[8:9], v[14:15], v[6:7] +; GFX10-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] +; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] +; GFX10-NEXT: global_store_dwordx2 v10, v[0:1], s[4:5] +; GFX10-NEXT: s_endpgm double addrspace(1)* %in2) #0 { %r0 = load double, double addrspace(1)* %in1, align 8 %r1 = load double, double addrspace(1)* %in2, align 8 @@ -924,6 +1261,52 @@ ; VI-NEXT: v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3] ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm +; +; GFX9-LABEL: unsafe_frem_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v10, s[6:7] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v10, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GFX9-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GFX9-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GFX9-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] +; GFX9-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] +; GFX9-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] +; GFX9-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] +; GFX9-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] +; GFX9-NEXT: global_store_dwordx2 v10, v[0:1], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: unsafe_frem_f64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v10, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v10, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v10, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GFX10-NEXT: v_fma_f64 v[14:15], v[6:7], v[4:5], v[4:5] +; GFX10-NEXT: v_mul_f64 v[6:7], v[0:1], v[14:15] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] +; GFX10-NEXT: v_fma_f64 v[4:5], v[8:9], v[14:15], v[6:7] +; GFX10-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] +; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] +; GFX10-NEXT: global_store_dwordx2 v10, v[0:1], s[4:5] +; GFX10-NEXT: s_endpgm double addrspace(1)* %in2) #1 { %r0 = load double, double addrspace(1)* %in1, align 8 %r1 = load double, double addrspace(1)* %in2, align 8 @@ -1102,6 +1485,75 @@ ; VI-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX9-LABEL: frem_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX9-NEXT: v_rcp_f32_e32 v4, v4 +; GFX9-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX9-NEXT: v_div_fixup_f16 v3, v3, v2, v1 +; GFX9-NEXT: v_trunc_f16_e32 v3, v3 +; GFX9-NEXT: v_fma_f16 v3, -v3, v2, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX9-NEXT: v_rcp_f32_e32 v5, v5 +; GFX9-NEXT: v_mul_f32_e32 v4, v4, v5 +; GFX9-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX9-NEXT: v_div_fixup_f16 v4, v4, v2, v1 +; GFX9-NEXT: v_trunc_f16_e32 v4, v4 +; GFX9-NEXT: v_fma_f16 v1, -v4, v2, v1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: frem_v2f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:16 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX10-NEXT: v_rcp_f32_e32 v4, v4 +; GFX10-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v1 +; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_trunc_f16_e32 v3, v3 +; GFX10-NEXT: v_fmac_f16_e64 v4, -v3, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX10-NEXT: v_rcp_f32_e32 v5, v5 +; GFX10-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1 +; GFX10-NEXT: v_trunc_f16_e32 v3, v3 +; GFX10-NEXT: v_fmac_f16_e64 v1, -v3, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_endpgm <2 x half> addrspace(1)* %in2) #0 { %gep2 = getelementptr <2 x half>, <2 x half> addrspace(1)* %in2, i32 4 %r0 = load <2 x half>, <2 x half> addrspace(1)* %in1, align 8 @@ -1381,6 +1833,118 @@ ; VI-NEXT: v_or_b32_e32 v2, v2, v5 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm +; +; GFX9-LABEL: frem_v4f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX9-NEXT: v_rcp_f32_e32 v6, v6 +; GFX9-NEXT: v_mul_f32_e32 v5, v5, v6 +; GFX9-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX9-NEXT: v_div_fixup_f16 v5, v5, v3, v1 +; GFX9-NEXT: v_trunc_f16_e32 v5, v5 +; GFX9-NEXT: v_fma_f16 v5, -v5, v3, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX9-NEXT: v_rcp_f32_e32 v7, v7 +; GFX9-NEXT: v_mul_f32_e32 v6, v6, v7 +; GFX9-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX9-NEXT: v_div_fixup_f16 v6, v6, v3, v1 +; GFX9-NEXT: v_trunc_f16_e32 v6, v6 +; GFX9-NEXT: v_fma_f16 v1, -v6, v3, v1 +; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: v_and_b32_e32 v5, v3, v5 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v5 +; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX9-NEXT: v_rcp_f32_e32 v6, v6 +; GFX9-NEXT: v_mul_f32_e32 v5, v5, v6 +; GFX9-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX9-NEXT: v_div_fixup_f16 v5, v5, v2, v0 +; GFX9-NEXT: v_trunc_f16_e32 v5, v5 +; GFX9-NEXT: v_fma_f16 v5, -v5, v2, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX9-NEXT: v_rcp_f32_e32 v7, v7 +; GFX9-NEXT: v_mul_f32_e32 v6, v6, v7 +; GFX9-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX9-NEXT: v_div_fixup_f16 v6, v6, v2, v0 +; GFX9-NEXT: v_trunc_f16_e32 v6, v6 +; GFX9-NEXT: v_fma_f16 v0, -v6, v2, v0 +; GFX9-NEXT: v_and_b32_e32 v2, v3, v5 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: frem_v4f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v11, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v11, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[15:16], v11, s[2:3] offset:32 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v16 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v16 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v15 +; GFX10-NEXT: v_rcp_f32_e32 v6, v6 +; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX10-NEXT: v_rcp_f32_e32 v7, v7 +; GFX10-NEXT: v_mul_f32_e32 v5, v5, v6 +; GFX10-NEXT: v_cvt_f16_f32_e32 v6, v5 +; GFX10-NEXT: v_div_fixup_f16 v5, v6, v16, v1 +; GFX10-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-NEXT: v_trunc_f16_e32 v5, v5 +; GFX10-NEXT: v_fmac_f16_e64 v6, -v5, v16 +; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX10-NEXT: v_mul_f32_e32 v5, v5, v7 +; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX10-NEXT: v_cvt_f16_f32_e32 v10, v5 +; GFX10-NEXT: v_rcp_f32_e32 v7, v7 +; GFX10-NEXT: v_div_fixup_f16 v5, v10, v3, v4 +; GFX10-NEXT: v_trunc_f16_e32 v10, v5 +; GFX10-NEXT: v_fmac_f16_e64 v4, -v10, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX10-NEXT: v_and_b32_e32 v5, v3, v6 +; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v15 +; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v5 +; GFX10-NEXT: v_rcp_f32_e32 v6, v6 +; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX10-NEXT: v_mul_f32_e32 v5, v5, v6 +; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX10-NEXT: v_div_fixup_f16 v5, v5, v15, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: v_trunc_f16_e32 v5, v5 +; GFX10-NEXT: v_fmac_f16_e64 v6, -v5, v15 +; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX10-NEXT: v_mul_f32_e32 v5, v5, v7 +; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX10-NEXT: v_div_fixup_f16 v5, v5, v2, v0 +; GFX10-NEXT: v_trunc_f16_e32 v5, v5 +; GFX10-NEXT: v_fmac_f16_e64 v0, -v5, v2 +; GFX10-NEXT: v_and_b32_e32 v2, v3, v6 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX10-NEXT: global_store_dwordx2 v11, v[0:1], s[4:5] +; GFX10-NEXT: s_endpgm <4 x half> addrspace(1)* %in2) #0 { %gep2 = getelementptr <4 x half>, <4 x half> addrspace(1)* %in2, i32 4 %r0 = load <4 x half>, <4 x half> addrspace(1)* %in1, align 16 @@ -1547,6 +2111,94 @@ ; VI-NEXT: v_fma_f32 v2, -v5, v4, v2 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm +; +; GFX9-LABEL: frem_v2f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32 +; GFX9-NEXT: s_mov_b32 s2, 3 +; GFX9-NEXT: s_mov_b32 s3, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_div_scale_f32 v6, s[0:1], v3, v3, v1 +; GFX9-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 +; GFX9-NEXT: v_rcp_f32_e32 v7, v6 +; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 +; GFX9-NEXT: v_fma_f32 v8, -v6, v7, 1.0 +; GFX9-NEXT: v_fma_f32 v7, v8, v7, v7 +; GFX9-NEXT: v_mul_f32_e32 v8, v5, v7 +; GFX9-NEXT: v_fma_f32 v9, -v6, v8, v5 +; GFX9-NEXT: v_fma_f32 v8, v9, v7, v8 +; GFX9-NEXT: v_fma_f32 v5, -v6, v8, v5 +; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 +; GFX9-NEXT: v_div_fmas_f32 v5, v5, v7, v8 +; GFX9-NEXT: v_div_fixup_f32 v5, v5, v3, v1 +; GFX9-NEXT: v_trunc_f32_e32 v5, v5 +; GFX9-NEXT: v_fma_f32 v1, -v5, v3, v1 +; GFX9-NEXT: v_div_scale_f32 v5, s[0:1], v2, v2, v0 +; GFX9-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0 +; GFX9-NEXT: v_rcp_f32_e32 v6, v5 +; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 +; GFX9-NEXT: v_fma_f32 v7, -v5, v6, 1.0 +; GFX9-NEXT: v_fma_f32 v6, v7, v6, v6 +; GFX9-NEXT: v_mul_f32_e32 v7, v3, v6 +; GFX9-NEXT: v_fma_f32 v8, -v5, v7, v3 +; GFX9-NEXT: v_fma_f32 v7, v8, v6, v7 +; GFX9-NEXT: v_fma_f32 v3, -v5, v7, v3 +; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 +; GFX9-NEXT: v_div_fmas_f32 v3, v3, v6, v7 +; GFX9-NEXT: v_div_fixup_f32 v3, v3, v2, v0 +; GFX9-NEXT: v_trunc_f32_e32 v3, v3 +; GFX9-NEXT: v_fma_f32 v0, -v3, v2, v0 +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: frem_v2f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v11, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v11, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v11, s[2:3] offset:32 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_div_scale_f32 v6, s0, v3, v3, v1 +; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v1, v3, v1 +; GFX10-NEXT: v_rcp_f32_e32 v7, v6 +; GFX10-NEXT: s_denorm_mode 15 +; GFX10-NEXT: v_fma_f32 v8, -v6, v7, 1.0 +; GFX10-NEXT: v_fma_f32 v7, v8, v7, v7 +; GFX10-NEXT: v_mul_f32_e32 v8, v5, v7 +; GFX10-NEXT: v_fma_f32 v9, -v6, v8, v5 +; GFX10-NEXT: v_fma_f32 v8, v9, v7, v8 +; GFX10-NEXT: v_fma_f32 v5, -v6, v8, v5 +; GFX10-NEXT: s_denorm_mode 12 +; GFX10-NEXT: v_div_fmas_f32 v6, v5, v7, v8 +; GFX10-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v2, v0 +; GFX10-NEXT: v_div_fixup_f32 v5, v6, v3, v1 +; GFX10-NEXT: v_trunc_f32_e32 v6, v5 +; GFX10-NEXT: v_div_scale_f32 v5, s0, v2, v2, v0 +; GFX10-NEXT: v_fma_f32 v1, v3, -v6, v1 +; GFX10-NEXT: v_rcp_f32_e32 v6, v5 +; GFX10-NEXT: s_denorm_mode 15 +; GFX10-NEXT: v_fma_f32 v7, -v5, v6, 1.0 +; GFX10-NEXT: v_fma_f32 v6, v7, v6, v6 +; GFX10-NEXT: v_mul_f32_e32 v7, v4, v6 +; GFX10-NEXT: v_fma_f32 v8, -v5, v7, v4 +; GFX10-NEXT: v_fma_f32 v7, v8, v6, v7 +; GFX10-NEXT: v_fma_f32 v5, -v5, v7, v4 +; GFX10-NEXT: s_denorm_mode 12 +; GFX10-NEXT: v_div_fmas_f32 v3, v5, v6, v7 +; GFX10-NEXT: v_div_fixup_f32 v3, v3, v2, v0 +; GFX10-NEXT: v_trunc_f32_e32 v3, v3 +; GFX10-NEXT: v_fmac_f32_e64 v0, -v3, v2 +; GFX10-NEXT: global_store_dwordx2 v11, v[0:1], s[4:5] +; GFX10-NEXT: s_endpgm <2 x float> addrspace(1)* %in2) #0 { %gep2 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in2, i32 4 %r0 = load <2 x float>, <2 x float> addrspace(1)* %in1, align 8 @@ -1803,6 +2455,154 @@ ; VI-NEXT: v_fma_f32 v0, -v5, v4, v0 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; VI-NEXT: s_endpgm +; +; GFX9-LABEL: frem_v4f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:64 +; GFX9-NEXT: s_mov_b32 s2, 3 +; GFX9-NEXT: s_mov_b32 s3, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_div_scale_f32 v10, s[0:1], v7, v7, v3 +; GFX9-NEXT: v_div_scale_f32 v9, vcc, v3, v7, v3 +; GFX9-NEXT: v_rcp_f32_e32 v11, v10 +; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 +; GFX9-NEXT: v_fma_f32 v12, -v10, v11, 1.0 +; GFX9-NEXT: v_fma_f32 v11, v12, v11, v11 +; GFX9-NEXT: v_mul_f32_e32 v12, v9, v11 +; GFX9-NEXT: v_fma_f32 v13, -v10, v12, v9 +; GFX9-NEXT: v_fma_f32 v12, v13, v11, v12 +; GFX9-NEXT: v_fma_f32 v9, -v10, v12, v9 +; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 +; GFX9-NEXT: v_div_fmas_f32 v9, v9, v11, v12 +; GFX9-NEXT: v_div_fixup_f32 v9, v9, v7, v3 +; GFX9-NEXT: v_trunc_f32_e32 v9, v9 +; GFX9-NEXT: v_fma_f32 v3, -v9, v7, v3 +; GFX9-NEXT: v_div_scale_f32 v9, s[0:1], v6, v6, v2 +; GFX9-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 +; GFX9-NEXT: v_rcp_f32_e32 v10, v9 +; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 +; GFX9-NEXT: v_fma_f32 v11, -v9, v10, 1.0 +; GFX9-NEXT: v_fma_f32 v10, v11, v10, v10 +; GFX9-NEXT: v_mul_f32_e32 v11, v7, v10 +; GFX9-NEXT: v_fma_f32 v12, -v9, v11, v7 +; GFX9-NEXT: v_fma_f32 v11, v12, v10, v11 +; GFX9-NEXT: v_fma_f32 v7, -v9, v11, v7 +; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 +; GFX9-NEXT: v_div_fmas_f32 v7, v7, v10, v11 +; GFX9-NEXT: v_div_fixup_f32 v7, v7, v6, v2 +; GFX9-NEXT: v_trunc_f32_e32 v7, v7 +; GFX9-NEXT: v_fma_f32 v2, -v7, v6, v2 +; GFX9-NEXT: v_div_scale_f32 v7, s[0:1], v5, v5, v1 +; GFX9-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 +; GFX9-NEXT: v_rcp_f32_e32 v9, v7 +; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 +; GFX9-NEXT: v_fma_f32 v10, -v7, v9, 1.0 +; GFX9-NEXT: v_fma_f32 v9, v10, v9, v9 +; GFX9-NEXT: v_mul_f32_e32 v10, v6, v9 +; GFX9-NEXT: v_fma_f32 v11, -v7, v10, v6 +; GFX9-NEXT: v_fma_f32 v10, v11, v9, v10 +; GFX9-NEXT: v_fma_f32 v6, -v7, v10, v6 +; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 +; GFX9-NEXT: v_div_fmas_f32 v6, v6, v9, v10 +; GFX9-NEXT: v_div_fixup_f32 v6, v6, v5, v1 +; GFX9-NEXT: v_trunc_f32_e32 v6, v6 +; GFX9-NEXT: v_fma_f32 v1, -v6, v5, v1 +; GFX9-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, v0 +; GFX9-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 +; GFX9-NEXT: v_rcp_f32_e32 v7, v6 +; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 +; GFX9-NEXT: v_fma_f32 v9, -v6, v7, 1.0 +; GFX9-NEXT: v_fma_f32 v7, v9, v7, v7 +; GFX9-NEXT: v_mul_f32_e32 v9, v5, v7 +; GFX9-NEXT: v_fma_f32 v10, -v6, v9, v5 +; GFX9-NEXT: v_fma_f32 v9, v10, v7, v9 +; GFX9-NEXT: v_fma_f32 v5, -v6, v9, v5 +; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 +; GFX9-NEXT: v_div_fmas_f32 v5, v5, v7, v9 +; GFX9-NEXT: v_div_fixup_f32 v5, v5, v4, v0 +; GFX9-NEXT: v_trunc_f32_e32 v5, v5 +; GFX9-NEXT: v_fma_f32 v0, -v5, v4, v0 +; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: frem_v4f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx4 v[15:18], v8, s[6:7] +; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:64 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_div_scale_f32 v10, s0, v7, v7, v18 +; GFX10-NEXT: v_div_scale_f32 v9, vcc_lo, v18, v7, v18 +; GFX10-NEXT: v_rcp_f32_e32 v11, v10 +; GFX10-NEXT: s_denorm_mode 15 +; GFX10-NEXT: v_fma_f32 v12, -v10, v11, 1.0 +; GFX10-NEXT: v_fma_f32 v11, v12, v11, v11 +; GFX10-NEXT: v_mul_f32_e32 v12, v9, v11 +; GFX10-NEXT: v_fma_f32 v13, -v10, v12, v9 +; GFX10-NEXT: v_fma_f32 v12, v13, v11, v12 +; GFX10-NEXT: v_fma_f32 v9, -v10, v12, v9 +; GFX10-NEXT: s_denorm_mode 12 +; GFX10-NEXT: v_div_fmas_f32 v9, v9, v11, v12 +; GFX10-NEXT: v_div_scale_f32 v0, vcc_lo, v17, v6, v17 +; GFX10-NEXT: v_div_fixup_f32 v9, v9, v7, v18 +; GFX10-NEXT: v_trunc_f32_e32 v9, v9 +; GFX10-NEXT: v_fma_f32 v18, v7, -v9, v18 +; GFX10-NEXT: v_div_scale_f32 v9, s0, v6, v6, v17 +; GFX10-NEXT: v_rcp_f32_e32 v10, v9 +; GFX10-NEXT: s_denorm_mode 15 +; GFX10-NEXT: v_fma_f32 v11, -v9, v10, 1.0 +; GFX10-NEXT: v_fma_f32 v10, v11, v10, v10 +; GFX10-NEXT: v_mul_f32_e32 v11, v0, v10 +; GFX10-NEXT: v_fma_f32 v12, -v9, v11, v0 +; GFX10-NEXT: v_fma_f32 v11, v12, v10, v11 +; GFX10-NEXT: v_fma_f32 v1, -v9, v11, v0 +; GFX10-NEXT: s_denorm_mode 12 +; GFX10-NEXT: v_div_fmas_f32 v7, v1, v10, v11 +; GFX10-NEXT: v_div_fixup_f32 v7, v7, v6, v17 +; GFX10-NEXT: v_trunc_f32_e32 v7, v7 +; GFX10-NEXT: v_fma_f32 v17, v6, -v7, v17 +; GFX10-NEXT: v_div_scale_f32 v7, s0, v5, v5, v16 +; GFX10-NEXT: v_div_scale_f32 v6, vcc_lo, v16, v5, v16 +; GFX10-NEXT: v_rcp_f32_e32 v9, v7 +; GFX10-NEXT: s_denorm_mode 15 +; GFX10-NEXT: v_fma_f32 v10, -v7, v9, 1.0 +; GFX10-NEXT: v_fma_f32 v9, v10, v9, v9 +; GFX10-NEXT: v_mul_f32_e32 v0, v6, v9 +; GFX10-NEXT: v_fma_f32 v11, -v7, v0, v6 +; GFX10-NEXT: v_fma_f32 v0, v11, v9, v0 +; GFX10-NEXT: v_fma_f32 v6, -v7, v0, v6 +; GFX10-NEXT: s_denorm_mode 12 +; GFX10-NEXT: v_div_fmas_f32 v6, v6, v9, v0 +; GFX10-NEXT: v_div_fixup_f32 v6, v6, v5, v16 +; GFX10-NEXT: v_trunc_f32_e32 v6, v6 +; GFX10-NEXT: v_fma_f32 v16, v5, -v6, v16 +; GFX10-NEXT: v_div_scale_f32 v6, s0, v4, v4, v15 +; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v15, v4, v15 +; GFX10-NEXT: v_rcp_f32_e32 v7, v6 +; GFX10-NEXT: s_denorm_mode 15 +; GFX10-NEXT: v_fma_f32 v9, -v6, v7, 1.0 +; GFX10-NEXT: v_fma_f32 v7, v9, v7, v7 +; GFX10-NEXT: v_mul_f32_e32 v0, v5, v7 +; GFX10-NEXT: v_fma_f32 v10, -v6, v0, v5 +; GFX10-NEXT: v_fma_f32 v0, v10, v7, v0 +; GFX10-NEXT: v_fma_f32 v5, -v6, v0, v5 +; GFX10-NEXT: s_denorm_mode 12 +; GFX10-NEXT: v_div_fmas_f32 v5, v5, v7, v0 +; GFX10-NEXT: v_div_fixup_f32 v5, v5, v4, v15 +; GFX10-NEXT: v_trunc_f32_e32 v5, v5 +; GFX10-NEXT: v_fmac_f32_e64 v15, -v5, v4 +; GFX10-NEXT: global_store_dwordx4 v8, v[15:18], s[4:5] +; GFX10-NEXT: s_endpgm <4 x float> addrspace(1)* %in2) #0 { %gep2 = getelementptr <4 x float>, <4 x float> addrspace(1)* %in2, i32 4 %r0 = load <4 x float>, <4 x float> addrspace(1)* %in1, align 16 @@ -1992,6 +2792,86 @@ ; VI-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] ; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; VI-NEXT: s_endpgm +; +; GFX9-LABEL: frem_v2f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v16, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v16, s[2:3] offset:64 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_div_scale_f64 v[8:9], s[0:1], v[6:7], v[6:7], v[2:3] +; GFX9-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] +; GFX9-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX9-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX9-NEXT: v_div_scale_f64 v[12:13], vcc, v[2:3], v[6:7], v[2:3] +; GFX9-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] +; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13] +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15] +; GFX9-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3] +; GFX9-NEXT: v_trunc_f64_e32 v[8:9], v[8:9] +; GFX9-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] +; GFX9-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1] +; GFX9-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] +; GFX9-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX9-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX9-NEXT: v_div_scale_f64 v[10:11], vcc, v[0:1], v[4:5], v[0:1] +; GFX9-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] +; GFX9-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] +; GFX9-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] +; GFX9-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] +; GFX9-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] +; GFX9-NEXT: global_store_dwordx4 v16, v[0:3], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: frem_v2f64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v16, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] +; GFX10-NEXT: global_load_dwordx4 v[18:21], v16, s[2:3] offset:64 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_div_scale_f64 v[8:9], s0, v[20:21], v[20:21], v[2:3] +; GFX10-NEXT: v_div_scale_f64 v[6:7], s0, v[18:19], v[18:19], v[0:1] +; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX10-NEXT: v_div_scale_f64 v[12:13], vcc_lo, v[2:3], v[20:21], v[2:3] +; GFX10-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13] +; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15] +; GFX10-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[20:21], v[2:3] +; GFX10-NEXT: v_trunc_f64_e32 v[8:9], v[8:9] +; GFX10-NEXT: v_fma_f64 v[2:3], -v[8:9], v[20:21], v[2:3] +; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] +; GFX10-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX10-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX10-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[18:19], v[0:1] +; GFX10-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] +; GFX10-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] +; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] +; GFX10-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[18:19], v[0:1] +; GFX10-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] +; GFX10-NEXT: v_fma_f64 v[0:1], -v[6:7], v[18:19], v[0:1] +; GFX10-NEXT: global_store_dwordx4 v16, v[0:3], s[4:5] +; GFX10-NEXT: s_endpgm <2 x double> addrspace(1)* %in2) #0 { %gep2 = getelementptr <2 x double>, <2 x double> addrspace(1)* %in2, i32 4 %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1, align 16 diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll --- a/llvm/test/CodeGen/AMDGPU/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/fshl.ll @@ -3,6 +3,7 @@ ; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=VI ; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s --check-prefix=GFX9 ; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s --check-prefix=R600 +; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefix=GFX10 declare i32 @llvm.fshl.i32(i32, i32, i32) nounwind readnone declare <2 x i32> @llvm.fshl.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind readnone @@ -72,6 +73,21 @@ ; R600-NEXT: BIT_ALIGN_INT T0.X, PV.Z, PV.W, PS, ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX10-LABEL: fshl_i32: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dword s6, s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_alignbit_b32 v0, s2, s3, 1 +; GFX10-NEXT: s_lshr_b32 s0, s2, 1 +; GFX10-NEXT: s_not_b32 s1, s6 +; GFX10-NEXT: v_alignbit_b32 v0, s0, v0, s1 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm entry: %0 = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z) store i32 %0, i32 addrspace(1)* %in @@ -125,6 +141,17 @@ ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; R600-NEXT: BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, literal.x, ; R600-NEXT: 25(3.503246e-44), 0(0.000000e+00) +; +; GFX10-LABEL: fshl_i32_imm: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 25 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_endpgm entry: %0 = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 7) store i32 %0, i32 addrspace(1)* %in @@ -220,6 +247,26 @@ ; R600-NEXT: BIT_ALIGN_INT T0.X, T0.Z, T0.W, PV.W, ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX10-LABEL: fshl_v2i32: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x3 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x3c +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshr_b32 s0, s3, 1 +; GFX10-NEXT: v_alignbit_b32 v0, s3, s5, 1 +; GFX10-NEXT: v_alignbit_b32 v3, s2, s4, 1 +; GFX10-NEXT: s_not_b32 s1, s7 +; GFX10-NEXT: s_lshr_b32 s2, s2, 1 +; GFX10-NEXT: s_not_b32 s3, s6 +; GFX10-NEXT: v_alignbit_b32 v1, s0, v0, s1 +; GFX10-NEXT: v_alignbit_b32 v0, s2, v3, s3 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX10-NEXT: s_endpgm entry: %0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z) store <2 x i32> %0, <2 x i32> addrspace(1)* %in @@ -284,6 +331,19 @@ ; R600-NEXT: 25(3.503246e-44), 0(0.000000e+00) ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX10-LABEL: fshl_v2i32_imm: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_alignbit_b32 v1, s3, s5, 23 +; GFX10-NEXT: v_alignbit_b32 v0, s2, s4, 25 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX10-NEXT: s_endpgm entry: %0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> ) store <2 x i32> %0, <2 x i32> addrspace(1)* %in @@ -423,6 +483,34 @@ ; R600-NEXT: BIT_ALIGN_INT T0.X, T1.Z, T1.W, PV.W, ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX10-LABEL: fshl_v4i32: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x3 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 +; GFX10-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshr_b32 s0, s7, 1 +; GFX10-NEXT: v_alignbit_b32 v5, s5, s9, 1 +; GFX10-NEXT: v_alignbit_b32 v6, s4, s8, 1 +; GFX10-NEXT: v_alignbit_b32 v0, s7, s11, 1 +; GFX10-NEXT: v_alignbit_b32 v1, s6, s10, 1 +; GFX10-NEXT: s_not_b32 s1, s15 +; GFX10-NEXT: s_lshr_b32 s6, s6, 1 +; GFX10-NEXT: s_not_b32 s7, s14 +; GFX10-NEXT: s_lshr_b32 s5, s5, 1 +; GFX10-NEXT: s_not_b32 s9, s13 +; GFX10-NEXT: s_lshr_b32 s4, s4, 1 +; GFX10-NEXT: s_not_b32 s8, s12 +; GFX10-NEXT: v_alignbit_b32 v3, s0, v0, s1 +; GFX10-NEXT: v_alignbit_b32 v2, s6, v1, s7 +; GFX10-NEXT: v_alignbit_b32 v1, s5, v5, s9 +; GFX10-NEXT: v_alignbit_b32 v0, s4, v6, s8 +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX10-NEXT: s_endpgm entry: %0 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) store <4 x i32> %0, <4 x i32> addrspace(1)* %in @@ -503,6 +591,21 @@ ; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00) ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX10-LABEL: fshl_v4i32_imm: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_alignbit_b32 v3, s7, s11, 31 +; GFX10-NEXT: v_alignbit_b32 v2, s6, s10, 23 +; GFX10-NEXT: v_alignbit_b32 v1, s5, s9, 25 +; GFX10-NEXT: v_alignbit_b32 v0, s4, s8, 31 +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX10-NEXT: s_endpgm entry: %0 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> ) store <4 x i32> %0, <4 x i32> addrspace(1)* %in diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -3,6 +3,7 @@ ; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefixes=GFX89,VI ; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX89,GFX9 ; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s --check-prefix=R600 +; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX10 declare i32 @llvm.fshr.i32(i32, i32, i32) declare <2 x i32> @llvm.fshr.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) @@ -69,6 +70,19 @@ ; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; R600-NEXT: BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, KC0[3].X, +; +; GFX10-LABEL: fshr_i32: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: s_load_dword s6, s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_alignbit_b32 v0, s2, s3, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm entry: %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z) store i32 %0, i32 addrspace(1)* %in @@ -122,6 +136,17 @@ ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; R600-NEXT: BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, literal.x, ; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00) +; +; GFX10-LABEL: fshr_i32_imm: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 7 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_endpgm entry: %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 7) store i32 %0, i32 addrspace(1)* %in @@ -195,6 +220,22 @@ ; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, PV.W, ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX10-LABEL: fshr_v2i32: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x3 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_alignbit_b32 v1, s5, s7, v0 +; GFX10-NEXT: v_alignbit_b32 v0, s4, s6, v2 +; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[8:9] +; GFX10-NEXT: s_endpgm entry: %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z) store <2 x i32> %0, <2 x i32> addrspace(1)* %in @@ -259,6 +300,19 @@ ; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00) ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX10-LABEL: fshr_v2i32_imm: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_alignbit_b32 v1, s3, s5, 9 +; GFX10-NEXT: v_alignbit_b32 v0, s2, s4, 7 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX10-NEXT: s_endpgm entry: %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> ) store <2 x i32> %0, <2 x i32> addrspace(1)* %in @@ -354,6 +408,26 @@ ; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, PV.W, ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX10-LABEL: fshr_v4i32: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x3 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x54 +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 +; GFX10-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v6, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s7 +; GFX10-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-NEXT: v_mov_b32_e32 v4, s5 +; GFX10-NEXT: v_mov_b32_e32 v5, s4 +; GFX10-NEXT: v_alignbit_b32 v3, s15, s11, v0 +; GFX10-NEXT: v_alignbit_b32 v2, s14, s10, v1 +; GFX10-NEXT: v_alignbit_b32 v1, s13, s9, v4 +; GFX10-NEXT: v_alignbit_b32 v0, s12, s8, v5 +; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[2:3] +; GFX10-NEXT: s_endpgm entry: %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) store <4 x i32> %0, <4 x i32> addrspace(1)* %in @@ -432,6 +506,21 @@ ; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, 1, ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX10-LABEL: fshr_v4i32_imm: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_alignbit_b32 v3, s7, s11, 1 +; GFX10-NEXT: v_alignbit_b32 v2, s6, s10, 9 +; GFX10-NEXT: v_alignbit_b32 v1, s5, s9, 7 +; GFX10-NEXT: v_alignbit_b32 v0, s4, s8, 1 +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX10-NEXT: s_endpgm entry: %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> ) store <4 x i32> %0, <4 x i32> addrspace(1)* %in @@ -449,6 +538,13 @@ ; R600: ; %bb.0: ; R600-NEXT: CF_END ; R600-NEXT: PAD +; +; GFX10-LABEL: v_fshr_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_alignbit_b32 v0, v0, v1, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] %ret = call i32 @llvm.fshr.i32(i32 %src0, i32 %src1, i32 %src2) ret i32 %ret } @@ -465,6 +561,14 @@ ; R600: ; %bb.0: ; R600-NEXT: CF_END ; R600-NEXT: PAD +; +; GFX10-LABEL: v_fshr_v2i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_alignbit_b32 v0, v0, v2, v4 +; GFX10-NEXT: v_alignbit_b32 v1, v1, v3, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %src0, <2 x i32> %src1, <2 x i32> %src2) ret <2 x i32> %ret } @@ -482,6 +586,15 @@ ; R600: ; %bb.0: ; R600-NEXT: CF_END ; R600-NEXT: PAD +; +; GFX10-LABEL: v_fshr_v3i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_alignbit_b32 v0, v0, v3, v6 +; GFX10-NEXT: v_alignbit_b32 v1, v1, v4, v7 +; GFX10-NEXT: v_alignbit_b32 v2, v2, v5, v8 +; GFX10-NEXT: s_setpc_b64 s[30:31] %ret = call <3 x i32> @llvm.fshr.v3i32(<3 x i32> %src0, <3 x i32> %src1, <3 x i32> %src2) ret <3 x i32> %ret } @@ -500,6 +613,16 @@ ; R600: ; %bb.0: ; R600-NEXT: CF_END ; R600-NEXT: PAD +; +; GFX10-LABEL: v_fshr_v4i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_alignbit_b32 v0, v0, v4, v8 +; GFX10-NEXT: v_alignbit_b32 v1, v1, v5, v9 +; GFX10-NEXT: v_alignbit_b32 v2, v2, v6, v10 +; GFX10-NEXT: v_alignbit_b32 v3, v3, v7, v11 +; GFX10-NEXT: s_setpc_b64 s[30:31] %ret = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %src0, <4 x i32> %src1, <4 x i32> %src2) ret <4 x i32> %ret } @@ -541,6 +664,19 @@ ; R600: ; %bb.0: ; R600-NEXT: CF_END ; R600-NEXT: PAD +; +; GFX10-LABEL: v_fshr_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX10-NEXT: v_lshlrev_b16_e64 v0, 1, v0 +; GFX10-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX10-NEXT: v_lshrrev_b16_e64 v1, v2, v1 +; GFX10-NEXT: v_lshlrev_b16_e64 v0, v3, v0 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %ret = call i16 @llvm.fshr.i16(i16 %src0, i16 %src1, i16 %src2) ret i16 %ret } @@ -600,6 +736,20 @@ ; R600: ; %bb.0: ; R600-NEXT: CF_END ; R600-NEXT: PAD +; +; GFX10-LABEL: v_fshr_v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2 +; GFX10-NEXT: s_mov_b32 s4, 0xf000f +; GFX10-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] +; GFX10-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX10-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX10-NEXT: v_pk_lshrrev_b16 v1, v2, v1 +; GFX10-NEXT: v_pk_lshlrev_b16 v0, v3, v0 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x i16> @llvm.fshr.v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2) ret <2 x i16> %ret } @@ -688,6 +838,38 @@ ; R600: ; %bb.0: ; R600-NEXT: CF_END ; R600-NEXT: PAD +; +; GFX10-LABEL: v_fshr_v3i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_xor_b32_e32 v8, -1, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; GFX10-NEXT: v_lshlrev_b16_e64 v0, 1, v0 +; GFX10-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX10-NEXT: v_and_b32_e32 v9, 15, v6 +; GFX10-NEXT: v_xor_b32_e32 v6, -1, v6 +; GFX10-NEXT: v_and_b32_e32 v15, 15, v8 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX10-NEXT: v_lshrrev_b16_e64 v2, v4, v2 +; GFX10-NEXT: v_lshlrev_b16_e64 v10, 1, v10 +; GFX10-NEXT: v_and_b32_e32 v19, 15, v6 +; GFX10-NEXT: v_lshlrev_b16_e64 v0, v15, v0 +; GFX10-NEXT: v_xor_b32_e32 v11, -1, v5 +; GFX10-NEXT: v_lshrrev_b16_e64 v4, v9, v7 +; GFX10-NEXT: v_lshlrev_b16_e64 v1, 1, v1 +; GFX10-NEXT: v_lshlrev_b16_e64 v6, v19, v10 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 15, v11 +; GFX10-NEXT: v_and_b32_e32 v2, 15, v5 +; GFX10-NEXT: v_or_b32_e32 v11, v6, v4 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_lshlrev_b16_e64 v1, v7, v1 +; GFX10-NEXT: v_lshrrev_b16_e64 v2, v2, v3 +; GFX10-NEXT: v_lshl_or_b32 v0, v11, 16, v0 +; GFX10-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] %ret = call <3 x i16> @llvm.fshr.v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2) ret <3 x i16> %ret } @@ -802,6 +984,51 @@ ; R600: ; %bb.0: ; R600-NEXT: CF_END ; R600-NEXT: PAD +; +; GFX10-LABEL: v_fshr_v4i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; GFX10-NEXT: v_xor_b32_e32 v9, -1, v6 +; GFX10-NEXT: v_and_b32_e32 v6, 15, v6 +; GFX10-NEXT: v_lshlrev_b16_e64 v8, 1, v8 +; GFX10-NEXT: v_and_b32_e32 v13, 15, v10 +; GFX10-NEXT: v_lshlrev_b16_e64 v1, 1, v1 +; GFX10-NEXT: v_and_b32_e32 v9, 15, v9 +; GFX10-NEXT: v_lshrrev_b16_e64 v6, v6, v7 +; GFX10-NEXT: v_lshlrev_b16_e64 v0, 1, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; GFX10-NEXT: v_lshlrev_b16_e64 v11, 1, v11 +; GFX10-NEXT: v_lshlrev_b16_e64 v7, v9, v8 +; GFX10-NEXT: v_xor_b32_e32 v9, -1, v10 +; GFX10-NEXT: v_xor_b32_e32 v10, -1, v5 +; GFX10-NEXT: v_xor_b32_e32 v8, -1, v4 +; GFX10-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX10-NEXT: v_and_b32_e32 v5, 15, v5 +; GFX10-NEXT: v_and_b32_e32 v9, 15, v9 +; GFX10-NEXT: v_and_b32_e32 v10, 15, v10 +; GFX10-NEXT: v_and_b32_e32 v15, 15, v8 +; GFX10-NEXT: v_lshrrev_b16_e64 v2, v4, v2 +; GFX10-NEXT: v_lshrrev_b16_e64 v3, v5, v3 +; GFX10-NEXT: v_lshrrev_b16_e64 v4, v13, v12 +; GFX10-NEXT: v_lshlrev_b16_e64 v1, v10, v1 +; GFX10-NEXT: v_lshlrev_b16_e64 v0, v15, v0 +; GFX10-NEXT: v_lshlrev_b16_e64 v5, v9, v11 +; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX10-NEXT: v_or_b32_e32 v3, v7, v6 +; GFX10-NEXT: v_or_b32_e32 v7, v5, v4 +; GFX10-NEXT: v_and_b32_e32 v0, v2, v0 +; GFX10-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX10-NEXT: v_lshl_or_b32 v0, v7, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %ret = call <4 x i16> @llvm.fshr.v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2) ret <4 x i16> %ret } @@ -850,6 +1077,20 @@ ; R600: ; %bb.0: ; R600-NEXT: CF_END ; R600-NEXT: PAD +; +; GFX10-LABEL: v_fshr_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_not_b32_e32 v5, v4 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GFX10-NEXT: v_and_b32_e32 v4, 63, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 63, v5 +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1] +; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] %ret = call i64 @llvm.fshr.i64(i64 %src0, i64 %src1, i64 %src2) ret i64 %ret } @@ -922,6 +1163,28 @@ ; R600: ; %bb.0: ; R600-NEXT: CF_END ; R600-NEXT: PAD +; +; GFX10-LABEL: v_fshr_v2i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_not_b32_e32 v9, v8 +; GFX10-NEXT: v_not_b32_e32 v11, v10 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] +; GFX10-NEXT: v_and_b32_e32 v15, 63, v8 +; GFX10-NEXT: v_and_b32_e32 v19, 63, v9 +; GFX10-NEXT: v_and_b32_e32 v9, 63, v10 +; GFX10-NEXT: v_and_b32_e32 v13, 63, v11 +; GFX10-NEXT: v_lshrrev_b64 v[4:5], v15, v[4:5] +; GFX10-NEXT: v_lshlrev_b64 v[11:12], v19, v[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[6:7], v9, v[6:7] +; GFX10-NEXT: v_lshlrev_b64 v[15:16], v13, v[2:3] +; GFX10-NEXT: v_or_b32_e32 v0, v11, v4 +; GFX10-NEXT: v_or_b32_e32 v1, v12, v5 +; GFX10-NEXT: v_or_b32_e32 v2, v15, v6 +; GFX10-NEXT: v_or_b32_e32 v3, v16, v7 +; GFX10-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2) ret <2 x i64> %ret } @@ -970,6 +1233,19 @@ ; R600: ; %bb.0: ; R600-NEXT: CF_END ; R600-NEXT: PAD +; +; GFX10-LABEL: v_fshr_i24: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_hi_u32 v3, 0xaaaaaaab, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 4, v3 +; GFX10-NEXT: v_mul_lo_u32 v3, v3, 24 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, v2, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v2, 8, v2 +; GFX10-NEXT: v_alignbit_b32 v0, v0, v1, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] %ret = call i24 @llvm.fshr.i24(i24 %src0, i24 %src1, i24 %src2) ret i24 %ret } @@ -1039,6 +1315,27 @@ ; R600: ; %bb.0: ; R600-NEXT: CF_END ; R600-NEXT: PAD +; +; GFX10-LABEL: v_fshr_v2i24: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_mov_b32 s4, 0xaaaaaaab +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX10-NEXT: v_mul_hi_u32 v6, v4, s4 +; GFX10-NEXT: v_mul_hi_u32 v7, v5, s4 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 4, v6 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 4, v7 +; GFX10-NEXT: v_mul_lo_u32 v6, v6, 24 +; GFX10-NEXT: v_mul_lo_u32 v7, v7, 24 +; GFX10-NEXT: v_sub_nc_u32_e32 v4, v4, v6 +; GFX10-NEXT: v_sub_nc_u32_e32 v5, v5, v7 +; GFX10-NEXT: v_add_nc_u32_e32 v7, 8, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v6, 8, v5 +; GFX10-NEXT: v_alignbit_b32 v0, v0, v2, v7 +; GFX10-NEXT: v_alignbit_b32 v1, v1, v3, v6 +; GFX10-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2) ret <2 x i24> %ret } diff --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll --- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll @@ -2,6 +2,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX900 %s ; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908 %s ; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s define amdgpu_kernel void @global_atomic_fadd_ret_f32(float addrspace(1)* %ptr) #0 { ; GFX900-LABEL: global_atomic_fadd_ret_f32: @@ -84,6 +85,34 @@ ; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX90A-NEXT: global_store_dword v[0:1], v0, off ; GFX90A-NEXT: s_endpgm +; +; GFX10-LABEL: global_atomic_fadd_ret_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: BB0_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: s_cbranch_execnz BB0_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: global_store_dword v[0:1], v0, off +; GFX10-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst store float %result, float addrspace(1)* undef ret void @@ -170,6 +199,34 @@ ; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX90A-NEXT: global_store_dword v[0:1], v0, off ; GFX90A-NEXT: s_endpgm +; +; GFX10-LABEL: global_atomic_fadd_ret_f32_ieee: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: BB1_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: s_cbranch_execnz BB1_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: global_store_dword v[0:1], v0, off +; GFX10-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst store float %result, float addrspace(1)* undef ret void @@ -221,6 +278,32 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: s_endpgm +; +; GFX10-LABEL: global_atomic_fadd_noret_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: BB2_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: s_cbranch_execnz BB2_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst ret void } @@ -300,6 +383,32 @@ ; GFX90A-NEXT: s_cbranch_execnz BB3_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_endpgm +; +; GFX10-LABEL: global_atomic_fadd_noret_f32_ieee: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: BB3_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: s_cbranch_execnz BB3_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst ret void } @@ -368,6 +477,34 @@ ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: global_store_dword v[0:1], v0, off ; GFX90A-NEXT: s_endpgm +; +; GFX10-LABEL: global_atomic_fadd_ret_f32_agent: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: BB4_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: s_cbranch_execnz BB4_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: global_store_dword v[0:1], v0, off +; GFX10-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst store float %result, float addrspace(1)* undef ret void @@ -454,6 +591,34 @@ ; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX90A-NEXT: global_store_dword v[0:1], v0, off ; GFX90A-NEXT: s_endpgm +; +; GFX10-LABEL: global_atomic_fadd_ret_f32_system: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: BB5_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: s_cbranch_execnz BB5_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: global_store_dword v[0:1], v0, off +; GFX10-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("one-as") seq_cst store float %result, float addrspace(1)* undef ret void diff --git a/llvm/test/CodeGen/AMDGPU/gws-hazards.mir b/llvm/test/CodeGen/AMDGPU/gws-hazards.mir --- a/llvm/test/CodeGen/AMDGPU/gws-hazards.mir +++ b/llvm/test/CodeGen/AMDGPU/gws-hazards.mir @@ -3,6 +3,7 @@ # RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=VI %s # RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=CI %s # RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=SI %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX10 %s --- name: m0_gws_init0 @@ -29,6 +30,10 @@ ; SI: liveins: $vgpr0 ; SI: $m0 = S_MOV_B32 -1 ; SI: DS_GWS_INIT $vgpr0, 0, implicit $m0, implicit $exec + ; GFX10-LABEL: name: m0_gws_init0 + ; GFX10: liveins: $vgpr0 + ; GFX10: $m0 = S_MOV_B32 -1 + ; GFX10: DS_GWS_INIT $vgpr0, 0, implicit $m0, implicit $exec $m0 = S_MOV_B32 -1 DS_GWS_INIT $vgpr0, 0, implicit $m0, implicit $exec @@ -58,6 +63,10 @@ ; SI: $vgpr0 = V_MOV_B32_e32 0, implicit $exec ; SI: $m0 = S_MOV_B32 -1 ; SI: DS_GWS_INIT $vgpr0, 0, implicit $m0, implicit $exec + ; GFX10-LABEL: name: m0_gws_init1 + ; GFX10: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX10: $m0 = S_MOV_B32 -1 + ; GFX10: DS_GWS_INIT $vgpr0, 0, implicit $m0, implicit $exec $vgpr0 = V_MOV_B32_e32 0, implicit $exec $m0 = S_MOV_B32 -1 DS_GWS_INIT $vgpr0, 0, implicit $m0, implicit $exec @@ -96,6 +105,11 @@ ; SI: $sgpr0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec ; SI: $m0 = S_MOV_B32 $sgpr0 ; SI: DS_GWS_INIT $vgpr0, 0, implicit $m0, implicit $exec + ; GFX10-LABEL: name: m0_gws_readlane + ; GFX10: liveins: $vgpr0, $vgpr1 + ; GFX10: $sgpr0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec + ; GFX10: $m0 = S_MOV_B32 $sgpr0 + ; GFX10: DS_GWS_INIT $vgpr0, 0, implicit $m0, implicit $exec $sgpr0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec $m0 = S_MOV_B32 $sgpr0 DS_GWS_INIT $vgpr0, 0, implicit $m0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll --- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll +++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s define amdgpu_kernel void @udiv32_invariant_denom(i32 addrspace(1)* nocapture %arg, i32 %arg1) { ; GFX9-LABEL: udiv32_invariant_denom: @@ -43,6 +44,49 @@ ; GFX9-NEXT: s_cbranch_scc0 BB0_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: udiv32_invariant_denom: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX10-NEXT: s_mov_b64 s[4:5], 0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX10-NEXT: s_sub_i32 s3, 0, s2 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX10-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: BB0_1: ; %bb3 +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_mul_lo_u32 v2, s5, v0 +; GFX10-NEXT: v_mul_hi_u32 v3, s4, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v2, v3, v2 +; GFX10-NEXT: v_mul_lo_u32 v4, s3, v2 +; GFX10-NEXT: v_not_b32_e32 v3, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v2 +; GFX10-NEXT: v_mul_lo_u32 v3, s2, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v4, s4, v4 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v3, s4, v3 +; GFX10-NEXT: s_add_u32 s4, s4, 1 +; GFX10-NEXT: s_addc_u32 s5, s5, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo +; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX10-NEXT: global_store_dword v1, v2, s[0:1] +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_add_u32 s0, s0, 4 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_cmpk_eq_i32 s4, 0x400 +; GFX10-NEXT: s_cbranch_scc0 BB0_1 +; GFX10-NEXT: ; %bb.2: ; %bb2 +; GFX10-NEXT: s_endpgm bb: br label %bb3 @@ -100,6 +144,47 @@ ; GFX9-NEXT: s_cbranch_scc0 BB1_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: urem32_invariant_denom: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX10-NEXT: s_mov_b64 s[4:5], 0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX10-NEXT: s_sub_i32 s3, 0, s2 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX10-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: BB1_1: ; %bb3 +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_mul_lo_u32 v2, s5, v0 +; GFX10-NEXT: v_mul_hi_u32 v3, s4, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v2, v3, v2 +; GFX10-NEXT: v_not_b32_e32 v3, v2 +; GFX10-NEXT: v_mul_lo_u32 v2, s3, v2 +; GFX10-NEXT: v_mul_lo_u32 v3, s2, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v2, s4, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v3, s4, v3 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 +; GFX10-NEXT: s_add_u32 s4, s4, 1 +; GFX10-NEXT: s_addc_u32 s5, s5, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s2, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX10-NEXT: global_store_dword v1, v2, s[0:1] +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_add_u32 s0, s0, 4 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_cmpk_eq_i32 s4, 0x400 +; GFX10-NEXT: s_cbranch_scc0 BB1_1 +; GFX10-NEXT: ; %bb.2: ; %bb2 +; GFX10-NEXT: s_endpgm bb: br label %bb3 @@ -160,6 +245,51 @@ ; GFX9-NEXT: s_cbranch_scc0 BB2_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: sdiv32_invariant_denom: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s3, s[0:1], 0x2c +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_ashr_i32 s2, s3, 31 +; GFX10-NEXT: s_add_i32 s3, s3, s2 +; GFX10-NEXT: s_xor_b32 s3, s3, s2 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX10-NEXT: s_sub_i32 s4, 0, s3 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX10-NEXT: v_mul_lo_u32 v1, s4, v0 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: BB2_1: ; %bb3 +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_mul_hi_u32 v2, s4, v0 +; GFX10-NEXT: v_mul_lo_u32 v3, v2, s3 +; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v2 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, s4, v3 +; GFX10-NEXT: s_add_i32 s4, s4, 1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s3, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo +; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX10-NEXT: v_xor_b32_e32 v2, s2, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s2, v2 +; GFX10-NEXT: global_store_dword v1, v2, s[0:1] +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_add_u32 s0, s0, 4 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_cmpk_eq_i32 s4, 0x400 +; GFX10-NEXT: s_cbranch_scc0 BB2_1 +; GFX10-NEXT: ; %bb.2: ; %bb2 +; GFX10-NEXT: s_endpgm bb: br label %bb3 @@ -216,6 +346,47 @@ ; GFX9-NEXT: s_cbranch_scc0 BB3_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: srem32_invariant_denom: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX10-NEXT: s_nop 0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_ashr_i32 s3, s2, 31 +; GFX10-NEXT: s_add_i32 s2, s2, s3 +; GFX10-NEXT: s_xor_b32 s2, s2, s3 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX10-NEXT: s_sub_i32 s3, 0, s2 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX10-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX10-NEXT: s_mov_b32 s3, 0 +; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: BB3_1: ; %bb3 +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_mul_hi_u32 v2, s3, v0 +; GFX10-NEXT: v_mul_lo_u32 v2, v2, s2 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, s3, v2 +; GFX10-NEXT: s_add_i32 s3, s3, 1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s2, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s2, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX10-NEXT: global_store_dword v1, v2, s[0:1] +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_add_u32 s0, s0, 4 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_cmpk_eq_i32 s3, 0x400 +; GFX10-NEXT: s_cbranch_scc0 BB3_1 +; GFX10-NEXT: ; %bb.2: ; %bb2 +; GFX10-NEXT: s_endpgm bb: br label %bb3 @@ -267,6 +438,39 @@ ; GFX9-NEXT: s_cbranch_vccz BB4_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: udiv16_invariant_denom: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 s1, 0xffff +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_and_b32 s0, s1, s4 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX10-NEXT: BB4_1: ; %bb3 +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_and_b32_e32 v2, s1, v4 +; GFX10-NEXT: v_add_nc_u16_e64 v4, v4, 1 +; GFX10-NEXT: v_cvt_f32_u32_e32 v7, v2 +; GFX10-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3] +; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v4 +; GFX10-NEXT: v_mul_f32_e32 v2, v7, v1 +; GFX10-NEXT: v_add_co_u32_e64 v5, s0, s2, v5 +; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, v6, s0 +; GFX10-NEXT: v_trunc_f32_e32 v2, v2 +; GFX10-NEXT: v_mad_f32 v7, -v2, v0, v7 +; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX10-NEXT: v_cmp_ge_f32_e64 s0, |v7|, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s0, 0, v2, s0 +; GFX10-NEXT: global_store_short v[5:6], v7, off +; GFX10-NEXT: s_cbranch_vccz BB4_1 +; GFX10-NEXT: ; %bb.2: ; %bb2 +; GFX10-NEXT: s_endpgm bb: br label %bb3 @@ -320,6 +524,41 @@ ; GFX9-NEXT: s_cbranch_vccz BB5_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: urem16_invariant_denom: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 s1, 0xffff +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_and_b32 s4, s1, s4 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX10-NEXT: BB5_1: ; %bb3 +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_and_b32_e32 v2, s1, v4 +; GFX10-NEXT: v_add_nc_u16_e64 v4, v4, 1 +; GFX10-NEXT: v_cvt_f32_u32_e32 v7, v2 +; GFX10-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3] +; GFX10-NEXT: v_mul_f32_e32 v8, v7, v1 +; GFX10-NEXT: v_add_co_u32_e64 v5, s0, s2, v5 +; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, v6, s0 +; GFX10-NEXT: v_trunc_f32_e32 v10, v8 +; GFX10-NEXT: v_mad_f32 v7, -v10, v0, v7 +; GFX10-NEXT: v_cvt_u32_f32_e32 v8, v10 +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v7|, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v8, vcc_lo +; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v4 +; GFX10-NEXT: v_mul_lo_u32 v7, v7, s4 +; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo +; GFX10-NEXT: v_sub_nc_u32_e32 v7, v2, v7 +; GFX10-NEXT: global_store_short v[5:6], v7, off +; GFX10-NEXT: s_cbranch_vccz BB5_1 +; GFX10-NEXT: ; %bb.2: ; %bb2 +; GFX10-NEXT: s_endpgm bb: br label %bb3 @@ -375,6 +614,43 @@ ; GFX9-NEXT: s_cbranch_vccz BB6_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: sdiv16_invariant_denom: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_sext_i32_i16 s4, s4 +; GFX10-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX10-NEXT: BB6_1: ; %bb3 +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_bfe_i32 v5, v4, 0, 16 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX10-NEXT: v_add_nc_u16_e64 v4, v4, 1 +; GFX10-NEXT: v_cvt_f32_i32_e32 v7, v5 +; GFX10-NEXT: v_xor_b32_e32 v8, s4, v5 +; GFX10-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3] +; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v4 +; GFX10-NEXT: v_mul_f32_e32 v2, v7, v1 +; GFX10-NEXT: v_ashrrev_i32_e32 v8, 30, v8 +; GFX10-NEXT: v_add_co_u32_e64 v5, s0, s2, v5 +; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo +; GFX10-NEXT: v_trunc_f32_e32 v2, v2 +; GFX10-NEXT: v_or_b32_e32 v8, 1, v8 +; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, v6, s0 +; GFX10-NEXT: v_mad_f32 v7, -v2, v0, v7 +; GFX10-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GFX10-NEXT: v_cmp_ge_f32_e64 s1, |v7|, |v0| +; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, v8, s1 +; GFX10-NEXT: v_add_nc_u32_e32 v7, v2, v7 +; GFX10-NEXT: global_store_short v[5:6], v7, off +; GFX10-NEXT: s_cbranch_vccz BB6_1 +; GFX10-NEXT: ; %bb.2: ; %bb2 +; GFX10-NEXT: s_endpgm bb: br label %bb3 @@ -432,6 +708,45 @@ ; GFX9-NEXT: s_cbranch_vccz BB7_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: srem16_invariant_denom: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_sext_i32_i16 s1, s4 +; GFX10-NEXT: v_cvt_f32_i32_e32 v0, s1 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX10-NEXT: BB7_1: ; %bb3 +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_bfe_i32 v7, v4, 0, 16 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX10-NEXT: v_add_nc_u16_e64 v4, v4, 1 +; GFX10-NEXT: v_cvt_f32_i32_e32 v11, v7 +; GFX10-NEXT: v_xor_b32_e32 v6, s1, v7 +; GFX10-NEXT: v_mul_f32_e32 v8, v11, v1 +; GFX10-NEXT: v_ashrrev_i32_e32 v6, 30, v6 +; GFX10-NEXT: v_trunc_f32_e32 v10, v8 +; GFX10-NEXT: v_or_b32_e32 v6, 1, v6 +; GFX10-NEXT: v_mad_f32 v5, -v10, v0, v11 +; GFX10-NEXT: v_cvt_i32_f32_e32 v8, v10 +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v5|, |v0| +; GFX10-NEXT: v_cndmask_b32_e32 v9, 0, v6, vcc_lo +; GFX10-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3] +; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v2, v8, v9 +; GFX10-NEXT: v_add_co_u32_e64 v5, s0, s2, v5 +; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, v6, s0 +; GFX10-NEXT: v_mul_lo_u32 v2, v2, s1 +; GFX10-NEXT: v_sub_nc_u32_e32 v7, v7, v2 +; GFX10-NEXT: global_store_short v[5:6], v7, off +; GFX10-NEXT: s_cbranch_vccz BB7_1 +; GFX10-NEXT: ; %bb.2: ; %bb2 +; GFX10-NEXT: s_endpgm bb: br label %bb3 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll @@ -2,6 +2,7 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,SI ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,VI ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,GFX9 +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GFX10 define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float %x, float %y) #0 { ; SI-LABEL: s_cvt_pkrtz_v2f16_f32: @@ -38,6 +39,17 @@ ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, s4, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: s_cvt_pkrtz_v2f16_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, s2, s3 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_endpgm %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y) store <2 x half> %result, <2 x half> addrspace(1)* %out ret void @@ -75,6 +87,17 @@ ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, s4, s4 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, s4, s4 +; GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-NEXT: s_endpgm %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %x) store <2 x half> %result, <2 x half> addrspace(1)* %out ret void @@ -84,6 +107,10 @@ ; GCN-LABEL: s_cvt_pkrtz_undef_undef: ; GCN: ; %bb.0: ; GCN-NEXT: s_endpgm +; +; GFX10-LABEL: s_cvt_pkrtz_undef_undef: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_endpgm %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float undef) store <2 x half> %result, <2 x half> addrspace(1)* %out ret void @@ -146,6 +173,21 @@ ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e32 v1, v1, v2 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -202,6 +244,17 @@ ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, 1.0 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, v1, 1.0 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -256,6 +309,17 @@ ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, 1.0, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e32 v1, 1.0, v1 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -323,6 +387,21 @@ ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, -v1, v2 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -393,6 +472,21 @@ ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, -v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, v1, -v2 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -463,6 +557,21 @@ ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -v1, -v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, -v1, -v2 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -534,6 +643,21 @@ ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -|v1|, -v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, -|v1|, -v2 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll @@ -2,6 +2,7 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX6 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX8 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s define amdgpu_kernel void @cos_f16(half addrspace(1)* %r, half addrspace(1)* %a) { ; GFX6-LABEL: cos_f16: @@ -53,6 +54,18 @@ ; GFX9-NEXT: v_cos_f16_e32 v1, v1 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: cos_f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 +; GFX10-NEXT: v_cos_f16_e32 v1, v1 +; GFX10-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm %a.val = load half, half addrspace(1)* %a %r.val = call half @llvm.cos.f16(half %a.val) store half %r.val, half addrspace(1)* %r @@ -128,6 +141,23 @@ ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: cos_v2f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v3, 0x3118 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mul_f16_e32 v2, 0.15915494, v1 +; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_cos_f16_e32 v2, v2 +; GFX10-NEXT: v_cos_f16_e32 v1, v1 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm %a.val = load <2 x half>, <2 x half> addrspace(1)* %a %r.val = call <2 x half> @llvm.cos.v2f16(<2 x half> %a.val) store <2 x half> %r.val, <2 x half> addrspace(1)* %r diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll @@ -2,6 +2,7 @@ ; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=SI %s ; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=VI %s ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s declare half @llvm.maxnum.f16(half %a, half %b) declare <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b) @@ -86,6 +87,32 @@ ; GFX9-NEXT: v_max_f16_e32 v0, v0, v1 ; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: maxnum_f16: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s14, s2 +; GFX10-NEXT: s_mov_b32 s15, s3 +; GFX10-NEXT: s_mov_b32 s10, s2 +; GFX10-NEXT: s_mov_b32 s11, s3 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s12, s6 +; GFX10-NEXT: s_mov_b32 s13, s7 +; GFX10-NEXT: s_mov_b32 s0, s4 +; GFX10-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_mov_b32 s1, s5 +; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX10-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX10-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX10-NEXT: s_endpgm half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #0 { @@ -156,6 +183,25 @@ ; GFX9-NEXT: v_max_f16_e32 v0, 0x4200, v0 ; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: maxnum_f16_imm_a: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s10, s6 +; GFX10-NEXT: s_mov_b32 s11, s7 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s8, s2 +; GFX10-NEXT: s_mov_b32 s9, s3 +; GFX10-NEXT: s_mov_b32 s4, s0 +; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX10-NEXT: v_max_f16_e32 v0, 0x4200, v0 +; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX10-NEXT: s_endpgm half addrspace(1)* %r, half addrspace(1)* %b) #0 { entry: @@ -224,6 +270,25 @@ ; GFX9-NEXT: v_max_f16_e32 v0, 4.0, v0 ; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: maxnum_f16_imm_b: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s10, s6 +; GFX10-NEXT: s_mov_b32 s11, s7 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s8, s2 +; GFX10-NEXT: s_mov_b32 s9, s3 +; GFX10-NEXT: s_mov_b32 s4, s0 +; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX10-NEXT: v_max_f16_e32 v0, 4.0, v0 +; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX10-NEXT: s_endpgm half addrspace(1)* %r, half addrspace(1)* %a) #0 { entry: @@ -306,6 +371,23 @@ ; GFX9-NEXT: v_pk_max_f16 v0, v1, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: maxnum_v2f16: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_pk_max_f16 v0, s0, s0 +; GFX10-NEXT: v_pk_max_f16 v1, s1, s1 +; GFX10-NEXT: v_pk_max_f16 v0, v1, v0 +; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX10-NEXT: s_endpgm <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) #0 { @@ -375,6 +457,19 @@ ; GFX9-NEXT: v_pk_max_f16 v0, v0, s0 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: maxnum_v2f16_imm_a: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_pk_max_f16 v0, s2, s2 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_pk_max_f16 v0, 0x44004200, v0 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: s_endpgm <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %b) #0 { entry: @@ -442,6 +537,19 @@ ; GFX9-NEXT: v_pk_max_f16 v0, v0, s0 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: maxnum_v2f16_imm_b: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_pk_max_f16 v0, s2, s2 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_pk_max_f16 v0, 0x42004400, v0 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: s_endpgm <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) #0 { entry: @@ -540,6 +648,27 @@ ; GFX9-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: maxnum_v3f16: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_pk_max_f16 v1, s1, s1 +; GFX10-NEXT: v_pk_max_f16 v2, s9, s9 +; GFX10-NEXT: v_pk_max_f16 v0, s0, s0 +; GFX10-NEXT: v_pk_max_f16 v3, s8, s8 +; GFX10-NEXT: v_pk_max_f16 v1, v2, v1 +; GFX10-NEXT: v_pk_max_f16 v0, v3, v0 +; GFX10-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 +; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX10-NEXT: s_endpgm <3 x half> addrspace(1)* %r, <3 x half> addrspace(1)* %a, <3 x half> addrspace(1)* %b) #0 { @@ -653,6 +782,26 @@ ; GFX9-NEXT: v_pk_max_f16 v0, v0, v2 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: maxnum_v4f16: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_pk_max_f16 v0, s1, s1 +; GFX10-NEXT: v_pk_max_f16 v1, s9, s9 +; GFX10-NEXT: v_pk_max_f16 v2, s0, s0 +; GFX10-NEXT: v_pk_max_f16 v3, s8, s8 +; GFX10-NEXT: v_pk_max_f16 v1, v1, v0 +; GFX10-NEXT: v_pk_max_f16 v0, v3, v2 +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX10-NEXT: s_endpgm <4 x half> addrspace(1)* %r, <4 x half> addrspace(1)* %a, <4 x half> addrspace(1)* %b) #0 { @@ -745,6 +894,21 @@ ; GFX9-NEXT: v_pk_max_f16 v0, v2, s9 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: fmax_v4f16_imm_a: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_pk_max_f16 v0, s3, s3 +; GFX10-NEXT: v_pk_max_f16 v2, s2, s2 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_pk_max_f16 v1, 0x44004200, v0 +; GFX10-NEXT: v_pk_max_f16 v0, 0x40004800, v2 +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX10-NEXT: s_endpgm <4 x half> addrspace(1)* %r, <4 x half> addrspace(1)* %b) #0 { entry: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -2,6 +2,7 @@ ; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s ; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10 %s declare half @llvm.minnum.f16(half %a, half %b) declare <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b) @@ -86,6 +87,32 @@ ; GFX9-NEXT: v_min_f16_e32 v0, v0, v1 ; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: minnum_f16_ieee: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s14, s2 +; GFX10-NEXT: s_mov_b32 s15, s3 +; GFX10-NEXT: s_mov_b32 s10, s2 +; GFX10-NEXT: s_mov_b32 s11, s3 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s12, s6 +; GFX10-NEXT: s_mov_b32 s13, s7 +; GFX10-NEXT: s_mov_b32 s0, s4 +; GFX10-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_mov_b32 s1, s5 +; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX10-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX10-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX10-NEXT: s_endpgm half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #0 { @@ -116,6 +143,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: v_min_f16_e32 v0, v0, v1 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: minnum_f16_no_ieee: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX10-NEXT: ; return to shader part epilog %r.val = call half @llvm.minnum.f16(half %a, half %b) ret half %r.val } @@ -179,6 +211,25 @@ ; GFX9-NEXT: v_min_f16_e32 v0, 0x4200, v0 ; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: minnum_f16_imm_a: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s10, s6 +; GFX10-NEXT: s_mov_b32 s11, s7 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s8, s2 +; GFX10-NEXT: s_mov_b32 s9, s3 +; GFX10-NEXT: s_mov_b32 s4, s0 +; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX10-NEXT: v_min_f16_e32 v0, 0x4200, v0 +; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX10-NEXT: s_endpgm half addrspace(1)* %r, half addrspace(1)* %b) #0 { entry: @@ -247,6 +298,25 @@ ; GFX9-NEXT: v_min_f16_e32 v0, 4.0, v0 ; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: minnum_f16_imm_b: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s10, s6 +; GFX10-NEXT: s_mov_b32 s11, s7 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s8, s2 +; GFX10-NEXT: s_mov_b32 s9, s3 +; GFX10-NEXT: s_mov_b32 s4, s0 +; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX10-NEXT: v_min_f16_e32 v0, 4.0, v0 +; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX10-NEXT: s_endpgm half addrspace(1)* %r, half addrspace(1)* %a) #0 { entry: @@ -329,6 +399,23 @@ ; GFX9-NEXT: v_pk_min_f16 v0, v1, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: minnum_v2f16_ieee: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_pk_max_f16 v0, s0, s0 +; GFX10-NEXT: v_pk_max_f16 v1, s1, s1 +; GFX10-NEXT: v_pk_min_f16 v0, v1, v0 +; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX10-NEXT: s_endpgm <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) #0 { @@ -366,6 +453,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: v_pk_min_f16 v0, v0, v1 ; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: minnum_v2f16_no_ieee: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_pk_min_f16 v0, v0, v1 +; GFX10-NEXT: ; return to shader part epilog %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b) ret <2 x half> %r.val } @@ -428,6 +520,19 @@ ; GFX9-NEXT: v_pk_min_f16 v0, v0, s0 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: minnum_v2f16_imm_a: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_pk_max_f16 v0, s2, s2 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_pk_min_f16 v0, 0x44004200, v0 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: s_endpgm <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %b) #0 { entry: @@ -495,6 +600,19 @@ ; GFX9-NEXT: v_pk_min_f16 v0, v0, s0 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: minnum_v2f16_imm_b: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_pk_max_f16 v0, s2, s2 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_pk_min_f16 v0, 0x42004400, v0 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: s_endpgm <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) #0 { entry: @@ -593,6 +711,27 @@ ; GFX9-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: minnum_v3f16: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_pk_max_f16 v1, s1, s1 +; GFX10-NEXT: v_pk_max_f16 v2, s9, s9 +; GFX10-NEXT: v_pk_max_f16 v0, s0, s0 +; GFX10-NEXT: v_pk_max_f16 v3, s8, s8 +; GFX10-NEXT: v_pk_min_f16 v1, v2, v1 +; GFX10-NEXT: v_pk_min_f16 v0, v3, v0 +; GFX10-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 +; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX10-NEXT: s_endpgm <3 x half> addrspace(1)* %r, <3 x half> addrspace(1)* %a, <3 x half> addrspace(1)* %b) #0 { @@ -706,6 +845,26 @@ ; GFX9-NEXT: v_pk_min_f16 v0, v0, v2 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: minnum_v4f16: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_pk_max_f16 v0, s1, s1 +; GFX10-NEXT: v_pk_max_f16 v1, s9, s9 +; GFX10-NEXT: v_pk_max_f16 v2, s0, s0 +; GFX10-NEXT: v_pk_max_f16 v3, s8, s8 +; GFX10-NEXT: v_pk_min_f16 v1, v1, v0 +; GFX10-NEXT: v_pk_min_f16 v0, v3, v2 +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX10-NEXT: s_endpgm <4 x half> addrspace(1)* %r, <4 x half> addrspace(1)* %a, <4 x half> addrspace(1)* %b) #0 { @@ -798,6 +957,21 @@ ; GFX9-NEXT: v_pk_min_f16 v0, v2, s9 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: fmin_v4f16_imm_a: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_pk_max_f16 v0, s3, s3 +; GFX10-NEXT: v_pk_max_f16 v2, s2, s2 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_pk_min_f16 v1, 0x44004200, v0 +; GFX10-NEXT: v_pk_min_f16 v0, 0x40004800, v2 +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX10-NEXT: s_endpgm <4 x half> addrspace(1)* %r, <4 x half> addrspace(1)* %b) #0 { entry: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=SI %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) { ; SI-LABEL: umulo_i64_v_v: @@ -48,6 +49,30 @@ ; GFX9-NEXT: v_add3_u32 v1, v6, v5, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: umulo_i64_v_v: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_lo_u32 v5, v0, v3 +; GFX10-NEXT: v_mul_hi_u32 v6, v0, v2 +; GFX10-NEXT: v_mul_hi_u32 v4, v0, v3 +; GFX10-NEXT: v_mul_lo_u32 v8, v1, v2 +; GFX10-NEXT: v_mul_hi_u32 v7, v1, v2 +; GFX10-NEXT: v_mul_hi_u32 v9, v1, v3 +; GFX10-NEXT: v_mul_lo_u32 v1, v1, v3 +; GFX10-NEXT: v_mul_lo_u32 v0, v0, v2 +; GFX10-NEXT: v_add_co_u32_e64 v10, vcc_lo, v6, v5 +; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v4, vcc_lo +; GFX10-NEXT: v_add_co_u32_e64 v3, vcc_lo, v10, v8 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v4, v7, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v9, vcc_lo +; GFX10-NEXT: v_add_co_u32_e64 v3, vcc_lo, v3, v1 +; GFX10-NEXT: v_add3_u32 v1, v6, v5, v8 +; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v4, vcc_lo +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[3:4] +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] bb: %umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %x, i64 %y) ret { i64, i1 } %umulo @@ -127,6 +152,42 @@ ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, v[3:4], v[5:6] ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: smulo_i64_s_s: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_lo_u32 v15, v0, v3 +; GFX10-NEXT: v_mul_hi_u32 v5, v0, v2 +; GFX10-NEXT: v_mul_hi_u32 v6, v0, v3 +; GFX10-NEXT: v_mul_lo_u32 v8, v1, v2 +; GFX10-NEXT: v_mul_hi_u32 v7, v1, v2 +; GFX10-NEXT: v_mul_hi_i32 v9, v1, v3 +; GFX10-NEXT: v_mul_lo_u32 v11, v1, v3 +; GFX10-NEXT: v_add_co_u32_e64 v10, vcc_lo, v5, v15 +; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v6, vcc_lo +; GFX10-NEXT: v_add_co_u32_e64 v10, vcc_lo, v10, v8 +; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v6, v7, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v9, vcc_lo +; GFX10-NEXT: v_add_co_u32_e64 v11, vcc_lo, v6, v11 +; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo +; GFX10-NEXT: v_sub_co_u32_e64 v9, vcc_lo, v11, v2 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v7, vcc_lo +; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v1 +; GFX10-NEXT: v_add3_u32 v1, v5, v15, v8 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v11, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc_lo +; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v1 +; GFX10-NEXT: v_sub_co_u32_e64 v8, vcc_lo, v6, v0 +; GFX10-NEXT: v_mul_lo_u32 v0, v0, v2 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v9, vcc_lo, 0, v7, vcc_lo +; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v3 +; GFX10-NEXT: v_mov_b32_e32 v5, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[6:7], v[4:5] +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] bb: %smulo = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %x, i64 %y) ret { i64, i1 } %smulo @@ -193,6 +254,33 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1] ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: umulo_i64_s: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mul_i32 s7, s0, s3 +; GFX10-NEXT: s_mul_hi_u32 s8, s0, s2 +; GFX10-NEXT: s_mul_hi_u32 s5, s0, s3 +; GFX10-NEXT: s_mul_hi_u32 s4, s1, s2 +; GFX10-NEXT: s_mul_i32 s6, s1, s2 +; GFX10-NEXT: s_mul_hi_u32 s9, s1, s3 +; GFX10-NEXT: s_mul_i32 s1, s1, s3 +; GFX10-NEXT: s_add_u32 s3, s8, s7 +; GFX10-NEXT: s_addc_u32 s5, 0, s5 +; GFX10-NEXT: s_add_u32 s3, s3, s6 +; GFX10-NEXT: s_addc_u32 s3, s5, s4 +; GFX10-NEXT: s_addc_u32 s5, s9, 0 +; GFX10-NEXT: s_add_u32 s4, s3, s1 +; GFX10-NEXT: s_addc_u32 s5, 0, s5 +; GFX10-NEXT: s_add_i32 s1, s8, s7 +; GFX10-NEXT: v_cmp_ne_u64_e64 s3, s[4:5], 0 +; GFX10-NEXT: s_add_i32 s1, s1, s6 +; GFX10-NEXT: s_mul_i32 s0, s0, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v1, s1, 0, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v0, s0, 0, s3 +; GFX10-NEXT: global_store_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_endpgm bb: %umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %x, i64 %y) %mul = extractvalue { i64, i1 } %umulo, 0 @@ -292,6 +380,47 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: smulo_i64_s: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mul_i32 s7, s0, s3 +; GFX10-NEXT: s_mul_hi_u32 s8, s0, s2 +; GFX10-NEXT: s_mul_hi_u32 s6, s0, s3 +; GFX10-NEXT: s_add_u32 s11, s8, s7 +; GFX10-NEXT: s_mul_i32 s5, s1, s2 +; GFX10-NEXT: s_addc_u32 s6, 0, s6 +; GFX10-NEXT: s_mul_hi_u32 s4, s1, s2 +; GFX10-NEXT: s_add_u32 s11, s11, s5 +; GFX10-NEXT: s_mul_hi_i32 s9, s1, s3 +; GFX10-NEXT: s_addc_u32 s4, s6, s4 +; GFX10-NEXT: s_mul_i32 s10, s1, s3 +; GFX10-NEXT: s_addc_u32 s6, s9, 0 +; GFX10-NEXT: s_add_u32 s4, s4, s10 +; GFX10-NEXT: s_addc_u32 s6, 0, s6 +; GFX10-NEXT: s_sub_u32 s9, s4, s2 +; GFX10-NEXT: s_subb_u32 s10, s6, 0 +; GFX10-NEXT: v_cmp_lt_i32_e64 vcc_lo, s1, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, s9 +; GFX10-NEXT: v_mov_b32_e32 v1, s10 +; GFX10-NEXT: s_add_i32 s1, s8, s7 +; GFX10-NEXT: s_add_i32 s1, s1, s5 +; GFX10-NEXT: v_cndmask_b32_e32 v0, s4, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, s6, v1, vcc_lo +; GFX10-NEXT: s_ashr_i32 s4, s1, 31 +; GFX10-NEXT: s_mov_b32 s5, s4 +; GFX10-NEXT: v_sub_co_u32_e64 v2, vcc_lo, v0, s0 +; GFX10-NEXT: s_mul_i32 s0, s0, s2 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_cmp_lt_i32_e64 vcc_lo, s3, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX10-NEXT: v_cndmask_b32_e64 v1, s1, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, s0, 0, vcc_lo +; GFX10-NEXT: global_store_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_endpgm bb: %umulo = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %x, i64 %y) %mul = extractvalue { i64, i1 } %umulo, 0 @@ -325,6 +454,19 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_mov_b32_e32 v1, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: smulo_i64_v_4: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshlrev_b64 v[4:5], 2, v[0:1] +; GFX10-NEXT: v_alignbit_b32 v3, v1, v0, 30 +; GFX10-NEXT: v_ashrrev_i64 v[6:7], 2, v[4:5] +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[6:7], v[0:1] +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] bb: %umulo = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %i, i64 4) ret { i64, i1 } %umulo @@ -356,6 +498,20 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: umulo_i64_v_4: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v7, 0x3fffffff, v1 +; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_lshlrev_b64 v[4:5], 2, v[0:1] +; GFX10-NEXT: v_alignbit_b32 v3, v1, v0, 30 +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[6:7], v[0:1] +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] bb: %umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %i, i64 4) ret { i64, i1 } %umulo diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll @@ -2,6 +2,7 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX6 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX8 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s define amdgpu_kernel void @sin_f16(half addrspace(1)* %r, half addrspace(1)* %a) { ; GFX6-LABEL: sin_f16: @@ -53,6 +54,18 @@ ; GFX9-NEXT: v_sin_f16_e32 v1, v1 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: sin_f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 +; GFX10-NEXT: v_sin_f16_e32 v1, v1 +; GFX10-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm %a.val = load half, half addrspace(1)* %a %r.val = call half @llvm.sin.f16(half %a.val) store half %r.val, half addrspace(1)* %r @@ -128,6 +141,23 @@ ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: sin_v2f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v3, 0x3118 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mul_f16_e32 v2, 0.15915494, v1 +; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_sin_f16_e32 v2, v2 +; GFX10-NEXT: v_sin_f16_e32 v1, v1 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm %a.val = load <2 x half>, <2 x half> addrspace(1)* %a %r.val = call <2 x half> @llvm.sin.v2f16(<2 x half> %a.val) store <2 x half> %r.val, <2 x half> addrspace(1)* %r diff --git a/llvm/test/CodeGen/AMDGPU/load-local.128.ll b/llvm/test/CodeGen/AMDGPU/load-local.128.ll --- a/llvm/test/CodeGen/AMDGPU/load-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local.128.ll @@ -2,6 +2,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s define <4 x i32> @load_lds_v4i32(<4 x i32> addrspace(3)* %ptr) { ; GFX9-LABEL: load_lds_v4i32: @@ -28,6 +29,14 @@ ; GFX6-NEXT: ds_read_b64 v[0:1], v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: load_lds_v4i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_read_b128 v[0:3], v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr ret <4 x i32> %load } @@ -198,6 +207,48 @@ ; GFX6-NEXT: v_or_b32_e32 v0, v0, v8 ; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: load_lds_v4i32_align1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_read_u8 v1, v0 +; GFX10-NEXT: ds_read_u8 v2, v0 offset:1 +; GFX10-NEXT: ds_read_u8 v3, v0 offset:2 +; GFX10-NEXT: ds_read_u8 v4, v0 offset:3 +; GFX10-NEXT: ds_read_u8 v5, v0 offset:4 +; GFX10-NEXT: ds_read_u8 v6, v0 offset:5 +; GFX10-NEXT: ds_read_u8 v7, v0 offset:6 +; GFX10-NEXT: ds_read_u8 v8, v0 offset:7 +; GFX10-NEXT: ds_read_u8 v9, v0 offset:8 +; GFX10-NEXT: ds_read_u8 v10, v0 offset:9 +; GFX10-NEXT: ds_read_u8 v11, v0 offset:10 +; GFX10-NEXT: ds_read_u8 v12, v0 offset:11 +; GFX10-NEXT: ds_read_u8 v13, v0 offset:12 +; GFX10-NEXT: ds_read_u8 v14, v0 offset:13 +; GFX10-NEXT: ds_read_u8 v15, v0 offset:14 +; GFX10-NEXT: ds_read_u8 v0, v0 offset:15 +; GFX10-NEXT: s_waitcnt lgkmcnt(14) +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 8, v1 +; GFX10-NEXT: s_waitcnt lgkmcnt(12) +; GFX10-NEXT: v_lshl_or_b32 v2, v4, 8, v3 +; GFX10-NEXT: s_waitcnt lgkmcnt(10) +; GFX10-NEXT: v_lshl_or_b32 v3, v6, 8, v5 +; GFX10-NEXT: s_waitcnt lgkmcnt(8) +; GFX10-NEXT: v_lshl_or_b32 v4, v8, 8, v7 +; GFX10-NEXT: s_waitcnt lgkmcnt(6) +; GFX10-NEXT: v_lshl_or_b32 v5, v10, 8, v9 +; GFX10-NEXT: s_waitcnt lgkmcnt(4) +; GFX10-NEXT: v_lshl_or_b32 v6, v12, 8, v11 +; GFX10-NEXT: s_waitcnt lgkmcnt(2) +; GFX10-NEXT: v_lshl_or_b32 v7, v14, 8, v13 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_lshl_or_b32 v8, v0, 8, v15 +; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v1 +; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v3 +; GFX10-NEXT: v_lshl_or_b32 v2, v6, 16, v5 +; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v7 +; GFX10-NEXT: s_setpc_b64 s[30:31] %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 1 ret <4 x i32> %load } @@ -280,6 +331,28 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: load_lds_v4i32_align2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_read_u16 v1, v0 +; GFX10-NEXT: ds_read_u16 v2, v0 offset:2 +; GFX10-NEXT: ds_read_u16 v3, v0 offset:4 +; GFX10-NEXT: ds_read_u16 v4, v0 offset:6 +; GFX10-NEXT: ds_read_u16 v5, v0 offset:8 +; GFX10-NEXT: ds_read_u16 v6, v0 offset:10 +; GFX10-NEXT: ds_read_u16 v7, v0 offset:12 +; GFX10-NEXT: ds_read_u16 v8, v0 offset:14 +; GFX10-NEXT: s_waitcnt lgkmcnt(6) +; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v1 +; GFX10-NEXT: s_waitcnt lgkmcnt(4) +; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v3 +; GFX10-NEXT: s_waitcnt lgkmcnt(2) +; GFX10-NEXT: v_lshl_or_b32 v2, v6, 16, v5 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v7 +; GFX10-NEXT: s_setpc_b64 s[30:31] %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 2 ret <4 x i32> %load } @@ -317,6 +390,16 @@ ; GFX6-NEXT: ds_read_b32 v0, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: load_lds_v4i32_align4: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; GFX10-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 4 ret <4 x i32> %load } @@ -346,6 +429,14 @@ ; GFX6-NEXT: ds_read_b64 v[0:1], v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: load_lds_v4i32_align8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 8 ret <4 x i32> %load } @@ -375,6 +466,14 @@ ; GFX6-NEXT: ds_read_b64 v[0:1], v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: load_lds_v4i32_align16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_read_b128 v[0:3], v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 16 ret <4 x i32> %load } diff --git a/llvm/test/CodeGen/AMDGPU/load-local.96.ll b/llvm/test/CodeGen/AMDGPU/load-local.96.ll --- a/llvm/test/CodeGen/AMDGPU/load-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local.96.ll @@ -2,6 +2,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s define <3 x i32> @load_lds_v3i32(<3 x i32> addrspace(3)* %ptr) { ; GFX9-LABEL: load_lds_v3i32: @@ -28,6 +29,14 @@ ; GFX6-NEXT: ds_read_b64 v[0:1], v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: load_lds_v3i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_read_b96 v[0:2], v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr ret <3 x i32> %load } @@ -162,6 +171,39 @@ ; GFX6-NEXT: v_or_b32_e32 v0, v0, v8 ; GFX6-NEXT: v_or_b32_e32 v0, v3, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: load_lds_v3i32_align1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_read_u8 v1, v0 +; GFX10-NEXT: ds_read_u8 v2, v0 offset:1 +; GFX10-NEXT: ds_read_u8 v3, v0 offset:2 +; GFX10-NEXT: ds_read_u8 v4, v0 offset:3 +; GFX10-NEXT: ds_read_u8 v5, v0 offset:4 +; GFX10-NEXT: ds_read_u8 v6, v0 offset:5 +; GFX10-NEXT: ds_read_u8 v7, v0 offset:6 +; GFX10-NEXT: ds_read_u8 v8, v0 offset:7 +; GFX10-NEXT: ds_read_u8 v9, v0 offset:8 +; GFX10-NEXT: ds_read_u8 v10, v0 offset:9 +; GFX10-NEXT: ds_read_u8 v11, v0 offset:10 +; GFX10-NEXT: ds_read_u8 v0, v0 offset:11 +; GFX10-NEXT: s_waitcnt lgkmcnt(10) +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 8, v1 +; GFX10-NEXT: s_waitcnt lgkmcnt(8) +; GFX10-NEXT: v_lshl_or_b32 v2, v4, 8, v3 +; GFX10-NEXT: s_waitcnt lgkmcnt(6) +; GFX10-NEXT: v_lshl_or_b32 v3, v6, 8, v5 +; GFX10-NEXT: s_waitcnt lgkmcnt(4) +; GFX10-NEXT: v_lshl_or_b32 v4, v8, 8, v7 +; GFX10-NEXT: s_waitcnt lgkmcnt(2) +; GFX10-NEXT: v_lshl_or_b32 v5, v10, 8, v9 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_lshl_or_b32 v6, v0, 8, v11 +; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v1 +; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v3 +; GFX10-NEXT: v_lshl_or_b32 v2, v6, 16, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 1 ret <3 x i32> %load } @@ -231,6 +273,24 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_or_b32_e32 v0, v3, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: load_lds_v3i32_align2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_read_u16 v1, v0 +; GFX10-NEXT: ds_read_u16 v2, v0 offset:2 +; GFX10-NEXT: ds_read_u16 v3, v0 offset:4 +; GFX10-NEXT: ds_read_u16 v4, v0 offset:6 +; GFX10-NEXT: ds_read_u16 v5, v0 offset:8 +; GFX10-NEXT: ds_read_u16 v6, v0 offset:10 +; GFX10-NEXT: s_waitcnt lgkmcnt(4) +; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v1 +; GFX10-NEXT: s_waitcnt lgkmcnt(2) +; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v3 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_lshl_or_b32 v2, v6, 16, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 2 ret <3 x i32> %load } @@ -266,6 +326,16 @@ ; GFX6-NEXT: ds_read_b32 v0, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: load_lds_v3i32_align4: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; GFX10-NEXT: ds_read_b32 v2, v2 offset:8 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 4 ret <3 x i32> %load } @@ -301,6 +371,16 @@ ; GFX6-NEXT: ds_read_b32 v0, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: load_lds_v3i32_align8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; GFX10-NEXT: ds_read_b32 v2, v2 offset:8 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 8 ret <3 x i32> %load } @@ -330,6 +410,14 @@ ; GFX6-NEXT: ds_read_b64 v[0:1], v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: load_lds_v3i32_align16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_read_b96 v[0:2], v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 16 ret <3 x i32> %load } diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -2,6 +2,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=VI %s ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=CI %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s define amdgpu_kernel void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { ; GFX9-LABEL: s_lshr_v2i16: @@ -57,6 +58,18 @@ ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; CI-NEXT: s_endpgm +; +; GFX10-LABEL: s_lshr_v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dword s5, s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_pk_lshrrev_b16 v1, s5, s4 +; GFX10-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-NEXT: s_endpgm %result = lshr <2 x i16> %lhs, %rhs store <2 x i16> %result, <2 x i16> addrspace(1)* %out ret void @@ -122,6 +135,19 @@ ; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; CI-NEXT: s_endpgm +; +; GFX10-LABEL: v_lshr_v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_lshrrev_b16 v1, v2, v1 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -193,6 +219,18 @@ ; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; CI-NEXT: s_endpgm +; +; GFX10-LABEL: lshr_v_s_v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_lshrrev_b16 v1, s0, v1 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -262,6 +300,18 @@ ; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; CI-NEXT: s_endpgm +; +; GFX10-LABEL: lshr_s_v_v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_lshrrev_b16 v1, v1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -324,6 +374,17 @@ ; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; CI-NEXT: s_endpgm +; +; GFX10-LABEL: lshr_imm_v_v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_lshrrev_b16 v1, v1, 8 op_sel_hi:[1,0] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -381,6 +442,17 @@ ; CI-NEXT: v_and_b32_e32 v2, 0xff00ff, v2 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; CI-NEXT: s_endpgm +; +; GFX10-LABEL: lshr_v_imm_v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -463,6 +535,20 @@ ; CI-NEXT: v_or_b32_e32 v2, v2, v4 ; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 ; CI-NEXT: s_endpgm +; +; GFX10-LABEL: v_lshr_v4i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v7, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v7, s[2:3] offset:8 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_lshrrev_b16 v1, v3, v1 +; GFX10-NEXT: v_pk_lshrrev_b16 v0, v2, v0 +; GFX10-NEXT: global_store_dwordx2 v7, v[0:1], s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext @@ -529,6 +615,18 @@ ; CI-NEXT: v_and_b32_e32 v2, s0, v2 ; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 ; CI-NEXT: s_endpgm +; +; GFX10-LABEL: lshr_v_imm_v4i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext diff --git a/llvm/test/CodeGen/AMDGPU/ptrmask.ll b/llvm/test/CodeGen/AMDGPU/ptrmask.ll --- a/llvm/test/CodeGen/AMDGPU/ptrmask.ll +++ b/llvm/test/CodeGen/AMDGPU/ptrmask.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10 %s define i8 addrspace(1)* @v_ptrmask_global_variable_i64(i8 addrspace(1)* %ptr, i64 %mask) { ; GCN-LABEL: v_ptrmask_global_variable_i64: @@ -8,6 +9,14 @@ ; GCN-NEXT: v_and_b32_e32 v1, v1, v3 ; GCN-NEXT: v_and_b32_e32 v0, v0, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_ptrmask_global_variable_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX10-NEXT: v_and_b32_e32 v1, v1, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] %masked = call i8 addrspace(1)* @llvm.ptrmask.p1i8.i64(i8 addrspace(1)* %ptr, i64 %mask) ret i8 addrspace(1)* %masked } @@ -19,6 +28,14 @@ ; GCN-NEXT: v_and_b32_e32 v0, v0, v2 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_ptrmask_global_variable_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %masked = call i8 addrspace(1)* @llvm.ptrmask.p1i8.i32(i8 addrspace(1)* %ptr, i32 %mask) ret i8 addrspace(1)* %masked } @@ -30,6 +47,14 @@ ; GCN-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_ptrmask_global_variable_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %masked = call i8 addrspace(1)* @llvm.ptrmask.p1i8.i16(i8 addrspace(1)* %ptr, i16 %mask) ret i8 addrspace(1)* %masked } @@ -40,6 +65,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_and_b32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_ptrmask_local_variable_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %masked = call i8 addrspace(3)* @llvm.ptrmask.p3i8.i64(i8 addrspace(3)* %ptr, i64 %mask) ret i8 addrspace(3)* %masked } @@ -50,6 +82,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_and_b32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_ptrmask_local_variable_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %masked = call i8 addrspace(3)* @llvm.ptrmask.p3i8.i32(i8 addrspace(3)* %ptr, i32 %mask) ret i8 addrspace(3)* %masked } @@ -60,6 +99,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_ptrmask_local_variable_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %masked = call i8 addrspace(3)* @llvm.ptrmask.p3i8.i16(i8 addrspace(3)* %ptr, i16 %mask) ret i8 addrspace(3)* %masked } @@ -69,6 +115,11 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_and_b64 s[0:1], s[2:3], s[4:5] ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_ptrmask_global_variable_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_and_b64 s[0:1], s[2:3], s[4:5] +; GFX10-NEXT: ; return to shader part epilog %masked = call i8 addrspace(1)* @llvm.ptrmask.p1i8.i64(i8 addrspace(1)* %ptr, i64 %mask) ret i8 addrspace(1)* %masked } @@ -80,6 +131,13 @@ ; GCN-NEXT: s_and_b64 s[0:1], s[2:3], s[4:5] ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_ptrmask_global_variable_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_and_b64 s[0:1], s[2:3], s[4:5] +; GFX10-NEXT: s_mov_b32 s1, 0 +; GFX10-NEXT: ; return to shader part epilog %masked = call i8 addrspace(1)* @llvm.ptrmask.p1i8.i32(i8 addrspace(1)* %ptr, i32 %mask) ret i8 addrspace(1)* %masked } @@ -92,6 +150,14 @@ ; GCN-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_ptrmask_global_variable_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s1, 0 +; GFX10-NEXT: s_and_b32 s0, s4, 0xffff +; GFX10-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] +; GFX10-NEXT: s_mov_b32 s1, 0 +; GFX10-NEXT: ; return to shader part epilog %masked = call i8 addrspace(1)* @llvm.ptrmask.p1i8.i16(i8 addrspace(1)* %ptr, i16 %mask) ret i8 addrspace(1)* %masked } @@ -101,6 +167,11 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_and_b32 s0, s2, s3 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_ptrmask_local_variable_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_and_b32 s0, s2, s3 +; GFX10-NEXT: ; return to shader part epilog %masked = call i8 addrspace(3)* @llvm.ptrmask.p3i8.i64(i8 addrspace(3)* %ptr, i64 %mask) ret i8 addrspace(3)* %masked } @@ -110,6 +181,11 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_and_b32 s0, s2, s3 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_ptrmask_local_variable_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_and_b32 s0, s2, s3 +; GFX10-NEXT: ; return to shader part epilog %masked = call i8 addrspace(3)* @llvm.ptrmask.p3i8.i32(i8 addrspace(3)* %ptr, i32 %mask) ret i8 addrspace(3)* %masked } @@ -120,6 +196,12 @@ ; GCN-NEXT: s_and_b32 s0, 0xffff, s3 ; GCN-NEXT: s_and_b32 s0, s2, s0 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_ptrmask_local_variable_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_and_b32 s0, 0xffff, s3 +; GFX10-NEXT: s_and_b32 s0, s2, s0 +; GFX10-NEXT: ; return to shader part epilog %masked = call i8 addrspace(3)* @llvm.ptrmask.p3i8.i16(i8 addrspace(3)* %ptr, i16 %mask) ret i8 addrspace(3)* %masked } diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll --- a/llvm/test/CodeGen/AMDGPU/saddo.ll +++ b/llvm/test/CodeGen/AMDGPU/saddo.ll @@ -2,6 +2,7 @@ ; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI ; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=VI ; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s --check-prefix=GFX9 +; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefix=GFX10 declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone @@ -74,6 +75,24 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: saddo_i64_zext: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_add_u32 s0, s6, s2 +; GFX10-NEXT: s_addc_u32 s1, s7, s3 +; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[2:3], 0 +; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[0:1], s[6:7] +; GFX10-NEXT: s_xor_b32 s2, s2, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-NEXT: s_endpgm %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind %val = extractvalue { i64, i1 } %sadd, 0 %carry = extractvalue { i64, i1 } %sadd, 1 @@ -144,6 +163,22 @@ ; GFX9-NEXT: global_store_dword v0, v2, s[4:5] ; GFX9-NEXT: global_store_byte v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: s_saddo_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_add_nc_i32 v0, s2, s3 clamp +; GFX10-NEXT: s_add_i32 s0, s2, s3 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s0, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10-NEXT: global_store_dword v1, v2, s[4:5] +; GFX10-NEXT: global_store_byte v1, v0, s[6:7] +; GFX10-NEXT: s_endpgm %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind %val = extractvalue { i32, i1 } %sadd, 0 %carry = extractvalue { i32, i1 } %sadd, 1 @@ -222,6 +257,23 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX9-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_saddo_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dword v1, v0, s[4:5] +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_nc_i32 v3, v1, v2 clamp +; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_byte v0, v2, s[2:3] +; GFX10-NEXT: s_endpgm %a = load i32, i32 addrspace(1)* %aptr, align 4 %b = load i32, i32 addrspace(1)* %bptr, align 4 %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind @@ -300,6 +352,23 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX9-NEXT: global_store_byte v2, v0, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: s_saddo_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_add_u32 s8, s4, s6 +; GFX10-NEXT: s_addc_u32 s9, s5, s7 +; GFX10-NEXT: v_cmp_lt_i64_e64 s6, s[6:7], 0 +; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[8:9], s[4:5] +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-NEXT: s_xor_b32 s4, s6, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: global_store_byte v2, v3, s[2:3] +; GFX10-NEXT: s_endpgm %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind %val = extractvalue { i64, i1 } %sadd, 0 %carry = extractvalue { i64, i1 } %sadd, 1 @@ -382,6 +451,25 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX9-NEXT: global_store_byte v6, v0, s[6:7] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_saddo_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v6, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx2 v[9:10], v6, s[8:9] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[10:11] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_co_u32_e64 v7, vcc_lo, v9, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v10, v3, vcc_lo +; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3] +; GFX10-NEXT: v_cmp_lt_i64_e64 s0, v[7:8], v[9:10] +; GFX10-NEXT: s_xor_b32 s0, vcc_lo, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX10-NEXT: global_store_dwordx2 v6, v[7:8], s[4:5] +; GFX10-NEXT: global_store_byte v6, v0, s[6:7] +; GFX10-NEXT: s_endpgm %a = load i64, i64 addrspace(1)* %aptr, align 4 %b = load i64, i64 addrspace(1)* %bptr, align 4 %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind @@ -476,6 +564,27 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_saddo_v2i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_nc_i32 v5, v1, v3 clamp +; GFX10-NEXT: v_add_nc_u32_e32 v10, v1, v3 +; GFX10-NEXT: v_add_nc_i32 v6, v0, v2 clamp +; GFX10-NEXT: v_add_nc_u32_e32 v9, v0, v2 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v10, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v9, v6 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX10-NEXT: global_store_dwordx2 v4, v[9:10], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] +; GFX10-NEXT: s_endpgm %a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4 %b = load <2 x i32>, <2 x i32> addrspace(1)* %bptr, align 4 %sadd = call { <2 x i32>, <2 x i1> } @llvm.sadd.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind diff --git a/llvm/test/CodeGen/AMDGPU/saddsat.ll b/llvm/test/CodeGen/AMDGPU/saddsat.ll --- a/llvm/test/CodeGen/AMDGPU/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/saddsat.ll @@ -2,6 +2,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck --check-prefix=GFX8 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s define i8 @v_saddsat_i8(i8 %lhs, i8 %rhs) { ; GFX6-LABEL: v_saddsat_i8: @@ -30,6 +31,16 @@ ; GFX9-NEXT: v_add_i16 v0, v0, v1 clamp ; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_saddsat_i8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshlrev_b16_e64 v1, 8, v1 +; GFX10-NEXT: v_lshlrev_b16_e64 v0, 8, v0 +; GFX10-NEXT: v_add_nc_i16 v0, v0, v1 clamp +; GFX10-NEXT: v_ashrrev_i16_e64 v0, 8, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i8 @llvm.sadd.sat.i8(i8 %lhs, i8 %rhs) ret i8 %result } @@ -64,6 +75,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_i16 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_saddsat_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_add_nc_i16 v0, v0, v1 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i16 @llvm.sadd.sat.i16(i16 %lhs, i16 %rhs) ret i16 %result } @@ -102,6 +120,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_i32 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_saddsat_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_add_nc_i32 v0, v0, v1 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i32 @llvm.sadd.sat.i32(i32 %lhs, i32 %rhs) ret i32 %result } @@ -158,6 +183,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_add_i16 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_saddsat_v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_add_i16 v0, v0, v1 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) ret <2 x i16> %result } @@ -228,6 +260,14 @@ ; GFX9-NEXT: v_pk_add_i16 v1, v1, v3 clamp ; GFX9-NEXT: v_pk_add_i16 v0, v0, v2 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_saddsat_v3i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_add_i16 v0, v0, v2 clamp +; GFX10-NEXT: v_pk_add_i16 v1, v1, v3 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <3 x i16> @llvm.sadd.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs) ret <3 x i16> %result } @@ -316,6 +356,14 @@ ; GFX9-NEXT: v_pk_add_i16 v0, v0, v2 clamp ; GFX9-NEXT: v_pk_add_i16 v1, v1, v3 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_saddsat_v4i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_add_i16 v0, v0, v2 clamp +; GFX10-NEXT: v_pk_add_i16 v1, v1, v3 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) %cast = bitcast <4 x i16> %result to <2 x float> ret <2 x float> %cast @@ -370,6 +418,14 @@ ; GFX9-NEXT: v_add_i32 v0, v0, v2 clamp ; GFX9-NEXT: v_add_i32 v1, v1, v3 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_saddsat_v2i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_add_nc_i32 v0, v0, v2 clamp +; GFX10-NEXT: v_add_nc_i32 v1, v1, v3 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) ret <2 x i32> %result } @@ -425,6 +481,23 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_saddsat_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_add_co_u32_e64 v10, vcc_lo, v0, v2 +; GFX10-NEXT: v_bfrev_b32_e32 v6, -2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[2:3] +; GFX10-NEXT: v_cmp_gt_i64_e64 s5, 0, v[10:11] +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[10:11], v[0:1] +; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v11 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0x80000000, v6, s5 +; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs) ret i64 %result } diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -2,6 +2,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=VI %s ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=CI %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s define amdgpu_kernel void @s_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { ; GFX9-LABEL: s_shl_v2i16: @@ -59,6 +60,19 @@ ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; CI-NEXT: s_endpgm +; +; GFX10-LABEL: s_shl_v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX10-NEXT: s_load_dword s3, s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_pk_lshlrev_b16 v0, s3, s2 +; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX10-NEXT: s_endpgm %result = shl <2 x i16> %lhs, %rhs store <2 x i16> %result, <2 x i16> addrspace(1)* %out ret void @@ -124,6 +138,19 @@ ; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; CI-NEXT: s_endpgm +; +; GFX10-LABEL: v_shl_v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_lshlrev_b16 v1, v2, v1 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -195,6 +222,18 @@ ; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; CI-NEXT: s_endpgm +; +; GFX10-LABEL: shl_v_s_v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_lshlrev_b16 v1, s0, v1 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -264,6 +303,18 @@ ; CI-NEXT: v_or_b32_e32 v2, v3, v2 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; CI-NEXT: s_endpgm +; +; GFX10-LABEL: shl_s_v_v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_lshlrev_b16 v1, v1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -327,6 +378,17 @@ ; CI-NEXT: v_or_b32_e32 v2, v3, v2 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; CI-NEXT: s_endpgm +; +; GFX10-LABEL: shl_imm_v_v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_lshlrev_b16 v1, v1, 8 op_sel_hi:[1,0] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -385,6 +447,17 @@ ; CI-NEXT: v_and_b32_e32 v2, 0xff00ff00, v2 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; CI-NEXT: s_endpgm +; +; GFX10-LABEL: shl_v_imm_v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -467,6 +540,20 @@ ; CI-NEXT: v_or_b32_e32 v2, v2, v4 ; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 ; CI-NEXT: s_endpgm +; +; GFX10-LABEL: v_shl_v4i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v7, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v7, s[2:3] offset:8 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_lshlrev_b16 v1, v3, v1 +; GFX10-NEXT: v_pk_lshlrev_b16 v0, v2, v0 +; GFX10-NEXT: global_store_dwordx2 v7, v[0:1], s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext @@ -540,6 +627,18 @@ ; CI-NEXT: v_and_b32_e32 v2, 0xff00ff00, v2 ; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 ; CI-NEXT: s_endpgm +; +; GFX10-LABEL: shl_v_imm_v4i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext diff --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll --- a/llvm/test/CodeGen/AMDGPU/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll @@ -2,6 +2,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck --check-prefix=GFX8 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s define i8 @v_ssubsat_i8(i8 %lhs, i8 %rhs) { ; GFX6-LABEL: v_ssubsat_i8: @@ -30,6 +31,16 @@ ; GFX9-NEXT: v_sub_i16 v0, v0, v1 clamp ; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_ssubsat_i8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshlrev_b16_e64 v1, 8, v1 +; GFX10-NEXT: v_lshlrev_b16_e64 v0, 8, v0 +; GFX10-NEXT: v_sub_nc_i16 v0, v0, v1 clamp +; GFX10-NEXT: v_ashrrev_i16_e64 v0, 8, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i8 @llvm.ssub.sat.i8(i8 %lhs, i8 %rhs) ret i8 %result } @@ -64,6 +75,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_sub_i16 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_ssubsat_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_sub_nc_i16 v0, v0, v1 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i16 @llvm.ssub.sat.i16(i16 %lhs, i16 %rhs) ret i16 %result } @@ -102,6 +120,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_sub_i32 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_ssubsat_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_sub_nc_i32 v0, v0, v1 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i32 @llvm.ssub.sat.i32(i32 %lhs, i32 %rhs) ret i32 %result } @@ -158,6 +183,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_ssubsat_v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_sub_i16 v0, v0, v1 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) ret <2 x i16> %result } @@ -229,6 +261,14 @@ ; GFX9-NEXT: v_pk_sub_i16 v1, v1, v3 clamp ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v2 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_ssubsat_v3i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_sub_i16 v0, v0, v2 clamp +; GFX10-NEXT: v_pk_sub_i16 v1, v1, v3 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <3 x i16> @llvm.ssub.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs) ret <3 x i16> %result } @@ -317,6 +357,14 @@ ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v2 clamp ; GFX9-NEXT: v_pk_sub_i16 v1, v1, v3 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_ssubsat_v4i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_sub_i16 v0, v0, v2 clamp +; GFX10-NEXT: v_pk_sub_i16 v1, v1, v3 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) %cast = bitcast <4 x i16> %result to <2 x float> ret <2 x float> %cast @@ -371,6 +419,14 @@ ; GFX9-NEXT: v_sub_i32 v0, v0, v2 clamp ; GFX9-NEXT: v_sub_i32 v1, v1, v3 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_ssubsat_v2i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_sub_nc_i32 v0, v0, v2 clamp +; GFX10-NEXT: v_sub_nc_i32 v1, v1, v3 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) ret <2 x i32> %result } @@ -439,6 +495,15 @@ ; GFX9-NEXT: v_sub_i32 v1, v1, v4 clamp ; GFX9-NEXT: v_sub_i32 v2, v2, v5 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_ssubsat_v3i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_sub_nc_i32 v0, v0, v3 clamp +; GFX10-NEXT: v_sub_nc_i32 v1, v1, v4 clamp +; GFX10-NEXT: v_sub_nc_i32 v2, v2, v5 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <3 x i32> @llvm.ssub.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs) ret <3 x i32> %result } @@ -522,6 +587,16 @@ ; GFX9-NEXT: v_sub_i32 v2, v2, v6 clamp ; GFX9-NEXT: v_sub_i32 v3, v3, v7 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_ssubsat_v4i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_sub_nc_i32 v0, v0, v4 clamp +; GFX10-NEXT: v_sub_nc_i32 v1, v1, v5 clamp +; GFX10-NEXT: v_sub_nc_i32 v2, v2, v6 clamp +; GFX10-NEXT: v_sub_nc_i32 v3, v3, v7 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) ret <4 x i32> %result } @@ -665,6 +740,20 @@ ; GFX9-NEXT: v_sub_i32 v6, v6, v14 clamp ; GFX9-NEXT: v_sub_i32 v7, v7, v15 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_ssubsat_v8i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_sub_nc_i32 v0, v0, v8 clamp +; GFX10-NEXT: v_sub_nc_i32 v1, v1, v9 clamp +; GFX10-NEXT: v_sub_nc_i32 v2, v2, v10 clamp +; GFX10-NEXT: v_sub_nc_i32 v3, v3, v11 clamp +; GFX10-NEXT: v_sub_nc_i32 v4, v4, v12 clamp +; GFX10-NEXT: v_sub_nc_i32 v5, v5, v13 clamp +; GFX10-NEXT: v_sub_nc_i32 v6, v6, v14 clamp +; GFX10-NEXT: v_sub_nc_i32 v7, v7, v15 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> %lhs, <8 x i32> %rhs) ret <8 x i32> %result } @@ -928,6 +1017,28 @@ ; GFX9-NEXT: v_sub_i32 v14, v14, v30 clamp ; GFX9-NEXT: v_sub_i32 v15, v15, v31 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_ssubsat_v16i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_sub_nc_i32 v0, v0, v16 clamp +; GFX10-NEXT: v_sub_nc_i32 v1, v1, v17 clamp +; GFX10-NEXT: v_sub_nc_i32 v2, v2, v18 clamp +; GFX10-NEXT: v_sub_nc_i32 v3, v3, v19 clamp +; GFX10-NEXT: v_sub_nc_i32 v4, v4, v20 clamp +; GFX10-NEXT: v_sub_nc_i32 v5, v5, v21 clamp +; GFX10-NEXT: v_sub_nc_i32 v6, v6, v22 clamp +; GFX10-NEXT: v_sub_nc_i32 v7, v7, v23 clamp +; GFX10-NEXT: v_sub_nc_i32 v8, v8, v24 clamp +; GFX10-NEXT: v_sub_nc_i32 v9, v9, v25 clamp +; GFX10-NEXT: v_sub_nc_i32 v10, v10, v26 clamp +; GFX10-NEXT: v_sub_nc_i32 v11, v11, v27 clamp +; GFX10-NEXT: v_sub_nc_i32 v12, v12, v28 clamp +; GFX10-NEXT: v_sub_nc_i32 v13, v13, v29 clamp +; GFX10-NEXT: v_sub_nc_i32 v14, v14, v30 clamp +; GFX10-NEXT: v_sub_nc_i32 v15, v15, v31 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) ret <16 x i32> %result } @@ -984,6 +1095,23 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_ssubsat_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_sub_co_u32_e64 v10, vcc_lo, v0, v2 +; GFX10-NEXT: v_bfrev_b32_e32 v6, -2 +; GFX10-NEXT: v_sub_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[2:3] +; GFX10-NEXT: v_cmp_gt_i64_e64 s5, 0, v[10:11] +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[10:11], v[0:1] +; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v11 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0x80000000, v6, s5 +; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs) ret i64 %result } diff --git a/llvm/test/CodeGen/AMDGPU/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/store-local.128.ll --- a/llvm/test/CodeGen/AMDGPU/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/store-local.128.ll @@ -2,6 +2,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s define amdgpu_kernel void @store_lds_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32: @@ -44,6 +45,20 @@ ; GFX6-NEXT: v_mov_b32_e32 v3, s1 ; GFX6-NEXT: ds_write2_b64 v4, v[2:3], v[0:1] offset1:1 ; GFX6-NEXT: s_endpgm +; +; GFX10-LABEL: store_lds_v4i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, s2 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: v_mov_b32_e32 v3, s7 +; GFX10-NEXT: ds_write_b128 v4, v[0:3] +; GFX10-NEXT: s_endpgm store <4 x i32> %x, <4 x i32> addrspace(3)* %out ret void } @@ -198,6 +213,51 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:2 ; GFX6-NEXT: s_endpgm +; +; GFX10-LABEL: store_lds_v4i32_align1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-NEXT: s_lshr_b32 s3, s6, 24 +; GFX10-NEXT: s_lshr_b32 s0, s7, 8 +; GFX10-NEXT: s_lshr_b32 s2, s6, 8 +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: s_lshr_b32 s6, s5, 8 +; GFX10-NEXT: v_mov_b32_e32 v3, s5 +; GFX10-NEXT: s_lshr_b32 s1, s7, 24 +; GFX10-NEXT: s_lshr_b32 s5, s5, 24 +; GFX10-NEXT: v_mov_b32_e32 v15, s3 +; GFX10-NEXT: v_mov_b32_e32 v5, s0 +; GFX10-NEXT: v_mov_b32_e32 v9, s6 +; GFX10-NEXT: s_lshr_b32 s0, s4, 8 +; GFX10-NEXT: v_mov_b32_e32 v6, s1 +; GFX10-NEXT: v_mov_b32_e32 v11, s4 +; GFX10-NEXT: v_mov_b32_e32 v7, s2 +; GFX10-NEXT: ds_write_b8 v0, v1 offset:12 +; GFX10-NEXT: ds_write_b8_d16_hi v0, v1 offset:14 +; GFX10-NEXT: ds_write_b8 v0, v2 offset:8 +; GFX10-NEXT: ds_write_b8_d16_hi v0, v2 offset:10 +; GFX10-NEXT: ds_write_b8 v0, v3 offset:4 +; GFX10-NEXT: ds_write_b8_d16_hi v0, v3 offset:6 +; GFX10-NEXT: ds_write_b8 v0, v11 +; GFX10-NEXT: ds_write_b8_d16_hi v0, v11 offset:2 +; GFX10-NEXT: ds_write_b8 v0, v5 offset:13 +; GFX10-NEXT: ds_write_b8 v0, v6 offset:15 +; GFX10-NEXT: ds_write_b8 v0, v7 offset:9 +; GFX10-NEXT: s_lshr_b32 s1, s4, 24 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: ds_write_b8 v0, v15 offset:11 +; GFX10-NEXT: ds_write_b8 v0, v9 offset:5 +; GFX10-NEXT: ds_write_b8 v0, v1 offset:7 +; GFX10-NEXT: ds_write_b8 v0, v2 offset:1 +; GFX10-NEXT: ds_write_b8 v0, v3 offset:3 +; GFX10-NEXT: s_endpgm store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 1 ret void } @@ -280,6 +340,27 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_write_b16 v0, v1 offset:2 ; GFX6-NEXT: s_endpgm +; +; GFX10-LABEL: store_lds_v4i32_align2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: v_mov_b32_e32 v3, s5 +; GFX10-NEXT: v_mov_b32_e32 v7, s4 +; GFX10-NEXT: ds_write_b16 v0, v1 offset:12 +; GFX10-NEXT: ds_write_b16_d16_hi v0, v1 offset:14 +; GFX10-NEXT: ds_write_b16 v0, v2 offset:8 +; GFX10-NEXT: ds_write_b16_d16_hi v0, v2 offset:10 +; GFX10-NEXT: ds_write_b16 v0, v3 offset:4 +; GFX10-NEXT: ds_write_b16_d16_hi v0, v3 offset:6 +; GFX10-NEXT: ds_write_b16 v0, v7 +; GFX10-NEXT: ds_write_b16_d16_hi v0, v7 offset:2 +; GFX10-NEXT: s_endpgm store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 2 ret void } @@ -328,6 +409,21 @@ ; GFX6-NEXT: v_mov_b32_e32 v2, s2 ; GFX6-NEXT: ds_write2_b32 v0, v2, v1 offset0:2 offset1:3 ; GFX6-NEXT: s_endpgm +; +; GFX10-LABEL: store_lds_v4i32_align4: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-NEXT: v_mov_b32_e32 v6, s7 +; GFX10-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 +; GFX10-NEXT: ds_write2_b32 v0, v3, v6 offset0:2 offset1:3 +; GFX10-NEXT: s_endpgm store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 4 ret void } @@ -373,6 +469,20 @@ ; GFX6-NEXT: v_mov_b32_e32 v3, s1 ; GFX6-NEXT: ds_write2_b64 v4, v[2:3], v[0:1] offset1:1 ; GFX6-NEXT: s_endpgm +; +; GFX10-LABEL: store_lds_v4i32_align8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, s2 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: v_mov_b32_e32 v3, s7 +; GFX10-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 +; GFX10-NEXT: s_endpgm store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 8 ret void } @@ -418,6 +528,20 @@ ; GFX6-NEXT: v_mov_b32_e32 v3, s1 ; GFX6-NEXT: ds_write2_b64 v4, v[2:3], v[0:1] offset1:1 ; GFX6-NEXT: s_endpgm +; +; GFX10-LABEL: store_lds_v4i32_align16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, s2 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: v_mov_b32_e32 v3, s7 +; GFX10-NEXT: ds_write_b128 v4, v[0:3] +; GFX10-NEXT: s_endpgm store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 16 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/store-local.96.ll --- a/llvm/test/CodeGen/AMDGPU/store-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/store-local.96.ll @@ -2,6 +2,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s define amdgpu_kernel void @store_lds_v3i32(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32: @@ -42,6 +43,19 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: ds_write_b64 v2, v[0:1] ; GFX6-NEXT: s_endpgm +; +; GFX10-LABEL: store_lds_v3i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 +; GFX10-NEXT: ds_write_b96 v3, v[0:2] +; GFX10-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out ret void } @@ -165,6 +179,42 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:2 ; GFX6-NEXT: s_endpgm +; +; GFX10-LABEL: store_lds_v3i32_align1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-NEXT: s_lshr_b32 s0, s6, 8 +; GFX10-NEXT: s_lshr_b32 s1, s6, 24 +; GFX10-NEXT: s_lshr_b32 s2, s5, 8 +; GFX10-NEXT: s_lshr_b32 s3, s5, 24 +; GFX10-NEXT: s_lshr_b32 s5, s4, 8 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_lshr_b32 s4, s4, 24 +; GFX10-NEXT: v_mov_b32_e32 v11, s0 +; GFX10-NEXT: v_mov_b32_e32 v5, s1 +; GFX10-NEXT: v_mov_b32_e32 v6, s2 +; GFX10-NEXT: v_mov_b32_e32 v7, s3 +; GFX10-NEXT: v_mov_b32_e32 v15, s5 +; GFX10-NEXT: v_mov_b32_e32 v9, s4 +; GFX10-NEXT: ds_write_b8 v0, v1 offset:8 +; GFX10-NEXT: ds_write_b8_d16_hi v0, v1 offset:10 +; GFX10-NEXT: ds_write_b8 v0, v2 offset:4 +; GFX10-NEXT: ds_write_b8_d16_hi v0, v2 offset:6 +; GFX10-NEXT: ds_write_b8 v0, v3 +; GFX10-NEXT: ds_write_b8_d16_hi v0, v3 offset:2 +; GFX10-NEXT: ds_write_b8 v0, v11 offset:9 +; GFX10-NEXT: ds_write_b8 v0, v5 offset:11 +; GFX10-NEXT: ds_write_b8 v0, v6 offset:5 +; GFX10-NEXT: ds_write_b8 v0, v7 offset:7 +; GFX10-NEXT: ds_write_b8 v0, v15 offset:1 +; GFX10-NEXT: ds_write_b8 v0, v9 offset:3 +; GFX10-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1 ret void } @@ -234,6 +284,24 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_write_b16 v0, v1 offset:2 ; GFX6-NEXT: s_endpgm +; +; GFX10-LABEL: store_lds_v3i32_align2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: ds_write_b16 v0, v1 offset:8 +; GFX10-NEXT: ds_write_b16_d16_hi v0, v1 offset:10 +; GFX10-NEXT: ds_write_b16 v0, v2 offset:4 +; GFX10-NEXT: ds_write_b16_d16_hi v0, v2 offset:6 +; GFX10-NEXT: ds_write_b16 v0, v3 +; GFX10-NEXT: ds_write_b16_d16_hi v0, v3 offset:2 +; GFX10-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 2 ret void } @@ -279,6 +347,20 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: ds_write_b32 v0, v1 offset:8 ; GFX6-NEXT: s_endpgm +; +; GFX10-LABEL: store_lds_v3i32_align4: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 +; GFX10-NEXT: ds_write_b32 v0, v3 offset:8 +; GFX10-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 4 ret void } @@ -324,6 +406,20 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: ds_write_b64 v2, v[0:1] ; GFX6-NEXT: s_endpgm +; +; GFX10-LABEL: store_lds_v3i32_align8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: ds_write_b32 v2, v3 offset:8 +; GFX10-NEXT: ds_write_b64 v2, v[0:1] +; GFX10-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 8 ret void } @@ -367,6 +463,19 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: ds_write_b64 v2, v[0:1] ; GFX6-NEXT: s_endpgm +; +; GFX10-LABEL: store_lds_v3i32_align16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 +; GFX10-NEXT: ds_write_b96 v3, v[0:2] +; GFX10-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 16 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll --- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -2,6 +2,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIVI,HAWAII %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIVI,FIJI %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s define void @local_store_i56(i56 addrspace(3)* %ptr, i56 %arg) #0 { ; CIVI-LABEL: local_store_i56: @@ -23,6 +24,16 @@ ; GFX9-NEXT: ds_write_b32 v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: local_store_i56: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_write_b8_d16_hi v0, v2 offset:6 +; GFX10-NEXT: ds_write_b16 v0, v2 offset:4 +; GFX10-NEXT: ds_write_b32 v0, v1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] store i56 %arg, i56 addrspace(3)* %ptr, align 8 ret void } @@ -92,6 +103,27 @@ ; GFX9-NEXT: ds_write_b8_d16_hi v1, v0 offset:6 ; GFX9-NEXT: ds_write_b32 v1, v3 ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: local_store_i55: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0xc +; GFX10-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-NEXT: global_load_ubyte_d16_hi v0, v0, s[4:5] offset:14 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_and_b32 s3, s0, 0xffff +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_or_b32_e32 v0, s3, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffff, v0 +; GFX10-NEXT: ds_write_b16 v1, v2 offset:4 +; GFX10-NEXT: ds_write_b8_d16_hi v1, v0 offset:6 +; GFX10-NEXT: ds_write_b32 v1, v3 +; GFX10-NEXT: s_endpgm store i55 %arg, i55 addrspace(3)* %ptr, align 8 ret void } @@ -137,6 +169,20 @@ ; GFX9-NEXT: ds_write_b16 v0, v1 offset:4 ; GFX9-NEXT: ds_write_b32 v0, v2 ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: local_store_i48: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s1, s[4:5], 0xc +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: ds_write_b16 v0, v1 offset:4 +; GFX10-NEXT: ds_write_b32 v0, v2 +; GFX10-NEXT: s_endpgm store i48 %arg, i48 addrspace(3)* %ptr, align 8 ret void } @@ -188,6 +234,22 @@ ; GFX9-NEXT: ds_write_b8 v2, v3 offset:8 ; GFX9-NEXT: ds_write_b64 v2, v[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: local_store_i65: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x10 +; GFX10-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_and_b32 s2, s2, 1 +; GFX10-NEXT: v_mov_b32_e32 v2, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: ds_write_b8 v2, v3 offset:8 +; GFX10-NEXT: ds_write_b64 v2, v[0:1] +; GFX10-NEXT: s_endpgm store i65 %arg, i65 addrspace(3)* %ptr, align 8 ret void } @@ -209,6 +271,15 @@ ; GFX9-NEXT: ds_write_b16 v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: local_store_i13: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v1, 0x1fff, v1 +; GFX10-NEXT: ds_write_b16 v0, v1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] store i13 %arg, i13 addrspace(3)* %ptr, align 8 ret void } @@ -232,6 +303,16 @@ ; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: local_store_i17: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v2, 0x1ffff, v1 +; GFX10-NEXT: ds_write_b16 v0, v1 +; GFX10-NEXT: ds_write_b8_d16_hi v0, v2 offset:2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] store i17 %arg, i17 addrspace(3)* %ptr, align 8 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll --- a/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s ; FIXME: promotion not handled without f16 insts define half @v_constained_fadd_f16_fpexcept_strict(half %x, half %y) #0 { @@ -9,6 +10,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_add_f16_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fadd_f16_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call half @llvm.experimental.constrained.fadd.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret half %val } @@ -19,6 +27,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_add_f16_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fadd_f16_fpexcept_ignore: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call half @llvm.experimental.constrained.fadd.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret half %val } @@ -29,6 +44,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_add_f16_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fadd_f16_fpexcept_maytrap: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call half @llvm.experimental.constrained.fadd.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") ret half %val } @@ -47,6 +69,13 @@ ; GFX8-NEXT: v_add_f16_e32 v0, v0, v1 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fadd_v2f16_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_add_f16 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x half> @llvm.experimental.constrained.fadd.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <2 x half> %val } @@ -65,6 +94,13 @@ ; GFX8-NEXT: v_add_f16_e32 v0, v0, v1 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fadd_v2f16_fpexcept_ignore: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_add_f16 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x half> @llvm.experimental.constrained.fadd.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret <2 x half> %val } @@ -83,6 +119,13 @@ ; GFX8-NEXT: v_add_f16_e32 v0, v0, v1 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fadd_v2f16_fpexcept_maytrap: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_add_f16 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x half> @llvm.experimental.constrained.fadd.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") ret <2 x half> %val } @@ -103,6 +146,14 @@ ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_add_f16_e32 v1, v1, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fadd_v3f16_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_add_f16 v0, v0, v2 +; GFX10-NEXT: v_add_f16_e32 v1, v1, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <3 x half> @llvm.experimental.constrained.fadd.v3f16(<3 x half> %x, <3 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <3 x half> %val } @@ -133,6 +184,21 @@ ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fadd_v4f16_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_add_f16_e32 v4, v0, v2 +; GFX10-NEXT: v_add_f16_e32 v6, v1, v3 +; GFX10-NEXT: v_mov_b32_e32 v5, 0xffff +; GFX10-NEXT: v_add_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_and_b32_e32 v2, v5, v4 +; GFX10-NEXT: v_and_b32_e32 v3, v5, v6 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <4 x half> @llvm.experimental.constrained.fadd.v4f16(<4 x half> %x, <4 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <4 x half> %val } @@ -143,6 +209,11 @@ ; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: v_add_f16_e32 v0, s2, v0 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_constained_fadd_f16_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_f16_e64 v0, s2, s3 +; GFX10-NEXT: ; return to shader part epilog %val = call half @llvm.experimental.constrained.fadd.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret half %val } @@ -165,6 +236,11 @@ ; GFX8-NEXT: v_add_f16_e32 v1, s2, v1 ; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_constained_fadd_v2f16_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_pk_add_f16 v0, s2, s3 +; GFX10-NEXT: ; return to shader part epilog %val = call <2 x half> @llvm.experimental.constrained.fadd.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <2 x half> %val } diff --git a/llvm/test/CodeGen/AMDGPU/strict_fadd.f32.ll b/llvm/test/CodeGen/AMDGPU/strict_fadd.f32.ll --- a/llvm/test/CodeGen/AMDGPU/strict_fadd.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fadd.f32.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s define float @v_constained_fadd_f32_fpexcept_strict(float %x, float %y) #0 { ; GCN-LABEL: v_constained_fadd_f32_fpexcept_strict: @@ -7,6 +8,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_add_f32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fadd_f32_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret float %val } @@ -17,6 +25,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_add_f32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fadd_f32_fpexcept_ignore: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret float %val } @@ -27,6 +42,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_add_f32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fadd_f32_fpexcept_maytrap: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") ret float %val } @@ -38,6 +60,14 @@ ; GCN-NEXT: v_add_f32_e32 v0, v0, v2 ; GCN-NEXT: v_add_f32_e32 v1, v1, v3 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fadd_v2f32_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x float> @llvm.experimental.constrained.fadd.v2f32(<2 x float> %x, <2 x float> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <2 x float> %val } @@ -49,6 +79,14 @@ ; GCN-NEXT: v_add_f32_e32 v0, v0, v2 ; GCN-NEXT: v_add_f32_e32 v1, v1, v3 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fadd_v2f32_fpexcept_ignore: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x float> @llvm.experimental.constrained.fadd.v2f32(<2 x float> %x, <2 x float> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret <2 x float> %val } @@ -60,6 +98,14 @@ ; GCN-NEXT: v_add_f32_e32 v0, v0, v2 ; GCN-NEXT: v_add_f32_e32 v1, v1, v3 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fadd_v2f32_fpexcept_maytrap: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x float> @llvm.experimental.constrained.fadd.v2f32(<2 x float> %x, <2 x float> %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") ret <2 x float> %val } @@ -72,6 +118,15 @@ ; GCN-NEXT: v_add_f32_e32 v1, v1, v4 ; GCN-NEXT: v_add_f32_e32 v2, v2, v5 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fadd_v3f32_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX10-NEXT: v_add_f32_e32 v1, v1, v4 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <3 x float> @llvm.experimental.constrained.fadd.v3f32(<3 x float> %x, <3 x float> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <3 x float> %val } @@ -82,6 +137,11 @@ ; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: v_add_f32_e32 v0, s2, v0 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_constained_fadd_f32_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_f32_e64 v0, s2, s3 +; GFX10-NEXT: ; return to shader part epilog %val = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret float %val } @@ -92,6 +152,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_add_f32_e64 v0, |v0|, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fadd_f32_fpexcept_strict_fabs_lhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_add_f32_e64 v0, |v0|, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fabs.x = call float @llvm.fabs.f32(float %x) %val = call float @llvm.experimental.constrained.fadd.f32(float %fabs.x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret float %val @@ -103,6 +170,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_add_f32_e64 v0, v0, |v1| ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fadd_f32_fpexcept_strict_fabs_rhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_add_f32_e64 v0, v0, |v1| +; GFX10-NEXT: s_setpc_b64 s[30:31] %fabs.y = call float @llvm.fabs.f32(float %y) %val = call float @llvm.experimental.constrained.fadd.f32(float %x, float %fabs.y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret float %val @@ -114,6 +188,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_sub_f32_e64 v0, v1, |v0| ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fadd_f32_fpexcept_strict_fneg_fabs_lhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_sub_f32_e64 v0, v1, |v0| +; GFX10-NEXT: s_setpc_b64 s[30:31] %fabs.x = call float @llvm.fabs.f32(float %x) %neg.fabs.x = fneg float %fabs.x %val = call float @llvm.experimental.constrained.fadd.f32(float %neg.fabs.x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict") diff --git a/llvm/test/CodeGen/AMDGPU/strict_fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/strict_fadd.f64.ll --- a/llvm/test/CodeGen/AMDGPU/strict_fadd.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fadd.f64.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s define double @v_constained_fadd_f64_fpexcept_strict(double %x, double %y) #0 { ; GCN-LABEL: v_constained_fadd_f64_fpexcept_strict: @@ -7,6 +8,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fadd_f64_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call double @llvm.experimental.constrained.fadd.f64(double %x, double %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret double %val } @@ -17,6 +25,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fadd_f64_fpexcept_ignore: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call double @llvm.experimental.constrained.fadd.f64(double %x, double %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret double %val } @@ -27,6 +42,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fadd_f64_fpexcept_maytrap: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call double @llvm.experimental.constrained.fadd.f64(double %x, double %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") ret double %val } @@ -38,6 +60,18 @@ ; GCN-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5] ; GCN-NEXT: v_add_f64 v[2:3], v[2:3], v[6:7] ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fadd_v2f64_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v9, v3 +; GFX10-NEXT: v_mov_b32_e32 v8, v2 +; GFX10-NEXT: v_mov_b32_e32 v11, v1 +; GFX10-NEXT: v_mov_b32_e32 v10, v0 +; GFX10-NEXT: v_add_f64 v[2:3], v[8:9], v[6:7] +; GFX10-NEXT: v_add_f64 v[0:1], v[10:11], v[4:5] +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <2 x double> %val } @@ -49,6 +83,18 @@ ; GCN-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5] ; GCN-NEXT: v_add_f64 v[2:3], v[2:3], v[6:7] ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fadd_v2f64_fpexcept_ignore: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v9, v3 +; GFX10-NEXT: v_mov_b32_e32 v8, v2 +; GFX10-NEXT: v_mov_b32_e32 v11, v1 +; GFX10-NEXT: v_mov_b32_e32 v10, v0 +; GFX10-NEXT: v_add_f64 v[2:3], v[8:9], v[6:7] +; GFX10-NEXT: v_add_f64 v[0:1], v[10:11], v[4:5] +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret <2 x double> %val } @@ -60,6 +106,18 @@ ; GCN-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5] ; GCN-NEXT: v_add_f64 v[2:3], v[2:3], v[6:7] ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fadd_v2f64_fpexcept_maytrap: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v9, v3 +; GFX10-NEXT: v_mov_b32_e32 v8, v2 +; GFX10-NEXT: v_mov_b32_e32 v11, v1 +; GFX10-NEXT: v_mov_b32_e32 v10, v0 +; GFX10-NEXT: v_add_f64 v[2:3], v[8:9], v[6:7] +; GFX10-NEXT: v_add_f64 v[0:1], v[10:11], v[4:5] +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") ret <2 x double> %val } @@ -72,6 +130,15 @@ ; GCN-NEXT: v_add_f64 v[2:3], v[2:3], v[8:9] ; GCN-NEXT: v_add_f64 v[4:5], v[4:5], v[10:11] ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fadd_v3f64_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], v[6:7] +; GFX10-NEXT: v_add_f64 v[2:3], v[2:3], v[8:9] +; GFX10-NEXT: v_add_f64 v[4:5], v[4:5], v[10:11] +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <3 x double> @llvm.experimental.constrained.fadd.v3f64(<3 x double> %x, <3 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <3 x double> %val } @@ -83,6 +150,11 @@ ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_constained_fadd_f64_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_f64 v[0:1], s[2:3], s[4:5] +; GFX10-NEXT: ; return to shader part epilog %val = call double @llvm.experimental.constrained.fadd.f64(double %x, double %y, metadata !"round.tonearest", metadata !"fpexcept.strict") %cast = bitcast double %val to <2 x float> ret <2 x float> %cast diff --git a/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll --- a/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s define half @v_constained_fma_f16_fpexcept_strict(half %x, half %y, half %z) #0 { ; GCN-LABEL: v_constained_fma_f16_fpexcept_strict: @@ -8,6 +9,14 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_fma_f16 v0, v0, v1, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fma_f16_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_fmac_f16_e32 v2, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call half @llvm.experimental.constrained.fma.f16(half %x, half %y, half %z, metadata !"round.tonearest", metadata !"fpexcept.strict") ret half %val } @@ -30,6 +39,13 @@ ; GFX8-NEXT: v_fma_f16 v0, v0, v1, v2 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fma_v2f16_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_fma_f16 v0, v0, v1, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x half> @llvm.experimental.constrained.fma.v2f16(<2 x half> %x, <2 x half> %y, <2 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <2 x half> %val } @@ -54,6 +70,16 @@ ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_fma_f16 v1, v1, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fma_v3f16_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_pk_fma_f16 v0, v0, v2, v4 +; GFX10-NEXT: v_fmac_f16_e32 v6, v1, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, v6 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <3 x half> @llvm.experimental.constrained.fma.v3f16(<3 x half> %x, <3 x half> %y, <3 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <3 x half> %val } @@ -97,6 +123,29 @@ ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fma_v4f16_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v14, v5 +; GFX10-NEXT: v_mov_b32_e32 v15, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v14 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v15 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; GFX10-NEXT: v_fmac_f16_e32 v15, v0, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX10-NEXT: v_fmac_f16_e32 v14, v1, v3 +; GFX10-NEXT: v_fmac_f16_e32 v5, v8, v7 +; GFX10-NEXT: v_fmac_f16_e32 v4, v11, v10 +; GFX10-NEXT: v_and_b32_e32 v1, v0, v15 +; GFX10-NEXT: v_and_b32_e32 v2, v0, v14 +; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v1 +; GFX10-NEXT: v_lshl_or_b32 v1, v5, 16, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <4 x half> %val } @@ -107,6 +156,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_fma_f16 v0, v0, v1, -v2 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fma_f16_fpexcept_strict_fneg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_fma_f16 v0, v0, v1, -v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.z = fneg half %z %val = call half @llvm.experimental.constrained.fma.f16(half %x, half %y, half %neg.z, metadata !"round.tonearest", metadata !"fpexcept.strict") ret half %val @@ -118,6 +174,14 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_fma_f16 v0, -v0, -v1, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fma_f16_fpexcept_strict_fneg_fneg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_fmac_f16_e64 v2, -v0, -v1 +; GFX10-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.x = fneg half %x %neg.y = fneg half %y %val = call half @llvm.experimental.constrained.fma.f16(half %neg.x, half %neg.y, half %z, metadata !"round.tonearest", metadata !"fpexcept.strict") @@ -130,6 +194,14 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_fma_f16 v0, |v0|, |v1|, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fma_f16_fpexcept_strict_fabs_fabs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_fmac_f16_e64 v2, |v0|, |v1| +; GFX10-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.x = call half @llvm.fabs.f16(half %x) %neg.y = call half @llvm.fabs.f16(half %y) %val = call half @llvm.experimental.constrained.fma.f16(half %neg.x, half %neg.y, half %z, metadata !"round.tonearest", metadata !"fpexcept.strict") @@ -154,6 +226,13 @@ ; GFX8-NEXT: v_fma_f16 v0, -v0, -v1, v2 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fma_v2f16_fpexcept_strict_fneg_fneg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[1,1,0] neg_hi:[1,1,0] +; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.x = fneg <2 x half> %x %neg.y = fneg <2 x half> %y %val = call <2 x half> @llvm.experimental.constrained.fma.v2f16(<2 x half> %neg.x, <2 x half> %neg.y, <2 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict") diff --git a/llvm/test/CodeGen/AMDGPU/strict_fma.f32.ll b/llvm/test/CodeGen/AMDGPU/strict_fma.f32.ll --- a/llvm/test/CodeGen/AMDGPU/strict_fma.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fma.f32.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s define float @v_constained_fma_f32_fpexcept_strict(float %x, float %y, float %z) #0 { ; GCN-LABEL: v_constained_fma_f32_fpexcept_strict: @@ -7,6 +8,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_fma_f32 v0, v0, v1, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fma_f32_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call float @llvm.experimental.constrained.fma.f32(float %x, float %y, float %z, metadata !"round.tonearest", metadata !"fpexcept.strict") ret float %val } @@ -18,6 +26,14 @@ ; GCN-NEXT: v_fma_f32 v0, v0, v2, v4 ; GCN-NEXT: v_fma_f32 v1, v1, v3, v5 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fma_v2f32_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_fma_f32 v0, v0, v2, v4 +; GFX10-NEXT: v_fma_f32 v1, v1, v3, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float> %x, <2 x float> %y, <2 x float> %z, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <2 x float> %val } @@ -30,6 +46,15 @@ ; GCN-NEXT: v_fma_f32 v1, v1, v4, v7 ; GCN-NEXT: v_fma_f32 v2, v2, v5, v8 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fma_v3f32_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_fma_f32 v0, v0, v3, v6 +; GFX10-NEXT: v_fma_f32 v1, v1, v4, v7 +; GFX10-NEXT: v_fma_f32 v2, v2, v5, v8 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <3 x float> @llvm.experimental.constrained.fma.v3f32(<3 x float> %x, <3 x float> %y, <3 x float> %z, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <3 x float> %val } @@ -43,6 +68,16 @@ ; GCN-NEXT: v_fma_f32 v2, v2, v6, v10 ; GCN-NEXT: v_fma_f32 v3, v3, v7, v11 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fma_v4f32_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_fma_f32 v0, v0, v4, v8 +; GFX10-NEXT: v_fma_f32 v1, v1, v5, v9 +; GFX10-NEXT: v_fma_f32 v2, v2, v6, v10 +; GFX10-NEXT: v_fma_f32 v3, v3, v7, v11 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %x, <4 x float> %y, <4 x float> %z, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <4 x float> %val } @@ -53,6 +88,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_fma_f32 v0, v0, v1, -v2 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fma_f32_fpexcept_strict_fneg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_fma_f32 v0, v0, v1, -v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.z = fneg float %z %val = call float @llvm.experimental.constrained.fma.f32(float %x, float %y, float %neg.z, metadata !"round.tonearest", metadata !"fpexcept.strict") ret float %val @@ -64,6 +106,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_fma_f32 v0, -v0, -v1, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fma_f32_fpexcept_strict_fneg_fneg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_fma_f32 v0, -v1, -v0, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.x = fneg float %x %neg.y = fneg float %y %val = call float @llvm.experimental.constrained.fma.f32(float %neg.x, float %neg.y, float %z, metadata !"round.tonearest", metadata !"fpexcept.strict") @@ -76,6 +125,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_fma_f32 v0, |v0|, |v1|, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fma_f32_fpexcept_strict_fabs_fabs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_fma_f32 v0, |v1|, |v0|, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.x = call float @llvm.fabs.f32(float %x) %neg.y = call float @llvm.fabs.f32(float %y) %val = call float @llvm.experimental.constrained.fma.f32(float %neg.x, float %neg.y, float %z, metadata !"round.tonearest", metadata !"fpexcept.strict") @@ -89,6 +145,14 @@ ; GCN-NEXT: v_fma_f32 v0, -v0, -v2, v4 ; GCN-NEXT: v_fma_f32 v1, -v1, -v3, v5 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fma_v2f32_fpexcept_strict_fneg_fneg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_fma_f32 v0, -v2, -v0, v4 +; GFX10-NEXT: v_fma_f32 v1, -v3, -v1, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.x = fneg <2 x float> %x %neg.y = fneg <2 x float> %y %val = call <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float> %neg.x, <2 x float> %neg.y, <2 x float> %z, metadata !"round.tonearest", metadata !"fpexcept.strict") diff --git a/llvm/test/CodeGen/AMDGPU/strict_fma.f64.ll b/llvm/test/CodeGen/AMDGPU/strict_fma.f64.ll --- a/llvm/test/CodeGen/AMDGPU/strict_fma.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fma.f64.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s define double @v_constained_fma_f64_fpexcept_strict(double %x, double %y, double %z) #0 { ; GCN-LABEL: v_constained_fma_f64_fpexcept_strict: @@ -7,6 +8,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fma_f64_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call double @llvm.experimental.constrained.fma.f64(double %x, double %y, double %z, metadata !"round.tonearest", metadata !"fpexcept.strict") ret double %val } @@ -18,6 +26,18 @@ ; GCN-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9] ; GCN-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11] ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fma_v2f64_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v13, v3 +; GFX10-NEXT: v_mov_b32_e32 v12, v2 +; GFX10-NEXT: v_mov_b32_e32 v15, v1 +; GFX10-NEXT: v_mov_b32_e32 v14, v0 +; GFX10-NEXT: v_fma_f64 v[2:3], v[12:13], v[6:7], v[10:11] +; GFX10-NEXT: v_fma_f64 v[0:1], v[14:15], v[4:5], v[8:9] +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %x, <2 x double> %y, <2 x double> %z, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <2 x double> %val } @@ -30,6 +50,15 @@ ; GCN-NEXT: v_fma_f64 v[2:3], v[2:3], v[8:9], v[14:15] ; GCN-NEXT: v_fma_f64 v[4:5], v[4:5], v[10:11], v[16:17] ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fma_v3f64_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[6:7], v[12:13] +; GFX10-NEXT: v_fma_f64 v[2:3], v[2:3], v[8:9], v[14:15] +; GFX10-NEXT: v_fma_f64 v[4:5], v[4:5], v[10:11], v[16:17] +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <3 x double> @llvm.experimental.constrained.fma.v3f64(<3 x double> %x, <3 x double> %y, <3 x double> %z, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <3 x double> %val } @@ -43,6 +72,24 @@ ; GCN-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21] ; GCN-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23] ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fma_v4f64_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v29, v7 +; GFX10-NEXT: v_mov_b32_e32 v28, v6 +; GFX10-NEXT: v_mov_b32_e32 v31, v5 +; GFX10-NEXT: v_mov_b32_e32 v30, v4 +; GFX10-NEXT: v_mov_b32_e32 v25, v3 +; GFX10-NEXT: v_mov_b32_e32 v24, v2 +; GFX10-NEXT: v_mov_b32_e32 v27, v1 +; GFX10-NEXT: v_mov_b32_e32 v26, v0 +; GFX10-NEXT: v_fma_f64 v[4:5], v[30:31], v[12:13], v[20:21] +; GFX10-NEXT: v_fma_f64 v[6:7], v[28:29], v[14:15], v[22:23] +; GFX10-NEXT: v_fma_f64 v[2:3], v[24:25], v[10:11], v[18:19] +; GFX10-NEXT: v_fma_f64 v[0:1], v[26:27], v[8:9], v[16:17] +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <4 x double> @llvm.experimental.constrained.fma.v4f64(<4 x double> %x, <4 x double> %y, <4 x double> %z, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <4 x double> %val } @@ -53,6 +100,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], -v[4:5] ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fma_f64_fpexcept_strict_fneg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], -v[4:5] +; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.z = fneg double %z %val = call double @llvm.experimental.constrained.fma.f64(double %x, double %y, double %neg.z, metadata !"round.tonearest", metadata !"fpexcept.strict") ret double %val @@ -64,6 +118,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_fma_f64 v[0:1], -v[0:1], -v[2:3], v[4:5] ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fma_f64_fpexcept_strict_fneg_fneg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_fma_f64 v[0:1], -v[0:1], -v[2:3], v[4:5] +; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.x = fneg double %x %neg.y = fneg double %y %val = call double @llvm.experimental.constrained.fma.f64(double %neg.x, double %neg.y, double %z, metadata !"round.tonearest", metadata !"fpexcept.strict") @@ -76,6 +137,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_fma_f64 v[0:1], |v[0:1]|, |v[2:3]|, v[4:5] ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fma_f64_fpexcept_strict_fabs_fabs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_fma_f64 v[0:1], |v[0:1]|, |v[2:3]|, v[4:5] +; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.x = call double @llvm.fabs.f64(double %x) %neg.y = call double @llvm.fabs.f64(double %y) %val = call double @llvm.experimental.constrained.fma.f64(double %neg.x, double %neg.y, double %z, metadata !"round.tonearest", metadata !"fpexcept.strict") @@ -89,6 +157,18 @@ ; GCN-NEXT: v_fma_f64 v[0:1], -v[0:1], -v[4:5], v[8:9] ; GCN-NEXT: v_fma_f64 v[2:3], -v[2:3], -v[6:7], v[10:11] ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fma_v2f64_fpexcept_strict_fneg_fneg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v13, v3 +; GFX10-NEXT: v_mov_b32_e32 v12, v2 +; GFX10-NEXT: v_mov_b32_e32 v15, v1 +; GFX10-NEXT: v_mov_b32_e32 v14, v0 +; GFX10-NEXT: v_fma_f64 v[2:3], -v[12:13], -v[6:7], v[10:11] +; GFX10-NEXT: v_fma_f64 v[0:1], -v[14:15], -v[4:5], v[8:9] +; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.x = fneg <2 x double> %x %neg.y = fneg <2 x double> %y %val = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %neg.x, <2 x double> %neg.y, <2 x double> %z, metadata !"round.tonearest", metadata !"fpexcept.strict") diff --git a/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll --- a/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s ; FIXME: promotion not handled without f16 insts define half @v_constained_fmul_f16_fpexcept_strict(half %x, half %y) #0 { @@ -9,6 +10,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mul_f16_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fmul_f16_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call half @llvm.experimental.constrained.fmul.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret half %val } @@ -19,6 +27,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mul_f16_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fmul_f16_fpexcept_ignore: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call half @llvm.experimental.constrained.fmul.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret half %val } @@ -29,6 +44,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mul_f16_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fmul_f16_fpexcept_maytrap: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call half @llvm.experimental.constrained.fmul.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") ret half %val } @@ -47,6 +69,13 @@ ; GFX8-NEXT: v_mul_f16_e32 v0, v0, v1 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fmul_v2f16_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_mul_f16 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x half> @llvm.experimental.constrained.fmul.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <2 x half> %val } @@ -65,6 +94,13 @@ ; GFX8-NEXT: v_mul_f16_e32 v0, v0, v1 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fmul_v2f16_fpexcept_ignore: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_mul_f16 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x half> @llvm.experimental.constrained.fmul.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret <2 x half> %val } @@ -83,6 +119,13 @@ ; GFX8-NEXT: v_mul_f16_e32 v0, v0, v1 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fmul_v2f16_fpexcept_maytrap: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_mul_f16 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x half> @llvm.experimental.constrained.fmul.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") ret <2 x half> %val } @@ -103,6 +146,14 @@ ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_mul_f16_e32 v1, v1, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fmul_v3f16_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 +; GFX10-NEXT: v_mul_f16_e32 v1, v1, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <3 x half> @llvm.experimental.constrained.fmul.v3f16(<3 x half> %x, <3 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <3 x half> %val } @@ -133,6 +184,21 @@ ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fmul_v4f16_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_f16_e32 v4, v0, v2 +; GFX10-NEXT: v_mul_f16_e32 v6, v1, v3 +; GFX10-NEXT: v_mov_b32_e32 v5, 0xffff +; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_and_b32_e32 v2, v5, v4 +; GFX10-NEXT: v_and_b32_e32 v3, v5, v6 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <4 x half> @llvm.experimental.constrained.fmul.v4f16(<4 x half> %x, <4 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <4 x half> %val } @@ -143,6 +209,11 @@ ; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: v_mul_f16_e32 v0, s2, v0 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_constained_fmul_f16_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mul_f16_e64 v0, s2, s3 +; GFX10-NEXT: ; return to shader part epilog %val = call half @llvm.experimental.constrained.fmul.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret half %val } @@ -165,6 +236,11 @@ ; GFX8-NEXT: v_mul_f16_e32 v1, s2, v1 ; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_constained_fmul_v2f16_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_pk_mul_f16 v0, s2, s3 +; GFX10-NEXT: ; return to shader part epilog %val = call <2 x half> @llvm.experimental.constrained.fmul.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <2 x half> %val } diff --git a/llvm/test/CodeGen/AMDGPU/strict_fmul.f32.ll b/llvm/test/CodeGen/AMDGPU/strict_fmul.f32.ll --- a/llvm/test/CodeGen/AMDGPU/strict_fmul.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fmul.f32.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s define float @v_constained_fmul_f32_fpexcept_strict(float %x, float %y) #0 { ; GCN-LABEL: v_constained_fmul_f32_fpexcept_strict: @@ -7,6 +8,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fmul_f32_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call float @llvm.experimental.constrained.fmul.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret float %val } @@ -17,6 +25,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fmul_f32_fpexcept_ignore: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call float @llvm.experimental.constrained.fmul.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret float %val } @@ -27,6 +42,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fmul_f32_fpexcept_maytrap: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call float @llvm.experimental.constrained.fmul.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") ret float %val } @@ -38,6 +60,14 @@ ; GCN-NEXT: v_mul_f32_e32 v0, v0, v2 ; GCN-NEXT: v_mul_f32_e32 v1, v1, v3 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fmul_v2f32_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x float> @llvm.experimental.constrained.fmul.v2f32(<2 x float> %x, <2 x float> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <2 x float> %val } @@ -49,6 +79,14 @@ ; GCN-NEXT: v_mul_f32_e32 v0, v0, v2 ; GCN-NEXT: v_mul_f32_e32 v1, v1, v3 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fmul_v2f32_fpexcept_ignore: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x float> @llvm.experimental.constrained.fmul.v2f32(<2 x float> %x, <2 x float> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret <2 x float> %val } @@ -60,6 +98,14 @@ ; GCN-NEXT: v_mul_f32_e32 v0, v0, v2 ; GCN-NEXT: v_mul_f32_e32 v1, v1, v3 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fmul_v2f32_fpexcept_maytrap: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x float> @llvm.experimental.constrained.fmul.v2f32(<2 x float> %x, <2 x float> %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") ret <2 x float> %val } @@ -72,6 +118,15 @@ ; GCN-NEXT: v_mul_f32_e32 v1, v1, v4 ; GCN-NEXT: v_mul_f32_e32 v2, v2, v5 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fmul_v3f32_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v3 +; GFX10-NEXT: v_mul_f32_e32 v1, v1, v4 +; GFX10-NEXT: v_mul_f32_e32 v2, v2, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <3 x float> @llvm.experimental.constrained.fmul.v3f32(<3 x float> %x, <3 x float> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <3 x float> %val } @@ -82,6 +137,11 @@ ; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: v_mul_f32_e32 v0, s2, v0 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_constained_fmul_f32_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mul_f32_e64 v0, s2, s3 +; GFX10-NEXT: ; return to shader part epilog %val = call float @llvm.experimental.constrained.fmul.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret float %val } @@ -92,6 +152,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mul_f32_e64 v0, |v0|, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fmul_f32_fpexcept_strict_fabs_lhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_f32_e64 v0, |v0|, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fabs.x = call float @llvm.fabs.f32(float %x) %val = call float @llvm.experimental.constrained.fmul.f32(float %fabs.x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret float %val @@ -103,6 +170,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mul_f32_e64 v0, v0, |v1| ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fmul_f32_fpexcept_strict_fabs_rhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_f32_e64 v0, v0, |v1| +; GFX10-NEXT: s_setpc_b64 s[30:31] %fabs.y = call float @llvm.fabs.f32(float %y) %val = call float @llvm.experimental.constrained.fmul.f32(float %x, float %fabs.y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret float %val @@ -114,6 +188,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mul_f32_e64 v0, -|v0|, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fmul_f32_fpexcept_strict_fneg_fabs_lhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_f32_e64 v0, -|v0|, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fabs.x = call float @llvm.fabs.f32(float %x) %neg.fabs.x = fneg float %fabs.x %val = call float @llvm.experimental.constrained.fmul.f32(float %neg.fabs.x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict") diff --git a/llvm/test/CodeGen/AMDGPU/strict_fmul.f64.ll b/llvm/test/CodeGen/AMDGPU/strict_fmul.f64.ll --- a/llvm/test/CodeGen/AMDGPU/strict_fmul.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fmul.f64.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s define double @v_constained_fmul_f64_fpexcept_strict(double %x, double %y) #0 { ; GCN-LABEL: v_constained_fmul_f64_fpexcept_strict: @@ -7,6 +8,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fmul_f64_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call double @llvm.experimental.constrained.fmul.f64(double %x, double %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret double %val } @@ -17,6 +25,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fmul_f64_fpexcept_ignore: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call double @llvm.experimental.constrained.fmul.f64(double %x, double %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret double %val } @@ -27,6 +42,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fmul_f64_fpexcept_maytrap: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call double @llvm.experimental.constrained.fmul.f64(double %x, double %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") ret double %val } @@ -38,6 +60,18 @@ ; GCN-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] ; GCN-NEXT: v_mul_f64 v[2:3], v[2:3], v[6:7] ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fmul_v2f64_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v9, v3 +; GFX10-NEXT: v_mov_b32_e32 v8, v2 +; GFX10-NEXT: v_mov_b32_e32 v11, v1 +; GFX10-NEXT: v_mov_b32_e32 v10, v0 +; GFX10-NEXT: v_mul_f64 v[2:3], v[8:9], v[6:7] +; GFX10-NEXT: v_mul_f64 v[0:1], v[10:11], v[4:5] +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <2 x double> %val } @@ -49,6 +83,18 @@ ; GCN-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] ; GCN-NEXT: v_mul_f64 v[2:3], v[2:3], v[6:7] ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fmul_v2f64_fpexcept_ignore: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v9, v3 +; GFX10-NEXT: v_mov_b32_e32 v8, v2 +; GFX10-NEXT: v_mov_b32_e32 v11, v1 +; GFX10-NEXT: v_mov_b32_e32 v10, v0 +; GFX10-NEXT: v_mul_f64 v[2:3], v[8:9], v[6:7] +; GFX10-NEXT: v_mul_f64 v[0:1], v[10:11], v[4:5] +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret <2 x double> %val } @@ -60,6 +106,18 @@ ; GCN-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] ; GCN-NEXT: v_mul_f64 v[2:3], v[2:3], v[6:7] ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fmul_v2f64_fpexcept_maytrap: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v9, v3 +; GFX10-NEXT: v_mov_b32_e32 v8, v2 +; GFX10-NEXT: v_mov_b32_e32 v11, v1 +; GFX10-NEXT: v_mov_b32_e32 v10, v0 +; GFX10-NEXT: v_mul_f64 v[2:3], v[8:9], v[6:7] +; GFX10-NEXT: v_mul_f64 v[0:1], v[10:11], v[4:5] +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") ret <2 x double> %val } @@ -72,6 +130,15 @@ ; GCN-NEXT: v_mul_f64 v[2:3], v[2:3], v[8:9] ; GCN-NEXT: v_mul_f64 v[4:5], v[4:5], v[10:11] ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fmul_v3f64_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_f64 v[0:1], v[0:1], v[6:7] +; GFX10-NEXT: v_mul_f64 v[2:3], v[2:3], v[8:9] +; GFX10-NEXT: v_mul_f64 v[4:5], v[4:5], v[10:11] +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <3 x double> @llvm.experimental.constrained.fmul.v3f64(<3 x double> %x, <3 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <3 x double> %val } @@ -83,6 +150,11 @@ ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_mul_f64 v[0:1], s[2:3], v[0:1] ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_constained_fmul_f64_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mul_f64 v[0:1], s[2:3], s[4:5] +; GFX10-NEXT: ; return to shader part epilog %val = call double @llvm.experimental.constrained.fmul.f64(double %x, double %y, metadata !"round.tonearest", metadata !"fpexcept.strict") %cast = bitcast double %val to <2 x float> ret <2 x float> %cast diff --git a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll --- a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s ; FIXME: promotion not handled without f16 insts define half @v_constained_fsub_f16_fpexcept_strict(half %x, half %y) #0 { @@ -9,6 +10,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_sub_f16_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fsub_f16_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_sub_f16_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call half @llvm.experimental.constrained.fsub.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret half %val } @@ -19,6 +27,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_sub_f16_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fsub_f16_fpexcept_ignore: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_sub_f16_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call half @llvm.experimental.constrained.fsub.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret half %val } @@ -29,6 +44,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_sub_f16_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fsub_f16_fpexcept_maytrap: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_sub_f16_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call half @llvm.experimental.constrained.fsub.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") ret half %val } @@ -50,6 +72,16 @@ ; GFX8-NEXT: v_sub_f16_e32 v0, v0, v1 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fsub_v2f16_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_sub_f16_e32 v2, v0, v1 +; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x half> @llvm.experimental.constrained.fsub.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <2 x half> %val } @@ -71,6 +103,16 @@ ; GFX8-NEXT: v_sub_f16_e32 v0, v0, v1 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fsub_v2f16_fpexcept_ignore: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_sub_f16_e32 v2, v0, v1 +; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x half> @llvm.experimental.constrained.fsub.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret <2 x half> %val } @@ -92,6 +134,16 @@ ; GFX8-NEXT: v_sub_f16_e32 v0, v0, v1 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fsub_v2f16_fpexcept_maytrap: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_sub_f16_e32 v2, v0, v1 +; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x half> @llvm.experimental.constrained.fsub.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") ret <2 x half> %val } @@ -115,6 +167,17 @@ ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_sub_f16_e32 v1, v1, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fsub_v3f16_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_sub_f16_e32 v4, v0, v2 +; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_sub_f16_e32 v1, v1, v3 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <3 x half> @llvm.experimental.constrained.fsub.v3f16(<3 x half> %x, <3 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <3 x half> %val } @@ -145,6 +208,21 @@ ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fsub_v4f16_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_sub_f16_e32 v4, v0, v2 +; GFX10-NEXT: v_sub_f16_e32 v6, v1, v3 +; GFX10-NEXT: v_mov_b32_e32 v5, 0xffff +; GFX10-NEXT: v_sub_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_and_b32_e32 v2, v5, v4 +; GFX10-NEXT: v_and_b32_e32 v3, v5, v6 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <4 x half> @llvm.experimental.constrained.fsub.v4f16(<4 x half> %x, <4 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <4 x half> %val } @@ -155,6 +233,11 @@ ; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: v_sub_f16_e32 v0, s2, v0 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_constained_fsub_f16_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_sub_f16_e64 v0, s2, s3 +; GFX10-NEXT: ; return to shader part epilog %val = call half @llvm.experimental.constrained.fsub.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret half %val } @@ -183,6 +266,16 @@ ; GFX8-NEXT: v_sub_f16_e32 v1, s2, v1 ; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_constained_fsub_v2f16_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_sub_f16_e64 v0, s2, s3 +; GFX10-NEXT: s_lshr_b32 s0, s3, 16 +; GFX10-NEXT: s_lshr_b32 s1, s2, 16 +; GFX10-NEXT: v_sub_f16_e64 v1, s1, s0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: ; return to shader part epilog %val = call <2 x half> @llvm.experimental.constrained.fsub.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <2 x half> %val } diff --git a/llvm/test/CodeGen/AMDGPU/strict_fsub.f32.ll b/llvm/test/CodeGen/AMDGPU/strict_fsub.f32.ll --- a/llvm/test/CodeGen/AMDGPU/strict_fsub.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fsub.f32.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s define float @v_constained_fsub_f32_fpexcept_strict(float %x, float %y) #0 { ; GCN-LABEL: v_constained_fsub_f32_fpexcept_strict: @@ -7,6 +8,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_sub_f32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fsub_f32_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call float @llvm.experimental.constrained.fsub.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret float %val } @@ -17,6 +25,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_sub_f32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fsub_f32_fpexcept_ignore: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call float @llvm.experimental.constrained.fsub.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret float %val } @@ -27,6 +42,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_sub_f32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fsub_f32_fpexcept_maytrap: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call float @llvm.experimental.constrained.fsub.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") ret float %val } @@ -38,6 +60,14 @@ ; GCN-NEXT: v_sub_f32_e32 v0, v0, v2 ; GCN-NEXT: v_sub_f32_e32 v1, v1, v3 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fsub_v2f32_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> %x, <2 x float> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <2 x float> %val } @@ -49,6 +79,14 @@ ; GCN-NEXT: v_sub_f32_e32 v0, v0, v2 ; GCN-NEXT: v_sub_f32_e32 v1, v1, v3 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fsub_v2f32_fpexcept_ignore: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> %x, <2 x float> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret <2 x float> %val } @@ -60,6 +98,14 @@ ; GCN-NEXT: v_sub_f32_e32 v0, v0, v2 ; GCN-NEXT: v_sub_f32_e32 v1, v1, v3 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fsub_v2f32_fpexcept_maytrap: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> %x, <2 x float> %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") ret <2 x float> %val } @@ -72,6 +118,15 @@ ; GCN-NEXT: v_sub_f32_e32 v1, v1, v4 ; GCN-NEXT: v_sub_f32_e32 v2, v2, v5 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fsub_v3f32_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_sub_f32_e32 v0, v0, v3 +; GFX10-NEXT: v_sub_f32_e32 v1, v1, v4 +; GFX10-NEXT: v_sub_f32_e32 v2, v2, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <3 x float> @llvm.experimental.constrained.fsub.v3f32(<3 x float> %x, <3 x float> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <3 x float> %val } @@ -82,6 +137,11 @@ ; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: v_sub_f32_e32 v0, s2, v0 ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_constained_fsub_f32_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_sub_f32_e64 v0, s2, s3 +; GFX10-NEXT: ; return to shader part epilog %val = call float @llvm.experimental.constrained.fsub.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret float %val } @@ -92,6 +152,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_sub_f32_e64 v0, |v0|, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fsub_f32_fpexcept_strict_fabs_lhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_sub_f32_e64 v0, |v0|, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fabs.x = call float @llvm.fabs.f32(float %x) %val = call float @llvm.experimental.constrained.fsub.f32(float %fabs.x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret float %val @@ -103,6 +170,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_sub_f32_e64 v0, v0, |v1| ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fsub_f32_fpexcept_strict_fabs_rhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_sub_f32_e64 v0, v0, |v1| +; GFX10-NEXT: s_setpc_b64 s[30:31] %fabs.y = call float @llvm.fabs.f32(float %y) %val = call float @llvm.experimental.constrained.fsub.f32(float %x, float %fabs.y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret float %val @@ -114,6 +188,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_sub_f32_e64 v0, -|v0|, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fsub_f32_fpexcept_strict_fneg_fabs_lhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_sub_f32_e64 v0, -|v0|, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %fabs.x = call float @llvm.fabs.f32(float %x) %neg.fabs.x = fneg float %fabs.x %val = call float @llvm.experimental.constrained.fsub.f32(float %neg.fabs.x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict") diff --git a/llvm/test/CodeGen/AMDGPU/strict_fsub.f64.ll b/llvm/test/CodeGen/AMDGPU/strict_fsub.f64.ll --- a/llvm/test/CodeGen/AMDGPU/strict_fsub.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fsub.f64.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s define double @v_constained_fsub_f64_fpexcept_strict(double %x, double %y) #0 { ; GCN-LABEL: v_constained_fsub_f64_fpexcept_strict: @@ -7,6 +8,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fsub_f64_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call double @llvm.experimental.constrained.fsub.f64(double %x, double %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret double %val } @@ -17,6 +25,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fsub_f64_fpexcept_ignore: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call double @llvm.experimental.constrained.fsub.f64(double %x, double %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret double %val } @@ -27,6 +42,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fsub_f64_fpexcept_maytrap: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call double @llvm.experimental.constrained.fsub.f64(double %x, double %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") ret double %val } @@ -38,6 +60,18 @@ ; GCN-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5] ; GCN-NEXT: v_add_f64 v[2:3], v[2:3], -v[6:7] ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fsub_v2f64_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v9, v3 +; GFX10-NEXT: v_mov_b32_e32 v8, v2 +; GFX10-NEXT: v_mov_b32_e32 v11, v1 +; GFX10-NEXT: v_mov_b32_e32 v10, v0 +; GFX10-NEXT: v_add_f64 v[2:3], v[8:9], -v[6:7] +; GFX10-NEXT: v_add_f64 v[0:1], v[10:11], -v[4:5] +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x double> @llvm.experimental.constrained.fsub.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <2 x double> %val } @@ -49,6 +83,18 @@ ; GCN-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5] ; GCN-NEXT: v_add_f64 v[2:3], v[2:3], -v[6:7] ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fsub_v2f64_fpexcept_ignore: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v9, v3 +; GFX10-NEXT: v_mov_b32_e32 v8, v2 +; GFX10-NEXT: v_mov_b32_e32 v11, v1 +; GFX10-NEXT: v_mov_b32_e32 v10, v0 +; GFX10-NEXT: v_add_f64 v[2:3], v[8:9], -v[6:7] +; GFX10-NEXT: v_add_f64 v[0:1], v[10:11], -v[4:5] +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x double> @llvm.experimental.constrained.fsub.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret <2 x double> %val } @@ -60,6 +106,18 @@ ; GCN-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5] ; GCN-NEXT: v_add_f64 v[2:3], v[2:3], -v[6:7] ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fsub_v2f64_fpexcept_maytrap: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v9, v3 +; GFX10-NEXT: v_mov_b32_e32 v8, v2 +; GFX10-NEXT: v_mov_b32_e32 v11, v1 +; GFX10-NEXT: v_mov_b32_e32 v10, v0 +; GFX10-NEXT: v_add_f64 v[2:3], v[8:9], -v[6:7] +; GFX10-NEXT: v_add_f64 v[0:1], v[10:11], -v[4:5] +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x double> @llvm.experimental.constrained.fsub.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") ret <2 x double> %val } @@ -72,6 +130,15 @@ ; GCN-NEXT: v_add_f64 v[2:3], v[2:3], -v[8:9] ; GCN-NEXT: v_add_f64 v[4:5], v[4:5], -v[10:11] ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constained_fsub_v3f64_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], -v[6:7] +; GFX10-NEXT: v_add_f64 v[2:3], v[2:3], -v[8:9] +; GFX10-NEXT: v_add_f64 v[4:5], v[4:5], -v[10:11] +; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <3 x double> @llvm.experimental.constrained.fsub.v3f64(<3 x double> %x, <3 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <3 x double> %val } @@ -83,6 +150,11 @@ ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_add_f64 v[0:1], s[2:3], -v[0:1] ; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_constained_fsub_f64_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_f64 v[0:1], s[2:3], -s[4:5] +; GFX10-NEXT: ; return to shader part epilog %val = call double @llvm.experimental.constrained.fsub.f64(double %x, double %y, metadata !"round.tonearest", metadata !"fpexcept.strict") %cast = bitcast double %val to <2 x float> ret <2 x float> %cast diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,GFX9 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,VI +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GFX10 ; FIXME: Need to handle non-uniform case for function below (load without gep). define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { @@ -45,6 +46,24 @@ ; VI-NEXT: v_or_b32_e32 v0, v2, v0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm +; +; GFX10-LABEL: v_test_sub_v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: v_pk_sub_i16 v0, v1, v2 +; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -96,6 +115,21 @@ ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX10-LABEL: s_test_sub_v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_pk_sub_i16 v0, s0, s1 +; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX10-NEXT: s_endpgm %a = load <2 x i16>, <2 x i16> addrspace(4)* %in0 %b = load <2 x i16>, <2 x i16> addrspace(4)* %in1 %add = sub <2 x i16> %a, %b @@ -113,6 +147,16 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm +; +; GFX10-LABEL: s_test_sub_self_v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: s_endpgm %a = load <2 x i16>, <2 x i16> addrspace(4)* %in0 %add = sub <2 x i16> %a, %a store <2 x i16> %add, <2 x i16> addrspace(1)* %out @@ -152,6 +196,19 @@ ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm +; +; GFX10-LABEL: s_test_sub_v2i16_kernarg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX10-NEXT: s_load_dword s3, s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_pk_sub_i16 v0, s2, s3 +; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX10-NEXT: s_endpgm %add = sub <2 x i16> %a, %b store <2 x i16> %add, <2 x i16> addrspace(1)* %out ret void @@ -192,6 +249,20 @@ ; VI-NEXT: v_or_b32_e32 v0, v2, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX10-LABEL: v_test_sub_v2i16_constant: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_pk_sub_i16 v0, v0, 0x1c8007b +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -237,6 +308,20 @@ ; VI-NEXT: v_or_b32_e32 v0, v2, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX10-LABEL: v_test_sub_v2i16_neg_constant: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_pk_sub_i16 v0, v0, 0xfc21fcb3 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -280,6 +365,20 @@ ; VI-NEXT: v_or_b32_e32 v0, v2, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX10-LABEL: v_test_sub_v2i16_inline_neg1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_pk_sub_i16 v0, v0, -1 op_sel_hi:[1,0] +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -322,6 +421,20 @@ ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX10-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_pk_sub_i16 v0, v0, 32 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -368,6 +481,20 @@ ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX10-LABEL: v_test_sub_v2i16_inline_fp_split: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: v_pk_sub_i16 v0, v0, 0x3f80 op_sel:[0,1] op_sel_hi:[1,0] +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -419,6 +546,26 @@ ; VI-NEXT: v_sub_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm +; +; GFX10-LABEL: v_test_sub_v2i16_zext_to_v2i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: v_pk_sub_i16 v0, v1, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -477,6 +624,28 @@ ; VI-NEXT: v_sub_u16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm +; +; GFX10-LABEL: v_test_sub_v2i16_zext_to_v2i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: v_pk_sub_i16 v2, v1, v2 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -533,6 +702,26 @@ ; VI-NEXT: v_bfe_i32 v1, v2, 0, 16 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm +; +; GFX10-LABEL: v_test_sub_v2i16_sext_to_v2i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: v_pk_sub_i16 v0, v1, v2 +; GFX10-NEXT: v_ashrrev_i32_e32 v1, 16, v0 +; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -592,6 +781,29 @@ ; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm +; +; GFX10-LABEL: v_test_sub_v2i16_sext_to_v2i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_sub_i16 v0, v1, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX10-NEXT: v_bfe_i32 v2, v1, 0, 16 +; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/usubsat.ll b/llvm/test/CodeGen/AMDGPU/usubsat.ll --- a/llvm/test/CodeGen/AMDGPU/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/usubsat.ll @@ -2,6 +2,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck --check-prefix=GFX8 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s define i8 @v_usubsat_i8(i8 %lhs, i8 %rhs) { ; GFX6-LABEL: v_usubsat_i8: @@ -25,6 +26,16 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_sub_u16_sdwa v0, v0, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_usubsat_i8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX10-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX10-NEXT: v_sub_nc_u16_e64 v0, v0, v1 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i8 @llvm.usub.sat.i8(i8 %lhs, i8 %rhs) ret i8 %result } @@ -51,6 +62,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_sub_u16_e64 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_usubsat_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_sub_nc_u16_e64 v0, v0, v1 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs) ret i16 %result } @@ -74,6 +92,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_sub_u32_e64 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_usubsat_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, v1 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i32 @llvm.usub.sat.i32(i32 %lhs, i32 %rhs) ret i32 %result } @@ -109,6 +134,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_sub_u16 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_usubsat_v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_sub_u16 v0, v0, v1 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) ret <2 x i16> %result } @@ -150,6 +182,14 @@ ; GFX9-NEXT: v_pk_sub_u16 v1, v1, v3 clamp ; GFX9-NEXT: v_pk_sub_u16 v0, v0, v2 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_usubsat_v3i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_sub_u16 v0, v0, v2 clamp +; GFX10-NEXT: v_pk_sub_u16 v1, v1, v3 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <3 x i16> @llvm.usub.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs) ret <3 x i16> %result } @@ -198,6 +238,14 @@ ; GFX9-NEXT: v_pk_sub_u16 v0, v0, v2 clamp ; GFX9-NEXT: v_pk_sub_u16 v1, v1, v3 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_usubsat_v4i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_sub_u16 v0, v0, v2 clamp +; GFX10-NEXT: v_pk_sub_u16 v1, v1, v3 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) %cast = bitcast <4 x i16> %result to <2 x float> ret <2 x float> %cast @@ -226,6 +274,14 @@ ; GFX9-NEXT: v_sub_u32_e64 v0, v0, v2 clamp ; GFX9-NEXT: v_sub_u32_e64 v1, v1, v3 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_usubsat_v2i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, v2 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v1, v1, v3 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) ret <2 x i32> %result } @@ -257,6 +313,15 @@ ; GFX9-NEXT: v_sub_u32_e64 v1, v1, v4 clamp ; GFX9-NEXT: v_sub_u32_e64 v2, v2, v5 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_usubsat_v3i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, v3 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v1, v1, v4 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v2, v2, v5 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <3 x i32> @llvm.usub.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs) ret <3 x i32> %result } @@ -292,6 +357,16 @@ ; GFX9-NEXT: v_sub_u32_e64 v2, v2, v6 clamp ; GFX9-NEXT: v_sub_u32_e64 v3, v3, v7 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_usubsat_v4i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, v4 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v1, v1, v5 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v2, v2, v6 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v3, v3, v7 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) ret <4 x i32> %result } @@ -343,6 +418,20 @@ ; GFX9-NEXT: v_sub_u32_e64 v6, v6, v14 clamp ; GFX9-NEXT: v_sub_u32_e64 v7, v7, v15 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_usubsat_v8i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, v8 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v1, v1, v9 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v2, v2, v10 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v3, v3, v11 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v4, v4, v12 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v5, v5, v13 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v6, v6, v14 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v7, v7, v15 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> %lhs, <8 x i32> %rhs) ret <8 x i32> %result } @@ -426,6 +515,28 @@ ; GFX9-NEXT: v_sub_u32_e64 v14, v14, v30 clamp ; GFX9-NEXT: v_sub_u32_e64 v15, v15, v31 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_usubsat_v16i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, v16 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v1, v1, v17 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v2, v2, v18 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v3, v3, v19 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v4, v4, v20 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v5, v5, v21 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v6, v6, v22 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v7, v7, v23 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v8, v8, v24 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v9, v9, v25 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v10, v10, v26 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v11, v11, v27 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v12, v12, v28 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v13, v13, v29 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v14, v14, v30 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v15, v15, v31 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) ret <16 x i32> %result } @@ -461,6 +572,17 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_usubsat_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_sub_co_u32_e64 v2, vcc_lo, v0, v2 +; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[2:3], v[0:1] +; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs) ret i64 %result } diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s define <4 x half> @shuffle_v4f16_23uu(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { ; GFX9-LABEL: shuffle_v4f16_23uu: @@ -8,6 +9,14 @@ ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: shuffle_v4f16_23uu: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -27,6 +36,18 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: shuffle_v4f16_234u: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 +; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_mov_b32_e32 v0, v6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -40,6 +61,14 @@ ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: shuffle_v4f16_u1u3: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -54,6 +83,15 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: shuffle_v4f16_u3u1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx2 v[1:2], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -67,6 +105,14 @@ ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: shuffle_v4f16_u3uu: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -84,6 +130,18 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: shuffle_v4f16_3u6u: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 +; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -101,6 +159,18 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: shuffle_v4f16_3uu7: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 +; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -121,6 +191,21 @@ ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: shuffle_v4f16_35u5: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v4, v[2:3], off +; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 +; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -141,6 +226,21 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v5 ; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: shuffle_v4f16_357u: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 +; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_and_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -155,6 +255,15 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: shuffle_v4f16_0101: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -168,6 +277,14 @@ ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: shuffle_v4f16_0123: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -185,6 +302,18 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: shuffle_v4f16_0145: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v5, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -202,6 +331,18 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: shuffle_v4f16_0167: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v5, v[2:3], off offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -216,6 +357,15 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: shuffle_v4f16_2301: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx2 v[1:2], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -230,6 +380,15 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: shuffle_v4f16_2323: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -247,6 +406,18 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: shuffle_v4f16_2345: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 +; GFX10-NEXT: global_load_dword v5, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -264,6 +435,18 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: shuffle_v4f16_2367: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 +; GFX10-NEXT: global_load_dword v5, v[2:3], off offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -281,6 +464,18 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: shuffle_v4f16_4501: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v4, v[2:3], off +; GFX10-NEXT: global_load_dword v5, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -298,6 +493,18 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: shuffle_v4f16_4523: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v4, v[2:3], off +; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -312,6 +519,15 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: shuffle_v4f16_4545: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v0, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -325,6 +541,14 @@ ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: shuffle_v4f16_4567: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -342,6 +566,18 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: shuffle_v4f16_6701: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4 +; GFX10-NEXT: global_load_dword v5, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -359,6 +595,18 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: shuffle_v4f16_6723: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4 +; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -373,6 +621,15 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: shuffle_v4f16_6745: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx2 v[1:2], v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -387,6 +644,15 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: shuffle_v4f16_6767: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v0, v[2:3], off offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -406,6 +672,20 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: shuffle_v4f16_2356: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 +; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_lshl_or_b32 v1, v6, 16, v0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -425,6 +705,20 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: shuffle_v4f16_5623: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 +; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: v_lshl_or_b32 v0, v6, 16, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -445,6 +739,21 @@ ; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v1, v5, 16, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: shuffle_v4f16_3456: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 +; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_and_b32_sdwa v1, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_and_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v1 +; GFX10-NEXT: v_lshl_or_b32 v1, v5, 16, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -464,6 +773,21 @@ ; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: shuffle_v4f16_5634: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx2 v[9:10], v[2:3], off +; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 +; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_and_b32_sdwa v1, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_and_b32_sdwa v2, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_lshl_or_b32 v0, v10, 16, v1 +; GFX10-NEXT: v_lshl_or_b32 v1, v9, 16, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -485,6 +809,22 @@ ; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: shuffle_v4f16_5734: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 +; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_and_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_and_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v2 +; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -504,6 +844,20 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: shuffle_v4i16_2356: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 +; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_lshl_or_b32 v1, v6, 16, v0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0 %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1 %shuffle = shufflevector <4 x i16> %val0, <4 x i16> %val1, <4 x i32> @@ -521,6 +875,18 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: shuffle_v4i16_0167: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v5, v[2:3], off offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0 %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1 %shuffle = shufflevector <4 x i16> %val0, <4 x i16> %val1, <4 x i32> @@ -537,6 +903,17 @@ ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: shuffle_v4f16_0000: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v0 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> zeroinitializer @@ -554,6 +931,18 @@ ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: shuffle_v4f16_1010: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX10-NEXT: v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -573,6 +962,20 @@ ; GFX9-NEXT: v_and_b32_e32 v0, v2, v3 ; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: shuffle_v4f16_1100: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx2 v[1:2], v[0:1], off +; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v4, v0, v1 +; GFX10-NEXT: v_and_b32_e32 v3, v0, v2 +; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v4 +; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -592,6 +995,20 @@ ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: shuffle_v4f16_6161: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v5, v[2:3], off offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -608,6 +1025,17 @@ ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: shuffle_v4f16_2333: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v1 +; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -624,6 +1052,17 @@ ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: shuffle_v4f16_6667: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v1 +; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -638,6 +1077,15 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: shuffle_v8f16_0101: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> @@ -651,6 +1099,14 @@ ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: shuffle_v8f16_0123: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> @@ -668,6 +1124,18 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: shuffle_v8f16_4589: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:8 +; GFX10-NEXT: global_load_dword v5, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> @@ -685,6 +1153,18 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: shuffle_v8f16_10_11_2_3: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4 +; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> @@ -704,6 +1184,20 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: shuffle_v8f16_13_14_2_3: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off offset:8 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 +; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: v_lshl_or_b32 v0, v6, 16, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> @@ -719,6 +1213,16 @@ ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: shuffle_v3f16_0122: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v1 +; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <3 x half>, <3 x half> addrspace(1)* %arg0 %val1 = load <3 x half>, <3 x half> addrspace(1)* %arg1 %shuffle = shufflevector <3 x half> %val0, <3 x half> %val1, <4 x i32> @@ -735,6 +1239,17 @@ ; GFX9-NEXT: v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: shuffle_v2f16_0122: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_lshl_or_b32 v1, v0, 16, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <2 x half>, <2 x half> addrspace(1)* %arg0 %val1 = load <2 x half>, <2 x half> addrspace(1)* %arg1 %shuffle = shufflevector <2 x half> %val0, <2 x half> %val1, <4 x i32> @@ -756,6 +1271,22 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, v7 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: shuffle_v6f16_452367: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: global_load_dwordx3 v[0:2], v[5:6], off +; GFX10-NEXT: global_load_dword v7, v[3:4], off +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v2, v7 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <6 x half>, <6 x half> addrspace(1)* %arg0 %val1 = load <6 x half>, <6 x half> addrspace(1)* %arg1 %shuffle = shufflevector <6 x half> %val0, <6 x half> %val1, <6 x i32> @@ -779,6 +1310,25 @@ ; GFX9-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0] ; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[8:9] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: fma_shuffle: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 3, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: global_load_dwordx2 v[7:8], v6, s[0:1] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[4:5], v6, s[8:9] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_fma_f16 v4, v7, v2, v4 op_sel_hi:[0,1,1] +; GFX10-NEXT: v_pk_fma_f16 v2, v8, v2, v5 op_sel_hi:[0,1,1] +; GFX10-NEXT: v_pk_fma_f16 v0, v7, v3, v4 op_sel:[1,0,0] +; GFX10-NEXT: v_pk_fma_f16 v1, v8, v3, v2 op_sel:[1,0,0] +; GFX10-NEXT: global_store_dwordx2 v6, v[0:1], s[8:9] +; GFX10-NEXT: s_endpgm entry: %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp12 = zext i32 %tmp1 to i64 @@ -824,6 +1374,21 @@ ; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: shuffle_v4f16_0456: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX10-NEXT: global_load_dwordx2 v[6:7], v[2:3], off +; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_and_b32_e32 v1, v3, v4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_and_b32_sdwa v2, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_lshl_or_b32 v0, v6, 16, v1 +; GFX10-NEXT: v_lshl_or_b32 v1, v7, 16, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -844,6 +1409,20 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: shuffle_scalar_load_v8i32_0123: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: v_mov_b32_e32 v3, s7 +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX10-NEXT: s_endpgm %ld8 = load <8 x i32>, <8 x i32> addrspace(4)* %in, align 16 %id = shufflevector <8 x i32> %ld8, <8 x i32> undef, <4 x i32> store <4 x i32> %id, <4 x i32> addrspace(1)* %out, align 8