Index: llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll @@ -0,0 +1,1149 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s + +define void @addv3i16(<3 x i16> addrspace(1)* %ptra, <3 x i16> addrspace(1)* %ptrb, <3 x i16> addrspace(1)* %ptr2) { +; GFX8-LABEL: addv3i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 2, v0 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ushort v8, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ushort v9, v[6:7] +; GFX8-NEXT: flat_load_ushort v10, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 4, v2 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_ushort v11, v[2:3] +; GFX8-NEXT: flat_load_ushort v12, v[0:1] +; GFX8-NEXT: flat_load_ushort v6, v[6:7] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v4 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v4 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; GFX8-NEXT: s_waitcnt vmcnt(2) +; GFX8-NEXT: v_add_u16_e32 v7, v8, v11 +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_add_u16_e32 v8, v9, v12 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u16_e32 v6, v10, v6 +; GFX8-NEXT: flat_store_short v[4:5], v7 +; GFX8-NEXT: flat_store_short v[0:1], v8 +; GFX8-NEXT: flat_store_short v[2:3], v6 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: addv3i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v6, v[0:1], off offset:2 +; GFX9-NEXT: global_load_ushort v7, v[0:1], off offset:4 +; GFX9-NEXT: global_load_ushort v8, v[2:3], off offset:2 +; GFX9-NEXT: global_load_ushort v9, v[2:3], off offset:4 +; GFX9-NEXT: global_load_ushort v10, v[0:1], off +; GFX9-NEXT: global_load_ushort v11, v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX9-NEXT: s_lshl_b32 s4, s4, 16 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_and_or_b32 v2, v7, v0, s4 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_and_or_b32 v6, v9, v0, s4 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_and_or_b32 v1, v10, v0, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_or_b32 v0, v11, v0, v3 +; GFX9-NEXT: v_pk_add_u16 v0, v1, v0 +; GFX9-NEXT: v_pk_add_u16 v2, v2, v6 +; GFX9-NEXT: global_store_short v[4:5], v0, off +; GFX9-NEXT: global_store_short_d16_hi v[4:5], v0, off offset:2 +; GFX9-NEXT: global_store_short v[4:5], v2, off offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %a = load <3 x i16>, <3 x i16> addrspace(1)* %ptra, align 4 + %b = load <3 x i16>, <3 x i16> addrspace(1)* %ptrb, align 4 + %add = add <3 x i16> %a, %b + store <3 x i16> %add, <3 x i16> addrspace(1)* %ptr2, align 4 + ret void +} + +define <3 x i16> @addv3i16arg(<3 x i16> %a, <3 x i16> %b) { +; GFX8-LABEL: addv3i16arg: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u16_e32 v4, v0, v2 +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX8-NEXT: v_add_u16_e32 v1, v1, v3 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: addv3i16arg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX9-NEXT: v_mov_b32_e32 v6, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_and_or_b32 v0, v0, v6, v4 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX9-NEXT: v_and_or_b32 v2, v2, v6, v4 +; GFX9-NEXT: s_lshl_b32 s4, s4, 16 +; GFX9-NEXT: v_pk_add_u16 v0, v0, v2 +; GFX9-NEXT: v_and_or_b32 v1, v1, v6, s4 +; GFX9-NEXT: v_and_or_b32 v3, v3, v6, s4 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_pk_add_u16 v1, v1, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_or_b32 v0, v0, v6, v2 +; GFX9-NEXT: v_and_or_b32 v1, v1, v6, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %add = add <3 x i16> %a, %b + ret <3 x i16> %add +} + +define void @addv4i16(<4 x i16> addrspace(1)* %ptra, <4 x i16> addrspace(1)* %ptrb, <4 x i16> addrspace(1)* %ptr2) { +; GFX8-LABEL: addv4i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u16_e32 v6, v0, v2 +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v2, v1, v3 +; GFX8-NEXT: v_add_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: addv4i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[8:9], v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v0, v6, v8 +; GFX9-NEXT: v_pk_add_u16 v1, v7, v9 +; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %a = load <4 x i16>, <4 x i16> addrspace(1)* %ptra, align 4 + %b = load <4 x i16>, <4 x i16> addrspace(1)* %ptrb, align 4 + %add = add <4 x i16> %a, %b + store <4 x i16> %add, <4 x i16> addrspace(1)* %ptr2, align 4 + ret void +} + +define <4 x i16> @addv4i16arg(<4 x i16> %a, <4 x i16> %b) { +; GFX8-LABEL: addv4i16arg: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u16_e32 v4, v0, v2 +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v2, v1, v3 +; GFX8-NEXT: v_add_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: addv4i16arg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v0, v0, v2 +; GFX9-NEXT: v_pk_add_u16 v1, v1, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %add = add <4 x i16> %a, %b + ret <4 x i16> %add +} + +define void @addv5i16(<5 x i16> addrspace(1)* %ptra, <5 x i16> addrspace(1)* %ptrb, <5 x i16> addrspace(1)* %ptr2) { +; GFX8-LABEL: addv5i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 2, v0 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 6, v0 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ushort v12, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 8, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ushort v13, v[6:7] +; GFX8-NEXT: flat_load_ushort v14, v[8:9] +; GFX8-NEXT: flat_load_ushort v15, v[10:11] +; GFX8-NEXT: flat_load_ushort v16, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 4, v2 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 6, v2 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 8, v2 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_ushort v17, v[2:3] +; GFX8-NEXT: flat_load_ushort v18, v[0:1] +; GFX8-NEXT: flat_load_ushort v19, v[6:7] +; GFX8-NEXT: flat_load_ushort v20, v[8:9] +; GFX8-NEXT: flat_load_ushort v10, v[10:11] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v4 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v4 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 6, v4 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 8, v4 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc +; GFX8-NEXT: s_waitcnt vmcnt(4) +; GFX8-NEXT: v_add_u16_e32 v11, v12, v17 +; GFX8-NEXT: s_waitcnt vmcnt(3) +; GFX8-NEXT: v_add_u16_e32 v12, v13, v18 +; GFX8-NEXT: s_waitcnt vmcnt(2) +; GFX8-NEXT: v_add_u16_e32 v13, v14, v19 +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_add_u16_e32 v14, v15, v20 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u16_e32 v10, v16, v10 +; GFX8-NEXT: flat_store_short v[4:5], v11 +; GFX8-NEXT: flat_store_short v[0:1], v12 +; GFX8-NEXT: flat_store_short v[2:3], v13 +; GFX8-NEXT: flat_store_short v[6:7], v14 +; GFX8-NEXT: flat_store_short v[8:9], v10 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: addv5i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v6, v[0:1], off offset:2 +; GFX9-NEXT: global_load_ushort v7, v[0:1], off offset:6 +; GFX9-NEXT: global_load_ushort v8, v[0:1], off offset:8 +; GFX9-NEXT: global_load_ushort v9, v[2:3], off offset:2 +; GFX9-NEXT: global_load_ushort v10, v[2:3], off offset:6 +; GFX9-NEXT: global_load_ushort v11, v[2:3], off offset:8 +; GFX9-NEXT: global_load_ushort v12, v[0:1], off +; GFX9-NEXT: global_load_ushort v13, v[0:1], off offset:4 +; GFX9-NEXT: global_load_ushort v14, v[2:3], off +; GFX9-NEXT: global_load_ushort v15, v[2:3], off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX9-NEXT: s_lshl_b32 s4, s4, 16 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_and_or_b32 v3, v8, v0, s4 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_and_or_b32 v8, v11, v0, s4 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_and_or_b32 v1, v12, v0, v1 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_and_or_b32 v2, v13, v0, v2 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_and_or_b32 v6, v14, v0, v6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_or_b32 v0, v15, v0, v7 +; GFX9-NEXT: v_pk_add_u16 v1, v1, v6 +; GFX9-NEXT: v_pk_add_u16 v3, v3, v8 +; GFX9-NEXT: v_pk_add_u16 v0, v2, v0 +; GFX9-NEXT: global_store_short v[4:5], v1, off +; GFX9-NEXT: global_store_short_d16_hi v[4:5], v1, off offset:2 +; GFX9-NEXT: global_store_short v[4:5], v0, off offset:4 +; GFX9-NEXT: global_store_short_d16_hi v[4:5], v0, off offset:6 +; GFX9-NEXT: global_store_short v[4:5], v3, off offset:8 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %a = load <5 x i16>, <5 x i16> addrspace(1)* %ptra, align 4 + %b = load <5 x i16>, <5 x i16> addrspace(1)* %ptrb, align 4 + %add = add <5 x i16> %a, %b + store <5 x i16> %add, <5 x i16> addrspace(1)* %ptr2, align 4 + ret void +} + +define <5 x i16> @addv5i16arg(<5 x i16> %a, <5 x i16> %b) { +; GFX8-LABEL: addv5i16arg: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u16_e32 v6, v0, v3 +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v3, v1, v4 +; GFX8-NEXT: v_add_u16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX8-NEXT: v_add_u16_e32 v2, v2, v5 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: addv5i16arg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX9-NEXT: v_mov_b32_e32 v10, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; GFX9-NEXT: v_and_or_b32 v0, v0, v10, v6 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; GFX9-NEXT: v_and_or_b32 v1, v1, v10, v6 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; GFX9-NEXT: v_and_or_b32 v3, v3, v10, v6 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; GFX9-NEXT: v_and_or_b32 v4, v4, v10, v6 +; GFX9-NEXT: v_pk_add_u16 v0, v0, v3 +; GFX9-NEXT: s_lshl_b32 s4, s4, 16 +; GFX9-NEXT: v_pk_add_u16 v1, v1, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX9-NEXT: v_and_or_b32 v2, v2, v10, s4 +; GFX9-NEXT: v_and_or_b32 v5, v5, v10, s4 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_pk_add_u16 v2, v2, v5 +; GFX9-NEXT: v_and_or_b32 v0, v0, v10, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX9-NEXT: v_and_or_b32 v1, v1, v10, v3 +; GFX9-NEXT: v_and_or_b32 v2, v2, v10, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %add = add <5 x i16> %a, %b + ret <5 x i16> %add +} + +define void @addv6i16(<6 x i16> addrspace(1)* %ptra, <6 x i16> addrspace(1)* %ptrb, <6 x i16> addrspace(1)* %ptr2) { +; GFX8-LABEL: addv6i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx3 v[6:8], v[0:1] +; GFX8-NEXT: flat_load_dwordx3 v[0:2], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u16_e32 v3, v6, v0 +; GFX8-NEXT: v_add_u16_sdwa v0, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v6, v7, v1 +; GFX8-NEXT: v_add_u16_sdwa v1, v7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v7, v8, v2 +; GFX8-NEXT: v_add_u16_sdwa v2, v8, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v6, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX8-NEXT: flat_store_dwordx3 v[4:5], v[0:2] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: addv6i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx3 v[6:8], v[0:1], off +; GFX9-NEXT: global_load_dwordx3 v[9:11], v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v0, v6, v9 +; GFX9-NEXT: v_pk_add_u16 v1, v7, v10 +; GFX9-NEXT: v_pk_add_u16 v2, v8, v11 +; GFX9-NEXT: global_store_dwordx3 v[4:5], v[0:2], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %a = load <6 x i16>, <6 x i16> addrspace(1)* %ptra, align 4 + %b = load <6 x i16>, <6 x i16> addrspace(1)* %ptrb, align 4 + %add = add <6 x i16> %a, %b + store <6 x i16> %add, <6 x i16> addrspace(1)* %ptr2, align 4 + ret void +} + +define <6 x i16> @addv6i16arg(<6 x i16> %a, <6 x i16> %b) { +; GFX8-LABEL: addv6i16arg: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u16_e32 v6, v0, v3 +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v3, v1, v4 +; GFX8-NEXT: v_add_u16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX8-NEXT: v_add_u16_e32 v3, v2, v5 +; GFX8-NEXT: v_add_u16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: addv6i16arg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v0, v0, v3 +; GFX9-NEXT: v_pk_add_u16 v1, v1, v4 +; GFX9-NEXT: v_pk_add_u16 v2, v2, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %add = add <6 x i16> %a, %b + ret <6 x i16> %add +} + +define void @addv7i16(<7 x i16> addrspace(1)* %ptra, <7 x i16> addrspace(1)* %ptrb, <7 x i16> addrspace(1)* %ptr2) { +; GFX8-LABEL: addv7i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 2, v0 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 6, v0 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v12, vcc, 8, v0 +; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 10, v0 +; GFX8-NEXT: v_addc_u32_e32 v15, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ushort v16, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 12, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ushort v17, v[6:7] +; GFX8-NEXT: flat_load_ushort v18, v[8:9] +; GFX8-NEXT: flat_load_ushort v19, v[10:11] +; GFX8-NEXT: flat_load_ushort v20, v[12:13] +; GFX8-NEXT: flat_load_ushort v21, v[14:15] +; GFX8-NEXT: flat_load_ushort v22, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 4, v2 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 6, v2 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 8, v2 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v12, vcc, 10, v2 +; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 12, v2 +; GFX8-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_ushort v2, v[2:3] +; GFX8-NEXT: flat_load_ushort v3, v[0:1] +; GFX8-NEXT: flat_load_ushort v6, v[6:7] +; GFX8-NEXT: flat_load_ushort v7, v[8:9] +; GFX8-NEXT: flat_load_ushort v8, v[10:11] +; GFX8-NEXT: flat_load_ushort v9, v[12:13] +; GFX8-NEXT: flat_load_ushort v10, v[14:15] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v4 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GFX8-NEXT: s_waitcnt vmcnt(6) +; GFX8-NEXT: v_add_u16_e32 v2, v16, v2 +; GFX8-NEXT: s_waitcnt vmcnt(5) +; GFX8-NEXT: v_add_u16_e32 v3, v17, v3 +; GFX8-NEXT: flat_store_short v[4:5], v2 +; GFX8-NEXT: flat_store_short v[0:1], v3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4 +; GFX8-NEXT: s_waitcnt vmcnt(6) +; GFX8-NEXT: v_add_u16_e32 v6, v18, v6 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GFX8-NEXT: flat_store_short v[0:1], v6 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 6, v4 +; GFX8-NEXT: s_waitcnt vmcnt(6) +; GFX8-NEXT: v_add_u16_e32 v7, v19, v7 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GFX8-NEXT: flat_store_short v[0:1], v7 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 8, v4 +; GFX8-NEXT: s_waitcnt vmcnt(6) +; GFX8-NEXT: v_add_u16_e32 v8, v20, v8 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GFX8-NEXT: flat_store_short v[0:1], v8 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 10, v4 +; GFX8-NEXT: s_waitcnt vmcnt(6) +; GFX8-NEXT: v_add_u16_e32 v9, v21, v9 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GFX8-NEXT: flat_store_short v[0:1], v9 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 12, v4 +; GFX8-NEXT: s_waitcnt vmcnt(6) +; GFX8-NEXT: v_add_u16_e32 v10, v22, v10 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GFX8-NEXT: flat_store_short v[0:1], v10 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: addv7i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v6, v[0:1], off offset:2 +; GFX9-NEXT: global_load_ushort v7, v[0:1], off offset:6 +; GFX9-NEXT: global_load_ushort v8, v[0:1], off offset:10 +; GFX9-NEXT: global_load_ushort v9, v[0:1], off offset:12 +; GFX9-NEXT: global_load_ushort v10, v[2:3], off offset:2 +; GFX9-NEXT: global_load_ushort v11, v[2:3], off offset:6 +; GFX9-NEXT: global_load_ushort v12, v[2:3], off offset:10 +; GFX9-NEXT: global_load_ushort v13, v[2:3], off offset:12 +; GFX9-NEXT: global_load_ushort v14, v[0:1], off +; GFX9-NEXT: global_load_ushort v15, v[0:1], off offset:4 +; GFX9-NEXT: global_load_ushort v16, v[0:1], off offset:8 +; GFX9-NEXT: global_load_ushort v17, v[2:3], off +; GFX9-NEXT: global_load_ushort v18, v[2:3], off offset:4 +; GFX9-NEXT: global_load_ushort v19, v[2:3], off offset:8 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX9-NEXT: s_lshl_b32 s4, s4, 16 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_and_or_b32 v6, v9, v0, s4 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v12 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_and_or_b32 v10, v13, v0, s4 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_and_or_b32 v1, v14, v0, v1 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_and_or_b32 v2, v15, v0, v2 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_and_or_b32 v3, v16, v0, v3 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_and_or_b32 v7, v17, v0, v7 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_and_or_b32 v8, v18, v0, v8 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_or_b32 v0, v19, v0, v9 +; GFX9-NEXT: v_pk_add_u16 v1, v1, v7 +; GFX9-NEXT: v_pk_add_u16 v6, v6, v10 +; GFX9-NEXT: v_pk_add_u16 v2, v2, v8 +; GFX9-NEXT: v_pk_add_u16 v0, v3, v0 +; GFX9-NEXT: global_store_short v[4:5], v1, off +; GFX9-NEXT: global_store_short_d16_hi v[4:5], v1, off offset:2 +; GFX9-NEXT: global_store_short v[4:5], v2, off offset:4 +; GFX9-NEXT: global_store_short_d16_hi v[4:5], v2, off offset:6 +; GFX9-NEXT: global_store_short v[4:5], v0, off offset:8 +; GFX9-NEXT: global_store_short_d16_hi v[4:5], v0, off offset:10 +; GFX9-NEXT: global_store_short v[4:5], v6, off offset:12 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %a = load <7 x i16>, <7 x i16> addrspace(1)* %ptra, align 4 + %b = load <7 x i16>, <7 x i16> addrspace(1)* %ptrb, align 4 + %add = add <7 x i16> %a, %b + store <7 x i16> %add, <7 x i16> addrspace(1)* %ptr2, align 4 + ret void +} + +define <7 x i16> @addv7i16arg(<7 x i16> %a, <7 x i16> %b) { +; GFX8-LABEL: addv7i16arg: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u16_e32 v8, v0, v4 +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v4, v1, v5 +; GFX8-NEXT: v_add_u16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX8-NEXT: v_add_u16_e32 v4, v2, v6 +; GFX8-NEXT: v_add_u16_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX8-NEXT: v_add_u16_e32 v3, v3, v7 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: addv7i16arg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX9-NEXT: v_and_or_b32 v0, v0, v14, v8 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v4 +; GFX9-NEXT: v_and_or_b32 v1, v1, v14, v8 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; GFX9-NEXT: v_and_or_b32 v2, v2, v14, v8 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; GFX9-NEXT: v_and_or_b32 v4, v4, v14, v8 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v12 +; GFX9-NEXT: v_and_or_b32 v5, v5, v14, v8 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v13 +; GFX9-NEXT: v_pk_add_u16 v0, v0, v4 +; GFX9-NEXT: v_and_or_b32 v6, v6, v14, v8 +; GFX9-NEXT: v_pk_add_u16 v1, v1, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX9-NEXT: s_lshl_b32 s4, s4, 16 +; GFX9-NEXT: v_pk_add_u16 v2, v2, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_and_or_b32 v3, v3, v14, s4 +; GFX9-NEXT: v_and_or_b32 v7, v7, v14, s4 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX9-NEXT: v_and_or_b32 v0, v0, v14, v4 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX9-NEXT: v_pk_add_u16 v3, v3, v7 +; GFX9-NEXT: v_and_or_b32 v1, v1, v14, v4 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX9-NEXT: v_and_or_b32 v2, v2, v14, v4 +; GFX9-NEXT: v_and_or_b32 v3, v3, v14, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %add = add <7 x i16> %a, %b + ret <7 x i16> %add +} + +define void @addv9i16(<9 x i16> addrspace(1)* %ptra, <9 x i16> addrspace(1)* %ptrb, <9 x i16> addrspace(1)* %ptr2) { +; GFX8-LABEL: addv9i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[6:9], v[0:1] +; GFX8-NEXT: flat_load_dwordx4 v[10:13], v[2:3] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ushort v14, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(2) +; GFX8-NEXT: v_add_u16_e32 v1, v6, v10 +; GFX8-NEXT: v_add_u16_sdwa v2, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v3, v7, v11 +; GFX8-NEXT: v_add_u16_sdwa v10, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v11, v8, v12 +; GFX8-NEXT: v_add_u16_sdwa v8, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v12, v9, v13 +; GFX8-NEXT: v_add_u16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 16, v4 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u16_e32 v13, v14, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v1, v2 +; GFX8-NEXT: v_or_b32_e32 v1, v3, v10 +; GFX8-NEXT: v_or_b32_e32 v2, v11, v8 +; GFX8-NEXT: v_or_b32_e32 v3, v12, v9 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: flat_store_short v[6:7], v13 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: addv9i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off +; GFX9-NEXT: global_load_dwordx4 v[10:13], v[2:3], off +; GFX9-NEXT: global_load_ushort v14, v[0:1], off offset:16 +; GFX9-NEXT: global_load_ushort v15, v[2:3], off offset:16 +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: s_lshl_b32 s4, s4, 16 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_and_or_b32 v14, v14, v3, s4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_or_b32 v15, v15, v3, s4 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v12 +; GFX9-NEXT: v_pk_add_u16 v14, v14, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX9-NEXT: v_and_or_b32 v0, v6, v3, v0 +; GFX9-NEXT: v_and_or_b32 v1, v7, v3, v1 +; GFX9-NEXT: v_and_or_b32 v2, v8, v3, v2 +; GFX9-NEXT: v_and_or_b32 v6, v9, v3, v16 +; GFX9-NEXT: v_and_or_b32 v7, v10, v3, v17 +; GFX9-NEXT: v_and_or_b32 v8, v11, v3, v18 +; GFX9-NEXT: v_and_or_b32 v9, v12, v3, v19 +; GFX9-NEXT: v_and_or_b32 v10, v13, v3, v15 +; GFX9-NEXT: v_pk_add_u16 v0, v0, v7 +; GFX9-NEXT: v_pk_add_u16 v1, v1, v8 +; GFX9-NEXT: v_pk_add_u16 v2, v2, v9 +; GFX9-NEXT: v_pk_add_u16 v6, v6, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v6 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX9-NEXT: v_and_or_b32 v0, v0, v3, v7 +; GFX9-NEXT: v_and_or_b32 v1, v1, v3, v8 +; GFX9-NEXT: v_and_or_b32 v2, v2, v3, v9 +; GFX9-NEXT: v_and_or_b32 v3, v6, v3, v10 +; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX9-NEXT: global_store_short v[4:5], v14, off offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %a = load <9 x i16>, <9 x i16> addrspace(1)* %ptra, align 4 + %b = load <9 x i16>, <9 x i16> addrspace(1)* %ptrb, align 4 + %add = add <9 x i16> %a, %b + store <9 x i16> %add, <9 x i16> addrspace(1)* %ptr2, align 4 + ret void +} + +define <9 x i16> @addv9i16arg(<9 x i16> %a, <9 x i16> %b) { +; GFX8-LABEL: addv9i16arg: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u16_e32 v10, v0, v5 +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v5, v1, v6 +; GFX8-NEXT: v_add_u16_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX8-NEXT: v_add_u16_e32 v5, v2, v7 +; GFX8-NEXT: v_add_u16_sdwa v2, v2, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_add_u16_e32 v5, v3, v8 +; GFX8-NEXT: v_add_u16_sdwa v3, v3, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v10, v0 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_add_u16_e32 v4, v4, v9 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: addv9i16arg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; GFX9-NEXT: v_mov_b32_e32 v18, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; GFX9-NEXT: v_and_or_b32 v0, v0, v18, v10 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; GFX9-NEXT: v_and_or_b32 v1, v1, v18, v10 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; GFX9-NEXT: v_and_or_b32 v2, v2, v18, v10 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v6 +; GFX9-NEXT: v_and_or_b32 v3, v3, v18, v10 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v7 +; GFX9-NEXT: v_and_or_b32 v5, v5, v18, v10 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; GFX9-NEXT: v_and_or_b32 v6, v6, v18, v10 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v16 +; GFX9-NEXT: v_pk_add_u16 v0, v0, v5 +; GFX9-NEXT: v_and_or_b32 v7, v7, v18, v10 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v17 +; GFX9-NEXT: v_pk_add_u16 v1, v1, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX9-NEXT: v_and_or_b32 v8, v8, v18, v10 +; GFX9-NEXT: v_pk_add_u16 v2, v2, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX9-NEXT: s_lshl_b32 s4, s4, 16 +; GFX9-NEXT: v_pk_add_u16 v3, v3, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX9-NEXT: v_and_or_b32 v0, v0, v18, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GFX9-NEXT: v_and_or_b32 v4, v4, v18, s4 +; GFX9-NEXT: v_and_or_b32 v9, v9, v18, s4 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; GFX9-NEXT: v_and_or_b32 v1, v1, v18, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; GFX9-NEXT: v_pk_add_u16 v4, v4, v9 +; GFX9-NEXT: v_and_or_b32 v2, v2, v18, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v8 +; GFX9-NEXT: v_and_or_b32 v3, v3, v18, v5 +; GFX9-NEXT: v_and_or_b32 v4, v4, v18, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %add = add <9 x i16> %a, %b + ret <9 x i16> %add +} + +define void @addv10i16(<10 x i16> addrspace(1)* %ptra, <10 x i16> addrspace(1)* %ptrb, <10 x i16> addrspace(1)* %ptr2) { +; GFX8-LABEL: addv10i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[6:9], v[0:1] +; GFX8-NEXT: flat_load_dwordx4 v[10:13], v[2:3] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v14, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_dword v15, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(2) +; GFX8-NEXT: v_add_u16_e32 v0, v6, v10 +; GFX8-NEXT: v_add_u16_sdwa v1, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v2, v7, v11 +; GFX8-NEXT: v_add_u16_sdwa v3, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v6, v8, v12 +; GFX8-NEXT: v_add_u16_sdwa v7, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v8, v9, v13 +; GFX8-NEXT: v_add_u16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX8-NEXT: v_or_b32_e32 v2, v6, v7 +; GFX8-NEXT: v_or_b32_e32 v3, v8, v9 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u16_e32 v6, v14, v15 +; GFX8-NEXT: v_add_u16_sdwa v7, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v4 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GFX8-NEXT: flat_store_dword v[0:1], v6 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: addv10i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[6:9], v[2:3], off +; GFX9-NEXT: global_load_dwordx4 v[10:13], v[0:1], off +; GFX9-NEXT: global_load_dword v14, v[0:1], off offset:16 +; GFX9-NEXT: global_load_dword v15, v[2:3], off offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_pk_add_u16 v0, v10, v6 +; GFX9-NEXT: v_pk_add_u16 v1, v11, v7 +; GFX9-NEXT: v_pk_add_u16 v2, v12, v8 +; GFX9-NEXT: v_pk_add_u16 v3, v13, v9 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v6, v14, v15 +; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX9-NEXT: global_store_dword v[4:5], v6, off offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %a = load <10 x i16>, <10 x i16> addrspace(1)* %ptra, align 4 + %b = load <10 x i16>, <10 x i16> addrspace(1)* %ptrb, align 4 + %add = add <10 x i16> %a, %b + store <10 x i16> %add, <10 x i16> addrspace(1)* %ptr2, align 4 + ret void +} + +define void @addv11i16(<11 x i16> addrspace(1)* %ptra, <11 x i16> addrspace(1)* %ptrb, <11 x i16> addrspace(1)* %ptr2) { +; GFX8-LABEL: addv11i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[6:9], v[0:1] +; GFX8-NEXT: flat_load_dwordx4 v[10:13], v[2:3] +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 16, v2 +; GFX8-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v16, vcc, 18, v2 +; GFX8-NEXT: v_addc_u32_e32 v17, vcc, 0, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 20, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_ushort v14, v[14:15] +; GFX8-NEXT: flat_load_ushort v15, v[16:17] +; GFX8-NEXT: flat_load_ushort v16, v[2:3] +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 16, v0 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX8-NEXT: s_waitcnt vmcnt(3) +; GFX8-NEXT: v_add_u16_e32 v17, v6, v10 +; GFX8-NEXT: v_add_u16_sdwa v10, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 18, v0 +; GFX8-NEXT: v_add_u16_e32 v18, v7, v11 +; GFX8-NEXT: v_add_u16_sdwa v11, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 20, v0 +; GFX8-NEXT: flat_load_ushort v2, v[2:3] +; GFX8-NEXT: flat_load_ushort v3, v[6:7] +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ushort v21, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 16, v4 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc +; GFX8-NEXT: v_add_u16_e32 v19, v8, v12 +; GFX8-NEXT: v_add_u16_sdwa v12, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 18, v4 +; GFX8-NEXT: v_add_u16_e32 v20, v9, v13 +; GFX8-NEXT: v_add_u16_sdwa v13, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc +; GFX8-NEXT: v_or_b32_e32 v0, v17, v10 +; GFX8-NEXT: v_or_b32_e32 v1, v18, v11 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 20, v4 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc +; GFX8-NEXT: s_waitcnt vmcnt(2) +; GFX8-NEXT: v_add_u16_e32 v14, v2, v14 +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_add_u16_e32 v15, v3, v15 +; GFX8-NEXT: v_or_b32_e32 v2, v19, v12 +; GFX8-NEXT: v_or_b32_e32 v3, v20, v13 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u16_e32 v16, v21, v16 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: flat_store_short v[6:7], v14 +; GFX8-NEXT: flat_store_short v[8:9], v15 +; GFX8-NEXT: flat_store_short v[10:11], v16 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: addv11i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off +; GFX9-NEXT: global_load_ushort v15, v[0:1], off offset:16 +; GFX9-NEXT: global_load_ushort v16, v[0:1], off offset:18 +; GFX9-NEXT: global_load_ushort v17, v[0:1], off offset:20 +; GFX9-NEXT: global_load_dwordx4 v[10:13], v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v14, 0xffff +; GFX9-NEXT: s_lshl_b32 s4, s4, 16 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX9-NEXT: v_and_or_b32 v0, v6, v14, v0 +; GFX9-NEXT: v_and_or_b32 v1, v7, v14, v1 +; GFX9-NEXT: v_and_or_b32 v6, v8, v14, v18 +; GFX9-NEXT: v_and_or_b32 v7, v9, v14, v19 +; GFX9-NEXT: global_load_ushort v8, v[2:3], off offset:18 +; GFX9-NEXT: global_load_ushort v9, v[2:3], off offset:20 +; GFX9-NEXT: global_load_ushort v18, v[2:3], off offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v12 +; GFX9-NEXT: v_and_or_b32 v15, v15, v14, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX9-NEXT: v_and_or_b32 v2, v10, v14, v2 +; GFX9-NEXT: v_and_or_b32 v3, v11, v14, v3 +; GFX9-NEXT: v_and_or_b32 v10, v12, v14, v19 +; GFX9-NEXT: v_and_or_b32 v11, v13, v14, v16 +; GFX9-NEXT: v_pk_add_u16 v0, v0, v2 +; GFX9-NEXT: v_pk_add_u16 v1, v1, v3 +; GFX9-NEXT: v_pk_add_u16 v2, v6, v10 +; GFX9-NEXT: v_pk_add_u16 v3, v7, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX9-NEXT: v_and_or_b32 v17, v17, v14, s4 +; GFX9-NEXT: v_and_or_b32 v0, v0, v14, v6 +; GFX9-NEXT: v_and_or_b32 v1, v1, v14, v7 +; GFX9-NEXT: v_and_or_b32 v3, v3, v14, v10 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_and_or_b32 v9, v9, v14, s4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_or_b32 v8, v18, v14, v8 +; GFX9-NEXT: v_pk_add_u16 v8, v15, v8 +; GFX9-NEXT: global_store_short v[4:5], v8, off offset:16 +; GFX9-NEXT: global_store_short_d16_hi v[4:5], v8, off offset:18 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX9-NEXT: v_and_or_b32 v2, v2, v14, v8 +; GFX9-NEXT: v_pk_add_u16 v9, v17, v9 +; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX9-NEXT: global_store_short v[4:5], v9, off offset:20 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %a = load <11 x i16>, <11 x i16> addrspace(1)* %ptra, align 4 + %b = load <11 x i16>, <11 x i16> addrspace(1)* %ptrb, align 4 + %add = add <11 x i16> %a, %b + store <11 x i16> %add, <11 x i16> addrspace(1)* %ptr2, align 4 + ret void +} + +define <11 x i16> @addv11i16arg(<11 x i16> %a, <11 x i16> %b) { +; GFX8-LABEL: addv11i16arg: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u16_e32 v12, v0, v6 +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v6, v1, v7 +; GFX8-NEXT: v_add_u16_sdwa v1, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v1, v6, v1 +; GFX8-NEXT: v_add_u16_e32 v6, v2, v8 +; GFX8-NEXT: v_add_u16_sdwa v2, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX8-NEXT: v_add_u16_e32 v6, v3, v9 +; GFX8-NEXT: v_add_u16_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX8-NEXT: v_add_u16_e32 v6, v4, v10 +; GFX8-NEXT: v_add_u16_sdwa v4, v4, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v12, v0 +; GFX8-NEXT: v_or_b32_e32 v4, v6, v4 +; GFX8-NEXT: v_add_u16_e32 v5, v5, v11 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: addv11i16arg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v4 +; GFX9-NEXT: v_mov_b32_e32 v19, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v6 +; GFX9-NEXT: v_and_or_b32 v3, v3, v19, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX9-NEXT: v_and_or_b32 v4, v4, v19, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; GFX9-NEXT: v_and_or_b32 v0, v0, v19, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v8 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX9-NEXT: v_and_or_b32 v6, v6, v19, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v18 +; GFX9-NEXT: v_and_or_b32 v1, v1, v19, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX9-NEXT: v_and_or_b32 v7, v7, v19, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX9-NEXT: v_pk_add_u16 v0, v0, v6 +; GFX9-NEXT: v_and_or_b32 v2, v2, v19, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v10 +; GFX9-NEXT: v_and_or_b32 v8, v8, v19, v12 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; GFX9-NEXT: v_pk_add_u16 v1, v1, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX9-NEXT: v_and_or_b32 v9, v9, v19, v12 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v14 +; GFX9-NEXT: v_pk_add_u16 v2, v2, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX9-NEXT: v_and_or_b32 v10, v10, v19, v12 +; GFX9-NEXT: v_pk_add_u16 v3, v3, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; GFX9-NEXT: v_and_or_b32 v0, v0, v19, v6 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; GFX9-NEXT: s_lshl_b32 s4, s4, 16 +; GFX9-NEXT: v_pk_add_u16 v4, v4, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; GFX9-NEXT: v_and_or_b32 v1, v1, v19, v6 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; GFX9-NEXT: v_and_or_b32 v5, v5, v19, s4 +; GFX9-NEXT: v_and_or_b32 v11, v11, v19, s4 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v4 +; GFX9-NEXT: v_and_or_b32 v2, v2, v19, v6 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; GFX9-NEXT: v_pk_add_u16 v5, v5, v11 +; GFX9-NEXT: v_and_or_b32 v3, v3, v19, v6 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v10 +; GFX9-NEXT: v_and_or_b32 v4, v4, v19, v6 +; GFX9-NEXT: v_and_or_b32 v5, v5, v19, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %add = add <11 x i16> %a, %b + ret <11 x i16> %add +} + +define void @addv12i16(<12 x i16> addrspace(1)* %ptra, <12 x i16> addrspace(1)* %ptrb, <12 x i16> addrspace(1)* %ptr2) { +; GFX8-LABEL: addv12i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[6:9], v[0:1] +; GFX8-NEXT: flat_load_dwordx4 v[10:13], v[2:3] +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 16, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[14:15], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_add_u16_e32 v2, v6, v10 +; GFX8-NEXT: v_add_u16_sdwa v3, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v10, v7, v11 +; GFX8-NEXT: v_add_u16_sdwa v11, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX8-NEXT: v_add_u16_e32 v16, v8, v12 +; GFX8-NEXT: v_add_u16_sdwa v8, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v12, v9, v13 +; GFX8-NEXT: v_add_u16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v2, v3 +; GFX8-NEXT: v_or_b32_e32 v1, v10, v11 +; GFX8-NEXT: v_or_b32_e32 v2, v16, v8 +; GFX8-NEXT: v_or_b32_e32 v3, v12, v9 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_add_u16_e32 v8, v6, v14 +; GFX8-NEXT: v_add_u16_sdwa v6, v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v9, v7, v15 +; GFX8-NEXT: v_add_u16_sdwa v7, v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v4 +; GFX8-NEXT: v_or_b32_e32 v6, v8, v6 +; GFX8-NEXT: v_or_b32_e32 v7, v9, v7 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[6:7] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: addv12i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[6:9], v[2:3], off +; GFX9-NEXT: global_load_dwordx4 v[10:13], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[14:15], v[0:1], off offset:16 +; GFX9-NEXT: global_load_dwordx2 v[16:17], v[2:3], off offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_pk_add_u16 v0, v10, v6 +; GFX9-NEXT: v_pk_add_u16 v1, v11, v7 +; GFX9-NEXT: v_pk_add_u16 v2, v12, v8 +; GFX9-NEXT: v_pk_add_u16 v3, v13, v9 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v6, v14, v16 +; GFX9-NEXT: v_pk_add_u16 v7, v15, v17 +; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX9-NEXT: global_store_dwordx2 v[4:5], v[6:7], off offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %a = load <12 x i16>, <12 x i16> addrspace(1)* %ptra, align 4 + %b = load <12 x i16>, <12 x i16> addrspace(1)* %ptrb, align 4 + %add = add <12 x i16> %a, %b + store <12 x i16> %add, <12 x i16> addrspace(1)* %ptr2, align 4 + ret void +} + +define <12 x i16> @addv12i16arg(<12 x i16> %a, <12 x i16> %b) { +; GFX8-LABEL: addv12i16arg: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u16_e32 v12, v0, v6 +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v6, v1, v7 +; GFX8-NEXT: v_add_u16_sdwa v1, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v1, v6, v1 +; GFX8-NEXT: v_add_u16_e32 v6, v2, v8 +; GFX8-NEXT: v_add_u16_sdwa v2, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX8-NEXT: v_add_u16_e32 v6, v3, v9 +; GFX8-NEXT: v_add_u16_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX8-NEXT: v_add_u16_e32 v6, v4, v10 +; GFX8-NEXT: v_add_u16_sdwa v4, v4, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v4, v6, v4 +; GFX8-NEXT: v_add_u16_e32 v6, v5, v11 +; GFX8-NEXT: v_add_u16_sdwa v5, v5, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v12, v0 +; GFX8-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: addv12i16arg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v0, v0, v6 +; GFX9-NEXT: v_pk_add_u16 v1, v1, v7 +; GFX9-NEXT: v_pk_add_u16 v2, v2, v8 +; GFX9-NEXT: v_pk_add_u16 v3, v3, v9 +; GFX9-NEXT: v_pk_add_u16 v4, v4, v10 +; GFX9-NEXT: v_pk_add_u16 v5, v5, v11 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %add = add <12 x i16> %a, %b + ret <12 x i16> %add +} + Index: llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-build-vector.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-build-vector.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-build-vector.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -O0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s # The G_ZEXT and G_SHL will be scalarized, introducing a # G_UNMERGE_VALUES of G_BUILD_VECTOR. The artifact combiner should @@ -30,3 +30,214 @@ %4:_(<2 x s64>) = G_SHL %3, %2 $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %4 ... + +--- +name: copy_scalar +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + + ; GFX9-LABEL: name: copy_scalar + ; GFX9: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV]](s32), [[UV1]](s32) + ; GFX9-NEXT: $vgpr2_vgpr3 = COPY [[MV]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32), %2:_(s32) = G_UNMERGE_VALUES %0(s64) + %3:_(s64) = G_MERGE_VALUES %1, %2 + $vgpr2_vgpr3= COPY %3 +... + +--- +name: copy_vector_using_elements +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + + ; GFX9-LABEL: name: copy_vector_using_elements + ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32) + ; GFX9-NEXT: $vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s32>) + %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 + %1:_(s32), %2:_(s32) = G_UNMERGE_VALUES %0(<2 x s32>) + %3:_(<2 x s32>) = G_BUILD_VECTOR %1, %2 + $vgpr2_vgpr3= COPY %3 +... + +--- +name: copy_vector_using_subvectors +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + + ; GFX9-LABEL: name: copy_vector_using_subvectors + ; GFX9: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[UV]](<2 x s16>), [[UV1]](<2 x s16>) + ; GFX9-NEXT: $vgpr2_vgpr3 = COPY [[CONCAT_VECTORS]](<4 x s16>) + %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 + %1:_(<2 x s16>), %2:_(<2 x s16>) = G_UNMERGE_VALUES %0(<4 x s16>) + %3:_(<4 x s16>) = G_CONCAT_VECTORS %1, %2 + $vgpr2_vgpr3= COPY %3 +... + +--- +name: shuffle_vector_elements +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + + ; GFX9-LABEL: name: shuffle_vector_elements + ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[UV1]](s32), [[UV]](s32) + ; GFX9-NEXT: $vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s32>) + %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 + %1:_(s32), %2:_(s32) = G_UNMERGE_VALUES %0(<2 x s32>) + %3:_(<2 x s32>) = G_BUILD_VECTOR %2, %1 + $vgpr2_vgpr3= COPY %3 +... + +--- +name: insert_element +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3_vgpr4 + + ; GFX9-LABEL: name: insert_element + ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[COPY1]](s32) + ; GFX9-NEXT: $vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s32>) + %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 + %1:_(s32) = COPY $vgpr2 + %2:_(s32), %3:_(s32) = G_UNMERGE_VALUES %0(<2 x s32>) + %4:_(<2 x s32>) = G_BUILD_VECTOR %2, %1 + $vgpr2_vgpr3= COPY %4 +... + +--- +name: unmerge_to_sub_vectors +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5, $vgpr6_vgpr7 + + ; GFX9-LABEL: name: unmerge_to_sub_vectors + ; GFX9: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[UV2]](s32), [[UV3]](s32) + ; GFX9-NEXT: $vgpr4_vgpr5 = COPY [[BUILD_VECTOR]](<2 x s32>) + ; GFX9-NEXT: $vgpr6_vgpr7 = COPY [[BUILD_VECTOR1]](<2 x s32>) + %0:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + %1:_(s32), %2:_(s32), %3:_(s32), %4:_(s32) = G_UNMERGE_VALUES %0(<4 x s32>) + %5:_(<2 x s32>) = G_BUILD_VECTOR %1, %2 + %6:_(<2 x s32>) = G_BUILD_VECTOR %3, %4 + $vgpr4_vgpr5= COPY %5 + $vgpr6_vgpr7= COPY %6 +... + +--- +name: cant_unmerge_to_sub_vectors +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5, $vgpr6_vgpr7 + + ; GFX9-LABEL: name: cant_unmerge_to_sub_vectors + ; GFX9: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV3]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[UV1]](s32), [[UV2]](s32) + ; GFX9-NEXT: $vgpr4_vgpr5 = COPY [[BUILD_VECTOR]](<2 x s32>) + ; GFX9-NEXT: $vgpr6_vgpr7 = COPY [[BUILD_VECTOR1]](<2 x s32>) + %0:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + %1:_(s32), %2:_(s32), %3:_(s32), %4:_(s32) = G_UNMERGE_VALUES %0(<4 x s32>) + %5:_(<2 x s32>) = G_BUILD_VECTOR %1, %4 + %6:_(<2 x s32>) = G_BUILD_VECTOR %2, %3 + $vgpr4_vgpr5= COPY %5 + $vgpr6_vgpr7= COPY %6 +... + +--- +name: concat +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7 + + ; GFX9-LABEL: name: concat + ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) + ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32) + ; GFX9-NEXT: $vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s32>) + %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 + %1:_(<2 x s32>) = COPY $vgpr2_vgpr3 + %2:_(s32), %3:_(s32) = G_UNMERGE_VALUES %0(<2 x s32>) + %4:_(s32), %5:_(s32) = G_UNMERGE_VALUES %1(<2 x s32>) + %6:_(<4 x s32>) = G_BUILD_VECTOR %2, %3, %4, %5 + $vgpr4_vgpr5_vgpr6_vgpr7= COPY %6 +... + +--- +name: concat_same_vector +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 + + ; GFX9-LABEL: name: concat_same_vector + ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV]](s32), [[UV1]](s32) + ; GFX9-NEXT: $vgpr2_vgpr3_vgpr4_vgpr5 = COPY [[BUILD_VECTOR]](<4 x s32>) + %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 + %1:_(s32), %2:_(s32) = G_UNMERGE_VALUES %0(<2 x s32>) + %3:_(<4 x s32>) = G_BUILD_VECTOR %1, %2, %1, %2 + $vgpr2_vgpr3_vgpr4_vgpr5= COPY %3 +... + +--- +name: shuffle_not_concat +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7 + + ; GFX9-LABEL: name: shuffle_not_concat + ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) + ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[UV2]](s32), [[UV]](s32), [[UV1]](s32), [[UV3]](s32) + ; GFX9-NEXT: $vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s32>) + %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 + %1:_(<2 x s32>) = COPY $vgpr2_vgpr3 + %2:_(s32), %3:_(s32) = G_UNMERGE_VALUES %0(<2 x s32>) + %4:_(s32), %5:_(s32) = G_UNMERGE_VALUES %1(<2 x s32>) + %6:_(<4 x s32>) = G_BUILD_VECTOR %4, %2, %3, %5 + $vgpr4_vgpr5_vgpr6_vgpr7= COPY %6 +... + +--- +name: not_a_concat +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4, $vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 + + ; GFX9-LABEL: name: not_a_concat + ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) + ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32), [[COPY2]](s32) + ; GFX9-NEXT: $vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 = COPY [[BUILD_VECTOR]](<5 x s32>) + %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 + %1:_(<2 x s32>) = COPY $vgpr2_vgpr3 + %2:_(s32) = COPY $vgpr4 + %3:_(s32), %4:_(s32) = G_UNMERGE_VALUES %0(<2 x s32>) + %5:_(s32), %6:_(s32) = G_UNMERGE_VALUES %1(<2 x s32>) + %7:_(<5 x s32>) = G_BUILD_VECTOR %3, %4, %5, %6, %2 + $vgpr5_vgpr6_vgpr7_vgpr8_vgpr9= COPY %7 +... Index: llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-unmerge-values.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-unmerge-values.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-unmerge-values.mir @@ -1548,3 +1548,47 @@ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 , implicit $vgpr4, implicit $vgpr5 ... + +--- +name: test_unmerge_values_look_through_scalar_to_vector_bitcast +body: | + bb.0: + + ; CHECK-LABEL: name: test_unmerge_values_look_through_scalar_to_vector_bitcast + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[MV]](s64) + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<2 x s32>) + ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) + ; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s64) = G_MERGE_VALUES %0:_(s32), %1:_(s32) + %3:_(<2 x s32>) = G_BITCAST %2:_(s64) + %4:_(s32), %5:_(s32) = G_UNMERGE_VALUES %3:_(<2 x s32>) + $vgpr0 = COPY %4 + $vgpr1 = COPY %5 +... + +--- +name: test_unmerge_values_look_through_vector_to_scalar_bitcast +body: | + bb.0: + + ; CHECK-LABEL: name: test_unmerge_values_look_through_vector_to_scalar_bitcast + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32) + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>) + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](s64) + ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) + ; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(<2 x s32>) = G_BUILD_VECTOR %0:_(s32), %1:_(s32) + %3:_(s64) = G_BITCAST %2:_(<2 x s32>) + %4:_(s32), %5:_(s32) = G_UNMERGE_VALUES %3:_(s64) + $vgpr0 = COPY %4 + $vgpr1 = COPY %5 +...