diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-inserts.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-inserts.mir --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-inserts.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-inserts.mir @@ -10,7 +10,9 @@ ; CHECK-LABEL: name: test_inserts_nonpow2 - ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x3 + ; CHECK: liveins: $x0, $x1, $x2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x3 ; CHECK-NEXT: $x0 = COPY [[COPY]](s64) ; CHECK-NEXT: RET_ReallyLR %0:_(s64) = COPY $x0 @@ -30,7 +32,9 @@ liveins: $x0, $x1, $x2 ; CHECK-LABEL: name: test_inserts_s96 - ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK: liveins: $x0, $x1, $x2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x2 ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64) ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) @@ -58,7 +62,9 @@ liveins: $x0, $x1, $x2 ; CHECK-LABEL: name: test_inserts_s65 - ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK: liveins: $x0, $x1, $x2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x2 ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s8), [[UV1:%[0-9]+]]:_(s8), [[UV2:%[0-9]+]]:_(s8), [[UV3:%[0-9]+]]:_(s8), [[UV4:%[0-9]+]]:_(s8), [[UV5:%[0-9]+]]:_(s8), [[UV6:%[0-9]+]]:_(s8), [[UV7:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[COPY]](s64) ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[UV]](s8) @@ -643,7 +649,9 @@ bb.0: liveins: $w0, $w1 ; CHECK-LABEL: name: s1_s32_legal - ; CHECK: %val1:_(s32) = COPY $w0 + ; CHECK: liveins: $w0, $w1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %val1:_(s32) = COPY $w0 ; CHECK-NEXT: %val2:_(s32) = COPY $w1 ; CHECK-NEXT: %op:_(s1) = G_TRUNC %val2(s32) ; CHECK-NEXT: %ins:_(s32) = G_INSERT %val1, %op(s1), 0 @@ -662,7 +670,9 @@ bb.0: liveins: $w0, $b0 ; CHECK-LABEL: name: s8_s32_legal - ; CHECK: %val:_(s32) = COPY $w0 + ; CHECK: liveins: $w0, $b0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %val:_(s32) = COPY $w0 ; CHECK-NEXT: %op:_(s8) = COPY $b0 ; CHECK-NEXT: %ins:_(s32) = G_INSERT %val, %op(s8), 0 ; CHECK-NEXT: $w0 = COPY %ins(s32) @@ -679,7 +689,9 @@ bb.0: liveins: $w0, $h0 ; CHECK-LABEL: name: s16_s32_legal - ; CHECK: %val:_(s32) = COPY $w0 + ; CHECK: liveins: $w0, $h0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %val:_(s32) = COPY $w0 ; CHECK-NEXT: %op:_(s16) = COPY $h0 ; CHECK-NEXT: %ins:_(s32) = G_INSERT %val, %op(s16), 0 ; CHECK-NEXT: $w0 = COPY %ins(s32) @@ -696,7 +708,9 @@ bb.0: liveins: $x0, $w0 ; CHECK-LABEL: name: s32_s64_legal - ; CHECK: %val:_(s64) = COPY $x0 + ; CHECK: liveins: $x0, $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %val:_(s64) = COPY $x0 ; CHECK-NEXT: %op:_(s32) = COPY $w0 ; CHECK-NEXT: %ins:_(s64) = G_INSERT %val, %op(s32), 0 ; CHECK-NEXT: $x0 = COPY %ins(s64) @@ -713,7 +727,9 @@ bb.0: liveins: $x0, $w0 ; CHECK-LABEL: name: s32_p0_legal - ; CHECK: %val:_(p0) = COPY $x0 + ; CHECK: liveins: $x0, $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %val:_(p0) = COPY $x0 ; CHECK-NEXT: %op:_(s32) = COPY $w0 ; CHECK-NEXT: %ins:_(p0) = G_INSERT %val, %op(s32), 0 ; CHECK-NEXT: $x0 = COPY %ins(p0) @@ -730,7 +746,9 @@ bb.0: liveins: $q0, $w0 ; CHECK-LABEL: name: s32_s128 - ; CHECK: %val:_(s128) = COPY $q0 + ; CHECK: liveins: $q0, $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %val:_(s128) = COPY $q0 ; CHECK-NEXT: %op:_(s32) = COPY $w0 ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES %val(s128) ; CHECK-NEXT: [[INSERT:%[0-9]+]]:_(s64) = G_INSERT [[UV]], %op(s32), 0 @@ -749,7 +767,9 @@ bb.0: liveins: $q0, $w0 ; CHECK-LABEL: name: s1_s128 - ; CHECK: %val1:_(s128) = COPY $q0 + ; CHECK: liveins: $q0, $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %val1:_(s128) = COPY $q0 ; CHECK-NEXT: %val2:_(s32) = COPY $w1 ; CHECK-NEXT: %op:_(s1) = G_TRUNC %val2(s32) ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES %val1(s128) @@ -772,7 +792,9 @@ ; FIXME: Can't legalize this right now? ; CHECK-LABEL: name: s4_s32 - ; CHECK: %val1:_(s32) = COPY $w0 + ; CHECK: liveins: $w0, $w1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %val1:_(s32) = COPY $w0 ; CHECK-NEXT: %val2:_(s32) = COPY $w1 ; CHECK-NEXT: %op:_(s4) = G_TRUNC %val2(s32) ; CHECK-NEXT: %ins:_(s32) = G_INSERT %val1, %op(s4), 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll @@ -0,0 +1,869 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s + +define void @add_v3i16(<3 x i16> addrspace(1)* %ptra, <3 x i16> addrspace(1)* %ptrb, <3 x i16> addrspace(1)* %ptr2) { +; GFX8-LABEL: add_v3i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 2, v0 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ushort v8, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ushort v9, v[6:7] +; GFX8-NEXT: flat_load_ushort v10, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 4, v2 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_ushort v11, v[2:3] +; GFX8-NEXT: flat_load_ushort v12, v[0:1] +; GFX8-NEXT: flat_load_ushort v6, v[6:7] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v4 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v4 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; GFX8-NEXT: s_waitcnt vmcnt(2) +; GFX8-NEXT: v_add_u16_e32 v7, v8, v11 +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_add_u16_e32 v8, v9, v12 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u16_e32 v6, v10, v6 +; GFX8-NEXT: flat_store_short v[4:5], v7 +; GFX8-NEXT: flat_store_short v[0:1], v8 +; GFX8-NEXT: flat_store_short v[2:3], v6 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: add_v3i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v6, v[0:1], off +; GFX9-NEXT: global_load_ushort v7, v[0:1], off offset:2 +; GFX9-NEXT: global_load_ushort v8, v[0:1], off offset:4 +; GFX9-NEXT: global_load_ushort v9, v[2:3], off +; GFX9-NEXT: global_load_ushort v10, v[2:3], off offset:2 +; GFX9-NEXT: global_load_ushort v11, v[2:3], off offset:4 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_perm_b32 v1, v10, v9, s4 +; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v2, v8, v11 +; GFX9-NEXT: global_store_short v[4:5], v0, off +; GFX9-NEXT: global_store_short_d16_hi v[4:5], v0, off offset:2 +; GFX9-NEXT: global_store_short v[4:5], v2, off offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %a = load <3 x i16>, <3 x i16> addrspace(1)* %ptra, align 4 + %b = load <3 x i16>, <3 x i16> addrspace(1)* %ptrb, align 4 + %add = add <3 x i16> %a, %b + store <3 x i16> %add, <3 x i16> addrspace(1)* %ptr2, align 4 + ret void +} + +define <3 x i16> @add_v3i16_arg(<3 x i16> %a, <3 x i16> %b) { +; GFX8-LABEL: add_v3i16_arg: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u16_e32 v4, v0, v2 +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX8-NEXT: v_add_u16_e32 v1, v1, v3 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: add_v3i16_arg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v0, v0, v2 +; GFX9-NEXT: v_pk_add_u16 v1, v1, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %add = add <3 x i16> %a, %b + ret <3 x i16> %add +} + +define void @add_v4i16(<4 x i16> addrspace(1)* %ptra, <4 x i16> addrspace(1)* %ptrb, <4 x i16> addrspace(1)* %ptr2) { +; GFX8-LABEL: add_v4i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u16_e32 v6, v0, v2 +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v2, v1, v3 +; GFX8-NEXT: v_add_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: add_v4i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[8:9], v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v0, v6, v8 +; GFX9-NEXT: v_pk_add_u16 v1, v7, v9 +; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %a = load <4 x i16>, <4 x i16> addrspace(1)* %ptra, align 4 + %b = load <4 x i16>, <4 x i16> addrspace(1)* %ptrb, align 4 + %add = add <4 x i16> %a, %b + store <4 x i16> %add, <4 x i16> addrspace(1)* %ptr2, align 4 + ret void +} + +define <4 x i16> @add_v4i16_arg(<4 x i16> %a, <4 x i16> %b) { +; GFX8-LABEL: add_v4i16_arg: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u16_e32 v4, v0, v2 +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v2, v1, v3 +; GFX8-NEXT: v_add_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: add_v4i16_arg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v0, v0, v2 +; GFX9-NEXT: v_pk_add_u16 v1, v1, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %add = add <4 x i16> %a, %b + ret <4 x i16> %add +} + +define void @add_v5i16(<5 x i16> addrspace(1)* %ptra, <5 x i16> addrspace(1)* %ptrb, <5 x i16> addrspace(1)* %ptr2) { +; GFX8-LABEL: add_v5i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 2, v0 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 6, v0 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ushort v12, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 8, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ushort v13, v[6:7] +; GFX8-NEXT: flat_load_ushort v14, v[8:9] +; GFX8-NEXT: flat_load_ushort v15, v[10:11] +; GFX8-NEXT: flat_load_ushort v16, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 4, v2 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 6, v2 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 8, v2 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_ushort v17, v[2:3] +; GFX8-NEXT: flat_load_ushort v18, v[0:1] +; GFX8-NEXT: flat_load_ushort v19, v[6:7] +; GFX8-NEXT: flat_load_ushort v20, v[8:9] +; GFX8-NEXT: flat_load_ushort v10, v[10:11] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v4 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v4 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 6, v4 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 8, v4 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc +; GFX8-NEXT: s_waitcnt vmcnt(4) +; GFX8-NEXT: v_add_u16_e32 v11, v12, v17 +; GFX8-NEXT: s_waitcnt vmcnt(3) +; GFX8-NEXT: v_add_u16_e32 v12, v13, v18 +; GFX8-NEXT: s_waitcnt vmcnt(2) +; GFX8-NEXT: v_add_u16_e32 v13, v14, v19 +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_add_u16_e32 v14, v15, v20 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u16_e32 v10, v16, v10 +; GFX8-NEXT: flat_store_short v[4:5], v11 +; GFX8-NEXT: flat_store_short v[0:1], v12 +; GFX8-NEXT: flat_store_short v[2:3], v13 +; GFX8-NEXT: flat_store_short v[6:7], v14 +; GFX8-NEXT: flat_store_short v[8:9], v10 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: add_v5i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v6, v[0:1], off +; GFX9-NEXT: global_load_ushort v7, v[0:1], off offset:2 +; GFX9-NEXT: global_load_ushort v8, v[0:1], off offset:4 +; GFX9-NEXT: global_load_ushort v9, v[0:1], off offset:6 +; GFX9-NEXT: global_load_ushort v10, v[0:1], off offset:8 +; GFX9-NEXT: global_load_ushort v11, v[2:3], off +; GFX9-NEXT: global_load_ushort v12, v[2:3], off offset:2 +; GFX9-NEXT: global_load_ushort v13, v[2:3], off offset:4 +; GFX9-NEXT: global_load_ushort v14, v[2:3], off offset:6 +; GFX9-NEXT: global_load_ushort v15, v[2:3], off offset:8 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_perm_b32 v1, v9, v8, s4 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_perm_b32 v2, v12, v11, s4 +; GFX9-NEXT: v_pk_add_u16 v0, v0, v2 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_perm_b32 v3, v14, v13, s4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v6, v10, v15 +; GFX9-NEXT: v_pk_add_u16 v1, v1, v3 +; GFX9-NEXT: global_store_short v[4:5], v0, off +; GFX9-NEXT: global_store_short_d16_hi v[4:5], v0, off offset:2 +; GFX9-NEXT: global_store_short v[4:5], v1, off offset:4 +; GFX9-NEXT: global_store_short_d16_hi v[4:5], v1, off offset:6 +; GFX9-NEXT: global_store_short v[4:5], v6, off offset:8 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %a = load <5 x i16>, <5 x i16> addrspace(1)* %ptra, align 4 + %b = load <5 x i16>, <5 x i16> addrspace(1)* %ptrb, align 4 + %add = add <5 x i16> %a, %b + store <5 x i16> %add, <5 x i16> addrspace(1)* %ptr2, align 4 + ret void +} + +define <5 x i16> @add_v5i16_arg(<5 x i16> %a, <5 x i16> %b) { +; GFX8-LABEL: add_v5i16_arg: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u16_e32 v6, v0, v3 +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v3, v1, v4 +; GFX8-NEXT: v_add_u16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX8-NEXT: v_add_u16_e32 v2, v2, v5 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: add_v5i16_arg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v0, v0, v3 +; GFX9-NEXT: v_pk_add_u16 v1, v1, v4 +; GFX9-NEXT: v_pk_add_u16 v2, v2, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %add = add <5 x i16> %a, %b + ret <5 x i16> %add +} + +define void @add_v6i16(<6 x i16> addrspace(1)* %ptra, <6 x i16> addrspace(1)* %ptrb, <6 x i16> addrspace(1)* %ptr2) { +; GFX8-LABEL: add_v6i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx3 v[6:8], v[0:1] +; GFX8-NEXT: flat_load_dwordx3 v[0:2], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u16_e32 v3, v6, v0 +; GFX8-NEXT: v_add_u16_sdwa v0, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v6, v7, v1 +; GFX8-NEXT: v_add_u16_sdwa v1, v7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v7, v8, v2 +; GFX8-NEXT: v_add_u16_sdwa v2, v8, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v6, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX8-NEXT: flat_store_dwordx3 v[4:5], v[0:2] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: add_v6i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx3 v[6:8], v[0:1], off +; GFX9-NEXT: global_load_dwordx3 v[9:11], v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v0, v6, v9 +; GFX9-NEXT: v_pk_add_u16 v1, v7, v10 +; GFX9-NEXT: v_pk_add_u16 v2, v8, v11 +; GFX9-NEXT: global_store_dwordx3 v[4:5], v[0:2], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %a = load <6 x i16>, <6 x i16> addrspace(1)* %ptra, align 4 + %b = load <6 x i16>, <6 x i16> addrspace(1)* %ptrb, align 4 + %add = add <6 x i16> %a, %b + store <6 x i16> %add, <6 x i16> addrspace(1)* %ptr2, align 4 + ret void +} + +define <6 x i16> @add_v6i16_arg(<6 x i16> %a, <6 x i16> %b) { +; GFX8-LABEL: add_v6i16_arg: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u16_e32 v6, v0, v3 +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v3, v1, v4 +; GFX8-NEXT: v_add_u16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX8-NEXT: v_add_u16_e32 v3, v2, v5 +; GFX8-NEXT: v_add_u16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: add_v6i16_arg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v0, v0, v3 +; GFX9-NEXT: v_pk_add_u16 v1, v1, v4 +; GFX9-NEXT: v_pk_add_u16 v2, v2, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %add = add <6 x i16> %a, %b + ret <6 x i16> %add +} + +define void @addv_7i16(<7 x i16> addrspace(1)* %ptra, <7 x i16> addrspace(1)* %ptrb, <7 x i16> addrspace(1)* %ptr2) { +; GFX8-LABEL: addv_7i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 2, v0 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 6, v0 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v12, vcc, 8, v0 +; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 10, v0 +; GFX8-NEXT: v_addc_u32_e32 v15, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ushort v16, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 12, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ushort v17, v[6:7] +; GFX8-NEXT: flat_load_ushort v18, v[8:9] +; GFX8-NEXT: flat_load_ushort v19, v[10:11] +; GFX8-NEXT: flat_load_ushort v20, v[12:13] +; GFX8-NEXT: flat_load_ushort v21, v[14:15] +; GFX8-NEXT: flat_load_ushort v22, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 4, v2 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 6, v2 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 8, v2 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v12, vcc, 10, v2 +; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 12, v2 +; GFX8-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_ushort v2, v[2:3] +; GFX8-NEXT: flat_load_ushort v3, v[0:1] +; GFX8-NEXT: flat_load_ushort v6, v[6:7] +; GFX8-NEXT: flat_load_ushort v7, v[8:9] +; GFX8-NEXT: flat_load_ushort v8, v[10:11] +; GFX8-NEXT: flat_load_ushort v9, v[12:13] +; GFX8-NEXT: flat_load_ushort v10, v[14:15] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v4 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GFX8-NEXT: s_waitcnt vmcnt(6) +; GFX8-NEXT: v_add_u16_e32 v2, v16, v2 +; GFX8-NEXT: s_waitcnt vmcnt(5) +; GFX8-NEXT: v_add_u16_e32 v3, v17, v3 +; GFX8-NEXT: flat_store_short v[4:5], v2 +; GFX8-NEXT: flat_store_short v[0:1], v3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4 +; GFX8-NEXT: s_waitcnt vmcnt(6) +; GFX8-NEXT: v_add_u16_e32 v6, v18, v6 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GFX8-NEXT: flat_store_short v[0:1], v6 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 6, v4 +; GFX8-NEXT: s_waitcnt vmcnt(6) +; GFX8-NEXT: v_add_u16_e32 v7, v19, v7 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GFX8-NEXT: flat_store_short v[0:1], v7 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 8, v4 +; GFX8-NEXT: s_waitcnt vmcnt(6) +; GFX8-NEXT: v_add_u16_e32 v8, v20, v8 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GFX8-NEXT: flat_store_short v[0:1], v8 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 10, v4 +; GFX8-NEXT: s_waitcnt vmcnt(6) +; GFX8-NEXT: v_add_u16_e32 v9, v21, v9 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GFX8-NEXT: flat_store_short v[0:1], v9 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 12, v4 +; GFX8-NEXT: s_waitcnt vmcnt(6) +; GFX8-NEXT: v_add_u16_e32 v10, v22, v10 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GFX8-NEXT: flat_store_short v[0:1], v10 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: addv_7i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v6, v[0:1], off +; GFX9-NEXT: global_load_ushort v7, v[0:1], off offset:2 +; GFX9-NEXT: global_load_ushort v8, v[0:1], off offset:4 +; GFX9-NEXT: global_load_ushort v9, v[0:1], off offset:6 +; GFX9-NEXT: global_load_ushort v10, v[0:1], off offset:8 +; GFX9-NEXT: global_load_ushort v11, v[0:1], off offset:10 +; GFX9-NEXT: global_load_ushort v12, v[0:1], off offset:12 +; GFX9-NEXT: global_load_ushort v13, v[2:3], off +; GFX9-NEXT: global_load_ushort v14, v[2:3], off offset:2 +; GFX9-NEXT: global_load_ushort v15, v[2:3], off offset:4 +; GFX9-NEXT: global_load_ushort v16, v[2:3], off offset:6 +; GFX9-NEXT: global_load_ushort v17, v[2:3], off offset:8 +; GFX9-NEXT: global_load_ushort v18, v[2:3], off offset:10 +; GFX9-NEXT: global_load_ushort v19, v[2:3], off offset:12 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_perm_b32 v1, v9, v8, s4 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_perm_b32 v2, v11, v10, s4 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_perm_b32 v3, v14, v13, s4 +; GFX9-NEXT: v_pk_add_u16 v0, v0, v3 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_perm_b32 v6, v16, v15, s4 +; GFX9-NEXT: v_pk_add_u16 v1, v1, v6 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_perm_b32 v7, v18, v17, s4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v8, v12, v19 +; GFX9-NEXT: v_pk_add_u16 v2, v2, v7 +; GFX9-NEXT: global_store_short v[4:5], v0, off +; GFX9-NEXT: global_store_short_d16_hi v[4:5], v0, off offset:2 +; GFX9-NEXT: global_store_short v[4:5], v1, off offset:4 +; GFX9-NEXT: global_store_short_d16_hi v[4:5], v1, off offset:6 +; GFX9-NEXT: global_store_short v[4:5], v2, off offset:8 +; GFX9-NEXT: global_store_short_d16_hi v[4:5], v2, off offset:10 +; GFX9-NEXT: global_store_short v[4:5], v8, off offset:12 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %a = load <7 x i16>, <7 x i16> addrspace(1)* %ptra, align 4 + %b = load <7 x i16>, <7 x i16> addrspace(1)* %ptrb, align 4 + %add = add <7 x i16> %a, %b + store <7 x i16> %add, <7 x i16> addrspace(1)* %ptr2, align 4 + ret void +} + +define <7 x i16> @add_v7i16_arg(<7 x i16> %a, <7 x i16> %b) { +; GFX8-LABEL: add_v7i16_arg: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u16_e32 v8, v0, v4 +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v4, v1, v5 +; GFX8-NEXT: v_add_u16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX8-NEXT: v_add_u16_e32 v4, v2, v6 +; GFX8-NEXT: v_add_u16_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX8-NEXT: v_add_u16_e32 v3, v3, v7 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: add_v7i16_arg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v0, v0, v4 +; GFX9-NEXT: v_pk_add_u16 v1, v1, v5 +; GFX9-NEXT: v_pk_add_u16 v2, v2, v6 +; GFX9-NEXT: v_pk_add_u16 v3, v3, v7 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %add = add <7 x i16> %a, %b + ret <7 x i16> %add +} + +define void @add_v9i16(<9 x i16> addrspace(1)* %ptra, <9 x i16> addrspace(1)* %ptrb, <9 x i16> addrspace(1)* %ptr2) { +; GFX8-LABEL: add_v9i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[6:9], v[0:1] +; GFX8-NEXT: flat_load_dwordx4 v[10:13], v[2:3] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ushort v14, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(2) +; GFX8-NEXT: v_add_u16_e32 v1, v6, v10 +; GFX8-NEXT: v_add_u16_sdwa v2, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v3, v7, v11 +; GFX8-NEXT: v_add_u16_sdwa v10, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v11, v8, v12 +; GFX8-NEXT: v_add_u16_sdwa v8, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v12, v9, v13 +; GFX8-NEXT: v_add_u16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 16, v4 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u16_e32 v13, v14, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v1, v2 +; GFX8-NEXT: v_or_b32_e32 v1, v3, v10 +; GFX8-NEXT: v_or_b32_e32 v2, v11, v8 +; GFX8-NEXT: v_or_b32_e32 v3, v12, v9 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: flat_store_short v[6:7], v13 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: add_v9i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[6:9], v[2:3], off +; GFX9-NEXT: global_load_dwordx4 v[10:13], v[0:1], off +; GFX9-NEXT: global_load_ushort v14, v[0:1], off offset:16 +; GFX9-NEXT: global_load_ushort v15, v[2:3], off offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_pk_add_u16 v0, v10, v6 +; GFX9-NEXT: v_pk_add_u16 v1, v11, v7 +; GFX9-NEXT: v_pk_add_u16 v2, v12, v8 +; GFX9-NEXT: v_pk_add_u16 v3, v13, v9 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v6, v14, v15 +; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX9-NEXT: global_store_short v[4:5], v6, off offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %a = load <9 x i16>, <9 x i16> addrspace(1)* %ptra, align 4 + %b = load <9 x i16>, <9 x i16> addrspace(1)* %ptrb, align 4 + %add = add <9 x i16> %a, %b + store <9 x i16> %add, <9 x i16> addrspace(1)* %ptr2, align 4 + ret void +} + +define <9 x i16> @add_v9i16_arg(<9 x i16> %a, <9 x i16> %b) { +; GFX8-LABEL: add_v9i16_arg: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u16_e32 v10, v0, v5 +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v5, v1, v6 +; GFX8-NEXT: v_add_u16_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX8-NEXT: v_add_u16_e32 v5, v2, v7 +; GFX8-NEXT: v_add_u16_sdwa v2, v2, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_add_u16_e32 v5, v3, v8 +; GFX8-NEXT: v_add_u16_sdwa v3, v3, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v10, v0 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_add_u16_e32 v4, v4, v9 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: add_v9i16_arg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v0, v0, v5 +; GFX9-NEXT: v_pk_add_u16 v1, v1, v6 +; GFX9-NEXT: v_pk_add_u16 v2, v2, v7 +; GFX9-NEXT: v_pk_add_u16 v3, v3, v8 +; GFX9-NEXT: v_pk_add_u16 v4, v4, v9 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %add = add <9 x i16> %a, %b + ret <9 x i16> %add +} + +define void @add_v10i16(<10 x i16> addrspace(1)* %ptra, <10 x i16> addrspace(1)* %ptrb, <10 x i16> addrspace(1)* %ptr2) { +; GFX8-LABEL: add_v10i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[6:9], v[0:1] +; GFX8-NEXT: flat_load_dwordx4 v[10:13], v[2:3] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v14, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_dword v15, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(2) +; GFX8-NEXT: v_add_u16_e32 v0, v6, v10 +; GFX8-NEXT: v_add_u16_sdwa v1, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v2, v7, v11 +; GFX8-NEXT: v_add_u16_sdwa v3, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v6, v8, v12 +; GFX8-NEXT: v_add_u16_sdwa v7, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v8, v9, v13 +; GFX8-NEXT: v_add_u16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX8-NEXT: v_or_b32_e32 v2, v6, v7 +; GFX8-NEXT: v_or_b32_e32 v3, v8, v9 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u16_e32 v6, v14, v15 +; GFX8-NEXT: v_add_u16_sdwa v7, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v4 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GFX8-NEXT: flat_store_dword v[0:1], v6 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: add_v10i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[6:9], v[2:3], off +; GFX9-NEXT: global_load_dwordx4 v[10:13], v[0:1], off +; GFX9-NEXT: global_load_dword v14, v[0:1], off offset:16 +; GFX9-NEXT: global_load_dword v15, v[2:3], off offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_pk_add_u16 v0, v10, v6 +; GFX9-NEXT: v_pk_add_u16 v1, v11, v7 +; GFX9-NEXT: v_pk_add_u16 v2, v12, v8 +; GFX9-NEXT: v_pk_add_u16 v3, v13, v9 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v6, v14, v15 +; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX9-NEXT: global_store_dword v[4:5], v6, off offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %a = load <10 x i16>, <10 x i16> addrspace(1)* %ptra, align 4 + %b = load <10 x i16>, <10 x i16> addrspace(1)* %ptrb, align 4 + %add = add <10 x i16> %a, %b + store <10 x i16> %add, <10 x i16> addrspace(1)* %ptr2, align 4 + ret void +} + +define void @add_v11i16(<11 x i16> addrspace(1)* %ptra, <11 x i16> addrspace(1)* %ptrb, <11 x i16> addrspace(1)* %ptr2) { +; GFX8-LABEL: add_v11i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[6:9], v[0:1] +; GFX8-NEXT: flat_load_dwordx4 v[10:13], v[2:3] +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 16, v2 +; GFX8-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v16, vcc, 18, v2 +; GFX8-NEXT: v_addc_u32_e32 v17, vcc, 0, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 20, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_ushort v14, v[14:15] +; GFX8-NEXT: flat_load_ushort v15, v[16:17] +; GFX8-NEXT: flat_load_ushort v16, v[2:3] +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 16, v0 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX8-NEXT: s_waitcnt vmcnt(3) +; GFX8-NEXT: v_add_u16_e32 v17, v6, v10 +; GFX8-NEXT: v_add_u16_sdwa v10, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 18, v0 +; GFX8-NEXT: v_add_u16_e32 v18, v7, v11 +; GFX8-NEXT: v_add_u16_sdwa v11, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 20, v0 +; GFX8-NEXT: flat_load_ushort v2, v[2:3] +; GFX8-NEXT: flat_load_ushort v3, v[6:7] +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ushort v21, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 16, v4 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc +; GFX8-NEXT: v_add_u16_e32 v19, v8, v12 +; GFX8-NEXT: v_add_u16_sdwa v12, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 18, v4 +; GFX8-NEXT: v_add_u16_e32 v20, v9, v13 +; GFX8-NEXT: v_add_u16_sdwa v13, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc +; GFX8-NEXT: v_or_b32_e32 v0, v17, v10 +; GFX8-NEXT: v_or_b32_e32 v1, v18, v11 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 20, v4 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc +; GFX8-NEXT: s_waitcnt vmcnt(2) +; GFX8-NEXT: v_add_u16_e32 v14, v2, v14 +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_add_u16_e32 v15, v3, v15 +; GFX8-NEXT: v_or_b32_e32 v2, v19, v12 +; GFX8-NEXT: v_or_b32_e32 v3, v20, v13 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u16_e32 v16, v21, v16 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: flat_store_short v[6:7], v14 +; GFX8-NEXT: flat_store_short v[8:9], v15 +; GFX8-NEXT: flat_store_short v[10:11], v16 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: add_v11i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off +; GFX9-NEXT: global_load_ushort v14, v[0:1], off offset:16 +; GFX9-NEXT: global_load_ushort v15, v[0:1], off offset:18 +; GFX9-NEXT: global_load_ushort v16, v[2:3], off offset:16 +; GFX9-NEXT: global_load_ushort v17, v[2:3], off offset:18 +; GFX9-NEXT: global_load_dwordx4 v[10:13], v[2:3], off +; GFX9-NEXT: global_load_ushort v18, v[0:1], off offset:20 +; GFX9-NEXT: global_load_ushort v19, v[2:3], off offset:20 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_perm_b32 v14, v15, v14, s4 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_perm_b32 v15, v17, v16, s4 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_pk_add_u16 v0, v6, v10 +; GFX9-NEXT: v_pk_add_u16 v1, v7, v11 +; GFX9-NEXT: v_pk_add_u16 v2, v8, v12 +; GFX9-NEXT: v_pk_add_u16 v3, v9, v13 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v6, v18, v19 +; GFX9-NEXT: v_pk_add_u16 v7, v14, v15 +; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX9-NEXT: global_store_short v[4:5], v7, off offset:16 +; GFX9-NEXT: global_store_short_d16_hi v[4:5], v7, off offset:18 +; GFX9-NEXT: global_store_short v[4:5], v6, off offset:20 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %a = load <11 x i16>, <11 x i16> addrspace(1)* %ptra, align 4 + %b = load <11 x i16>, <11 x i16> addrspace(1)* %ptrb, align 4 + %add = add <11 x i16> %a, %b + store <11 x i16> %add, <11 x i16> addrspace(1)* %ptr2, align 4 + ret void +} + +define <11 x i16> @add_v11i16_arg(<11 x i16> %a, <11 x i16> %b) { +; GFX8-LABEL: add_v11i16_arg: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u16_e32 v12, v0, v6 +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v6, v1, v7 +; GFX8-NEXT: v_add_u16_sdwa v1, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v1, v6, v1 +; GFX8-NEXT: v_add_u16_e32 v6, v2, v8 +; GFX8-NEXT: v_add_u16_sdwa v2, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX8-NEXT: v_add_u16_e32 v6, v3, v9 +; GFX8-NEXT: v_add_u16_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX8-NEXT: v_add_u16_e32 v6, v4, v10 +; GFX8-NEXT: v_add_u16_sdwa v4, v4, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v12, v0 +; GFX8-NEXT: v_or_b32_e32 v4, v6, v4 +; GFX8-NEXT: v_add_u16_e32 v5, v5, v11 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: add_v11i16_arg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v0, v0, v6 +; GFX9-NEXT: v_pk_add_u16 v1, v1, v7 +; GFX9-NEXT: v_pk_add_u16 v2, v2, v8 +; GFX9-NEXT: v_pk_add_u16 v3, v3, v9 +; GFX9-NEXT: v_pk_add_u16 v4, v4, v10 +; GFX9-NEXT: v_pk_add_u16 v5, v5, v11 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %add = add <11 x i16> %a, %b + ret <11 x i16> %add +} + +define void @add_v12i16(<12 x i16> addrspace(1)* %ptra, <12 x i16> addrspace(1)* %ptrb, <12 x i16> addrspace(1)* %ptr2) { +; GFX8-LABEL: add_v12i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[6:9], v[0:1] +; GFX8-NEXT: flat_load_dwordx4 v[10:13], v[2:3] +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 16, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[14:15], v[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_add_u16_e32 v2, v6, v10 +; GFX8-NEXT: v_add_u16_sdwa v3, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v10, v7, v11 +; GFX8-NEXT: v_add_u16_sdwa v11, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX8-NEXT: v_add_u16_e32 v16, v8, v12 +; GFX8-NEXT: v_add_u16_sdwa v8, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v12, v9, v13 +; GFX8-NEXT: v_add_u16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v2, v3 +; GFX8-NEXT: v_or_b32_e32 v1, v10, v11 +; GFX8-NEXT: v_or_b32_e32 v2, v16, v8 +; GFX8-NEXT: v_or_b32_e32 v3, v12, v9 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_add_u16_e32 v8, v6, v14 +; GFX8-NEXT: v_add_u16_sdwa v6, v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v9, v7, v15 +; GFX8-NEXT: v_add_u16_sdwa v7, v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v4 +; GFX8-NEXT: v_or_b32_e32 v6, v8, v6 +; GFX8-NEXT: v_or_b32_e32 v7, v9, v7 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[6:7] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: add_v12i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[6:9], v[2:3], off +; GFX9-NEXT: global_load_dwordx4 v[10:13], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[14:15], v[0:1], off offset:16 +; GFX9-NEXT: global_load_dwordx2 v[16:17], v[2:3], off offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_pk_add_u16 v0, v10, v6 +; GFX9-NEXT: v_pk_add_u16 v1, v11, v7 +; GFX9-NEXT: v_pk_add_u16 v2, v12, v8 +; GFX9-NEXT: v_pk_add_u16 v3, v13, v9 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v6, v14, v16 +; GFX9-NEXT: v_pk_add_u16 v7, v15, v17 +; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX9-NEXT: global_store_dwordx2 v[4:5], v[6:7], off offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %a = load <12 x i16>, <12 x i16> addrspace(1)* %ptra, align 4 + %b = load <12 x i16>, <12 x i16> addrspace(1)* %ptrb, align 4 + %add = add <12 x i16> %a, %b + store <12 x i16> %add, <12 x i16> addrspace(1)* %ptr2, align 4 + ret void +} + +define <12 x i16> @add_v12i16_arg(<12 x i16> %a, <12 x i16> %b) { +; GFX8-LABEL: add_v12i16_arg: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u16_e32 v12, v0, v6 +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v6, v1, v7 +; GFX8-NEXT: v_add_u16_sdwa v1, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v1, v6, v1 +; GFX8-NEXT: v_add_u16_e32 v6, v2, v8 +; GFX8-NEXT: v_add_u16_sdwa v2, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX8-NEXT: v_add_u16_e32 v6, v3, v9 +; GFX8-NEXT: v_add_u16_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX8-NEXT: v_add_u16_e32 v6, v4, v10 +; GFX8-NEXT: v_add_u16_sdwa v4, v4, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v4, v6, v4 +; GFX8-NEXT: v_add_u16_e32 v6, v5, v11 +; GFX8-NEXT: v_add_u16_sdwa v5, v5, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v12, v0 +; GFX8-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: add_v12i16_arg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v0, v0, v6 +; GFX9-NEXT: v_pk_add_u16 v1, v1, v7 +; GFX9-NEXT: v_pk_add_u16 v2, v2, v8 +; GFX9-NEXT: v_pk_add_u16 v3, v3, v9 +; GFX9-NEXT: v_pk_add_u16 v4, v4, v10 +; GFX9-NEXT: v_pk_add_u16 v5, v5, v11 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %add = add <12 x i16> %a, %b + ret <12 x i16> %add +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-build-vector.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-build-vector.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-build-vector.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-build-vector.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -O0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s # The G_ZEXT and G_SHL will be scalarized, introducing a # G_UNMERGE_VALUES of G_BUILD_VECTOR. The artifact combiner should @@ -30,3 +30,236 @@ %4:_(<2 x s64>) = G_SHL %3, %2 $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %4 ... + +--- +name: copy_scalar +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + + ; GFX9-LABEL: name: copy_scalar + ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV]](s32), [[UV1]](s32) + ; GFX9-NEXT: $vgpr2_vgpr3 = COPY [[MV]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32), %2:_(s32) = G_UNMERGE_VALUES %0(s64) + %3:_(s64) = G_MERGE_VALUES %1, %2 + $vgpr2_vgpr3= COPY %3 +... + +--- +name: copy_vector_using_elements +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + + ; GFX9-LABEL: name: copy_vector_using_elements + ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32) + ; GFX9-NEXT: $vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s32>) + %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 + %1:_(s32), %2:_(s32) = G_UNMERGE_VALUES %0(<2 x s32>) + %3:_(<2 x s32>) = G_BUILD_VECTOR %1, %2 + $vgpr2_vgpr3= COPY %3 +... + +--- +name: copy_vector_using_subvectors +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + + ; GFX9-LABEL: name: copy_vector_using_subvectors + ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[UV]](<2 x s16>), [[UV1]](<2 x s16>) + ; GFX9-NEXT: $vgpr2_vgpr3 = COPY [[CONCAT_VECTORS]](<4 x s16>) + %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 + %1:_(<2 x s16>), %2:_(<2 x s16>) = G_UNMERGE_VALUES %0(<4 x s16>) + %3:_(<4 x s16>) = G_CONCAT_VECTORS %1, %2 + $vgpr2_vgpr3= COPY %3 +... + +--- +name: shuffle_vector_elements +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + + ; GFX9-LABEL: name: shuffle_vector_elements + ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[UV1]](s32), [[UV]](s32) + ; GFX9-NEXT: $vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s32>) + %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 + %1:_(s32), %2:_(s32) = G_UNMERGE_VALUES %0(<2 x s32>) + %3:_(<2 x s32>) = G_BUILD_VECTOR %2, %1 + $vgpr2_vgpr3= COPY %3 +... + +--- +name: insert_element +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3_vgpr4 + + ; GFX9-LABEL: name: insert_element + ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3_vgpr4 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[COPY1]](s32) + ; GFX9-NEXT: $vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s32>) + %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 + %1:_(s32) = COPY $vgpr2 + %2:_(s32), %3:_(s32) = G_UNMERGE_VALUES %0(<2 x s32>) + %4:_(<2 x s32>) = G_BUILD_VECTOR %2, %1 + $vgpr2_vgpr3= COPY %4 +... + +--- +name: unmerge_to_sub_vectors +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5, $vgpr6_vgpr7 + + ; GFX9-LABEL: name: unmerge_to_sub_vectors + ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5, $vgpr6_vgpr7 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[UV2]](s32), [[UV3]](s32) + ; GFX9-NEXT: $vgpr4_vgpr5 = COPY [[BUILD_VECTOR]](<2 x s32>) + ; GFX9-NEXT: $vgpr6_vgpr7 = COPY [[BUILD_VECTOR1]](<2 x s32>) + %0:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + %1:_(s32), %2:_(s32), %3:_(s32), %4:_(s32) = G_UNMERGE_VALUES %0(<4 x s32>) + %5:_(<2 x s32>) = G_BUILD_VECTOR %1, %2 + %6:_(<2 x s32>) = G_BUILD_VECTOR %3, %4 + $vgpr4_vgpr5= COPY %5 + $vgpr6_vgpr7= COPY %6 +... + +--- +name: cant_unmerge_to_sub_vectors +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5, $vgpr6_vgpr7 + + ; GFX9-LABEL: name: cant_unmerge_to_sub_vectors + ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5, $vgpr6_vgpr7 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV3]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[UV1]](s32), [[UV2]](s32) + ; GFX9-NEXT: $vgpr4_vgpr5 = COPY [[BUILD_VECTOR]](<2 x s32>) + ; GFX9-NEXT: $vgpr6_vgpr7 = COPY [[BUILD_VECTOR1]](<2 x s32>) + %0:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + %1:_(s32), %2:_(s32), %3:_(s32), %4:_(s32) = G_UNMERGE_VALUES %0(<4 x s32>) + %5:_(<2 x s32>) = G_BUILD_VECTOR %1, %4 + %6:_(<2 x s32>) = G_BUILD_VECTOR %2, %3 + $vgpr4_vgpr5= COPY %5 + $vgpr6_vgpr7= COPY %6 +... + +--- +name: concat +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7 + + ; GFX9-LABEL: name: concat + ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) + ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32) + ; GFX9-NEXT: $vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s32>) + %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 + %1:_(<2 x s32>) = COPY $vgpr2_vgpr3 + %2:_(s32), %3:_(s32) = G_UNMERGE_VALUES %0(<2 x s32>) + %4:_(s32), %5:_(s32) = G_UNMERGE_VALUES %1(<2 x s32>) + %6:_(<4 x s32>) = G_BUILD_VECTOR %2, %3, %4, %5 + $vgpr4_vgpr5_vgpr6_vgpr7= COPY %6 +... + +--- +name: concat_same_vector +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 + + ; GFX9-LABEL: name: concat_same_vector + ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV]](s32), [[UV1]](s32) + ; GFX9-NEXT: $vgpr2_vgpr3_vgpr4_vgpr5 = COPY [[BUILD_VECTOR]](<4 x s32>) + %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 + %1:_(s32), %2:_(s32) = G_UNMERGE_VALUES %0(<2 x s32>) + %3:_(<4 x s32>) = G_BUILD_VECTOR %1, %2, %1, %2 + $vgpr2_vgpr3_vgpr4_vgpr5= COPY %3 +... + +--- +name: shuffle_not_concat +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7 + + ; GFX9-LABEL: name: shuffle_not_concat + ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) + ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[UV2]](s32), [[UV]](s32), [[UV1]](s32), [[UV3]](s32) + ; GFX9-NEXT: $vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s32>) + %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 + %1:_(<2 x s32>) = COPY $vgpr2_vgpr3 + %2:_(s32), %3:_(s32) = G_UNMERGE_VALUES %0(<2 x s32>) + %4:_(s32), %5:_(s32) = G_UNMERGE_VALUES %1(<2 x s32>) + %6:_(<4 x s32>) = G_BUILD_VECTOR %4, %2, %3, %5 + $vgpr4_vgpr5_vgpr6_vgpr7= COPY %6 +... + +--- +name: not_a_concat +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4, $vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 + + ; GFX9-LABEL: name: not_a_concat + ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4, $vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) + ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32), [[COPY2]](s32) + ; GFX9-NEXT: $vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 = COPY [[BUILD_VECTOR]](<5 x s32>) + %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 + %1:_(<2 x s32>) = COPY $vgpr2_vgpr3 + %2:_(s32) = COPY $vgpr4 + %3:_(s32), %4:_(s32) = G_UNMERGE_VALUES %0(<2 x s32>) + %5:_(s32), %6:_(s32) = G_UNMERGE_VALUES %1(<2 x s32>) + %7:_(<5 x s32>) = G_BUILD_VECTOR %3, %4, %5, %6, %2 + $vgpr5_vgpr6_vgpr7_vgpr8_vgpr9= COPY %7 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-unmerge-values.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-unmerge-values.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-unmerge-values.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-unmerge-values.mir @@ -1548,3 +1548,47 @@ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 , implicit $vgpr4, implicit $vgpr5 ... + +--- +name: test_unmerge_values_look_through_scalar_to_vector_bitcast +body: | + bb.0: + + ; CHECK-LABEL: name: test_unmerge_values_look_through_scalar_to_vector_bitcast + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[MV]](s64) + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<2 x s32>) + ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) + ; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s64) = G_MERGE_VALUES %0:_(s32), %1:_(s32) + %3:_(<2 x s32>) = G_BITCAST %2:_(s64) + %4:_(s32), %5:_(s32) = G_UNMERGE_VALUES %3:_(<2 x s32>) + $vgpr0 = COPY %4 + $vgpr1 = COPY %5 +... + +--- +name: test_unmerge_values_look_through_vector_to_scalar_bitcast +body: | + bb.0: + + ; CHECK-LABEL: name: test_unmerge_values_look_through_vector_to_scalar_bitcast + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32) + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>) + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](s64) + ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) + ; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(<2 x s32>) = G_BUILD_VECTOR %0:_(s32), %1:_(s32) + %3:_(s64) = G_BITCAST %2:_(<2 x s32>) + %4:_(s32), %5:_(s32) = G_UNMERGE_VALUES %3:_(s64) + $vgpr0 = COPY %4 + $vgpr1 = COPY %5 +...