diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll --- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll @@ -2,6 +2,7 @@ ; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -march=amdgcn -mcpu=gfx906 < %s | FileCheck -check-prefix=GFX906 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s define amdgpu_kernel void @uniform_vec_0_i16(ptr addrspace(1) %out, i16 %a) { ; GCN-LABEL: uniform_vec_0_i16: @@ -37,6 +38,19 @@ ; GFX906-NEXT: v_mov_b32_e32 v1, s0 ; GFX906-NEXT: global_store_dword v0, v1, s[2:3] ; GFX906-NEXT: s_endpgm +; +; GFX11-LABEL: uniform_vec_0_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tmp = insertelement <2 x i16> undef, i16 0, i32 0 %vec = insertelement <2 x i16> %tmp, i16 %a, i32 1 %val = bitcast <2 x i16> %vec to i32 @@ -62,6 +76,13 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: divergent_vec_0_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %tmp = insertelement <2 x i16> undef, i16 0, i32 0 %vec = insertelement <2 x i16> %tmp, i16 %a, i32 1 %val = bitcast <2 x i16> %vec to i32 @@ -102,6 +123,19 @@ ; GFX906-NEXT: v_mov_b32_e32 v1, s0 ; GFX906-NEXT: global_store_dword v0, v1, s[2:3] ; GFX906-NEXT: s_endpgm +; +; GFX11-LABEL: uniform_vec_i16_0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tmp = insertelement <2 x i16> undef, i16 %a, i32 0 %vec = insertelement <2 x i16> %tmp, i16 0, i32 1 %val = bitcast <2 x i16> %vec to i32 @@ -127,6 +161,13 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: divergent_vec_i16_0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %tmp = insertelement <2 x i16> undef, i16 %a, i32 0 %vec = insertelement <2 x i16> %tmp, i16 0, i32 1 %val = bitcast <2 x i16> %vec to i32 @@ -167,6 +208,19 @@ ; GFX906-NEXT: v_mov_b32_e32 v1, s0 ; GFX906-NEXT: global_store_dword v0, v1, s[2:3] ; GFX906-NEXT: s_endpgm +; +; GFX11-LABEL: uniform_vec_f16_0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tmp = insertelement <2 x half> undef, half %a, i32 0 %vec = insertelement <2 x half> %tmp, half 0.0, i32 1 %val = bitcast <2 x half> %vec to float @@ -192,6 +246,13 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: divergent_vec_f16_0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %tmp = insertelement <2 x half> undef, half %a, i32 0 %vec = insertelement <2 x half> %tmp, half 0.0, i32 1 %val = bitcast <2 x half> %vec to float @@ -239,6 +300,19 @@ ; GFX906-NEXT: ; use s0 ; GFX906-NEXT: ;;#ASMEND ; GFX906-NEXT: s_endpgm +; +; GFX11-LABEL: uniform_vec_i16_LL: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use s0 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm %val0 = load volatile i32, ptr addrspace(4) %in0 %val1 = load volatile i32, ptr addrspace(4) %in1 %lo = trunc i32 %val0 to i16 @@ -272,6 +346,13 @@ ; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: divergent_vec_i16_LL: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: s_setpc_b64 s[30:31] %tmp = insertelement <2 x i16> undef, i16 %a, i32 0 %vec = insertelement <2 x i16> %tmp, i16 %b, i32 1 %val = bitcast <2 x i16> %vec to i32 @@ -313,6 +394,17 @@ ; GFX906-NEXT: v_mov_b32_e32 v1, s2 ; GFX906-NEXT: global_store_dword v0, v1, s[0:1] ; GFX906-NEXT: s_endpgm +; +; GFX11-LABEL: uniform_vec_i16_LH: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_pack_lh_b32_b16 s2, s2, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %shift = lshr i32 %b, 16 %tr = trunc i32 %shift to i16 %tmp = insertelement <2 x i16> undef, i16 %a, i32 0 @@ -343,6 +435,13 @@ ; GFX906-NEXT: s_mov_b32 s4, 0xffff ; GFX906-NEXT: v_bfi_b32 v0, s4, v0, v1 ; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: divergent_vec_i16_LH: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %shift = lshr i32 %b, 16 %tr = trunc i32 %shift to i16 %tmp = insertelement <2 x i16> undef, i16 %a, i32 0 @@ -386,6 +485,17 @@ ; GFX906-NEXT: v_mov_b32_e32 v1, s2 ; GFX906-NEXT: global_store_dword v0, v1, s[0:1] ; GFX906-NEXT: s_endpgm +; +; GFX11-LABEL: uniform_vec_i16_HH: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_pack_hh_b32_b16 s2, s2, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %shift_a = lshr i32 %a, 16 %tr_a = trunc i32 %shift_a to i16 %shift_b = lshr i32 %b, 16 @@ -419,6 +529,13 @@ ; GFX906-NEXT: s_mov_b32 s4, 0x7060302 ; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: divergent_vec_i16_HH: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 +; GFX11-NEXT: s_setpc_b64 s[30:31] %shift_a = lshr i32 %a, 16 %tr_a = trunc i32 %shift_a to i16 %shift_b = lshr i32 %b, 16 @@ -470,6 +587,19 @@ ; GFX906-NEXT: ; use s0 ; GFX906-NEXT: ;;#ASMEND ; GFX906-NEXT: s_endpgm +; +; GFX11-LABEL: uniform_vec_f16_LL: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use s0 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm %val0 = load volatile i32, ptr addrspace(4) %in0 %val1 = load volatile i32, ptr addrspace(4) %in1 %lo.i = trunc i32 %val0 to i16 @@ -507,6 +637,13 @@ ; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: divergent_vec_f16_LL: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: s_setpc_b64 s[30:31] %tmp = insertelement <2 x half> undef, half %a, i32 0 %vec = insertelement <2 x half> %tmp, half %b, i32 1 %val = bitcast <2 x half> %vec to float @@ -535,6 +672,14 @@ ; GFX906-NEXT: ds_read_u16 v0, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: build_vec_v2i16_undeflo_divergent: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_load_u16_d16 v0, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %load = load i16, ptr addrspace(3) %in %build = insertelement <2 x i16> undef, i16 %load, i32 0 @@ -579,6 +724,19 @@ ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_store_dword v1, v0, s[2:3] ; GFX906-NEXT: s_endpgm +; +; GFX11-LABEL: build_vec_v2i16_undeflo_uniform: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: ds_load_u16_d16 v0, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %load = load i16, ptr addrspace(3) %in %build = insertelement <2 x i16> undef, i16 %load, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll --- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll +++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -verify-machineinstrs -o - %s | FileCheck -check-prefix=SI %s ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx1100 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX11 %s define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1) %p1) { ; SI-LABEL: vec_8xi16_extract_4xi16: @@ -114,6 +115,38 @@ ; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX9-NEXT: v_perm_b32 v1, v3, v1, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: vec_8xi16_extract_4xi16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_cbranch_scc0 .LBB0_2 +; GFX11-NEXT: ; %bb.1: ; %F +; GFX11-NEXT: global_load_b128 v[2:5], v[2:3], off glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_cbranch_execz .LBB0_3 +; GFX11-NEXT: s_branch .LBB0_4 +; GFX11-NEXT: .LBB0_2: +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX11-NEXT: .LBB0_3: ; %T +; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: .LBB0_4: ; %exit +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_ashrrev_i16 v1, 15, v3 op_sel_hi:[0,0] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-NEXT: v_or_b32_e32 v1, 0xffff8000, v1 +; GFX11-NEXT: v_or_b32_e32 v0, 0xffff8000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v2, 0xffff8000, v2 +; GFX11-NEXT: v_or_b32_e32 v3, 0xffff8000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 +; GFX11-NEXT: s_setpc_b64 s[30:31] br i1 undef, label %T, label %F T: @@ -246,6 +279,38 @@ ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 ; GFX9-NEXT: v_perm_b32 v1, v2, v1, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: vec_8xi16_extract_4xi16_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_cbranch_scc0 .LBB1_2 +; GFX11-NEXT: ; %bb.1: ; %F +; GFX11-NEXT: global_load_b128 v[2:5], v[2:3], off glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_cbranch_execz .LBB1_3 +; GFX11-NEXT: s_branch .LBB1_4 +; GFX11-NEXT: .LBB1_2: +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX11-NEXT: .LBB1_3: ; %T +; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: .LBB1_4: ; %exit +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_ashrrev_i16 v1, 15, v5 op_sel_hi:[0,1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-NEXT: v_or_b32_e32 v1, 0xffff8000, v1 +; GFX11-NEXT: v_or_b32_e32 v0, 0xffff8000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v2, 0xffff8000, v2 +; GFX11-NEXT: v_or_b32_e32 v3, 0xffff8000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 +; GFX11-NEXT: s_setpc_b64 s[30:31] br i1 undef, label %T, label %F T: @@ -381,6 +446,39 @@ ; GFX9-NEXT: v_pack_b32_f16 v1, v0, v5 ; GFX9-NEXT: v_pack_b32_f16 v0, v4, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: vec_8xf16_extract_4xf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_cbranch_scc0 .LBB2_2 +; GFX11-NEXT: ; %bb.1: ; %F +; GFX11-NEXT: global_load_b128 v[2:5], v[2:3], off glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_cbranch_execz .LBB2_3 +; GFX11-NEXT: s_branch .LBB2_4 +; GFX11-NEXT: .LBB2_2: +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX11-NEXT: .LBB2_3: ; %T +; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: .LBB2_4: ; %exit +; GFX11-NEXT: v_mov_b32_e32 v0, 0x3d00 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x3900, v0, vcc_lo +; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v1 +; GFX11-NEXT: v_dual_mov_b32 v4, 0x3900 :: v_dual_cndmask_b32 v1, 0x3900, v0 +; GFX11-NEXT: v_cmp_nge_f16_e32 vcc_lo, 0.5, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x3d00, v4, vcc_lo +; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x3900, v0, vcc_lo +; GFX11-NEXT: v_pack_b32_f16 v0, v2, v1 +; GFX11-NEXT: v_pack_b32_f16 v1, v3, v4 +; GFX11-NEXT: s_setpc_b64 s[30:31] br i1 undef, label %T, label %F T: @@ -550,6 +648,42 @@ ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 ; GFX9-NEXT: v_perm_b32 v1, v2, v1, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: vec_16xi16_extract_4xi16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_cbranch_scc0 .LBB3_2 +; GFX11-NEXT: ; %bb.1: ; %F +; GFX11-NEXT: global_load_b128 v[4:7], v[2:3], off offset:16 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b128 v[2:5], v[2:3], off glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_cbranch_execz .LBB3_3 +; GFX11-NEXT: s_branch .LBB3_4 +; GFX11-NEXT: .LBB3_2: +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; GFX11-NEXT: .LBB3_3: ; %T +; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off offset:16 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: .LBB3_4: ; %exit +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_ashrrev_i16 v1, 15, v3 op_sel_hi:[0,0] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-NEXT: v_or_b32_e32 v1, 0xffff8000, v1 +; GFX11-NEXT: v_or_b32_e32 v0, 0xffff8000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v2, 0xffff8000, v2 +; GFX11-NEXT: v_or_b32_e32 v3, 0xffff8000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 +; GFX11-NEXT: s_setpc_b64 s[30:31] br i1 undef, label %T, label %F T: @@ -721,6 +855,42 @@ ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 ; GFX9-NEXT: v_perm_b32 v1, v2, v1, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: vec_16xi16_extract_4xi16_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_cbranch_scc0 .LBB4_2 +; GFX11-NEXT: ; %bb.1: ; %F +; GFX11-NEXT: global_load_b128 v[4:7], v[2:3], off offset:16 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b128 v[2:5], v[2:3], off glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_cbranch_execz .LBB4_3 +; GFX11-NEXT: s_branch .LBB4_4 +; GFX11-NEXT: .LBB4_2: +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; GFX11-NEXT: .LBB4_3: ; %T +; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off offset:16 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: .LBB4_4: ; %exit +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_ashrrev_i16 v1, 15, v5 op_sel_hi:[0,1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-NEXT: v_or_b32_e32 v1, 0xffff8000, v1 +; GFX11-NEXT: v_or_b32_e32 v0, 0xffff8000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v2, 0xffff8000, v2 +; GFX11-NEXT: v_or_b32_e32 v3, 0xffff8000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 +; GFX11-NEXT: s_setpc_b64 s[30:31] br i1 undef, label %T, label %F T: @@ -895,6 +1065,43 @@ ; GFX9-NEXT: v_pack_b32_f16 v1, v0, v4 ; GFX9-NEXT: v_pack_b32_f16 v0, v2, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: vec_16xf16_extract_4xf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_cbranch_scc0 .LBB5_2 +; GFX11-NEXT: ; %bb.1: ; %F +; GFX11-NEXT: global_load_b128 v[4:7], v[2:3], off offset:16 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b128 v[2:5], v[2:3], off glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_cbranch_execz .LBB5_3 +; GFX11-NEXT: s_branch .LBB5_4 +; GFX11-NEXT: .LBB5_2: +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; GFX11-NEXT: .LBB5_3: ; %T +; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off offset:16 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: .LBB5_4: ; %exit +; GFX11-NEXT: v_mov_b32_e32 v0, 0x3d00 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x3900, v0, vcc_lo +; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v1 +; GFX11-NEXT: v_dual_mov_b32 v4, 0x3900 :: v_dual_cndmask_b32 v1, 0x3900, v0 +; GFX11-NEXT: v_cmp_nge_f16_e32 vcc_lo, 0.5, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x3d00, v4, vcc_lo +; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x3900, v0, vcc_lo +; GFX11-NEXT: v_pack_b32_f16 v0, v2, v1 +; GFX11-NEXT: v_pack_b32_f16 v1, v3, v4 +; GFX11-NEXT: s_setpc_b64 s[30:31] br i1 undef, label %T, label %F T: @@ -945,6 +1152,16 @@ ; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: large_vector: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshl_add_u32 v2, v1, 5, v0 +; GFX11-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1 +; GFX11-NEXT: ds_load_2addr_b32 v[2:3], v2 offset0:2 offset1:3 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %idx = shl i32 %idxp, 4 %i.0 = or i32 %idx, 0 diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll @@ -1,14 +1,54 @@ -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX11 %s -; GCN-LABEL: {{^}}extract_vector_elt_v2f16: -; GCN: s_load_dword [[VEC:s[0-9]+]] -; GCN: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16 -; GCN-DAG: v_mov_b32_e32 [[VELT0:v[0-9]+]], [[VEC]] -; GCN-DAG: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]] -; GCN-DAG: buffer_store_short [[VELT0]] -; GCN-DAG: buffer_store_short [[VELT1]] define amdgpu_kernel void @extract_vector_elt_v2f16(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 { +; SI-LABEL: extract_vector_elt_v2f16: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dword s4, s[2:3], 0x0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_short v1, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:20 +; SI-NEXT: s_endpgm +; +; VI-LABEL: extract_vector_elt_v2f16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dword s4, s[2:3], 0x0 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s5, s4, 16 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_short v1, off, s[0:3], 0 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:20 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: extract_vector_elt_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_d16_hi_b16 v0, v1, s[0:1] +; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 offset:20 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <2 x half>, ptr addrspace(4) %vec.ptr %p0 = extractelement <2 x half> %vec, i32 0 %p1 = extractelement <2 x half> %vec, i32 1 @@ -18,34 +58,124 @@ ret void } -; GCN-LABEL: {{^}}extract_vector_elt_v2f16_dynamic_sgpr: -; GCN: s_load_dword [[IDX:s[0-9]+]] -; GCN: s_load_dword [[VEC:s[0-9]+]] -; GCN: s_lshl_b32 [[IDX_SCALED:s[0-9]+]], [[IDX]], 4 -; GCN: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], [[IDX_SCALED]] -; GCN: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]] -; GCN: buffer_store_short [[VELT1]] -; GCN: ScratchSize: 0 define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, i32 %idx) #0 { +; SI-LABEL: extract_vector_elt_v2f16_dynamic_sgpr: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xd +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dword s1, s[6:7], 0x0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_lshl_b32 s0, s0, 4 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_lshr_b32 s0, s1, s0 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: extract_vector_elt_v2f16_dynamic_sgpr: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dword s6, s[6:7], 0x0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_lshl_b32 s4, s8, 4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s4, s6, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: extract_vector_elt_v2f16_dynamic_sgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0 +; GFX11-NEXT: s_lshl_b32 s0, s0, 4 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshr_b32 s0, s1, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <2 x half>, ptr addrspace(4) %vec.ptr %elt = extractelement <2 x half> %vec, i32 %idx store half %elt, ptr addrspace(1) %out, align 2 ret void } -; GCN-LABEL: {{^}}extract_vector_elt_v2f16_dynamic_vgpr: -; GCN-DAG: s_load_dword [[VEC:s[0-9]+]] -; GCN-DAG: {{flat|buffer}}_load_dword [[IDX:v[0-9]+]] -; GCN: v_lshlrev_b32_e32 [[IDX_SCALED:v[0-9]+]], 4, [[IDX]] - -; SI: v_lshr_b32_e32 [[ELT:v[0-9]+]], [[VEC]], [[IDX_SCALED]] -; VI: v_lshrrev_b32_e64 [[ELT:v[0-9]+]], [[IDX_SCALED]], [[VEC]] - - -; SI: buffer_store_short [[ELT]] -; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[ELT]] -; GCN: ScratchSize: 0{{$}} define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, ptr addrspace(1) %idx.ptr) #0 { +; SI-LABEL: extract_vector_elt_v2f16_dynamic_vgpr: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; SI-NEXT: v_mov_b32_e32 v2, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[1:2], s[0:3], 0 addr64 +; SI-NEXT: s_load_dword s6, s[6:7], 0x0 +; SI-NEXT: s_mov_b64 s[0:1], s[4:5] +; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v3 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_lshr_b32_e32 v0, s6, v0 +; SI-NEXT: buffer_store_short v0, v[1:2], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: extract_vector_elt_v2f16_dynamic_vgpr: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 +; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; VI-NEXT: flat_load_dword v2, v[1:2] +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_load_dword s1, s[2:3], 0x0 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_lshrrev_b32_e64 v2, v2, s1 +; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: extract_vector_elt_v2f16_dynamic_vgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v1, s[2:3] +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e64 v1, v1, s2 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i32, ptr addrspace(1) %idx.ptr, i64 %tid.ext @@ -57,12 +187,50 @@ ret void } -; GCN-LABEL: {{^}}extract_vector_elt_v3f16: -; GCN: s_load_dwordx4 - -; GCN: buffer_store_short -; GCN: buffer_store_short define amdgpu_kernel void @extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x half> %foo) #0 { +; SI-LABEL: extract_vector_elt_v3f16: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:2 +; SI-NEXT: s_endpgm +; +; VI-LABEL: extract_vector_elt_v3f16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:2 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: extract_vector_elt_v3f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: buffer_store_b16 v1, off, s[4:7], 0 offset:2 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %p0 = extractelement <3 x half> %foo, i32 0 %p1 = extractelement <3 x half> %foo, i32 2 %out1 = getelementptr half, ptr addrspace(1) %out, i32 1 @@ -72,35 +240,106 @@ } ; FIXME: Why sometimes vector shift? -; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3f16: -; SI: s_load_dword s -; SI: s_load_dwordx4 s - -; GFX89: s_load_dwordx4 s -; GFX89: s_load_dword s - - -; GCN-DAG: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 4 -; GCN: s_lshr_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} - -; GCN: {{buffer|global}}_store_short define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x half> %foo, i32 %idx) #0 { +; SI-LABEL: dynamic_extract_vector_elt_v3f16: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_lshl_b32 s4, s4, 4 +; SI-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: dynamic_extract_vector_elt_v3f16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_lshl_b32 s4, s8, 4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_lshr_b64 s[4:5], s[6:7], s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: dynamic_extract_vector_elt_v3f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b32 s4, s4, 4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %p0 = extractelement <3 x half> %foo, i32 %idx %out1 = getelementptr half, ptr addrspace(1) %out, i32 1 store half %p0, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}v_extractelement_v4f16_2: -; SI: buffer_load_dword [[LOAD:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI: buffer_store_short [[LOAD]] - -; VI: flat_load_dword v -; VI: flat_store_short - -; GFX9: global_load_dword [[LOAD:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, off offset:4 -; GFX9: global_store_short_d16_hi v{{\[[0-9]+:[0-9]+\]}}, [[LOAD]] define amdgpu_kernel void @v_extractelement_v4f16_2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-LABEL: v_extractelement_v4f16_2: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: v_mov_b32_e32 v2, 0 +; SI-NEXT: s_mov_b64 s[10:11], s[6:7] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_dword v3, v[1:2], s[8:11], 0 addr64 offset:4 +; SI-NEXT: s_mov_b64 s[4:5], s[0:1] +; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_short v3, v[1:2], s[4:7], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_extractelement_v4f16_2: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 +; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v1 +; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; VI-NEXT: flat_load_dword v2, v[1:2] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: v_extractelement_v4f16_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v1, s[2:3] offset:4 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %tid.ext @@ -111,17 +350,69 @@ ret void } -; GCN-LABEL: {{^}}v_insertelement_v4f16_dynamic_vgpr: -; GCN-DAG: {{flat|global|buffer}}_load_dword [[IDX:v[0-9]+]], -; GCN-DAG: {{flat|global|buffer}}_load_dwordx2 v[[[LO:[0-9]+]]:[[HI:[0-9]+]]] -; GCN-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]] - -; GFX89: v_lshrrev_b64 v[[[SHIFT_LO:[0-9]+]]:[[SHIFT_HI:[0-9]+]]], [[SCALED_IDX]], v[[[LO]]:[[HI]]] -; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[SHIFT_LO]] - -; SI: v_lshr_b64 v[[[SHIFT_LO:[0-9]+]]:[[SHIFT_HI:[0-9]+]]], v[[[LO]]:[[HI]]], [[SCALED_IDX]] -; SI: buffer_store_short v[[SHIFT_LO]] define amdgpu_kernel void @v_insertelement_v4f16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-LABEL: v_insertelement_v4f16_dynamic_vgpr: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; SI-NEXT: v_mov_b32_e32 v2, 0 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: buffer_load_dword v5, off, s[8:11], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], v0 +; SI-NEXT: buffer_store_short v3, v[1:2], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_insertelement_v4f16_dynamic_vgpr: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; VI-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2] +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b64 v[0:1], v0, v[1:2] +; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v4 +; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc +; VI-NEXT: flat_store_short v[1:2], v0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: v_insertelement_v4f16_dynamic_vgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-NEXT: buffer_load_b32 v3, off, s[4:7], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3] +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 4, v3 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b64 v[1:2], v3, v[1:2] +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %tid.ext @@ -133,13 +424,58 @@ ret void } -; GCN-LABEL: {{^}}reduce_load_vector_v8f16_extract_01: -; GCN: s_load_dwordx2 [[PTR:s\[[0-9]+:[0-9]+\]]], -; GCN-NOT: {{s|buffer|flat|global}}_load_ -; GCN: s_load_dword s{{[0-9]+}}, [[PTR]], 0x0 -; GCN-NOT: {{s|buffer|flat|global}}_load_ -; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 define amdgpu_kernel void @reduce_load_vector_v8f16_extract_01(ptr addrspace(4) %ptr) #0 { +; SI-LABEL: reduce_load_vector_v8f16_extract_01: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dword s0, s[0:1], 0x0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_lshr_b32 s1, s0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_endpgm +; +; VI-LABEL: reduce_load_vector_v8f16_extract_01: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dword s0, s[0:1], 0x0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s1, s0, 16 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: reduce_load_vector_v8f16_extract_01: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshr_b32 s1, s0, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %load = load <16 x half>, ptr addrspace(4) %ptr %elt0 = extractelement <16 x half> %load, i32 0 %elt1 = extractelement <16 x half> %load, i32 1 @@ -148,13 +484,58 @@ ret void } -; GCN-LABEL: {{^}}reduce_load_vector_v8f16_extract_23: -; GCN: s_load_dwordx2 [[PTR:s\[[0-9]+:[0-9]+\]]], -; GCN-NOT: {{s|buffer|flat|global}}_load_ -; GCN: s_load_dword s{{[0-9]+}}, [[PTR]], {{0x1|0x4}} -; GCN-NOT: {{s|buffer|flat|global}}_load_ -; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 define amdgpu_kernel void @reduce_load_vector_v8f16_extract_23(ptr addrspace(4) %ptr) #0 { +; SI-LABEL: reduce_load_vector_v8f16_extract_23: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dword s0, s[0:1], 0x1 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_lshr_b32 s1, s0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_endpgm +; +; VI-LABEL: reduce_load_vector_v8f16_extract_23: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dword s0, s[0:1], 0x4 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s1, s0, 16 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: reduce_load_vector_v8f16_extract_23: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x4 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshr_b32 s1, s0, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %load = load <16 x half>, ptr addrspace(4) %ptr %elt2 = extractelement <16 x half> %load, i32 2 %elt3 = extractelement <16 x half> %load, i32 3 @@ -163,9 +544,143 @@ ret void } -; GCN-LABEL: {{^}}v_extractelement_v8f16_dynamic_sgpr: -; GCN-COUNT-7: v_cndmask_b32_e32 define amdgpu_kernel void @v_extractelement_v8f16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %n) #0 { +; SI-LABEL: v_extractelement_v8f16_dynamic_sgpr: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; SI-NEXT: v_mov_b32_e32 v5, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[0:1], s[6:7] +; SI-NEXT: buffer_load_dwordx4 v[1:4], v[4:5], s[0:3], 0 addr64 +; SI-NEXT: v_lshlrev_b32_e32 v6, 1, v0 +; SI-NEXT: v_mov_b32_e32 v7, v5 +; SI-NEXT: s_mov_b64 s[6:7], s[2:3] +; SI-NEXT: s_cmp_eq_u32 s8, 1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 3 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 4 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 5 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 6 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 7 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, v[6:7], s[4:7], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_extractelement_v8f16_dynamic_sgpr: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v1, 4, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; VI-NEXT: flat_load_dwordx4 v[1:4], v[1:2] +; VI-NEXT: v_mov_b32_e32 v6, s5 +; VI-NEXT: v_add_u32_e32 v5, vcc, s4, v0 +; VI-NEXT: s_cmp_eq_u32 s0, 1 +; VI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s0, 2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s0, 3 +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s0, 4 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s0, 5 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s0, 6 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s0, 7 +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc +; VI-NEXT: flat_store_short v[5:6], v0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: v_extractelement_v8f16_dynamic_sgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 4, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b128 v[1:4], v1, s[6:7] +; GFX11-NEXT: s_cmp_eq_u32 s0, 1 +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s0, 2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NEXT: s_cmp_eq_u32 s0, 3 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s0, 4 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s0, 5 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s0, 6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX11-NEXT: s_cmp_eq_u32 s0, 7 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <8 x half>, ptr addrspace(1) %in, i64 %tid.ext @@ -176,9 +691,248 @@ ret void } -; GCN-LABEL: {{^}}v_extractelement_v16f16_dynamic_sgpr: -; GCN-COUNT-15: v_cndmask_b32_e32 define amdgpu_kernel void @v_extractelement_v16f16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %n) #0 { +; SI-LABEL: v_extractelement_v16f16_dynamic_sgpr: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 5, v0 +; SI-NEXT: v_mov_b32_e32 v6, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[0:1], s[6:7] +; SI-NEXT: buffer_load_dwordx4 v[1:4], v[5:6], s[0:3], 0 addr64 +; SI-NEXT: v_lshlrev_b32_e32 v9, 1, v0 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: s_mov_b64 s[6:7], s[2:3] +; SI-NEXT: buffer_load_dwordx4 v[5:8], v[5:6], s[0:3], 0 addr64 offset:16 +; SI-NEXT: s_cmp_eq_u32 s8, 1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 3 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 4 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 5 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 6 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 7 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 8 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 9 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 10 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 11 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v15, vcc +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 12 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 13 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 14 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 15 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v17, vcc +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, v[9:10], s[4:7], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_extractelement_v16f16_dynamic_sgpr: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v1, 5, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v1 +; VI-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc +; VI-NEXT: flat_load_dwordx4 v[1:4], v[5:6] +; VI-NEXT: v_add_u32_e32 v5, vcc, 16, v5 +; VI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; VI-NEXT: flat_load_dwordx4 v[5:8], v[5:6] +; VI-NEXT: v_mov_b32_e32 v10, s5 +; VI-NEXT: v_add_u32_e32 v9, vcc, s4, v0 +; VI-NEXT: s_cmp_eq_u32 s0, 1 +; VI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s0, 2 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s0, 3 +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s0, 4 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s0, 5 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v3 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s0, 6 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s0, 7 +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s0, 8 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s0, 9 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s0, 10 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s0, 11 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s0, 12 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s0, 13 +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s0, 14 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v15, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s0, 15 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc +; VI-NEXT: flat_store_short v[9:10], v0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: v_extractelement_v16f16_dynamic_sgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 5, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b128 v[1:4], v5, s[6:7] +; GFX11-NEXT: global_load_b128 v[5:8], v5, s[6:7] offset:16 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_cmp_eq_u32 s0, 1 +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s0, 2 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GFX11-NEXT: s_cmp_eq_u32 s0, 3 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s0, 4 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s0, 5 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s0, 6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX11-NEXT: s_cmp_eq_u32 s0, 7 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s0, 8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; GFX11-NEXT: s_cmp_eq_u32 s0, 9 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s0, 10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; GFX11-NEXT: s_cmp_eq_u32 s0, 11 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s0, 12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v7 +; GFX11-NEXT: s_cmp_eq_u32 s0, 13 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s0, 14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX11-NEXT: s_cmp_eq_u32 s0, 15 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <16 x half>, ptr addrspace(1) %in, i64 %tid.ext @@ -193,3 +947,5 @@ attributes #0 = { nounwind } attributes #1 = { nounwind readnone } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll --- a/llvm/test/CodeGen/AMDGPU/fmin3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll @@ -1,14 +1,130 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11 %s -; GCN-LABEL: {{^}}test_fmin3_olt_0_f32: -; GCN: buffer_load_dword [[REGC:v[0-9]+]] -; GCN: buffer_load_dword [[REGB:v[0-9]+]] -; GCN: buffer_load_dword [[REGA:v[0-9]+]] -; GCN: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] -; GCN: buffer_store_dword [[RESULT]], define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { +; SI-LABEL: test_fmin3_olt_0_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s22, s10 +; SI-NEXT: s_mov_b32 s23, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s20, s6 +; SI-NEXT: s_mov_b32 s21, s7 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[20:23], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: v_min3_f32 v0, v0, v1, v2 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: test_fmin3_olt_0_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: v_min3_f32 v0, v0, v1, v2 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fmin3_olt_0_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s12, s2 +; GFX9-NEXT: s_mov_b32 s13, s3 +; GFX9-NEXT: s_mov_b32 s16, s4 +; GFX9-NEXT: s_mov_b32 s17, s5 +; GFX9-NEXT: s_mov_b32 s18, s10 +; GFX9-NEXT: s_mov_b32 s19, s11 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s10 +; GFX9-NEXT: s_mov_b32 s7, s11 +; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s0 +; GFX9-NEXT: s_mov_b32 s9, s1 +; GFX9-NEXT: v_min3_f32 v0, v0, v1, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_fmin3_olt_0_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_mov_b32 s22, s10 +; GFX11-NEXT: s_mov_b32 s23, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: s_mov_b32 s20, s6 +; GFX11-NEXT: s_mov_b32 s21, s7 +; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: v_min3_f32 v0, v0, v1, v2 +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %a = load volatile float, ptr addrspace(1) %aptr, align 4 %b = load volatile float, ptr addrspace(1) %bptr, align 4 %c = load volatile float, ptr addrspace(1) %cptr, align 4 @@ -19,13 +135,127 @@ } ; Commute operand of second fmin -; GCN-LABEL: {{^}}test_fmin3_olt_1_f32: -; GCN: buffer_load_dword [[REGB:v[0-9]+]] -; GCN: buffer_load_dword [[REGA:v[0-9]+]] -; GCN: buffer_load_dword [[REGC:v[0-9]+]] -; GCN: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] -; GCN: buffer_store_dword [[RESULT]], define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { +; SI-LABEL: test_fmin3_olt_1_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s22, s10 +; SI-NEXT: s_mov_b32 s23, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s20, s6 +; SI-NEXT: s_mov_b32 s21, s7 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[20:23], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: v_min3_f32 v0, v2, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: test_fmin3_olt_1_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: v_min3_f32 v0, v2, v0, v1 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fmin3_olt_1_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s12, s2 +; GFX9-NEXT: s_mov_b32 s13, s3 +; GFX9-NEXT: s_mov_b32 s16, s4 +; GFX9-NEXT: s_mov_b32 s17, s5 +; GFX9-NEXT: s_mov_b32 s18, s10 +; GFX9-NEXT: s_mov_b32 s19, s11 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s10 +; GFX9-NEXT: s_mov_b32 s7, s11 +; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s0 +; GFX9-NEXT: s_mov_b32 s9, s1 +; GFX9-NEXT: v_min3_f32 v0, v2, v0, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_fmin3_olt_1_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_mov_b32 s22, s10 +; GFX11-NEXT: s_mov_b32 s23, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: s_mov_b32 s20, s6 +; GFX11-NEXT: s_mov_b32 s21, s7 +; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: v_min3_f32 v0, v2, v0, v1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %a = load volatile float, ptr addrspace(1) %aptr, align 4 %b = load volatile float, ptr addrspace(1) %bptr, align 4 %c = load volatile float, ptr addrspace(1) %cptr, align 4 @@ -35,20 +265,135 @@ ret void } -; GCN-LABEL: {{^}}test_fmin3_olt_0_f16: -; GCN: buffer_load_ushort [[REGC:v[0-9]+]] -; GCN: buffer_load_ushort [[REGB:v[0-9]+]] -; GCN: buffer_load_ushort [[REGA:v[0-9]+]] - -; SI: v_min3_f32 [[RESULT_F32:v[0-9]+]], -; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT]] - -; VI: v_min_f16_e32 -; VI: v_min_f16_e32 [[RESULT:v[0-9]+]], - -; GFX9: v_min3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] -; GCN: buffer_store_short [[RESULT]], define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { +; SI-LABEL: test_fmin3_olt_0_f16: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s22, s10 +; SI-NEXT: s_mov_b32 s23, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s20, s6 +; SI-NEXT: s_mov_b32 s21, s7 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v2, off, s[20:23], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_min3_f32 v0, v0, v1, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: test_fmin3_olt_0_f16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: v_max_f16_e32 v0, v0, v0 +; VI-NEXT: v_max_f16_e32 v1, v1, v1 +; VI-NEXT: v_min_f16_e32 v0, v0, v1 +; VI-NEXT: v_max_f16_e32 v1, v2, v2 +; VI-NEXT: v_min_f16_e32 v0, v0, v1 +; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fmin3_olt_0_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s12, s2 +; GFX9-NEXT: s_mov_b32 s13, s3 +; GFX9-NEXT: s_mov_b32 s16, s4 +; GFX9-NEXT: s_mov_b32 s17, s5 +; GFX9-NEXT: s_mov_b32 s18, s10 +; GFX9-NEXT: s_mov_b32 s19, s11 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s10 +; GFX9-NEXT: s_mov_b32 s7, s11 +; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s0 +; GFX9-NEXT: s_mov_b32 s9, s1 +; GFX9-NEXT: v_min3_f16 v0, v0, v1, v2 +; GFX9-NEXT: buffer_store_short v0, off, s[8:11], 0 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_fmin3_olt_0_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_mov_b32 s22, s10 +; GFX11-NEXT: s_mov_b32 s23, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: s_mov_b32 s20, s6 +; GFX11-NEXT: s_mov_b32 s21, s7 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: v_min3_f16 v0, v0, v1, v2 +; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %a = load volatile half, ptr addrspace(1) %aptr, align 2 %b = load volatile half, ptr addrspace(1) %bptr, align 2 %c = load volatile half, ptr addrspace(1) %cptr, align 2 @@ -59,23 +404,135 @@ } ; Commute operand of second fmin -; GCN-LABEL: {{^}}test_fmin3_olt_1_f16: -; GCN: buffer_load_ushort [[REGA:v[0-9]+]] -; GCN: buffer_load_ushort [[REGB:v[0-9]+]] -; GCN: buffer_load_ushort [[REGC:v[0-9]+]] - -; SI-DAG: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], [[REGA]] -; SI-DAG: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], [[REGB]] -; SI-DAG: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], [[REGC]] -; SI: v_min3_f32 [[RESULT_F32:v[0-9]+]], [[CVT_C]], [[CVT_A]], [[CVT_B]] -; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT_F32]] - -; VI: v_min_f16_e32 -; VI: v_min_f16_e32 [[RESULT:v[0-9]+]], - -; GFX9: v_min3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGA]], [[REGB]] -; GCN: buffer_store_short [[RESULT]], define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { +; SI-LABEL: test_fmin3_olt_1_f16: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s22, s10 +; SI-NEXT: s_mov_b32 s23, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s20, s6 +; SI-NEXT: s_mov_b32 s21, s7 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v2, off, s[20:23], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_min3_f32 v0, v2, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: test_fmin3_olt_1_f16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: v_max_f16_e32 v0, v0, v0 +; VI-NEXT: v_max_f16_e32 v1, v1, v1 +; VI-NEXT: v_min_f16_e32 v0, v0, v1 +; VI-NEXT: v_max_f16_e32 v1, v2, v2 +; VI-NEXT: v_min_f16_e32 v0, v1, v0 +; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fmin3_olt_1_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s12, s2 +; GFX9-NEXT: s_mov_b32 s13, s3 +; GFX9-NEXT: s_mov_b32 s16, s4 +; GFX9-NEXT: s_mov_b32 s17, s5 +; GFX9-NEXT: s_mov_b32 s18, s10 +; GFX9-NEXT: s_mov_b32 s19, s11 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s10 +; GFX9-NEXT: s_mov_b32 s7, s11 +; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s0 +; GFX9-NEXT: s_mov_b32 s9, s1 +; GFX9-NEXT: v_min3_f16 v0, v2, v0, v1 +; GFX9-NEXT: buffer_store_short v0, off, s[8:11], 0 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_fmin3_olt_1_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_mov_b32 s22, s10 +; GFX11-NEXT: s_mov_b32 s23, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: s_mov_b32 s20, s6 +; GFX11-NEXT: s_mov_b32 s21, s7 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: v_min3_f16 v0, v2, v0, v1 +; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %a = load volatile half, ptr addrspace(1) %aptr, align 2 %b = load volatile half, ptr addrspace(1) %bptr, align 2 %c = load volatile half, ptr addrspace(1) %cptr, align 2 @@ -87,30 +544,61 @@ ; Checks whether the test passes; performMinMaxCombine() should not optimize vector patterns of min3 ; since there are no pack instructions for fmin3. -; GCN-LABEL: {{^}}no_fmin3_v2f16: - -; SI: v_cvt_f16_f32_e32 -; SI: v_min_f32_e32 -; SI-NEXT: v_min_f32_e32 -; SI-NEXT: v_min3_f32 -; SI-NEXT: v_min3_f32 - -; VI: s_waitcnt -; VI-NEXT: v_min_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_min_f16_e32 v0, v0, v1 -; VI-NEXT: v_min_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_min_f16_e32 v0, v2, v0 -; VI-NEXT: v_min_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_min_f16_e32 v0, v0, v3 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: s_setpc_b64 - -; GFX9: s_waitcnt -; GFX9-NEXT: v_pk_min_f16 v0, v0, v1 -; GFX9-NEXT: v_pk_min_f16 v0, v2, v0 -; GFX9-NEXT: v_pk_min_f16 v0, v0, v3 -; GFX9-NEXT: s_setpc_b64 define <2 x half> @no_fmin3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) #2 { +; SI-LABEL: no_fmin3_v2f16: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_min_f32_e32 v1, v1, v3 +; SI-NEXT: v_min_f32_e32 v0, v0, v2 +; SI-NEXT: v_min3_f32 v0, v4, v0, v6 +; SI-NEXT: v_min3_f32 v1, v5, v1, v7 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: no_fmin3_v2f16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_min_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_min_f16_e32 v0, v0, v1 +; VI-NEXT: v_min_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_min_f16_e32 v0, v2, v0 +; VI-NEXT: v_min_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_min_f16_e32 v0, v0, v3 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: no_fmin3_v2f16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_min_f16 v0, v0, v1 +; GFX9-NEXT: v_pk_min_f16 v0, v2, v0 +; GFX9-NEXT: v_pk_min_f16 v0, v0, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: no_fmin3_v2f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_pk_min_f16 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_min_f16 v0, v2, v0 +; GFX11-NEXT: v_pk_min_f16 v0, v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %min = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b) %min1 = call <2 x half> @llvm.minnum.v2f16(<2 x half> %c, <2 x half> %min) @@ -118,9 +606,142 @@ ret <2 x half> %res } -; GCN-LABEL: {{^}}test_fmin3_olt_0_f64: -; GCN-NOT: v_min3 define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { +; SI-LABEL: test_fmin3_olt_0_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s0, s6 +; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: buffer_load_dwordx2 v[4:5], off, s[0:3], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; SI-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; SI-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; SI-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; SI-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: test_fmin3_olt_0_f64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; VI-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; VI-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; VI-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; VI-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fmin3_olt_0_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s12, s2 +; GFX9-NEXT: s_mov_b32 s13, s3 +; GFX9-NEXT: s_mov_b32 s16, s4 +; GFX9-NEXT: s_mov_b32 s17, s5 +; GFX9-NEXT: s_mov_b32 s18, s10 +; GFX9-NEXT: s_mov_b32 s19, s11 +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s10 +; GFX9-NEXT: s_mov_b32 s7, s11 +; GFX9-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s0 +; GFX9-NEXT: s_mov_b32 s9, s1 +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_fmin3_olt_0_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[12:15], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_b64 v[2:3], off, s[16:19], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: buffer_load_b64 v[4:5], off, s[12:15], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %a = load volatile double, ptr addrspace(1) %aptr, align 4 %b = load volatile double, ptr addrspace(1) %bptr, align 4 %c = load volatile double, ptr addrspace(1) %cptr, align 4 @@ -131,9 +752,142 @@ } ; Commute operand of second fmin -; GCN-LABEL: {{^}}test_fmin3_olt_1_f64: -; GCN-NOT: v_min3 define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { +; SI-LABEL: test_fmin3_olt_1_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s0, s6 +; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: buffer_load_dwordx2 v[4:5], off, s[0:3], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; SI-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; SI-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; SI-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; SI-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: test_fmin3_olt_1_f64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; VI-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; VI-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; VI-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; VI-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fmin3_olt_1_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s12, s2 +; GFX9-NEXT: s_mov_b32 s13, s3 +; GFX9-NEXT: s_mov_b32 s16, s4 +; GFX9-NEXT: s_mov_b32 s17, s5 +; GFX9-NEXT: s_mov_b32 s18, s10 +; GFX9-NEXT: s_mov_b32 s19, s11 +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s10 +; GFX9-NEXT: s_mov_b32 s7, s11 +; GFX9-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s0 +; GFX9-NEXT: s_mov_b32 s9, s1 +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX9-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_fmin3_olt_1_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[12:15], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_b64 v[2:3], off, s[16:19], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: buffer_load_b64 v[4:5], off, s[12:15], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %a = load volatile double, ptr addrspace(1) %aptr, align 4 %b = load volatile double, ptr addrspace(1) %bptr, align 4 %c = load volatile double, ptr addrspace(1) %cptr, align 4 @@ -152,3 +906,5 @@ attributes #0 = { nounwind } attributes #1 = { nounwind readnone speculatable } attributes #2 = { nounwind "no-nans-fp-math"="true" } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll @@ -8,6 +8,9 @@ ; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI-SAFE %s ; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI-NNAN %s +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SAFE %s +; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-NNAN %s + define half @test_fmin_legacy_ule_f16(half %a, half %b) #0 { ; GFX9-SAFE-LABEL: test_fmin_legacy_ule_f16: @@ -55,6 +58,21 @@ ; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NNAN-NEXT: v_min_f32_e32 v0, v0, v1 ; SI-NNAN-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-LABEL: test_fmin_legacy_ule_f16: +; GFX11-SAFE: ; %bb.0: +; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NNAN-LABEL: test_fmin_legacy_ule_f16: +; GFX11-NNAN: ; %bb.0: +; GFX11-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NNAN-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NNAN-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX11-NNAN-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule half %a, %b %val = select i1 %cmp, half %a, half %b ret half %val @@ -130,6 +148,27 @@ ; SI-NNAN-NEXT: v_min_f32_e32 v0, v0, v2 ; SI-NNAN-NEXT: v_min_f32_e32 v1, v1, v3 ; SI-NNAN-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-LABEL: test_fmin_legacy_ule_v2f16: +; GFX11-SAFE: ; %bb.0: +; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v2 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX11-SAFE-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NNAN-LABEL: test_fmin_legacy_ule_v2f16: +; GFX11-NNAN: ; %bb.0: +; GFX11-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NNAN-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NNAN-NEXT: v_pk_min_f16 v0, v0, v1 +; GFX11-NNAN-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule <2 x half> %a, %b %val = select <2 x i1> %cmp, <2 x half> %a, <2 x half> %b ret <2 x half> %val @@ -221,6 +260,30 @@ ; SI-NNAN-NEXT: v_min_f32_e32 v1, v1, v4 ; SI-NNAN-NEXT: v_min_f32_e32 v2, v2, v5 ; SI-NNAN-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-LABEL: test_fmin_legacy_ule_v3f16: +; GFX11-SAFE: ; %bb.0: +; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v2 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v5, v4 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo +; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v3 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX11-SAFE-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NNAN-LABEL: test_fmin_legacy_ule_v3f16: +; GFX11-NNAN: ; %bb.0: +; GFX11-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NNAN-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NNAN-NEXT: v_pk_min_f16 v0, v0, v2 +; GFX11-NNAN-NEXT: v_pk_min_f16 v1, v1, v3 +; GFX11-NNAN-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule <3 x half> %a, %b %val = select <3 x i1> %cmp, <3 x half> %a, <3 x half> %b ret <3 x half> %val @@ -335,6 +398,36 @@ ; SI-NNAN-NEXT: v_min_f32_e32 v2, v2, v6 ; SI-NNAN-NEXT: v_min_f32_e32 v3, v3, v7 ; SI-NNAN-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-LABEL: test_fmin_legacy_ule_v4f16: +; GFX11-SAFE: ; %bb.0: +; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v5, v4 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo +; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v7, v6 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v2 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v3 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 +; GFX11-SAFE-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 +; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NNAN-LABEL: test_fmin_legacy_ule_v4f16: +; GFX11-NNAN: ; %bb.0: +; GFX11-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NNAN-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NNAN-NEXT: v_pk_min_f16 v0, v0, v2 +; GFX11-NNAN-NEXT: v_pk_min_f16 v1, v1, v3 +; GFX11-NNAN-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule <4 x half> %a, %b %val = select <4 x i1> %cmp, <4 x half> %a, <4 x half> %b ret <4 x half> %val @@ -527,6 +620,52 @@ ; SI-NNAN-NEXT: v_min_f32_e32 v6, v6, v14 ; SI-NNAN-NEXT: v_min_f32_e32 v7, v7, v15 ; SI-NNAN-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-LABEL: test_fmin_legacy_ule_v8f16: +; GFX11-SAFE: ; %bb.0: +; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v10, 16, v7 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v12, 16, v6 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v11, v10 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc_lo +; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v13, v12 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc_lo +; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v15, v14 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc_lo +; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v9, v8 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo +; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v2, v6 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo +; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v4 +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-SAFE-NEXT: v_perm_b32 v2, v11, v2, 0x5040100 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo +; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v5 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo +; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v7 +; GFX11-SAFE-NEXT: v_perm_b32 v1, v12, v1, 0x5040100 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX11-SAFE-NEXT: v_perm_b32 v0, v8, v0, 0x5040100 +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SAFE-NEXT: v_perm_b32 v3, v10, v3, 0x5040100 +; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NNAN-LABEL: test_fmin_legacy_ule_v8f16: +; GFX11-NNAN: ; %bb.0: +; GFX11-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NNAN-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NNAN-NEXT: v_pk_min_f16 v0, v0, v4 +; GFX11-NNAN-NEXT: v_pk_min_f16 v1, v1, v5 +; GFX11-NNAN-NEXT: v_pk_min_f16 v2, v2, v6 +; GFX11-NNAN-NEXT: v_pk_min_f16 v3, v3, v7 +; GFX11-NNAN-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule <8 x half> %a, %b %val = select <8 x i1> %cmp, <8 x half> %a, <8 x half> %b ret <8 x half> %val diff --git a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll @@ -1,18 +1,83 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89,SIVI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11 %s -; GCN-LABEL: {{^}}fmul_f16 -; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] -; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_mul_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]] -; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; GFX89: v_mul_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]] -; GCN: buffer_store_short v[[R_F16]] -; GCN: s_endpgm define amdgpu_kernel void @fmul_f16( +; SI-LABEL: fmul_f16: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; GFX89-LABEL: fmul_f16: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX89-NEXT: s_mov_b32 s3, 0xf000 +; GFX89-NEXT: s_mov_b32 s2, -1 +; GFX89-NEXT: s_mov_b32 s14, s2 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s12, s6 +; GFX89-NEXT: s_mov_b32 s13, s7 +; GFX89-NEXT: s_mov_b32 s15, s3 +; GFX89-NEXT: s_mov_b32 s10, s2 +; GFX89-NEXT: s_mov_b32 s11, s3 +; GFX89-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_mov_b32 s0, s4 +; GFX89-NEXT: s_mov_b32 s1, s5 +; GFX89-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX89-NEXT: s_endpgm +; +; GFX11-LABEL: fmul_f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s2, s10 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s4 +; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) { @@ -24,15 +89,63 @@ ret void } -; GCN-LABEL: {{^}}fmul_f16_imm_a -; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_mul_f32_e32 v[[R_F32:[0-9]+]], 0x40400000, v[[B_F32]] -; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; GFX89: v_mul_f16_e32 v[[R_F16:[0-9]+]], 0x4200, v[[B_F16]] -; GCN: buffer_store_short v[[R_F16]] -; GCN: s_endpgm define amdgpu_kernel void @fmul_f16_imm_a( +; SI-LABEL: fmul_f16_imm_a: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 0x40400000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; GFX89-LABEL: fmul_f16_imm_a: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_mov_b32 s10, s6 +; GFX89-NEXT: s_mov_b32 s11, s7 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s8, s2 +; GFX89-NEXT: s_mov_b32 s9, s3 +; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_mov_b32 s4, s0 +; GFX89-NEXT: s_mov_b32 s5, s1 +; GFX89-NEXT: v_mul_f16_e32 v0, 0x4200, v0 +; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX89-NEXT: s_endpgm +; +; GFX11-LABEL: fmul_f16_imm_a: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: v_mul_f16_e32 v0, 0x4200, v0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %b) { entry: @@ -42,16 +155,63 @@ ret void } -; GCN-LABEL: {{^}}fmul_f16_imm_b -; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; SI: v_mul_f32_e32 v[[R_F32:[0-9]+]], 4.0, v[[A_F32]] -; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] - -; GFX89: v_mul_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]] -; GCN: buffer_store_short v[[R_F16]] -; GCN: s_endpgm define amdgpu_kernel void @fmul_f16_imm_b( +; SI-LABEL: fmul_f16_imm_b: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 4.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; GFX89-LABEL: fmul_f16_imm_b: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_mov_b32 s10, s6 +; GFX89-NEXT: s_mov_b32 s11, s7 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s8, s2 +; GFX89-NEXT: s_mov_b32 s9, s3 +; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_mov_b32 s4, s0 +; GFX89-NEXT: s_mov_b32 s5, s1 +; GFX89-NEXT: v_mul_f16_e32 v0, 4.0, v0 +; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX89-NEXT: s_endpgm +; +; GFX11-LABEL: fmul_f16_imm_b: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: v_mul_f16_e32 v0, 4.0, v0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -61,34 +221,110 @@ ret void } -; GCN-LABEL: {{^}}fmul_v2f16: -; SIVI: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SIVI: buffer_load_dword v[[A_V2_F16:[0-9]+]] - -; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI-DAG: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]] -; SI-DAG: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] - -; VI-DAG: v_mul_f16_e32 v[[R_F16_LO:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] -; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]] - -; GFX9: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GFX9: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; GFX9: v_pk_mul_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] - -; GCN: buffer_store_dword v[[R_V2_F16]] -; GCN: s_endpgm define amdgpu_kernel void @fmul_v2f16( +; SI-LABEL: fmul_v2f16: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_mul_f32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_mul_f32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fmul_v2f16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mul_f16_sdwa v2, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_mul_f16_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fmul_v2f16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s14, s2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s15, s3 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; GFX9-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: fmul_v2f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s2, s10 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 +; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0 +; GFX11-NEXT: s_mov_b32 s8, s4 +; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_mul_f16 v0, v0, v1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) { @@ -100,30 +336,91 @@ ret void } -; GCN-LABEL: {{^}}fmul_v2f16_imm_a: -; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI-DAG: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI-DAG: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] - - -; VI-DAG: v_mov_b32_e32 v[[CONST4:[0-9]+]], 0x4400 -; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], v[[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-DAG: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]] - -; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x44004200 -; GFX9: v_pk_mul_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], [[K]] - -; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] - -; GCN: buffer_store_dword v[[R_V2_F16]] -; GCN: s_endpgm define amdgpu_kernel void @fmul_v2f16_imm_a( +; SI-LABEL: fmul_v2f16_imm_a: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v1, 4.0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_mul_f32_e32 v0, 0x40400000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fmul_v2f16_imm_a: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: v_mov_b32_e32 v1, 0x4400 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v0, 0x4200, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fmul_v2f16_imm_a: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s0, 0x44004200 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_mul_f16 v0, v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: fmul_v2f16_imm_a: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_mul_f16 v0, 0x44004200, v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %b) { entry: @@ -133,29 +430,91 @@ ret void } -; GCN-LABEL: {{^}}fmul_v2f16_imm_b: -; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI-DAG: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI-DAG: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] - -; VI-DAG: v_mov_b32_e32 v[[CONST3:[0-9]+]], 0x4200 -; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-DAG: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]] - -; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x42004400 -; GFX9: v_pk_mul_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], [[K]] - -; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] - -; GCN: buffer_store_dword v[[R_V2_F16]] -; GCN: s_endpgm define amdgpu_kernel void @fmul_v2f16_imm_b( +; SI-LABEL: fmul_v2f16_imm_b: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v1, 0x40400000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_mul_f32_e32 v0, 4.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fmul_v2f16_imm_b: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: v_mov_b32_e32 v1, 0x4200 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v0, 4.0, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fmul_v2f16_imm_b: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s0, 0x42004400 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_mul_f16 v0, v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: fmul_v2f16_imm_b: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_mul_f16 v0, 0x42004400, v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -165,23 +524,127 @@ ret void } -; GCN-LABEL: {{^}}fmul_v4f16: -; GFX9: buffer_load_dwordx2 v[[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]] -; GFX9: buffer_load_dwordx2 v[[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]] - -; GFX9-DAG: v_pk_mul_f16 v[[MUL_LO:[0-9]+]], v[[A_LO]], v[[B_LO]] -; GFX9-DAG: v_pk_mul_f16 v[[MUL_HI:[0-9]+]], v[[A_HI]], v[[B_HI]] -; GFX9: buffer_store_dwordx2 v[[[MUL_LO]]:[[MUL_HI]]] - -; VI: buffer_load_dwordx2 v[[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]] -; VI: buffer_load_dwordx2 v[[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]] -; VI: v_mul_f16_sdwa -; VI: v_mul_f16_e32 -; VI: v_mul_f16_sdwa -; VI: v_mul_f16_e32 -; VI: v_or_b32 -; VI: v_or_b32 define amdgpu_kernel void @fmul_v4f16( +; SI-LABEL: fmul_v4f16: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_mul_f32_e32 v5, v7, v5 +; SI-NEXT: v_mul_f32_e32 v4, v6, v4 +; SI-NEXT: v_mul_f32_e32 v1, v3, v1 +; SI-NEXT: v_mul_f32_e32 v0, v2, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v3, v0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fmul_v4f16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mul_f16_sdwa v4, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_mul_f16_e32 v1, v3, v1 +; VI-NEXT: v_mul_f16_sdwa v3, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_mul_f16_e32 v0, v2, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v4 +; VI-NEXT: v_or_b32_e32 v0, v0, v3 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fmul_v4f16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s14, s2 +; GFX9-NEXT: s_mov_b32 s15, s3 +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GFX9-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_mul_f16 v1, v3, v1 +; GFX9-NEXT: v_pk_mul_f16 v0, v2, v0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: fmul_v4f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, s10 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b64 v[2:3], off, s[12:15], 0 +; GFX11-NEXT: s_mov_b32 s8, s4 +; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_mul_f16 v1, v3, v1 +; GFX11-NEXT: v_pk_mul_f16 v0, v2, v0 +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) { @@ -193,27 +656,106 @@ ret void } -; GCN-LABEL: {{^}}fmul_v4f16_imm_a: -; GFX89-DAG: buffer_load_dwordx2 v[[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]] -; GFX9-DAG: s_mov_b32 [[K1:s[0-9]+]], 0x44004200 -; GFX9-DAG: s_mov_b32 [[K0:s[0-9]+]], 0x40004800 - -; GFX9-DAG: v_pk_mul_f16 v[[MUL_LO:[0-9]+]], v[[A_LO]], [[K0]] -; GFX9-DAG: v_pk_mul_f16 v[[MUL_HI:[0-9]+]], v[[A_HI]], [[K1]] -; GFX9: buffer_store_dwordx2 v[[[MUL_LO]]:[[MUL_HI]]] - -; VI-DAG: v_mov_b32_e32 [[K4:v[0-9]+]], 0x4400 - -; VI-DAG: v_mul_f16_sdwa v[[MUL_HI_HI:[0-9]+]], v[[A_HI]], [[K4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-DAG: v_mul_f16_e32 v[[MUL_HI_LO:[0-9]+]], 0x4200, v[[A_HI]] -; VI-DAG: v_add_f16_sdwa v[[MUL_LO_HI:[0-9]+]], v[[A_LO]], v[[A_LO]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-DAG: v_mul_f16_e32 v[[MUL_LO_LO:[0-9]+]], 0x4800, v[[A_LO]] - -; VI-DAG: v_or_b32_e32 v[[OR0:[0-9]+]], v[[MUL_LO_LO]], v[[MUL_LO_HI]] -; VI-DAG: v_or_b32_e32 v[[OR1:[0-9]+]], v[[MUL_HI_LO]], v[[MUL_HI_HI]] - -; VI: buffer_store_dwordx2 v[[[OR0]]:[[OR1]]] define amdgpu_kernel void @fmul_v4f16_imm_a( +; SI-LABEL: fmul_v4f16_imm_a: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v3, 0x40400000, v3 +; SI-NEXT: v_mul_f32_e32 v2, 0x41000000, v2 +; SI-NEXT: v_mul_f32_e32 v1, 4.0, v1 +; SI-NEXT: v_add_f32_e32 v0, v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fmul_v4f16_imm_a: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: v_mov_b32_e32 v2, 0x4400 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mul_f16_sdwa v2, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v1, 0x4200, v1 +; VI-NEXT: v_add_f16_sdwa v3, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_mul_f16_e32 v0, 0x4800, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_e32 v0, v0, v3 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fmul_v4f16_imm_a: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s2, 0x44004200 +; GFX9-NEXT: s_mov_b32 s3, 0x40004800 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_mul_f16 v1, v1, s2 +; GFX9-NEXT: v_pk_mul_f16 v0, v0, s3 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: fmul_v4f16_imm_a: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_mul_f16 v1, 0x44004200, v1 +; GFX11-NEXT: v_pk_mul_f16 v0, 0x40004800, v0 +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %b) { entry: @@ -222,3 +764,5 @@ store <4 x half> %r.val, ptr addrspace(1) %r ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll --- a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll +++ b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll @@ -2,6 +2,7 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SICI,SI %s ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=SICI,CI %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s declare half @llvm.nearbyint.f16(half) #0 declare float @llvm.nearbyint.f32(float) #0 @@ -48,6 +49,18 @@ ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: fnearbyint_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_rndne_f16_e32 v1, s2 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %1 = call half @llvm.nearbyint.f16(half %in) store half %1, ptr addrspace(1) %out ret void @@ -75,6 +88,18 @@ ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: fnearbyint_f32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_rndne_f32_e32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %0 = call float @llvm.nearbyint.f32(float %in) store float %0, ptr addrspace(1) %out @@ -105,6 +130,17 @@ ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: fnearbyint_v2f32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_rndne_f32_e32 v1, s3 +; GFX11-NEXT: v_rndne_f32_e32 v0, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %0 = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %in) store <2 x float> %0, ptr addrspace(1) %out @@ -139,6 +175,21 @@ ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: fnearbyint_v4f32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_rndne_f32_e32 v3, s7 +; GFX11-NEXT: v_rndne_f32_e32 v2, s6 +; GFX11-NEXT: v_rndne_f32_e32 v1, s5 +; GFX11-NEXT: v_rndne_f32_e32 v0, s4 +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %0 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %in) store <4 x float> %0, ptr addrspace(1) %out @@ -189,6 +240,16 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: nearbyint_f64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_rndne_f64_e32 v[0:1], s[2:3] +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %0 = call double @llvm.nearbyint.f64(double %in) store double %0, ptr addrspace(1) %out @@ -251,6 +312,19 @@ ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: nearbyint_v2f64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_rndne_f64_e32 v[2:3], s[6:7] +; GFX11-NEXT: v_rndne_f64_e32 v[0:1], s[4:5] +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %0 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %in) store <2 x double> %0, ptr addrspace(1) %out @@ -341,6 +415,23 @@ ; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: nearbyint_v4f64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x44 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v8, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_rndne_f64_e32 v[6:7], s[10:11] +; GFX11-NEXT: v_rndne_f64_e32 v[4:5], s[8:9] +; GFX11-NEXT: v_rndne_f64_e32 v[2:3], s[6:7] +; GFX11-NEXT: v_rndne_f64_e32 v[0:1], s[4:5] +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 +; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %0 = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %in) store <4 x double> %0, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll @@ -1,10 +1,58 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=kaveri -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,CIVI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,CIVI,GFX89 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,GFX9 %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=kaveri -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,GFX8 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11 %s ; FIXME: Should be able to do scalar op -; GCN-LABEL: {{^}}s_fneg_f16: define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 { +; CI-LABEL: s_fneg_f16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_xor_b32 s2, s2, 0x8000 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: flat_store_short v[0:1], v2 +; CI-NEXT: s_endpgm +; +; GFX8-LABEL: s_fneg_f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_xor_b32 s2, s2, 0x8000 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: flat_store_short v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: s_fneg_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_xor_b32 s2, s2, 0x8000 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: s_fneg_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_xor_b32 s2, s2, 0x8000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %fneg = fsub half -0.0, %in store half %fneg, ptr addrspace(1) %out ret void @@ -12,13 +60,57 @@ ; FIXME: Should be able to use bit operations when illegal type as ; well. - -; GCN-LABEL: {{^}}v_fneg_f16: -; GCN: {{flat|global}}_load_ushort [[VAL:v[0-9]+]], -; GCN: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x8000, [[VAL]] -; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[XOR]] -; SI: buffer_store_short [[XOR]] define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; CI-LABEL: v_fneg_f16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; CI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_load_ushort v2, v[0:1] +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_xor_b32_e32 v2, 0x8000, v2 +; CI-NEXT: flat_store_short v[0:1], v2 +; CI-NEXT: s_endpgm +; +; GFX8-LABEL: v_fneg_f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ushort v2, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_xor_b32_e32 v2, 0x8000, v2 +; GFX8-NEXT: flat_store_short v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: v_fneg_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_fneg_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.in = getelementptr inbounds half, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr inbounds half, ptr addrspace(1) %in, i32 %tid @@ -28,30 +120,114 @@ ret void } -; GCN-LABEL: {{^}}s_fneg_free_f16: -; GCN: s_load_dword [[NEG_VALUE:s[0-9]+]], -; GCN: s_xor_b32 [[XOR:s[0-9]+]], [[NEG_VALUE]], 0x8000{{$}} -; GCN: v_mov_b32_e32 [[V_XOR:v[0-9]+]], [[XOR]] -; GCN: {{flat|global}}_store_short v{{.+}}, [[V_XOR]] define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 { +; CI-LABEL: s_fneg_free_f16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_xor_b32 s2, s2, 0x8000 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: flat_store_short v[0:1], v2 +; CI-NEXT: s_endpgm +; +; GFX8-LABEL: s_fneg_free_f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_xor_b32 s2, s2, 0x8000 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: flat_store_short v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: s_fneg_free_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_xor_b32 s2, s2, 0x8000 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: s_fneg_free_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_xor_b32 s2, s2, 0x8000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %bc = bitcast i16 %in to half %fsub = fsub half -0.0, %bc store half %fsub, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}v_fneg_fold_f16: -; GCN: {{flat|global}}_load_ushort [[NEG_VALUE:v[0-9]+]] - -; CI-DAG: v_cvt_f32_f16_e32 [[CVT_VAL:v[0-9]+]], [[NEG_VALUE]] -; CI-DAG: v_cvt_f32_f16_e64 [[NEG_CVT0:v[0-9]+]], -[[NEG_VALUE]] -; CI: v_mul_f32_e32 [[MUL:v[0-9]+]], [[NEG_CVT0]], [[CVT_VAL]] -; CI: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], [[MUL]] -; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[CVT1]] - -; VI-NOT: [[NEG_VALUE]] -; VI: v_mul_f16_e64 v{{[0-9]+}}, -[[NEG_VALUE]], [[NEG_VALUE]] define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; CI-LABEL: v_fneg_fold_f16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_load_ushort v0, v[0:1] +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; CI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; CI-NEXT: v_mul_f32_e32 v0, v0, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v0 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_store_short v[0:1], v2 +; CI-NEXT: s_endpgm +; +; GFX8-LABEL: v_fneg_fold_f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_load_ushort v2, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mul_f16_e64 v2, -v2, v2 +; GFX8-NEXT: flat_store_short v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: v_fneg_fold_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mul_f16_e64 v1, -v1, v1 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_fneg_fold_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mul_f16_e64 v1, -v1, v1 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load half, ptr addrspace(1) %in %fsub = fsub half -0.0, %val %fmul = fmul half %fsub, %val @@ -59,17 +235,100 @@ ret void } -; GCN-LABEL: {{^}}s_fneg_v2f16: -; GCN: s_xor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000 define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) #0 { +; CI-LABEL: s_fneg_v2f16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_xor_b32 s2, s2, 0x80008000 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_endpgm +; +; GFX8-LABEL: s_fneg_v2f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_xor_b32 s2, s2, 0x80008000 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: s_fneg_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_xor_b32 s2, s2, 0x80008000 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: s_fneg_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_xor_b32 s2, s2, 0x80008000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %fneg = fsub <2 x half> , %in store <2 x half> %fneg, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}s_fneg_v2f16_nonload: -; GCN: s_xor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000 define amdgpu_kernel void @s_fneg_v2f16_nonload(ptr addrspace(1) %out) #0 { +; CIVI-LABEL: s_fneg_v2f16_nonload: +; CIVI: ; %bb.0: +; CIVI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CIVI-NEXT: ;;#ASMSTART +; CIVI-NEXT: ; def s2 +; CIVI-NEXT: ;;#ASMEND +; CIVI-NEXT: s_xor_b32 s2, s2, 0x80008000 +; CIVI-NEXT: v_mov_b32_e32 v2, s2 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: v_mov_b32_e32 v0, s0 +; CIVI-NEXT: v_mov_b32_e32 v1, s1 +; CIVI-NEXT: flat_store_dword v[0:1], v2 +; CIVI-NEXT: s_endpgm +; +; GFX9-LABEL: s_fneg_v2f16_nonload: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s2 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_xor_b32 s2, s2, 0x80008000 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: s_fneg_v2f16_nonload: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; def s2 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_xor_b32 s2, s2, 0x80008000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %in = call i32 asm sideeffect "; def $0", "=s"() %in.bc = bitcast i32 %in to <2 x half> %fneg = fsub <2 x half> , %in.bc @@ -77,10 +336,57 @@ ret void } -; GCN-LABEL: {{^}}v_fneg_v2f16: -; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]] -; GCN: v_xor_b32_e32 v{{[0-9]+}}, 0x80008000, [[VAL]] define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; CI-LABEL: v_fneg_v2f16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_load_dword v2, v[0:1] +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_endpgm +; +; GFX8-LABEL: v_fneg_v2f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: v_fneg_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_fneg_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.in = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid @@ -90,35 +396,125 @@ ret void } -; GCN-LABEL: {{^}}fneg_free_v2f16: -; GCN: s_load_dword [[VAL:s[0-9]+]] -; GCN: s_xor_b32 s{{[0-9]+}}, [[VAL]], 0x80008000 define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { +; CI-LABEL: fneg_free_v2f16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_xor_b32 s2, s2, 0x80008000 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_endpgm +; +; GFX8-LABEL: fneg_free_v2f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_xor_b32 s2, s2, 0x80008000 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: fneg_free_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_xor_b32 s2, s2, 0x80008000 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: fneg_free_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_xor_b32 s2, s2, 0x80008000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %bc = bitcast i32 %in to <2 x half> %fsub = fsub <2 x half> , %bc store <2 x half> %fsub, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}v_fneg_fold_v2f16: -; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]] - -; CI: v_xor_b32_e32 [[FNEG:v[0-9]+]], 0x80008000, [[VAL]] -; CI: v_lshrrev_b32_e32 -; CI: v_lshrrev_b32_e32 - -; CI: v_cvt_f32_f16_e32 v{{[0-9]+}}, v{{[0-9]+}} -; CI: v_cvt_f32_f16_e32 v{{[0-9]+}}, v{{[0-9]+}} -; CI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; CI: v_cvt_f16_f32 -; CI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; CI: v_cvt_f16_f32 - -; VI: v_mul_f16_sdwa v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI: v_mul_f16_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}} - -; GFX9: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} neg_lo:[1,0] neg_hi:[1,0]{{$}} define amdgpu_kernel void @v_fneg_fold_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; CI-LABEL: v_fneg_fold_v2f16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_load_dword v0, v[0:1] +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_xor_b32_e32 v2, 0x80008000, v0 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_mul_f32_e32 v1, v3, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v1 +; CI-NEXT: v_mul_f32_e32 v0, v2, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v0 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_or_b32_e32 v2, v2, v3 +; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_endpgm +; +; GFX8-LABEL: v_fneg_fold_v2f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mul_f16_sdwa v3, -v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_e64 v2, -v2, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: v_fneg_fold_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_mul_f16 v1, v1, v1 neg_lo:[1,0] neg_hi:[1,0] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_fneg_fold_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_mul_f16 v1, v1, v1 neg_lo:[1,0] neg_hi:[1,0] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load <2 x half>, ptr addrspace(1) %in %fsub = fsub <2 x half> , %val %fmul = fmul <2 x half> %fsub, %val @@ -126,16 +522,78 @@ ret void } -; GCN-LABEL: {{^}}v_extract_fneg_fold_v2f16: -; GCN-DAG: {{flat|global}}_load_dword [[VAL:v[0-9]+]] -; CI-DAG: v_mul_f32_e32 v{{[0-9]+}}, -4.0, v{{[0-9]+}} -; CI-DAG: v_sub_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} - -; GFX89-DAG: v_mul_f16_e32 v{{[0-9]+}}, -4.0, [[VAL]] -; GFX89-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000 -; GFX89-DAG: v_sub_f16_sdwa v{{[0-9]+}}, [[CONST2]], [[VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 - define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 { +; CI-LABEL: v_extract_fneg_fold_v2f16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_load_dword v0, v[0:1] +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_mul_f32_e32 v0, -4.0, v0 +; CI-NEXT: v_sub_f32_e32 v1, 2.0, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: flat_store_short v[0:1], v0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: flat_store_short v[0:1], v1 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_endpgm +; +; GFX8-LABEL: v_extract_fneg_fold_v2f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v1, 0x4000 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mul_f16_e32 v2, -4.0, v0 +; GFX8-NEXT: v_sub_f16_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: flat_store_short v[0:1], v2 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_short v[0:1], v0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: v_extract_fneg_fold_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x4000 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mul_f16_e32 v2, -4.0, v0 +; GFX9-NEXT: v_sub_f16_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: global_store_short v[0:1], v2, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_short v[0:1], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_extract_fneg_fold_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-NEXT: v_mul_f16_e32 v0, -4.0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_sub_f16_e32 v1, 2.0, v1 +; GFX11-NEXT: global_store_b16 v[0:1], v0, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_b16 v[0:1], v1, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load <2 x half>, ptr addrspace(1) %in %fneg = fsub <2 x half> , %val %elt0 = extractelement <2 x half> %fneg, i32 0 @@ -148,12 +606,51 @@ ret void } -; GCN-LABEL: {{^}}v_extract_fneg_no_fold_v2f16: -; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]] -; GCN: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80008000, [[VAL]] -; CIVI: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[NEG]] -; GFX9: global_store_short_d16_hi v{{\[[0-9]+:[0-9]+\]}}, [[NEG]], off define amdgpu_kernel void @v_extract_fneg_no_fold_v2f16(ptr addrspace(1) %in) #0 { +; CIVI-LABEL: v_extract_fneg_no_fold_v2f16: +; CIVI: ; %bb.0: +; CIVI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: v_mov_b32_e32 v0, s0 +; CIVI-NEXT: v_mov_b32_e32 v1, s1 +; CIVI-NEXT: flat_load_dword v0, v[0:1] +; CIVI-NEXT: s_waitcnt vmcnt(0) +; CIVI-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; CIVI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; CIVI-NEXT: flat_store_short v[0:1], v0 +; CIVI-NEXT: s_waitcnt vmcnt(0) +; CIVI-NEXT: flat_store_short v[0:1], v1 +; CIVI-NEXT: s_waitcnt vmcnt(0) +; CIVI-NEXT: s_endpgm +; +; GFX9-LABEL: v_extract_fneg_no_fold_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX9-NEXT: global_store_short v[0:1], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_short_d16_hi v[0:1], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_extract_fneg_no_fold_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX11-NEXT: global_store_b16 v[0:1], v0, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_d16_hi_b16 v[0:1], v0, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load <2 x half>, ptr addrspace(1) %in %fneg = fsub <2 x half> , %val %elt0 = extractelement <2 x half> %fneg, i32 0 @@ -167,3 +664,5 @@ attributes #0 = { nounwind } attributes #1 = { nounwind readnone } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}}