diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll --- a/llvm/test/CodeGen/AMDGPU/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll @@ -1,12 +1,70 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s - -; GCN-LABEL: {{^}}v_test_nnan_input_fmed3_r_i_i_f32: -; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v{{[0-9]+}} -; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], 2.0, 4.0 define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { +; SI-LABEL: v_test_nnan_input_fmed3_r_i_i_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_nnan_input_fmed3_r_i_i_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f32_e32 v2, 1.0, v3 +; VI-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_nnan_input_fmed3_r_i_i_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_nnan_input_fmed3_r_i_i_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid @@ -19,9 +77,67 @@ ret void } -; GCN-LABEL: {{^}}v_test_fmed3_nnan_r_i_i_f32: -; GCN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0 define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { +; SI-LABEL: v_test_fmed3_nnan_r_i_i_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_fmed3_nnan_r_i_i_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f32_e32 v2, 1.0, v3 +; VI-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid @@ -35,9 +151,67 @@ ret void } -; GCN-LABEL: {{^}}v_test_fmed3_nnan_r_i_i_commute0_f32: -; GCN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0 define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { +; SI-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f32_e32 v2, 1.0, v3 +; VI-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid @@ -51,9 +225,67 @@ ret void } -; GCN-LABEL: {{^}}v_test_fmed3_nnan_r_i_i_commute1_f32: -; GCN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0 define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { +; SI-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f32_e32 v2, 1.0, v3 +; VI-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid @@ -67,10 +299,70 @@ ret void } -; GCN-LABEL: {{^}}v_test_fmed3_nnan_r_i_i_constant_order_f32: -; GCN: v_max_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} -; GCN: v_min_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { +; SI-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_max_f32_e32 v2, 4.0, v2 +; SI-NEXT: v_min_f32_e32 v2, 2.0, v2 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f32_e32 v2, 1.0, v3 +; VI-NEXT: v_max_f32_e32 v2, 4.0, v2 +; VI-NEXT: v_min_f32_e32 v2, 2.0, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_max_f32_e32 v1, 4.0, v1 +; GFX9-NEXT: v_min_f32_e32 v1, 2.0, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_maxmin_f32 v1, v1, 4.0, 2.0 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid @@ -84,10 +376,83 @@ ret void } -; GCN-LABEL: {{^}}v_test_fmed3_nnan_r_i_i_multi_use_f32: -; GCN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} -; GCN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { +; SI-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_max_f32_e32 v2, 2.0, v2 +; SI-NEXT: v_min_f32_e32 v3, 4.0, v2 +; SI-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f32_e32 v2, 1.0, v3 +; VI-NEXT: v_max_f32_e32 v2, 2.0, v2 +; VI-NEXT: v_min_f32_e32 v3, 4.0, v2 +; VI-NEXT: flat_store_dword v[0:1], v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_max_f32_e32 v1, 2.0, v1 +; GFX9-NEXT: v_min_f32_e32 v2, 4.0, v1 +; GFX9-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f32_e32 v1, 2.0, v1 +; GFX11-NEXT: v_min_f32_e32 v2, 4.0, v1 +; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid @@ -102,10 +467,71 @@ ret void } -; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_f64: -; GCN: v_max_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, 2.0 -; GCN: v_min_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, 4.0 define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { +; SI-LABEL: v_test_fmed3_r_i_i_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_max_f64 v[2:3], v[2:3], 2.0 +; SI-NEXT: v_min_f64 v[2:3], v[2:3], 4.0 +; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_fmed3_r_i_i_f64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_max_f64 v[0:1], v[0:1], 2.0 +; VI-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_fmed3_r_i_i_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], 2.0 +; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_fmed3_r_i_i_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], 2.0 +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr double, ptr addrspace(1) %aptr, i32 %tid %outgep = getelementptr double, ptr addrspace(1) %out, i32 %tid @@ -119,9 +545,62 @@ ret void } -; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_no_nans_f32: -; GCN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0 define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 { +; SI-LABEL: v_test_fmed3_r_i_i_no_nans_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_fmed3_r_i_i_no_nans_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_med3_f32 v2, v3, 2.0, 4.0 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_fmed3_r_i_i_no_nans_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_fmed3_r_i_i_no_nans_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid @@ -134,9 +613,67 @@ ret void } -; GCN-LABEL: {{^}}v_test_legacy_fmed3_r_i_i_f32: -; GCN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0 define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { +; SI-LABEL: v_test_legacy_fmed3_r_i_i_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_legacy_fmed3_r_i_i_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f32_e32 v2, 1.0, v3 +; VI-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_legacy_fmed3_r_i_i_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_legacy_fmed3_r_i_i_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid @@ -155,12 +692,88 @@ ret void } -; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0_srcmod0: -; GCN: {{buffer_|flat_|global_}}load_dword [[A:v[0-9]+]] -; GCN: {{buffer_|flat_|global_}}load_dword [[B:v[0-9]+]] -; GCN: {{buffer_|flat_|global_}}load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, -[[A]], [[B]], [[C]] define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, -v2, v3, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, -v7, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, -v1, v2, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, -v1, v2, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -178,12 +791,88 @@ ret void } -; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0_srcmod1: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], -[[B]], [[C]] define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, v2, -v3, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, v7, -v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, v1, -v2, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, v1, -v2, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -201,12 +890,88 @@ ret void } -; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0_srcmod2: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], -[[C]] define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, v2, v3, -v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, v7, v2, -v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, v1, v2, -v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, v1, v2, -v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -224,12 +989,88 @@ ret void } -; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0_srcmod012: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, -[[A]], |[[B]]|, -|[[C]]| define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, -v2, |v3|, -|v4| +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, -v7, |v2|, -|v3| +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, -v1, |v2|, -|v3| +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, -v1, |v2|, -|v3| +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -253,12 +1094,88 @@ ret void } -; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0_negabs012: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, -|[[A]]|, -|[[B]]|, -|[[C]]| define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, -|v2|, -|v3|, -|v4| +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, -|v7|, -|v2|, -|v3| +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, -|v1|, -|v2|, -|v3| +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, -|v1|, -|v2|, -|v3| +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -284,15 +1201,100 @@ ret void } -; GCN-LABEL: {{^}}v_nnan_inputs_med3_f32_pat0: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]] -; GCN-DAG: v_add_f32_e32 [[A_ADD:v[0-9]+]], 1.0, [[A]] -; GCN-DAG: v_add_f32_e32 [[B_ADD:v[0-9]+]], 2.0, [[B]] -; GCN-DAG: v_add_f32_e32 [[C_ADD:v[0-9]+]], 4.0, [[C]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[A_ADD]], [[B_ADD]], [[C_ADD]] define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +; SI-LABEL: v_nnan_inputs_med3_f32_pat0: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v3, 2.0, v3 +; SI-NEXT: v_add_f32_e32 v4, 4.0, v4 +; SI-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_nnan_inputs_med3_f32_pat0: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_f32_e32 v4, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v2, 2.0, v2 +; VI-NEXT: v_add_f32_e32 v3, 4.0, v3 +; VI-NEXT: v_med3_f32 v2, v4, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_nnan_inputs_med3_f32_pat0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v2, 2.0, v2 +; GFX9-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_nnan_inputs_med3_f32_pat0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v2, 2.0, v2 +; GFX11-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -314,13 +1316,88 @@ ret void } - -; GCN-LABEL: {{^}}v_nnan_input_calls_med3_f32_pat0: -; GCN: {{buffer_|flat_|global_}}load_dword [[A:v[0-9]+]] -; GCN: {{buffer_|flat_|global_}}load_dword [[B:v[0-9]+]] -; GCN: {{buffer_|flat_|global_}}load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]] define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +; SI-LABEL: v_nnan_input_calls_med3_f32_pat0: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_nnan_input_calls_med3_f32_pat0: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_nnan_input_calls_med3_f32_pat0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_nnan_input_calls_med3_f32_pat0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -337,12 +1414,88 @@ ret void } -; GCN-LABEL: {{^}}v_nnan_call_med3_f32_pat0: -; GCN: {{buffer_|flat_|global_}}load_dword [[A:v[0-9]+]] -; GCN: {{buffer_|flat_|global_}}load_dword [[B:v[0-9]+]] -; GCN: {{buffer_|flat_|global_}}load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]] define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +; SI-LABEL: v_nnan_call_med3_f32_pat0: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_nnan_call_med3_f32_pat0: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_nnan_call_med3_f32_pat0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_nnan_call_med3_f32_pat0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -359,12 +1512,88 @@ ret void } -; GCN-LABEL: {{^}}v_fast_call_med3_f32_pat0: -; GCN: {{buffer_|flat_|global_}}load_dword [[A:v[0-9]+]] -; GCN: {{buffer_|flat_|global_}}load_dword [[B:v[0-9]+]] -; GCN: {{buffer_|flat_|global_}}load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]] define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +; SI-LABEL: v_fast_call_med3_f32_pat0: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_fast_call_med3_f32_pat0: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_fast_call_med3_f32_pat0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_fast_call_med3_f32_pat0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -391,15 +1620,90 @@ ; 5: max(min(y, x), min(max(y, x), z)) ; 6: max(min(y, x), min(z, max(x, y))) ; 7: max(min(y, x), min(z, max(y, x))) -; ; + commute outermost max -; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]] define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat0: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_med3_f32_pat0: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_med3_f32_pat0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -416,12 +1720,88 @@ ret void } -; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat1: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]] define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat1: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_med3_f32_pat1: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_med3_f32_pat1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_med3_f32_pat1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -438,12 +1818,88 @@ ret void } -; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat2: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]] define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat2: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_med3_f32_pat2: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_med3_f32_pat2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_med3_f32_pat2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -460,12 +1916,88 @@ ret void } -; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat3: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]] define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat3: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_med3_f32_pat3: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_med3_f32_pat3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_med3_f32_pat3: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -482,12 +2014,88 @@ ret void } -; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat4: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]] define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat4: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, v3, v2, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_med3_f32_pat4: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, v2, v7, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_med3_f32_pat4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_med3_f32_pat4: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -504,12 +2112,88 @@ ret void } -; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat5: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]] define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat5: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, v3, v2, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_med3_f32_pat5: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, v2, v7, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_med3_f32_pat5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_med3_f32_pat5: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -526,12 +2210,88 @@ ret void } -; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat6: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]] define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat6: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, v3, v2, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_med3_f32_pat6: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, v2, v7, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_med3_f32_pat6: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_med3_f32_pat6: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -548,12 +2308,88 @@ ret void } -; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat7: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]] define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat7: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, v3, v2, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_med3_f32_pat7: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, v2, v7, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_med3_f32_pat7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_med3_f32_pat7: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -570,12 +2406,88 @@ ret void } -; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat8: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]] define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat8: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_med3_f32_pat8: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_med3_f32_pat8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_med3_f32_pat8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -592,12 +2504,88 @@ ret void } -; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat9: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]] define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat9: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, v3, v2, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_med3_f32_pat9: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, v2, v7, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_med3_f32_pat9: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_med3_f32_pat9: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -614,12 +2602,88 @@ ret void } -; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat10: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]] define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat10: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_med3_f32_pat10: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_med3_f32_pat10: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_med3_f32_pat10: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -636,12 +2700,88 @@ ret void } -; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat11: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]] define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat11: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, v3, v2, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_med3_f32_pat11: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, v2, v7, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_med3_f32_pat11: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_med3_f32_pat11: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -658,12 +2798,88 @@ ret void } -; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat12: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]] define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat12: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, v3, v2, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_med3_f32_pat12: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, v2, v7, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_med3_f32_pat12: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_med3_f32_pat12: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -680,12 +2896,88 @@ ret void } -; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat13: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]] define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat13: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, v3, v2, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_med3_f32_pat13: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, v2, v7, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_med3_f32_pat13: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_med3_f32_pat13: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -702,12 +2994,88 @@ ret void } -; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat14: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]] define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat14: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_med3_f32_pat14: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_med3_f32_pat14: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_med3_f32_pat14: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -724,12 +3092,88 @@ ret void } -; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat15: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]] define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat15: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, v3, v2, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_med3_f32_pat15: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, v2, v7, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_med3_f32_pat15: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_med3_f32_pat15: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -749,12 +3193,88 @@ ; Also handle `min` at the root: ; min(max(x, y), max(min(x, y), z)) -; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat16: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]] define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat16: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_med3_f32_pat16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_med3_f32_pat16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_med3_f32_pat16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -775,12 +3295,119 @@ ; Negative patterns ; --------------------------------------------------------------------- -; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0_multi_use0: -; GCN-DAG: v_min_f32 -; GCN-DAG: v_max_f32 -; GCN: v_min_f32 -; GCN: v_max_f32 define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +; SI-LABEL: v_test_safe_med3_f32_pat0_multi_use0: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_min_f32_e32 v5, v2, v3 +; SI-NEXT: v_max_f32_e32 v2, v2, v3 +; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_min_f32_e32 v2, v2, v4 +; SI-NEXT: v_max_f32_e32 v2, v5, v2 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_safe_med3_f32_pat0_multi_use0: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_min_f32_e32 v5, v4, v2 +; VI-NEXT: v_max_f32_e32 v2, v4, v2 +; VI-NEXT: v_min_f32_e32 v2, v2, v3 +; VI-NEXT: v_max_f32_e32 v2, v5, v2 +; VI-NEXT: flat_store_dword v[0:1], v5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_safe_med3_f32_pat0_multi_use0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX9-NEXT: global_store_dword v[0:1], v4, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX9-NEXT: v_max_f32_e32 v1, v4, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_safe_med3_f32_pat0_multi_use0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX11-NEXT: v_dual_max_f32 v1, v1, v2 :: v_dual_max_f32 v2, v3, v3 +; GFX11-NEXT: v_minmax_f32 v1, v1, v2, v4 +; GFX11-NEXT: global_store_b32 v[0:1], v4, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -798,8 +3425,120 @@ ret void } -; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0_multi_use1: define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +; SI-LABEL: v_test_safe_med3_f32_pat0_multi_use1: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_min_f32_e32 v5, v2, v3 +; SI-NEXT: v_max_f32_e32 v2, v2, v3 +; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_min_f32_e32 v2, v2, v4 +; SI-NEXT: v_max_f32_e32 v2, v5, v2 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_safe_med3_f32_pat0_multi_use1: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_min_f32_e32 v5, v4, v2 +; VI-NEXT: v_max_f32_e32 v2, v4, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_min_f32_e32 v2, v2, v3 +; VI-NEXT: v_max_f32_e32 v2, v5, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_safe_med3_f32_pat0_multi_use1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX9-NEXT: global_store_dword v[0:1], v1, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX9-NEXT: v_max_f32_e32 v1, v4, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_safe_med3_f32_pat0_multi_use1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v4, v1, v2 +; GFX11-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_minmax_f32 v1, v1, v2, v3 +; GFX11-NEXT: global_store_b32 v[0:1], v4, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -817,8 +3556,119 @@ ret void } -; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0_multi_use2: define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +; SI-LABEL: v_test_safe_med3_f32_pat0_multi_use2: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_min_f32_e32 v5, v2, v3 +; SI-NEXT: v_max_f32_e32 v2, v2, v3 +; SI-NEXT: v_min_f32_e32 v2, v2, v4 +; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_max_f32_e32 v2, v5, v2 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_safe_med3_f32_pat0_multi_use2: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_min_f32_e32 v5, v4, v2 +; VI-NEXT: v_max_f32_e32 v2, v4, v2 +; VI-NEXT: v_min_f32_e32 v2, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_max_f32_e32 v2, v5, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_safe_med3_f32_pat0_multi_use2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX9-NEXT: global_store_dword v[0:1], v1, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v1, v4, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_safe_med3_f32_pat0_multi_use2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2 +; GFX11-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_maxmin_f32 v3, v1, v2, v3 +; GFX11-NEXT: v_minmax_f32 v1, v1, v2, v3 +; GFX11-NEXT: global_store_b32 v[0:1], v3, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -836,9 +3686,109 @@ ret void } - -; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0: define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +; SI-LABEL: v_test_safe_med3_f32_pat0: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[12:13], s[6:7] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: buffer_load_dword v4, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_min_f32_e32 v5, v2, v3 +; SI-NEXT: v_max_f32_e32 v2, v2, v3 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; SI-NEXT: v_min_f32_e32 v2, v2, v3 +; SI-NEXT: v_max_f32_e32 v2, v5, v2 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_safe_med3_f32_pat0: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: flat_load_dword v6, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mul_f32_e32 v4, 1.0, v6 +; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_min_f32_e32 v5, v4, v2 +; VI-NEXT: v_max_f32_e32 v2, v4, v2 +; VI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_min_f32_e32 v2, v2, v3 +; VI-NEXT: v_max_f32_e32 v2, v5, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_safe_med3_f32_pat0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX9-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX9-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX9-NEXT: v_max_f32_e32 v1, v4, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_safe_med3_f32_pat0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2 +; GFX11-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_maxmin_f32 v3, v1, v2, v3 +; GFX11-NEXT: v_minmax_f32 v1, v1, v2, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -855,8 +3805,100 @@ ret void } -; GCN-LABEL: {{^}}v_nnan_inputs_missing0_med3_f32_pat0: define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +; SI-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v3, 2.0, v3 +; SI-NEXT: v_add_f32_e32 v4, 4.0, v4 +; SI-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_f32_e32 v4, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v2, 2.0, v2 +; VI-NEXT: v_add_f32_e32 v3, 4.0, v3 +; VI-NEXT: v_med3_f32 v2, v4, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v2, 2.0, v2 +; GFX9-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v2, 2.0, v2 +; GFX11-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -878,8 +3920,100 @@ ret void } -; GCN-LABEL: {{^}}v_nnan_inputs_missing1_med3_f32_pat0: define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +; SI-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v3, 2.0, v3 +; SI-NEXT: v_add_f32_e32 v4, 4.0, v4 +; SI-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_f32_e32 v4, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v2, 2.0, v2 +; VI-NEXT: v_add_f32_e32 v3, 4.0, v3 +; VI-NEXT: v_med3_f32 v2, v4, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v2, 2.0, v2 +; GFX9-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v2, 2.0, v2 +; GFX11-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -901,8 +4035,100 @@ ret void } -; GCN-LABEL: {{^}}v_nnan_inputs_missing2_med3_f32_pat0: define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +; SI-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v3, 2.0, v3 +; SI-NEXT: v_add_f32_e32 v4, 4.0, v4 +; SI-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_f32_e32 v4, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v2, 2.0, v2 +; VI-NEXT: v_add_f32_e32 v3, 4.0, v3 +; VI-NEXT: v_med3_f32 v2, v4, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v2, 2.0, v2 +; GFX9-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v2, 2.0, v2 +; GFX11-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -924,15 +4150,98 @@ ret void } -; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]] -; GCN-DAG: v_min_f32 -; GCN-DAG: v_max_f32 -; GCN-DAG: v_min_f32 -; GCN-DAG: v_max_f32 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[12:13], s[6:7] +; SI-NEXT: buffer_load_dword v4, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_min_f32_e64 v5, -v2, v3 +; SI-NEXT: v_max_f32_e32 v2, v2, v3 +; SI-NEXT: v_min_f32_e32 v2, v2, v4 +; SI-NEXT: v_max_f32_e32 v2, v5, v2 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: flat_load_dword v6, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_min_f32_e64 v4, -v6, v2 +; VI-NEXT: v_max_f32_e32 v2, v6, v2 +; VI-NEXT: v_min_f32_e32 v2, v2, v3 +; VI-NEXT: v_max_f32_e32 v2, v4, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_min_f32_e64 v4, -v1, v2 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX9-NEXT: v_max_f32_e32 v1, v4, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_maxmin_f32 v3, v1, v2, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_minmax_f32 v1, -v1, v2, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -951,13 +4260,91 @@ } ; A simple min and max is not sufficient -; GCN-LABEL: {{^}}v_test_global_nnans_min_max_f32: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]] -; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], [[A]], [[B]] -; GCN: v_min_f32_e32 v{{[0-9]+}}, [[MAX]], [[C]] define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; SI-LABEL: v_test_global_nnans_min_max_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_max_f32_e32 v2, v2, v3 +; SI-NEXT: v_min_f32_e32 v2, v2, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_min_max_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_max_f32_e32 v2, v7, v2 +; VI-NEXT: v_min_f32_e32 v2, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_min_max_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_min_max_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_maxmin_f32 v1, v1, v2, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -972,19 +4359,70 @@ ret void } -; GCN-LABEL: {{^}}v_test_nnan_input_fmed3_r_i_i_f16: -; SI: v_cvt_f32_f16 -; SI: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} -; SI: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0 -; SI: v_cvt_f16_f32 - -; VI: v_add_f16_e32 v{{[0-9]+}}, 1.0 -; VI: v_max_f16_e32 v{{[0-9]+}}, 2.0 -; VI: v_min_f16_e32 v{{[0-9]+}}, 4.0 - -; GFX9: v_add_f16_e32 [[ADD:v[0-9]+]], 1.0 -; GFX9: v_med3_f16 v{{[0-9]+}}, [[ADD]], 2.0, 4.0 define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { +; SI-LABEL: v_test_nnan_input_fmed3_r_i_i_f16: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_nnan_input_fmed3_r_i_i_f16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_ushort v3, v[0:1] +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f16_e32 v2, 1.0, v3 +; VI-NEXT: v_max_f16_e32 v2, 2.0, v2 +; VI-NEXT: v_min_f16_e32 v2, 4.0, v2 +; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_nnan_input_fmed3_r_i_i_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f16_e32 v1, 1.0, v1 +; GFX9-NEXT: v_med3_f16 v1, v1, 2.0, 4.0 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_nnan_input_fmed3_r_i_i_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f16_e32 v1, 1.0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_med3_f16 v1, v1, 2.0, 4.0 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr half, ptr addrspace(1) %aptr, i32 %tid %outgep = getelementptr half, ptr addrspace(1) %out, i32 %tid @@ -997,31 +4435,108 @@ ret void } -; GCN-LABEL: {{^}}v_nnan_inputs_med3_f16_pat0: -; GCN: {{buffer|flat|global}}_load_ushort [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_ushort [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_ushort [[C:v[0-9]+]] - -; SI: v_cvt_f32_f16 -; SI: v_cvt_f32_f16 -; SI: v_add_f32_e32 -; SI: v_add_f32_e32 -; SI: v_add_f32_e32 -; SI: v_med3_f32 -; SI: v_cvt_f16_f32_e32 - - -; GFX89-DAG: v_add_f16_e32 [[A_ADD:v[0-9]+]], 1.0, [[A]] -; GFX89-DAG: v_add_f16_e32 [[B_ADD:v[0-9]+]], 2.0, [[B]] -; GFX89-DAG: v_add_f16_e32 [[C_ADD:v[0-9]+]], 4.0, [[C]] - -; VI-DAG: v_min_f16 -; VI-DAG: v_max_f16 -; VI: v_min_f16 -; VI: v_max_f16 - -; GFX9: v_med3_f16 v{{[0-9]+}}, [[A_ADD]], [[B_ADD]], [[C_ADD]] define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +; SI-LABEL: v_nnan_inputs_med3_f16_pat0: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v3, 2.0, v3 +; SI-NEXT: v_add_f32_e32 v4, 4.0, v4 +; SI-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_nnan_inputs_med3_f16_pat0: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 1, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_ushort v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_ushort v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_ushort v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_f16_e32 v4, 1.0, v7 +; VI-NEXT: v_add_f16_e32 v2, 2.0, v2 +; VI-NEXT: v_add_f16_e32 v3, 4.0, v3 +; VI-NEXT: v_min_f16_e32 v5, v4, v2 +; VI-NEXT: v_max_f16_e32 v2, v4, v2 +; VI-NEXT: v_min_f16_e32 v2, v2, v3 +; VI-NEXT: v_max_f16_e32 v2, v5, v2 +; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_nnan_inputs_med3_f16_pat0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_ushort v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_ushort v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f16_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f16_e32 v2, 2.0, v2 +; GFX9-NEXT: v_add_f16_e32 v3, 4.0, v3 +; GFX9-NEXT: v_med3_f16 v1, v1, v2, v3 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_nnan_inputs_med3_f16_pat0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_u16 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_u16 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f16_e32 v1, 1.0, v1 +; GFX11-NEXT: v_add_f16_e32 v2, 2.0, v2 +; GFX11-NEXT: v_add_f16_e32 v3, 4.0, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_med3_f16 v1, v1, v2, v3 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr half, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr half, ptr addrspace(1) %bptr, i32 %tid @@ -1043,11 +4558,71 @@ ret void } -; GCN-LABEL: {{^}}two_non_inline_constant: -; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 0.5, -; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0x41000000, [[ADD]] -; GCN: v_min_f32_e32 v{{[0-9]+}}, 0x41800000, [[MAX]] define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { +; SI-LABEL: two_non_inline_constant: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f32_e32 v2, 0.5, v2 +; SI-NEXT: v_max_f32_e32 v2, 0x41000000, v2 +; SI-NEXT: v_min_f32_e32 v2, 0x41800000, v2 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: two_non_inline_constant: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f32_e32 v2, 0.5, v3 +; VI-NEXT: v_max_f32_e32 v2, 0x41000000, v2 +; VI-NEXT: v_min_f32_e32 v2, 0x41800000, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: two_non_inline_constant: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v1, 0.5, v1 +; GFX9-NEXT: v_max_f32_e32 v1, 0x41000000, v1 +; GFX9-NEXT: v_min_f32_e32 v1, 0x41800000, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: two_non_inline_constant: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_mov_b32 s2, 0x41000000 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f32_e32 v1, 0.5, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_maxmin_f32 v1, v1, s2, 0x41800000 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid @@ -1061,11 +4636,83 @@ } ; FIXME: Simple stores do not work as a multiple use because they are bitcasted to integer constants. -; GCN-LABEL: {{^}}one_non_inline_constant: -; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41800000 -; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], 0.5, -; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], 1.0, [[K1]] define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { +; SI-LABEL: one_non_inline_constant: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: v_mov_b32_e32 v3, 0x41800000 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f32_e32 v4, 0.5, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x41800000, v2 +; SI-NEXT: v_med3_f32 v3, v4, 1.0, v3 +; SI-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_endpgm +; +; VI-LABEL: one_non_inline_constant: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: v_mov_b32_e32 v4, 0x41800000 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f32_e32 v2, 0.5, v3 +; VI-NEXT: v_med3_f32 v2, v2, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 0x41800000, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: flat_store_dword v[0:1], v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: one_non_inline_constant: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x41800000 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v3, 0.5, v1 +; GFX9-NEXT: v_add_f32_e32 v1, 0x41800000, v1 +; GFX9-NEXT: v_med3_f32 v2, v3, 1.0, v2 +; GFX9-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-NEXT: global_store_dword v[0:1], v1, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: one_non_inline_constant: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f32_e32 v2, 0.5, v1 +; GFX11-NEXT: v_add_f32_e32 v1, 0x41800000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_med3_f32 v2, v2, 1.0, 0x41800000 +; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] +; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid @@ -1081,12 +4728,99 @@ ret void } -; GCN-LABEL: {{^}}two_non_inline_constant_multi_use: -; GCN-DAG: s_mov_b32 [[K0:s[0-9]+]], 0x41000000 -; GCN-DAG: v_mov_b32_e32 [[VK1:v[0-9]+]], 0x41800000 -; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], 0.5, -; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], [[K0]], [[VK1]] define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { +; SI-LABEL: two_non_inline_constant_multi_use: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b32 s4, 0x41000000 +; SI-NEXT: v_mov_b32_e32 v3, 0x41800000 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f32_e32 v4, 0.5, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x41800000, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x41000000, v2 +; SI-NEXT: v_med3_f32 v3, v4, s4, v3 +; SI-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_endpgm +; +; VI-LABEL: two_non_inline_constant_multi_use: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: v_mov_b32_e32 v4, 0x41800000 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b32 s2, 0x41000000 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f32_e32 v2, 0.5, v3 +; VI-NEXT: v_med3_f32 v2, v2, s2, v4 +; VI-NEXT: v_add_f32_e32 v5, 0x41800000, v3 +; VI-NEXT: v_add_f32_e32 v3, 0x41000000, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: flat_store_dword v[0:1], v5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_store_dword v[0:1], v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: two_non_inline_constant_multi_use: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x41800000 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_mov_b32 s2, 0x41000000 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v3, 0.5, v1 +; GFX9-NEXT: v_add_f32_e32 v4, 0x41800000, v1 +; GFX9-NEXT: v_add_f32_e32 v1, 0x41000000, v1 +; GFX9-NEXT: v_med3_f32 v2, v3, s2, v2 +; GFX9-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-NEXT: global_store_dword v[0:1], v4, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dword v[0:1], v1, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: two_non_inline_constant_multi_use: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_mov_b32 s2, 0x41000000 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f32_e32 v3, 0x41800000, v1 +; GFX11-NEXT: v_add_f32_e32 v2, 0.5, v1 +; GFX11-NEXT: v_add_f32_e32 v1, 0x41000000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_med3_f32 v2, v2, s2, 0x41800000 +; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] +; GFX11-NEXT: global_store_b32 v[0:1], v3, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll --- a/llvm/test/CodeGen/AMDGPU/fmin3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll @@ -1,14 +1,130 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s -; GCN-LABEL: {{^}}test_fmin3_olt_0_f32: -; GCN: buffer_load_dword [[REGC:v[0-9]+]] -; GCN: buffer_load_dword [[REGB:v[0-9]+]] -; GCN: buffer_load_dword [[REGA:v[0-9]+]] -; GCN: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] -; GCN: buffer_store_dword [[RESULT]], define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { +; SI-LABEL: test_fmin3_olt_0_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s22, s10 +; SI-NEXT: s_mov_b32 s23, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s20, s6 +; SI-NEXT: s_mov_b32 s21, s7 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[20:23], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: v_min3_f32 v0, v0, v1, v2 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: test_fmin3_olt_0_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: v_min3_f32 v0, v0, v1, v2 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fmin3_olt_0_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s12, s2 +; GFX9-NEXT: s_mov_b32 s13, s3 +; GFX9-NEXT: s_mov_b32 s16, s4 +; GFX9-NEXT: s_mov_b32 s17, s5 +; GFX9-NEXT: s_mov_b32 s18, s10 +; GFX9-NEXT: s_mov_b32 s19, s11 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s10 +; GFX9-NEXT: s_mov_b32 s7, s11 +; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s0 +; GFX9-NEXT: s_mov_b32 s9, s1 +; GFX9-NEXT: v_min3_f32 v0, v0, v1, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_fmin3_olt_0_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_mov_b32 s22, s10 +; GFX11-NEXT: s_mov_b32 s23, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: s_mov_b32 s20, s6 +; GFX11-NEXT: s_mov_b32 s21, s7 +; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: v_min3_f32 v0, v0, v1, v2 +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %a = load volatile float, ptr addrspace(1) %aptr, align 4 %b = load volatile float, ptr addrspace(1) %bptr, align 4 %c = load volatile float, ptr addrspace(1) %cptr, align 4 @@ -19,13 +135,127 @@ } ; Commute operand of second fmin -; GCN-LABEL: {{^}}test_fmin3_olt_1_f32: -; GCN: buffer_load_dword [[REGB:v[0-9]+]] -; GCN: buffer_load_dword [[REGA:v[0-9]+]] -; GCN: buffer_load_dword [[REGC:v[0-9]+]] -; GCN: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] -; GCN: buffer_store_dword [[RESULT]], define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { +; SI-LABEL: test_fmin3_olt_1_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s22, s10 +; SI-NEXT: s_mov_b32 s23, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s20, s6 +; SI-NEXT: s_mov_b32 s21, s7 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[20:23], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: v_min3_f32 v0, v2, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: test_fmin3_olt_1_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: v_min3_f32 v0, v2, v0, v1 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fmin3_olt_1_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s12, s2 +; GFX9-NEXT: s_mov_b32 s13, s3 +; GFX9-NEXT: s_mov_b32 s16, s4 +; GFX9-NEXT: s_mov_b32 s17, s5 +; GFX9-NEXT: s_mov_b32 s18, s10 +; GFX9-NEXT: s_mov_b32 s19, s11 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s10 +; GFX9-NEXT: s_mov_b32 s7, s11 +; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s0 +; GFX9-NEXT: s_mov_b32 s9, s1 +; GFX9-NEXT: v_min3_f32 v0, v2, v0, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_fmin3_olt_1_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_mov_b32 s22, s10 +; GFX11-NEXT: s_mov_b32 s23, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: s_mov_b32 s20, s6 +; GFX11-NEXT: s_mov_b32 s21, s7 +; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: v_min3_f32 v0, v2, v0, v1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %a = load volatile float, ptr addrspace(1) %aptr, align 4 %b = load volatile float, ptr addrspace(1) %bptr, align 4 %c = load volatile float, ptr addrspace(1) %cptr, align 4 @@ -35,20 +265,135 @@ ret void } -; GCN-LABEL: {{^}}test_fmin3_olt_0_f16: -; GCN: buffer_load_ushort [[REGC:v[0-9]+]] -; GCN: buffer_load_ushort [[REGB:v[0-9]+]] -; GCN: buffer_load_ushort [[REGA:v[0-9]+]] - -; SI: v_min3_f32 [[RESULT_F32:v[0-9]+]], -; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT]] - -; VI: v_min_f16_e32 -; VI: v_min_f16_e32 [[RESULT:v[0-9]+]], - -; GFX9: v_min3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] -; GCN: buffer_store_short [[RESULT]], define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { +; SI-LABEL: test_fmin3_olt_0_f16: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s22, s10 +; SI-NEXT: s_mov_b32 s23, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s20, s6 +; SI-NEXT: s_mov_b32 s21, s7 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v2, off, s[20:23], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_min3_f32 v0, v0, v1, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: test_fmin3_olt_0_f16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: v_max_f16_e32 v0, v0, v0 +; VI-NEXT: v_max_f16_e32 v1, v1, v1 +; VI-NEXT: v_min_f16_e32 v0, v0, v1 +; VI-NEXT: v_max_f16_e32 v1, v2, v2 +; VI-NEXT: v_min_f16_e32 v0, v0, v1 +; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fmin3_olt_0_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s12, s2 +; GFX9-NEXT: s_mov_b32 s13, s3 +; GFX9-NEXT: s_mov_b32 s16, s4 +; GFX9-NEXT: s_mov_b32 s17, s5 +; GFX9-NEXT: s_mov_b32 s18, s10 +; GFX9-NEXT: s_mov_b32 s19, s11 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s10 +; GFX9-NEXT: s_mov_b32 s7, s11 +; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s0 +; GFX9-NEXT: s_mov_b32 s9, s1 +; GFX9-NEXT: v_min3_f16 v0, v0, v1, v2 +; GFX9-NEXT: buffer_store_short v0, off, s[8:11], 0 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_fmin3_olt_0_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_mov_b32 s22, s10 +; GFX11-NEXT: s_mov_b32 s23, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: s_mov_b32 s20, s6 +; GFX11-NEXT: s_mov_b32 s21, s7 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: v_min3_f16 v0, v0, v1, v2 +; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %a = load volatile half, ptr addrspace(1) %aptr, align 2 %b = load volatile half, ptr addrspace(1) %bptr, align 2 %c = load volatile half, ptr addrspace(1) %cptr, align 2 @@ -59,23 +404,135 @@ } ; Commute operand of second fmin -; GCN-LABEL: {{^}}test_fmin3_olt_1_f16: -; GCN: buffer_load_ushort [[REGA:v[0-9]+]] -; GCN: buffer_load_ushort [[REGB:v[0-9]+]] -; GCN: buffer_load_ushort [[REGC:v[0-9]+]] - -; SI-DAG: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], [[REGA]] -; SI-DAG: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], [[REGB]] -; SI-DAG: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], [[REGC]] -; SI: v_min3_f32 [[RESULT_F32:v[0-9]+]], [[CVT_C]], [[CVT_A]], [[CVT_B]] -; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT_F32]] - -; VI: v_min_f16_e32 -; VI: v_min_f16_e32 [[RESULT:v[0-9]+]], - -; GFX9: v_min3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGA]], [[REGB]] -; GCN: buffer_store_short [[RESULT]], define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { +; SI-LABEL: test_fmin3_olt_1_f16: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s22, s10 +; SI-NEXT: s_mov_b32 s23, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s20, s6 +; SI-NEXT: s_mov_b32 s21, s7 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v2, off, s[20:23], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_min3_f32 v0, v2, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: test_fmin3_olt_1_f16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: v_max_f16_e32 v0, v0, v0 +; VI-NEXT: v_max_f16_e32 v1, v1, v1 +; VI-NEXT: v_min_f16_e32 v0, v0, v1 +; VI-NEXT: v_max_f16_e32 v1, v2, v2 +; VI-NEXT: v_min_f16_e32 v0, v1, v0 +; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fmin3_olt_1_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s12, s2 +; GFX9-NEXT: s_mov_b32 s13, s3 +; GFX9-NEXT: s_mov_b32 s16, s4 +; GFX9-NEXT: s_mov_b32 s17, s5 +; GFX9-NEXT: s_mov_b32 s18, s10 +; GFX9-NEXT: s_mov_b32 s19, s11 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s10 +; GFX9-NEXT: s_mov_b32 s7, s11 +; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s0 +; GFX9-NEXT: s_mov_b32 s9, s1 +; GFX9-NEXT: v_min3_f16 v0, v2, v0, v1 +; GFX9-NEXT: buffer_store_short v0, off, s[8:11], 0 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_fmin3_olt_1_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_mov_b32 s22, s10 +; GFX11-NEXT: s_mov_b32 s23, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: s_mov_b32 s20, s6 +; GFX11-NEXT: s_mov_b32 s21, s7 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: v_min3_f16 v0, v2, v0, v1 +; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %a = load volatile half, ptr addrspace(1) %aptr, align 2 %b = load volatile half, ptr addrspace(1) %bptr, align 2 %c = load volatile half, ptr addrspace(1) %cptr, align 2 @@ -87,30 +544,61 @@ ; Checks whether the test passes; performMinMaxCombine() should not optimize vector patterns of min3 ; since there are no pack instructions for fmin3. -; GCN-LABEL: {{^}}no_fmin3_v2f16: - -; SI: v_cvt_f16_f32_e32 -; SI: v_min_f32_e32 -; SI-NEXT: v_min_f32_e32 -; SI-NEXT: v_min3_f32 -; SI-NEXT: v_min3_f32 - -; VI: s_waitcnt -; VI-NEXT: v_min_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_min_f16_e32 v0, v0, v1 -; VI-NEXT: v_min_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_min_f16_e32 v0, v2, v0 -; VI-NEXT: v_min_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_min_f16_e32 v0, v0, v3 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: s_setpc_b64 - -; GFX9: s_waitcnt -; GFX9-NEXT: v_pk_min_f16 v0, v0, v1 -; GFX9-NEXT: v_pk_min_f16 v0, v2, v0 -; GFX9-NEXT: v_pk_min_f16 v0, v0, v3 -; GFX9-NEXT: s_setpc_b64 define <2 x half> @no_fmin3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) #2 { +; SI-LABEL: no_fmin3_v2f16: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_min_f32_e32 v1, v1, v3 +; SI-NEXT: v_min_f32_e32 v0, v0, v2 +; SI-NEXT: v_min3_f32 v0, v4, v0, v6 +; SI-NEXT: v_min3_f32 v1, v5, v1, v7 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: no_fmin3_v2f16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_min_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_min_f16_e32 v0, v0, v1 +; VI-NEXT: v_min_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_min_f16_e32 v0, v2, v0 +; VI-NEXT: v_min_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_min_f16_e32 v0, v0, v3 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: no_fmin3_v2f16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_min_f16 v0, v0, v1 +; GFX9-NEXT: v_pk_min_f16 v0, v2, v0 +; GFX9-NEXT: v_pk_min_f16 v0, v0, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: no_fmin3_v2f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_pk_min_f16 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_min_f16 v0, v2, v0 +; GFX11-NEXT: v_pk_min_f16 v0, v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %min = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b) %min1 = call <2 x half> @llvm.minnum.v2f16(<2 x half> %c, <2 x half> %min) @@ -118,9 +606,142 @@ ret <2 x half> %res } -; GCN-LABEL: {{^}}test_fmin3_olt_0_f64: -; GCN-NOT: v_min3 define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { +; SI-LABEL: test_fmin3_olt_0_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s0, s6 +; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: buffer_load_dwordx2 v[4:5], off, s[0:3], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; SI-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; SI-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; SI-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; SI-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: test_fmin3_olt_0_f64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; VI-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; VI-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; VI-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; VI-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fmin3_olt_0_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s12, s2 +; GFX9-NEXT: s_mov_b32 s13, s3 +; GFX9-NEXT: s_mov_b32 s16, s4 +; GFX9-NEXT: s_mov_b32 s17, s5 +; GFX9-NEXT: s_mov_b32 s18, s10 +; GFX9-NEXT: s_mov_b32 s19, s11 +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s10 +; GFX9-NEXT: s_mov_b32 s7, s11 +; GFX9-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s0 +; GFX9-NEXT: s_mov_b32 s9, s1 +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_fmin3_olt_0_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[12:15], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_b64 v[2:3], off, s[16:19], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: buffer_load_b64 v[4:5], off, s[12:15], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %a = load volatile double, ptr addrspace(1) %aptr, align 4 %b = load volatile double, ptr addrspace(1) %bptr, align 4 %c = load volatile double, ptr addrspace(1) %cptr, align 4 @@ -131,9 +752,142 @@ } ; Commute operand of second fmin -; GCN-LABEL: {{^}}test_fmin3_olt_1_f64: -; GCN-NOT: v_min3 define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { +; SI-LABEL: test_fmin3_olt_1_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s0, s6 +; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: buffer_load_dwordx2 v[4:5], off, s[0:3], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; SI-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; SI-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; SI-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; SI-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: test_fmin3_olt_1_f64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; VI-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; VI-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; VI-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; VI-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fmin3_olt_1_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s12, s2 +; GFX9-NEXT: s_mov_b32 s13, s3 +; GFX9-NEXT: s_mov_b32 s16, s4 +; GFX9-NEXT: s_mov_b32 s17, s5 +; GFX9-NEXT: s_mov_b32 s18, s10 +; GFX9-NEXT: s_mov_b32 s19, s11 +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s10 +; GFX9-NEXT: s_mov_b32 s7, s11 +; GFX9-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s0 +; GFX9-NEXT: s_mov_b32 s9, s1 +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX9-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_fmin3_olt_1_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[12:15], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_b64 v[2:3], off, s[16:19], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: buffer_load_b64 v[4:5], off, s[12:15], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %a = load volatile double, ptr addrspace(1) %aptr, align 4 %b = load volatile double, ptr addrspace(1) %bptr, align 4 %c = load volatile double, ptr addrspace(1) %cptr, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll @@ -1,18 +1,83 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89,SIVI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s -; GCN-LABEL: {{^}}fmul_f16 -; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] -; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_mul_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]] -; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; GFX89: v_mul_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]] -; GCN: buffer_store_short v[[R_F16]] -; GCN: s_endpgm define amdgpu_kernel void @fmul_f16( +; SI-LABEL: fmul_f16: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; GFX89-LABEL: fmul_f16: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX89-NEXT: s_mov_b32 s3, 0xf000 +; GFX89-NEXT: s_mov_b32 s2, -1 +; GFX89-NEXT: s_mov_b32 s14, s2 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s12, s6 +; GFX89-NEXT: s_mov_b32 s13, s7 +; GFX89-NEXT: s_mov_b32 s15, s3 +; GFX89-NEXT: s_mov_b32 s10, s2 +; GFX89-NEXT: s_mov_b32 s11, s3 +; GFX89-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_mov_b32 s0, s4 +; GFX89-NEXT: s_mov_b32 s1, s5 +; GFX89-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX89-NEXT: s_endpgm +; +; GFX11-LABEL: fmul_f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s2, s10 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s4 +; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) { @@ -24,15 +89,63 @@ ret void } -; GCN-LABEL: {{^}}fmul_f16_imm_a -; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_mul_f32_e32 v[[R_F32:[0-9]+]], 0x40400000, v[[B_F32]] -; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; GFX89: v_mul_f16_e32 v[[R_F16:[0-9]+]], 0x4200, v[[B_F16]] -; GCN: buffer_store_short v[[R_F16]] -; GCN: s_endpgm define amdgpu_kernel void @fmul_f16_imm_a( +; SI-LABEL: fmul_f16_imm_a: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 0x40400000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; GFX89-LABEL: fmul_f16_imm_a: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_mov_b32 s10, s6 +; GFX89-NEXT: s_mov_b32 s11, s7 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s8, s2 +; GFX89-NEXT: s_mov_b32 s9, s3 +; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_mov_b32 s4, s0 +; GFX89-NEXT: s_mov_b32 s5, s1 +; GFX89-NEXT: v_mul_f16_e32 v0, 0x4200, v0 +; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX89-NEXT: s_endpgm +; +; GFX11-LABEL: fmul_f16_imm_a: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: v_mul_f16_e32 v0, 0x4200, v0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %b) { entry: @@ -42,16 +155,63 @@ ret void } -; GCN-LABEL: {{^}}fmul_f16_imm_b -; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; SI: v_mul_f32_e32 v[[R_F32:[0-9]+]], 4.0, v[[A_F32]] -; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] - -; GFX89: v_mul_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]] -; GCN: buffer_store_short v[[R_F16]] -; GCN: s_endpgm define amdgpu_kernel void @fmul_f16_imm_b( +; SI-LABEL: fmul_f16_imm_b: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 4.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; GFX89-LABEL: fmul_f16_imm_b: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_mov_b32 s10, s6 +; GFX89-NEXT: s_mov_b32 s11, s7 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s8, s2 +; GFX89-NEXT: s_mov_b32 s9, s3 +; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_mov_b32 s4, s0 +; GFX89-NEXT: s_mov_b32 s5, s1 +; GFX89-NEXT: v_mul_f16_e32 v0, 4.0, v0 +; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX89-NEXT: s_endpgm +; +; GFX11-LABEL: fmul_f16_imm_b: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: v_mul_f16_e32 v0, 4.0, v0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -61,34 +221,110 @@ ret void } -; GCN-LABEL: {{^}}fmul_v2f16: -; SIVI: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SIVI: buffer_load_dword v[[A_V2_F16:[0-9]+]] - -; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI-DAG: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]] -; SI-DAG: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] - -; VI-DAG: v_mul_f16_e32 v[[R_F16_LO:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] -; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]] - -; GFX9: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GFX9: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; GFX9: v_pk_mul_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] - -; GCN: buffer_store_dword v[[R_V2_F16]] -; GCN: s_endpgm define amdgpu_kernel void @fmul_v2f16( +; SI-LABEL: fmul_v2f16: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_mul_f32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_mul_f32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fmul_v2f16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mul_f16_sdwa v2, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_mul_f16_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fmul_v2f16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s14, s2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s15, s3 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; GFX9-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: fmul_v2f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s2, s10 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 +; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0 +; GFX11-NEXT: s_mov_b32 s8, s4 +; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_mul_f16 v0, v0, v1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) { @@ -100,30 +336,91 @@ ret void } -; GCN-LABEL: {{^}}fmul_v2f16_imm_a: -; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI-DAG: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI-DAG: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] - - -; VI-DAG: v_mov_b32_e32 v[[CONST4:[0-9]+]], 0x4400 -; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], v[[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-DAG: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]] - -; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x44004200 -; GFX9: v_pk_mul_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], [[K]] - -; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] - -; GCN: buffer_store_dword v[[R_V2_F16]] -; GCN: s_endpgm define amdgpu_kernel void @fmul_v2f16_imm_a( +; SI-LABEL: fmul_v2f16_imm_a: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v1, 4.0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_mul_f32_e32 v0, 0x40400000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fmul_v2f16_imm_a: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: v_mov_b32_e32 v1, 0x4400 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v0, 0x4200, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fmul_v2f16_imm_a: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s0, 0x44004200 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_mul_f16 v0, v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: fmul_v2f16_imm_a: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_mul_f16 v0, 0x44004200, v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %b) { entry: @@ -133,29 +430,91 @@ ret void } -; GCN-LABEL: {{^}}fmul_v2f16_imm_b: -; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI-DAG: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI-DAG: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] - -; VI-DAG: v_mov_b32_e32 v[[CONST3:[0-9]+]], 0x4200 -; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-DAG: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]] - -; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x42004400 -; GFX9: v_pk_mul_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], [[K]] - -; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] - -; GCN: buffer_store_dword v[[R_V2_F16]] -; GCN: s_endpgm define amdgpu_kernel void @fmul_v2f16_imm_b( +; SI-LABEL: fmul_v2f16_imm_b: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v1, 0x40400000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_mul_f32_e32 v0, 4.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fmul_v2f16_imm_b: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: v_mov_b32_e32 v1, 0x4200 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v0, 4.0, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fmul_v2f16_imm_b: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s0, 0x42004400 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_mul_f16 v0, v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: fmul_v2f16_imm_b: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_mul_f16 v0, 0x42004400, v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -165,23 +524,127 @@ ret void } -; GCN-LABEL: {{^}}fmul_v4f16: -; GFX9: buffer_load_dwordx2 v[[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]] -; GFX9: buffer_load_dwordx2 v[[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]] - -; GFX9-DAG: v_pk_mul_f16 v[[MUL_LO:[0-9]+]], v[[A_LO]], v[[B_LO]] -; GFX9-DAG: v_pk_mul_f16 v[[MUL_HI:[0-9]+]], v[[A_HI]], v[[B_HI]] -; GFX9: buffer_store_dwordx2 v[[[MUL_LO]]:[[MUL_HI]]] - -; VI: buffer_load_dwordx2 v[[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]] -; VI: buffer_load_dwordx2 v[[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]] -; VI: v_mul_f16_sdwa -; VI: v_mul_f16_e32 -; VI: v_mul_f16_sdwa -; VI: v_mul_f16_e32 -; VI: v_or_b32 -; VI: v_or_b32 define amdgpu_kernel void @fmul_v4f16( +; SI-LABEL: fmul_v4f16: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_mul_f32_e32 v5, v7, v5 +; SI-NEXT: v_mul_f32_e32 v4, v6, v4 +; SI-NEXT: v_mul_f32_e32 v1, v3, v1 +; SI-NEXT: v_mul_f32_e32 v0, v2, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v3, v0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fmul_v4f16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mul_f16_sdwa v4, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_mul_f16_e32 v1, v3, v1 +; VI-NEXT: v_mul_f16_sdwa v3, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_mul_f16_e32 v0, v2, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v4 +; VI-NEXT: v_or_b32_e32 v0, v0, v3 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fmul_v4f16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s14, s2 +; GFX9-NEXT: s_mov_b32 s15, s3 +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GFX9-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_mul_f16 v1, v3, v1 +; GFX9-NEXT: v_pk_mul_f16 v0, v2, v0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: fmul_v4f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, s10 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b64 v[2:3], off, s[12:15], 0 +; GFX11-NEXT: s_mov_b32 s8, s4 +; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_mul_f16 v1, v3, v1 +; GFX11-NEXT: v_pk_mul_f16 v0, v2, v0 +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) { @@ -193,27 +656,106 @@ ret void } -; GCN-LABEL: {{^}}fmul_v4f16_imm_a: -; GFX89-DAG: buffer_load_dwordx2 v[[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]] -; GFX9-DAG: s_mov_b32 [[K1:s[0-9]+]], 0x44004200 -; GFX9-DAG: s_mov_b32 [[K0:s[0-9]+]], 0x40004800 - -; GFX9-DAG: v_pk_mul_f16 v[[MUL_LO:[0-9]+]], v[[A_LO]], [[K0]] -; GFX9-DAG: v_pk_mul_f16 v[[MUL_HI:[0-9]+]], v[[A_HI]], [[K1]] -; GFX9: buffer_store_dwordx2 v[[[MUL_LO]]:[[MUL_HI]]] - -; VI-DAG: v_mov_b32_e32 [[K4:v[0-9]+]], 0x4400 - -; VI-DAG: v_mul_f16_sdwa v[[MUL_HI_HI:[0-9]+]], v[[A_HI]], [[K4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-DAG: v_mul_f16_e32 v[[MUL_HI_LO:[0-9]+]], 0x4200, v[[A_HI]] -; VI-DAG: v_add_f16_sdwa v[[MUL_LO_HI:[0-9]+]], v[[A_LO]], v[[A_LO]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-DAG: v_mul_f16_e32 v[[MUL_LO_LO:[0-9]+]], 0x4800, v[[A_LO]] - -; VI-DAG: v_or_b32_e32 v[[OR0:[0-9]+]], v[[MUL_LO_LO]], v[[MUL_LO_HI]] -; VI-DAG: v_or_b32_e32 v[[OR1:[0-9]+]], v[[MUL_HI_LO]], v[[MUL_HI_HI]] - -; VI: buffer_store_dwordx2 v[[[OR0]]:[[OR1]]] define amdgpu_kernel void @fmul_v4f16_imm_a( +; SI-LABEL: fmul_v4f16_imm_a: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v3, 0x40400000, v3 +; SI-NEXT: v_mul_f32_e32 v2, 0x41000000, v2 +; SI-NEXT: v_mul_f32_e32 v1, 4.0, v1 +; SI-NEXT: v_add_f32_e32 v0, v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fmul_v4f16_imm_a: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: v_mov_b32_e32 v2, 0x4400 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mul_f16_sdwa v2, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v1, 0x4200, v1 +; VI-NEXT: v_add_f16_sdwa v3, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_mul_f16_e32 v0, 0x4800, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_e32 v0, v0, v3 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fmul_v4f16_imm_a: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s2, 0x44004200 +; GFX9-NEXT: s_mov_b32 s3, 0x40004800 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_mul_f16 v1, v1, s2 +; GFX9-NEXT: v_pk_mul_f16 v0, v0, s3 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: fmul_v4f16_imm_a: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_mul_f16 v1, 0x44004200, v1 +; GFX11-NEXT: v_pk_mul_f16 v0, 0x40004800, v0 +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %b) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll @@ -1,10 +1,58 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=kaveri -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,CIVI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,CIVI,GFX89 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,GFX9 %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=kaveri -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIVI,CI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIVI,GFX8 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s ; FIXME: Should be able to do scalar op -; GCN-LABEL: {{^}}s_fneg_f16: define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 { +; CI-LABEL: s_fneg_f16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_xor_b32 s2, s2, 0x8000 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: flat_store_short v[0:1], v2 +; CI-NEXT: s_endpgm +; +; GFX8-LABEL: s_fneg_f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_xor_b32 s2, s2, 0x8000 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: flat_store_short v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: s_fneg_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_xor_b32 s2, s2, 0x8000 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: s_fneg_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_xor_b32 s2, s2, 0x8000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %fneg = fsub half -0.0, %in store half %fneg, ptr addrspace(1) %out ret void @@ -12,13 +60,57 @@ ; FIXME: Should be able to use bit operations when illegal type as ; well. - -; GCN-LABEL: {{^}}v_fneg_f16: -; GCN: {{flat|global}}_load_ushort [[VAL:v[0-9]+]], -; GCN: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x8000, [[VAL]] -; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[XOR]] -; SI: buffer_store_short [[XOR]] define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; CI-LABEL: v_fneg_f16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; CI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_load_ushort v2, v[0:1] +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_xor_b32_e32 v2, 0x8000, v2 +; CI-NEXT: flat_store_short v[0:1], v2 +; CI-NEXT: s_endpgm +; +; GFX8-LABEL: v_fneg_f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ushort v2, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_xor_b32_e32 v2, 0x8000, v2 +; GFX8-NEXT: flat_store_short v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: v_fneg_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_fneg_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.in = getelementptr inbounds half, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr inbounds half, ptr addrspace(1) %in, i32 %tid @@ -28,30 +120,114 @@ ret void } -; GCN-LABEL: {{^}}s_fneg_free_f16: -; GCN: s_load_dword [[NEG_VALUE:s[0-9]+]], -; GCN: s_xor_b32 [[XOR:s[0-9]+]], [[NEG_VALUE]], 0x8000{{$}} -; GCN: v_mov_b32_e32 [[V_XOR:v[0-9]+]], [[XOR]] -; GCN: {{flat|global}}_store_short v{{.+}}, [[V_XOR]] define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 { +; CI-LABEL: s_fneg_free_f16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_xor_b32 s2, s2, 0x8000 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: flat_store_short v[0:1], v2 +; CI-NEXT: s_endpgm +; +; GFX8-LABEL: s_fneg_free_f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_xor_b32 s2, s2, 0x8000 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: flat_store_short v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: s_fneg_free_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_xor_b32 s2, s2, 0x8000 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: s_fneg_free_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_xor_b32 s2, s2, 0x8000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %bc = bitcast i16 %in to half %fsub = fsub half -0.0, %bc store half %fsub, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}v_fneg_fold_f16: -; GCN: {{flat|global}}_load_ushort [[NEG_VALUE:v[0-9]+]] - -; CI-DAG: v_cvt_f32_f16_e32 [[CVT_VAL:v[0-9]+]], [[NEG_VALUE]] -; CI-DAG: v_cvt_f32_f16_e64 [[NEG_CVT0:v[0-9]+]], -[[NEG_VALUE]] -; CI: v_mul_f32_e32 [[MUL:v[0-9]+]], [[NEG_CVT0]], [[CVT_VAL]] -; CI: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], [[MUL]] -; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[CVT1]] - -; VI-NOT: [[NEG_VALUE]] -; VI: v_mul_f16_e64 v{{[0-9]+}}, -[[NEG_VALUE]], [[NEG_VALUE]] define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; CI-LABEL: v_fneg_fold_f16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_load_ushort v0, v[0:1] +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; CI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; CI-NEXT: v_mul_f32_e32 v0, v0, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v0 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_store_short v[0:1], v2 +; CI-NEXT: s_endpgm +; +; GFX8-LABEL: v_fneg_fold_f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_load_ushort v2, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mul_f16_e64 v2, -v2, v2 +; GFX8-NEXT: flat_store_short v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: v_fneg_fold_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mul_f16_e64 v1, -v1, v1 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_fneg_fold_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mul_f16_e64 v1, -v1, v1 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load half, ptr addrspace(1) %in %fsub = fsub half -0.0, %val %fmul = fmul half %fsub, %val @@ -59,17 +235,100 @@ ret void } -; GCN-LABEL: {{^}}s_fneg_v2f16: -; GCN: s_xor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000 define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) #0 { +; CI-LABEL: s_fneg_v2f16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_xor_b32 s2, s2, 0x80008000 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_endpgm +; +; GFX8-LABEL: s_fneg_v2f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_xor_b32 s2, s2, 0x80008000 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: s_fneg_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_xor_b32 s2, s2, 0x80008000 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: s_fneg_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_xor_b32 s2, s2, 0x80008000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %fneg = fsub <2 x half> , %in store <2 x half> %fneg, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}s_fneg_v2f16_nonload: -; GCN: s_xor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000 define amdgpu_kernel void @s_fneg_v2f16_nonload(ptr addrspace(1) %out) #0 { +; CIVI-LABEL: s_fneg_v2f16_nonload: +; CIVI: ; %bb.0: +; CIVI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CIVI-NEXT: ;;#ASMSTART +; CIVI-NEXT: ; def s2 +; CIVI-NEXT: ;;#ASMEND +; CIVI-NEXT: s_xor_b32 s2, s2, 0x80008000 +; CIVI-NEXT: v_mov_b32_e32 v2, s2 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: v_mov_b32_e32 v0, s0 +; CIVI-NEXT: v_mov_b32_e32 v1, s1 +; CIVI-NEXT: flat_store_dword v[0:1], v2 +; CIVI-NEXT: s_endpgm +; +; GFX9-LABEL: s_fneg_v2f16_nonload: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s2 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_xor_b32 s2, s2, 0x80008000 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: s_fneg_v2f16_nonload: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; def s2 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_xor_b32 s2, s2, 0x80008000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %in = call i32 asm sideeffect "; def $0", "=s"() %in.bc = bitcast i32 %in to <2 x half> %fneg = fsub <2 x half> , %in.bc @@ -77,10 +336,57 @@ ret void } -; GCN-LABEL: {{^}}v_fneg_v2f16: -; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]] -; GCN: v_xor_b32_e32 v{{[0-9]+}}, 0x80008000, [[VAL]] define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; CI-LABEL: v_fneg_v2f16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_load_dword v2, v[0:1] +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_endpgm +; +; GFX8-LABEL: v_fneg_v2f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: v_fneg_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_fneg_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.in = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid @@ -90,35 +396,125 @@ ret void } -; GCN-LABEL: {{^}}fneg_free_v2f16: -; GCN: s_load_dword [[VAL:s[0-9]+]] -; GCN: s_xor_b32 s{{[0-9]+}}, [[VAL]], 0x80008000 define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { +; CI-LABEL: fneg_free_v2f16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_xor_b32 s2, s2, 0x80008000 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_endpgm +; +; GFX8-LABEL: fneg_free_v2f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_xor_b32 s2, s2, 0x80008000 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: fneg_free_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_xor_b32 s2, s2, 0x80008000 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: fneg_free_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_xor_b32 s2, s2, 0x80008000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %bc = bitcast i32 %in to <2 x half> %fsub = fsub <2 x half> , %bc store <2 x half> %fsub, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}v_fneg_fold_v2f16: -; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]] - -; CI: v_xor_b32_e32 [[FNEG:v[0-9]+]], 0x80008000, [[VAL]] -; CI: v_lshrrev_b32_e32 -; CI: v_lshrrev_b32_e32 - -; CI: v_cvt_f32_f16_e32 v{{[0-9]+}}, v{{[0-9]+}} -; CI: v_cvt_f32_f16_e32 v{{[0-9]+}}, v{{[0-9]+}} -; CI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; CI: v_cvt_f16_f32 -; CI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; CI: v_cvt_f16_f32 - -; VI: v_mul_f16_sdwa v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI: v_mul_f16_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}} - -; GFX9: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} neg_lo:[1,0] neg_hi:[1,0]{{$}} define amdgpu_kernel void @v_fneg_fold_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; CI-LABEL: v_fneg_fold_v2f16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_load_dword v0, v[0:1] +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_xor_b32_e32 v2, 0x80008000, v0 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_mul_f32_e32 v1, v3, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v1 +; CI-NEXT: v_mul_f32_e32 v0, v2, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v0 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_or_b32_e32 v2, v2, v3 +; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_endpgm +; +; GFX8-LABEL: v_fneg_fold_v2f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mul_f16_sdwa v3, -v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_e64 v2, -v2, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: v_fneg_fold_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_mul_f16 v1, v1, v1 neg_lo:[1,0] neg_hi:[1,0] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_fneg_fold_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_mul_f16 v1, v1, v1 neg_lo:[1,0] neg_hi:[1,0] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load <2 x half>, ptr addrspace(1) %in %fsub = fsub <2 x half> , %val %fmul = fmul <2 x half> %fsub, %val @@ -126,16 +522,78 @@ ret void } -; GCN-LABEL: {{^}}v_extract_fneg_fold_v2f16: -; GCN-DAG: {{flat|global}}_load_dword [[VAL:v[0-9]+]] -; CI-DAG: v_mul_f32_e32 v{{[0-9]+}}, -4.0, v{{[0-9]+}} -; CI-DAG: v_sub_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} - -; GFX89-DAG: v_mul_f16_e32 v{{[0-9]+}}, -4.0, [[VAL]] -; GFX89-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000 -; GFX89-DAG: v_sub_f16_sdwa v{{[0-9]+}}, [[CONST2]], [[VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 - define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 { +; CI-LABEL: v_extract_fneg_fold_v2f16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_load_dword v0, v[0:1] +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_mul_f32_e32 v0, -4.0, v0 +; CI-NEXT: v_sub_f32_e32 v1, 2.0, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: flat_store_short v[0:1], v0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: flat_store_short v[0:1], v1 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_endpgm +; +; GFX8-LABEL: v_extract_fneg_fold_v2f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v1, 0x4000 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mul_f16_e32 v2, -4.0, v0 +; GFX8-NEXT: v_sub_f16_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: flat_store_short v[0:1], v2 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_short v[0:1], v0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: v_extract_fneg_fold_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x4000 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mul_f16_e32 v2, -4.0, v0 +; GFX9-NEXT: v_sub_f16_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: global_store_short v[0:1], v2, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_short v[0:1], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_extract_fneg_fold_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-NEXT: v_mul_f16_e32 v0, -4.0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_sub_f16_e32 v1, 2.0, v1 +; GFX11-NEXT: global_store_b16 v[0:1], v0, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_b16 v[0:1], v1, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load <2 x half>, ptr addrspace(1) %in %fneg = fsub <2 x half> , %val %elt0 = extractelement <2 x half> %fneg, i32 0 @@ -148,12 +606,51 @@ ret void } -; GCN-LABEL: {{^}}v_extract_fneg_no_fold_v2f16: -; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]] -; GCN: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80008000, [[VAL]] -; CIVI: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[NEG]] -; GFX9: global_store_short_d16_hi v{{\[[0-9]+:[0-9]+\]}}, [[NEG]], off define amdgpu_kernel void @v_extract_fneg_no_fold_v2f16(ptr addrspace(1) %in) #0 { +; CIVI-LABEL: v_extract_fneg_no_fold_v2f16: +; CIVI: ; %bb.0: +; CIVI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: v_mov_b32_e32 v0, s0 +; CIVI-NEXT: v_mov_b32_e32 v1, s1 +; CIVI-NEXT: flat_load_dword v0, v[0:1] +; CIVI-NEXT: s_waitcnt vmcnt(0) +; CIVI-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; CIVI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; CIVI-NEXT: flat_store_short v[0:1], v0 +; CIVI-NEXT: s_waitcnt vmcnt(0) +; CIVI-NEXT: flat_store_short v[0:1], v1 +; CIVI-NEXT: s_waitcnt vmcnt(0) +; CIVI-NEXT: s_endpgm +; +; GFX9-LABEL: v_extract_fneg_no_fold_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX9-NEXT: global_store_short v[0:1], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_short_d16_hi v[0:1], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_extract_fneg_no_fold_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX11-NEXT: global_store_b16 v[0:1], v0, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_d16_hi_b16 v[0:1], v0, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load <2 x half>, ptr addrspace(1) %in %fneg = fsub <2 x half> , %val %elt0 = extractelement <2 x half> %fneg, i32 0