diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -1,15 +1,64 @@ -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck --check-prefixes=CI,GCN %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefixes=VI,GFX89,GCN %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX89,GFX9,GCN %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck --check-prefixes=CIVI,CI %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefixes=CIVI,VI %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11 %s -; GCN-LABEL: {{^}}fneg_fabs_fadd_f16: -; CI-DAG: v_cvt_f32_f16_e32 -; CI-DAG: v_cvt_f32_f16_e64 [[CVT_ABS_X:v[0-9]+]], |s{{[0-9]+}}| -; CI: v_sub_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_ABS_X]] - -; GFX89-NOT: _and -; GFX89: v_sub_f16_e64 {{v[0-9]+}}, {{s[0-9]+}}, |{{v[0-9]+}}| define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, half %y) { +; CI-LABEL: fneg_fabs_fadd_f16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s0, s[4:5], 0x2 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e64 v0, |s0| +; CI-NEXT: s_lshr_b32 s0, s0, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: v_sub_f32_e32 v0, v1, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_store_short v[0:1], v2 +; CI-NEXT: s_endpgm +; +; VI-LABEL: fneg_fabs_fadd_f16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s3, s2, 16 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_sub_f16_e64 v2, s3, |v0| +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fneg_fabs_fadd_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s3, s2, 16 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_sub_f16_e64 v1, s3, |v1| +; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: fneg_fabs_fadd_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_sub_f16_e64 v1, s3, |s2| +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %fabs = call half @llvm.fabs.f16(half %x) %fsub = fsub half -0.0, %fabs %fadd = fadd half %y, %fsub @@ -17,17 +66,62 @@ ret void } -; GCN-LABEL: {{^}}fneg_fabs_fmul_f16: -; CI-DAG: v_cvt_f32_f16_e32 -; CI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG_ABS_X:v[0-9]+]], -|{{s[0-9]+}}| -; CI: v_mul_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, [[CVT_NEG_ABS_X]] -; CI: v_cvt_f16_f32_e32 - -; GFX89-NOT: _and -; GFX89: v_mul_f16_e64 [[MUL:v[0-9]+]], {{s[0-9]+}}, -|{{v[0-9]+}}| -; GFX89-NOT: [[MUL]] -; GFX89: {{flat|global}}_store_short v{{.+}}, [[MUL]] define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, half %y) { +; CI-LABEL: fneg_fabs_fmul_f16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s0, s[4:5], 0x2 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_and_b32 s1, s0, 0x7fff +; CI-NEXT: s_lshr_b32 s0, s0, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; CI-NEXT: v_cvt_f32_f16_e64 v1, -|s1| +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: v_mul_f32_e32 v0, v0, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_store_short v[0:1], v2 +; CI-NEXT: s_endpgm +; +; VI-LABEL: fneg_fabs_fmul_f16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s3, s2, 16 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mul_f16_e64 v2, s3, -|v0| +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fneg_fabs_fmul_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s3, s2, 16 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mul_f16_e64 v1, s3, -|v1| +; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: fneg_fabs_fmul_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mul_f16_e64 v1, s3, -|s2| +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %fabs = call half @llvm.fabs.f16(half %x) %fsub = fsub half -0.0, %fabs %fmul = fmul half %y, %fsub @@ -38,10 +132,54 @@ ; DAGCombiner will transform: ; (fabs (f16 bitcast (i16 a))) => (f16 bitcast (and (i16 a), 0x7FFFFFFF)) ; unless isFabsFree returns true - -; GCN-LABEL: {{^}}fneg_fabs_free_f16: -; GCN: {{s_or_b32 s[0-9]+, s[0-9]+, 0x8000|s_bitset1_b32 s[0-9]+, 15}} define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { +; CI-LABEL: fneg_fabs_free_f16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_bitset1_b32 s2, 15 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: flat_store_short v[0:1], v2 +; CI-NEXT: s_endpgm +; +; VI-LABEL: fneg_fabs_free_f16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_bitset1_b32 s2, 15 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fneg_fabs_free_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_bitset1_b32 s2, 15 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: fneg_fabs_free_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bitset1_b32 s2, 15 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %bc = bitcast i16 %in to half %fabs = call half @llvm.fabs.f16(half %bc) %fsub = fsub half -0.0, %fabs @@ -49,18 +187,97 @@ ret void } -; GCN-LABEL: {{^}}fneg_fabs_f16: -; GCN: {{s_or_b32 s[0-9]+, s[0-9]+, 0x8000|s_bitset1_b32 s[0-9]+, 15}} define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) { +; CI-LABEL: fneg_fabs_f16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_bitset1_b32 s2, 15 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: flat_store_short v[0:1], v2 +; CI-NEXT: s_endpgm +; +; VI-LABEL: fneg_fabs_f16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_bitset1_b32 s2, 15 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fneg_fabs_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_bitset1_b32 s2, 15 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: fneg_fabs_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bitset1_b32 s2, 15 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %fabs = call half @llvm.fabs.f16(half %in) %fsub = fsub half -0.0, %fabs store half %fsub, ptr addrspace(1) %out, align 2 ret void } -; GCN-LABEL: {{^}}v_fneg_fabs_f16: -; GCN: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}} define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; CIVI-LABEL: v_fneg_fabs_f16: +; CIVI: ; %bb.0: +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: v_mov_b32_e32 v0, s2 +; CIVI-NEXT: v_mov_b32_e32 v1, s3 +; CIVI-NEXT: flat_load_ushort v2, v[0:1] +; CIVI-NEXT: v_mov_b32_e32 v0, s0 +; CIVI-NEXT: v_mov_b32_e32 v1, s1 +; CIVI-NEXT: s_waitcnt vmcnt(0) +; CIVI-NEXT: v_or_b32_e32 v2, 0x8000, v2 +; CIVI-NEXT: flat_store_short v[0:1], v2 +; CIVI-NEXT: s_endpgm +; +; GFX9-LABEL: v_fneg_fabs_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_fneg_fabs_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load half, ptr addrspace(1) %in, align 2 %fabs = call half @llvm.fabs.f16(half %val) %fsub = fsub half -0.0, %fabs @@ -68,14 +285,70 @@ ret void } -; GCN-LABEL: {{^}}s_fneg_fabs_v2f16_non_bc_src: -; GFX9-DAG: s_load_dword [[VAL:s[0-9]+]] -; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x40003c00 -; GFX9: v_pk_add_f16 [[ADD:v[0-9]+]], [[VAL]], [[K]] -; GFX9: v_or_b32_e32 [[RESULT:v[0-9]+]], 0x80008000, [[ADD]] - -; VI: v_or_b32_e32 v{{[0-9]+}}, 0x80008000, v{{[0-9]+}} define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, <2 x half> %in) { +; CI-LABEL: s_fneg_fabs_v2f16_non_bc_src: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s0, s[4:5], 0x2 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_lshr_b32 s1, s0, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: v_add_f32_e32 v1, 2.0, v1 +; CI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 +; CI-NEXT: v_or_b32_e32 v2, 0x80008000, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_endpgm +; +; VI-LABEL: s_fneg_fabs_v2f16_non_bc_src: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, 0x4000 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s3, s2, 16 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_add_f16_e64 v1, s2, 1.0 +; VI-NEXT: v_add_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v2, 0x80008000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: s_fneg_fabs_v2f16_non_bc_src: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x40003c00 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_pk_add_f16 v1, s2, v1 +; GFX9-NEXT: v_or_b32_e32 v1, 0x80008000, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: s_fneg_fabs_v2f16_non_bc_src: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v0, 0x40003c00, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v0, 0x80008000, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = fadd <2 x half> %in, %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %add) %fneg.fabs = fsub <2 x half> , %fabs @@ -84,43 +357,166 @@ } ; FIXME: single bit op - ; Combine turns this into integer op when bitcast source (from load) - -; GCN-LABEL: {{^}}s_fneg_fabs_v2f16_bc_src: -; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000 define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x half> %in) { +; CI-LABEL: s_fneg_fabs_v2f16_bc_src: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_or_b32 s2, s2, 0x80008000 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_endpgm +; +; VI-LABEL: s_fneg_fabs_v2f16_bc_src: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_or_b32 s2, s2, 0x80008000 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: s_fneg_fabs_v2f16_bc_src: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_or_b32 s2, s2, 0x80008000 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: s_fneg_fabs_v2f16_bc_src: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_or_b32 s2, s2, 0x80008000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in) %fneg.fabs = fsub <2 x half> , %fabs store <2 x half> %fneg.fabs, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}fneg_fabs_v4f16: -; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000 -; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000 -; GCN: {{flat|global}}_store_dwordx2 define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { +; CIVI-LABEL: fneg_fabs_v4f16: +; CIVI: ; %bb.0: +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: s_or_b32 s3, s3, 0x80008000 +; CIVI-NEXT: s_or_b32 s2, s2, 0x80008000 +; CIVI-NEXT: v_mov_b32_e32 v3, s1 +; CIVI-NEXT: v_mov_b32_e32 v0, s2 +; CIVI-NEXT: v_mov_b32_e32 v1, s3 +; CIVI-NEXT: v_mov_b32_e32 v2, s0 +; CIVI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CIVI-NEXT: s_endpgm +; +; GFX9-LABEL: fneg_fabs_v4f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_or_b32 s3, s3, 0x80008000 +; GFX9-NEXT: s_or_b32 s2, s2, 0x80008000 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: fneg_fabs_v4f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_or_b32 s2, s2, 0x80008000 +; GFX11-NEXT: s_or_b32 s3, s3, 0x80008000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in) %fsub = fsub <4 x half> , %fabs store <4 x half> %fsub, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}fold_user_fneg_fabs_v2f16: -; CI: s_load_dword [[IN:s[0-9]+]] -; CI: s_lshr_b32 -; CI: v_cvt_f32_f16_e64 v{{[0-9]+}}, |s{{[0-9]+}}| -; CI: v_cvt_f32_f16_e64 v{{[0-9]+}}, |s{{[0-9]+}}| -; CI: v_mul_f32_e32 v{{[0-9]+}}, -4.0, v{{[0-9]+}} -; CI: v_mul_f32_e32 v{{[0-9]+}}, -4.0, v{{[0-9]+}} - -; VI: v_mul_f16_e64 v{{[0-9]+}}, |s{{[0-9]+}}|, -4.0 -; VI: v_mul_f16_sdwa v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD - -; GFX9: s_and_b32 [[ABS:s[0-9]+]], s{{[0-9]+}}, 0x7fff7fff -; GFX9: v_pk_mul_f16 v{{[0-9]+}}, [[ABS]], -4.0 op_sel_hi:[1,0] define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) #0 { +; CI-LABEL: fold_user_fneg_fabs_v2f16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s0, s[4:5], 0x2 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_lshr_b32 s1, s0, 16 +; CI-NEXT: v_cvt_f32_f16_e64 v1, |s1| +; CI-NEXT: v_cvt_f32_f16_e64 v0, |s0| +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: v_mul_f32_e32 v1, -4.0, v1 +; CI-NEXT: v_mul_f32_e32 v0, -4.0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v2, v0, v1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_endpgm +; +; VI-LABEL: fold_user_fneg_fabs_v2f16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, 0xc400 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s3, s2, 16 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_mul_f16_e64 v1, |s2|, -4.0 +; VI-NEXT: v_mul_f16_sdwa v0, |v2|, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v1, v0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fold_user_fneg_fabs_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff7fff +; GFX9-NEXT: v_pk_mul_f16 v1, s2, -4.0 op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: fold_user_fneg_fabs_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_pk_mul_f16 v1, s2, -4.0 op_sel_hi:[1,0] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in) %fneg.fabs = fsub <2 x half> , %fabs %mul = fmul <2 x half> %fneg.fabs, @@ -128,14 +524,71 @@ ret void } -; GCN-LABEL: {{^}}s_fneg_multi_use_fabs_v2f16: -; GFX9: s_and_b32 [[ABS:s[0-9]+]], s{{[0-9]+}}, 0x7fff7fff -; GFX9: s_xor_b32 [[NEG:s[0-9]+]], [[ABS]], 0x80008000 -; GFX9: v_mov_b32_e32 [[V_ABS:v[0-9]+]], [[ABS]] -; GFX9-DAG: v_mov_b32_e32 [[V_NEG:v[0-9]+]], [[NEG]] -; GFX9-DAG: global_store_dword v{{[0-9]+}}, [[V_ABS]], s{{\[[0-9]+:[0-9]+\]}} -; GFX9: global_store_dword v{{[0-9]+}}, [[V_NEG]], s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x half> %in) { +; CI-LABEL: s_fneg_multi_use_fabs_v2f16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0x4 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: s_and_b32 s0, s4, 0x7fff7fff +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: s_or_b32 s1, s0, 0x80008000 +; CI-NEXT: v_mov_b32_e32 v4, s0 +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: flat_store_dword v[0:1], v4 +; CI-NEXT: v_mov_b32_e32 v0, s1 +; CI-NEXT: flat_store_dword v[2:3], v0 +; CI-NEXT: s_endpgm +; +; VI-LABEL: s_fneg_multi_use_fabs_v2f16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_and_b32 s0, s4, 0x7fff7fff +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_xor_b32 s1, s0, 0x80008000 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: flat_store_dword v[0:1], v4 +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: s_fneg_multi_use_fabs_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_and_b32 s4, s6, 0x7fff7fff +; GFX9-NEXT: s_xor_b32 s5, s4, 0x80008000 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: s_fneg_multi_use_fabs_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_and_b32 s4, s4, 0x7fff7fff +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 +; GFX11-NEXT: s_xor_b32 s5, s4, 0x80008000 +; GFX11-NEXT: v_mov_b32_e32 v2, s5 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: global_store_b32 v0, v2, s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in) %fneg = fsub <2 x half> , %fabs store <2 x half> %fabs, ptr addrspace(1) %out0 @@ -143,10 +596,80 @@ ret void } -; GCN-LABEL: {{^}}s_fneg_multi_use_fabs_foldable_neg_v2f16: -; GFX9: s_and_b32 [[ABS:s[0-9]+]], s{{[0-9]+}}, 0x7fff7fff -; GFX9: v_pk_mul_f16 v{{[0-9]+}}, [[ABS]], -4.0 op_sel_hi:[1,0] define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x half> %in) { +; CI-LABEL: s_fneg_multi_use_fabs_foldable_neg_v2f16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0x4 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: s_bfe_u32 s0, s4, 0xf0010 +; CI-NEXT: v_cvt_f32_f16_e32 v5, s0 +; CI-NEXT: v_cvt_f32_f16_e64 v4, |s4| +; CI-NEXT: s_and_b32 s0, s4, 0x7fff7fff +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mul_f32_e32 v5, -4.0, v5 +; CI-NEXT: v_mul_f32_e32 v4, -4.0, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_mov_b32_e32 v6, s0 +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: v_or_b32_e32 v4, v4, v5 +; CI-NEXT: flat_store_dword v[0:1], v6 +; CI-NEXT: flat_store_dword v[2:3], v4 +; CI-NEXT: s_endpgm +; +; VI-LABEL: s_fneg_multi_use_fabs_foldable_neg_v2f16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: v_mov_b32_e32 v5, 0xc400 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_lshr_b32 s1, s4, 16 +; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_and_b32 s0, s4, 0x7fff7fff +; VI-NEXT: v_mul_f16_sdwa v4, |v4|, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mul_f16_e64 v5, |s4|, -4.0 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: flat_store_dword v[0:1], v5 +; VI-NEXT: flat_store_dword v[2:3], v4 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: s_fneg_multi_use_fabs_foldable_neg_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_and_b32 s4, s6, 0x7fff7fff +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_pk_mul_f16 v1, s4, -4.0 op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: s_fneg_multi_use_fabs_foldable_neg_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_and_b32 s4, s4, 0x7fff7fff +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 +; GFX11-NEXT: v_pk_mul_f16 v2, s4, -4.0 op_sel_hi:[1,0] +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: global_store_b32 v0, v2, s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in) %fneg = fsub <2 x half> , %fabs %mul = fmul <2 x half> %fneg, diff --git a/llvm/test/CodeGen/AMDGPU/fp-classify.ll b/llvm/test/CodeGen/AMDGPU/fp-classify.ll --- a/llvm/test/CodeGen/AMDGPU/fp-classify.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-classify.ll @@ -1,15 +1,51 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s declare float @llvm.fabs.f32(float) #1 declare double @llvm.fabs.f64(double) #1 -; GCN-LABEL: {{^}}test_isinf_pattern: -; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x204{{$}} -; GCN: v_cmp_class_f32_e32 vcc, s{{[0-9]+}}, [[MASK]] -; GCN-NOT: v_cmp -; GCN: s_endpgm define amdgpu_kernel void @test_isinf_pattern(ptr addrspace(1) nocapture %out, float %x) #0 { +; SI-LABEL: test_isinf_pattern: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x204 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cmp_class_f32_e32 vcc, s0, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: test_isinf_pattern: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 0x204 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: test_isinf_pattern: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x204 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %fabs = tail call float @llvm.fabs.f32(float %x) #1 %cmp = fcmp oeq float %fabs, 0x7FF0000000000000 %ext = zext i1 %cmp to i32 @@ -17,10 +53,46 @@ ret void } -; GCN-LABEL: {{^}}test_not_isinf_pattern_0: -; GCN-NOT: v_cmp_class -; GCN: s_endpgm define amdgpu_kernel void @test_not_isinf_pattern_0(ptr addrspace(1) nocapture %out, float %x) #0 { +; SI-LABEL: test_not_isinf_pattern_0: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x7f800000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cmp_nlg_f32_e64 s[0:1], |s0|, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: test_not_isinf_pattern_0: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cmp_nlg_f32_e64 s[2:3], |s2|, v0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: test_not_isinf_pattern_0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_cmp_nlg_f32_e64 s2, 0x7f800000, |s2| +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %fabs = tail call float @llvm.fabs.f32(float %x) #1 %cmp = fcmp ueq float %fabs, 0x7FF0000000000000 %ext = zext i1 %cmp to i32 @@ -28,10 +100,35 @@ ret void } -; GCN-LABEL: {{^}}test_not_isinf_pattern_1: -; GCN-NOT: v_cmp_class -; GCN: s_endpgm define amdgpu_kernel void @test_not_isinf_pattern_1(ptr addrspace(1) nocapture %out, float %x) #0 { +; SI-LABEL: test_not_isinf_pattern_1: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: test_not_isinf_pattern_1: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: test_not_isinf_pattern_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %fabs = tail call float @llvm.fabs.f32(float %x) #1 %cmp = fcmp oeq float %fabs, 0xFFF0000000000000 %ext = zext i1 %cmp to i32 @@ -39,13 +136,46 @@ ret void } -; GCN-LABEL: {{^}}test_isfinite_pattern_0: -; GCN-NOT: v_cmp -; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1f8{{$}} -; GCN: v_cmp_class_f32_e32 vcc, s{{[0-9]+}}, [[MASK]] -; GCN-NOT: v_cmp -; GCN: s_endpgm define amdgpu_kernel void @test_isfinite_pattern_0(ptr addrspace(1) nocapture %out, float %x) #0 { +; SI-LABEL: test_isfinite_pattern_0: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x1f8 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cmp_class_f32_e32 vcc, s0, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: test_isfinite_pattern_0: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: test_isfinite_pattern_0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %ord = fcmp ord float %x, 0.000000e+00 %x.fabs = tail call float @llvm.fabs.f32(float %x) #1 %ninf = fcmp une float %x.fabs, 0x7FF0000000000000 @@ -55,13 +185,46 @@ ret void } -; SI-LABEL: {{^}}test_isfinite_pattern_1: -; SI-NOT: v_cmp -; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1f8{{$}} -; SI: v_cmp_class_f32_e32 vcc, s{{[0-9]+}}, [[MASK]] -; SI-NOT: v_cmp -; SI: s_endpgm define amdgpu_kernel void @test_isfinite_pattern_1(ptr addrspace(1) nocapture %out, float %x) #0 { +; SI-LABEL: test_isfinite_pattern_1: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x1f8 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cmp_class_f32_e32 vcc, s0, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: test_isfinite_pattern_1: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: test_isfinite_pattern_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %x.fabs = tail call float @llvm.fabs.f32(float %x) #3 %cmpinf = fcmp one float %x.fabs, 0x7FF0000000000000 %ext = zext i1 %cmpinf to i32 @@ -70,10 +233,44 @@ } ; Use negative infinity -; GCN-LABEL: {{^}}test_isfinite_not_pattern_0: -; GCN-NOT: v_cmp_class_f32 -; GCN: s_endpgm define amdgpu_kernel void @test_isfinite_not_pattern_0(ptr addrspace(1) nocapture %out, float %x) #0 { +; SI-LABEL: test_isfinite_not_pattern_0: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cmp_o_f32_e64 s[4:5], s4, s4 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: test_isfinite_not_pattern_0: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cmp_o_f32_e64 s[2:3], s2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: test_isfinite_not_pattern_0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_cmp_o_f32_e64 s2, s2, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %ord = fcmp ord float %x, 0.000000e+00 %x.fabs = tail call float @llvm.fabs.f32(float %x) #1 %ninf = fcmp une float %x.fabs, 0xFFF0000000000000 @@ -84,10 +281,52 @@ } ; No fabs -; GCN-LABEL: {{^}}test_isfinite_not_pattern_1: -; GCN-NOT: v_cmp_class_f32 -; GCN: s_endpgm define amdgpu_kernel void @test_isfinite_not_pattern_1(ptr addrspace(1) nocapture %out, float %x) #0 { +; SI-LABEL: test_isfinite_not_pattern_1: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x7f800000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cmp_o_f32_e64 s[0:1], s2, s2 +; SI-NEXT: v_cmp_neq_f32_e32 vcc, s2, v0 +; SI-NEXT: s_and_b64 s[0:1], s[0:1], vcc +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: test_isfinite_not_pattern_1: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cmp_o_f32_e64 s[2:3], s4, s4 +; VI-NEXT: v_cmp_neq_f32_e32 vcc, s4, v0 +; VI-NEXT: s_and_b64 s[2:3], s[2:3], vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: test_isfinite_not_pattern_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_cmp_o_f32_e64 s3, s2, s2 +; GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x7f800000, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s2, s3, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %ord = fcmp ord float %x, 0.000000e+00 %ninf = fcmp une float %x, 0x7FF0000000000000 %and = and i1 %ord, %ninf @@ -97,10 +336,50 @@ } ; fabs of different value -; GCN-LABEL: {{^}}test_isfinite_not_pattern_2: -; GCN-NOT: v_cmp_class_f32 -; GCN: s_endpgm define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocapture %out, float %x, float %y) #0 { +; SI-LABEL: test_isfinite_not_pattern_2: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x7f800000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_cmp_o_f32_e64 s[0:1], s2, s2 +; SI-NEXT: v_cmp_neq_f32_e64 s[2:3], |s3|, v0 +; SI-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: test_isfinite_not_pattern_2: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cmp_o_f32_e64 s[4:5], s2, s2 +; VI-NEXT: v_cmp_neq_f32_e64 s[2:3], |s3|, v0 +; VI-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: test_isfinite_not_pattern_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_cmp_o_f32_e64 s2, s2, s2 +; GFX11-NEXT: v_cmp_neq_f32_e64 s3, 0x7f800000, |s3| +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s2, s2, s3 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %ord = fcmp ord float %x, 0.000000e+00 %x.fabs = tail call float @llvm.fabs.f32(float %y) #1 %ninf = fcmp une float %x.fabs, 0x7FF0000000000000 @@ -111,10 +390,52 @@ } ; Wrong ordered compare type -; GCN-LABEL: {{^}}test_isfinite_not_pattern_3: -; GCN-NOT: v_cmp_class_f32 -; GCN: s_endpgm define amdgpu_kernel void @test_isfinite_not_pattern_3(ptr addrspace(1) nocapture %out, float %x) #0 { +; SI-LABEL: test_isfinite_not_pattern_3: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x7f800000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cmp_u_f32_e64 s[0:1], s2, s2 +; SI-NEXT: v_cmp_neq_f32_e64 s[2:3], |s2|, v0 +; SI-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: test_isfinite_not_pattern_3: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cmp_u_f32_e64 s[2:3], s4, s4 +; VI-NEXT: v_cmp_neq_f32_e64 s[4:5], |s4|, v0 +; VI-NEXT: s_and_b64 s[2:3], s[2:3], s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: test_isfinite_not_pattern_3: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_cmp_u_f32_e64 s3, s2, s2 +; GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x7f800000, |s2| +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s2, s3, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %ord = fcmp uno float %x, 0.000000e+00 %x.fabs = tail call float @llvm.fabs.f32(float %x) #1 %ninf = fcmp une float %x.fabs, 0x7FF0000000000000 @@ -124,12 +445,46 @@ ret void } -; GCN-LABEL: {{^}}test_isfinite_pattern_4: -; GCN-DAG: s_load_dword [[X:s[0-9]+]] -; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1f8 -; GCN-DAG: v_cmp_class_f32_e32 vcc, [[X]], [[K]] -; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc define amdgpu_kernel void @test_isfinite_pattern_4(ptr addrspace(1) nocapture %out, float %x) #0 { +; SI-LABEL: test_isfinite_pattern_4: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x1f8 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cmp_class_f32_e32 vcc, s0, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: test_isfinite_pattern_4: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: test_isfinite_pattern_4: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %ord = fcmp ord float %x, 0.000000e+00 %x.fabs = tail call float @llvm.fabs.f32(float %x) #1 %ninf = fcmp one float %x.fabs, 0x7FF0000000000000 @@ -139,12 +494,46 @@ ret void } -; GCN-LABEL: {{^}}test_isfinite_pattern_4_commute_and: -; GCN-DAG: s_load_dword [[X:s[0-9]+]] -; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1f8 -; GCN-DAG: v_cmp_class_f32_e32 vcc, [[X]], [[K]] -; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc define amdgpu_kernel void @test_isfinite_pattern_4_commute_and(ptr addrspace(1) nocapture %out, float %x) #0 { +; SI-LABEL: test_isfinite_pattern_4_commute_and: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x1f8 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cmp_class_f32_e32 vcc, s0, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: test_isfinite_pattern_4_commute_and: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: test_isfinite_pattern_4_commute_and: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %ord = fcmp ord float %x, 0.000000e+00 %x.fabs = tail call float @llvm.fabs.f32(float %x) #1 %ninf = fcmp one float %x.fabs, 0x7FF0000000000000 @@ -154,23 +543,57 @@ ret void } -; GCN-LABEL: {{^}}test_not_isfinite_pattern_4_wrong_ord_test: -; GCN-DAG: s_load_dword [[X:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} -; GCN-DAG: s_load_dword [[Y:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x14|0x50}} - -; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1f8 -; GCN-DAG: v_mov_b32_e32 [[VY:v[0-9]+]], [[Y]] - -; SI-DAG: v_cmp_o_f32_e32 vcc, [[X]], [[VY]] -; SI-DAG: v_cmp_class_f32_e64 [[CLASS:s\[[0-9]+:[0-9]+\]]], [[X]], [[K]] -; SI: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], vcc, [[CLASS]] - -; VI-DAG: v_cmp_o_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[X]], [[VY]] -; VI-DAG: v_cmp_class_f32_e32 vcc, [[X]], [[K]] -; VI: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], [[CMP]], vcc - -; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[AND]] define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrspace(1) nocapture %out, float %x, [8 x i32], float %y) #0 { +; SI-LABEL: test_not_isfinite_pattern_4_wrong_ord_test: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s2, s[0:1], 0x14 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x1f8 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s2 +; SI-NEXT: v_cmp_o_f32_e32 vcc, s0, v1 +; SI-NEXT: v_cmp_class_f32_e64 s[0:1], s0, v0 +; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: test_not_isfinite_pattern_4_wrong_ord_test: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s4, s[0:1], 0x50 +; VI-NEXT: s_load_dword s5, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_cmp_class_f32_e32 vcc, s5, v0 +; VI-NEXT: v_cmp_o_f32_e64 s[0:1], s5, v1 +; VI-NEXT: s_and_b64 s[0:1], s[0:1], vcc +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: test_not_isfinite_pattern_4_wrong_ord_test: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x50 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_cmp_o_f32_e64 s3, s2, s3 +; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s2, s3, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %ord = fcmp ord float %x, %y %x.fabs = tail call float @llvm.fabs.f32(float %x) #1 %ninf = fcmp one float %x.fabs, 0x7FF0000000000000 @@ -180,18 +603,47 @@ ret void } -; GCN-LABEL: {{^}}test_isinf_pattern_f16: -; SI-DAG: s_mov_b32 [[INF:s[0-9]+]], 0x7f800000 -; SI-DAG: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], |s{{[0-9]+}}| -; SI: v_cmp_eq_f32_e32 vcc, [[INF]], [[CVT]] -; SI-NEXT: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc - -; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x204{{$}} -; VI: v_cmp_class_f16_e32 vcc, s{{[0-9]+}}, [[MASK]] -; VI-NOT: v_cmp - -; GCN: s_endpgm define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %out, half %x) #0 { +; SI-LABEL: test_isinf_pattern_f16: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s1, 0x7f800000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e64 v0, |s0| +; SI-NEXT: v_cmp_eq_f32_e32 vcc, s1, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: test_isinf_pattern_f16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 0x204 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cmp_class_f16_e32 vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: test_isinf_pattern_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_cmp_class_f16_e64 s2, s2, 0x204 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %fabs = tail call half @llvm.fabs.f16(half %x) #1 %cmp = fcmp oeq half %fabs, 0xH7C00 %ext = zext i1 %cmp to i32 @@ -199,19 +651,47 @@ ret void } -; GCN-LABEL: {{^}}test_isfinite_pattern_0_f16: -; SI-DAG: s_movk_i32 [[MASK:s[0-9]+]], 0x1f8 -; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], s{{[0-9]+}} -; SI: v_cmp_class_f32_e64 [[CLASS:s\[[0-9]+:[0-9]+\]]], [[CVT]], [[MASK]] -; SI-NEXT: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[CLASS]] - -; VI-NOT: v_cmp -; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1f8{{$}} -; VI: v_cmp_class_f16_e32 vcc, s{{[0-9]+}}, [[MASK]] -; VI-NOT: v_cmp - -; GCN: s_endpgm define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocapture %out, half %x) #0 { +; SI-LABEL: test_isfinite_pattern_0_f16: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_movk_i32 s1, 0x1f8 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: v_cmp_class_f32_e64 s[0:1], v0, s1 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: test_isfinite_pattern_0_f16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cmp_class_f16_e32 vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: test_isfinite_pattern_0_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_cmp_class_f16_e64 s2, s2, 0x1f8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %ord = fcmp ord half %x, 0.0 %x.fabs = tail call half @llvm.fabs.f16(half %x) #1 %ninf = fcmp une half %x.fabs, 0xH7C00 @@ -221,17 +701,47 @@ ret void } -; GCN-LABEL: {{^}}test_isfinite_pattern_4_f16: -; SI-DAG: s_movk_i32 [[MASK:s[0-9]+]], 0x1f8 -; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], s{{[0-9]+}} -; SI: v_cmp_class_f32_e64 [[CLASS:s\[[0-9]+:[0-9]+\]]], [[CVT]], [[MASK]] -; SI-NEXT: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[CLASS]] - -; VI-DAG: s_load_dword [[X:s[0-9]+]] -; VI-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1f8 -; VI: v_cmp_class_f16_e32 vcc, [[X]], [[MASK]] -; VI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc define amdgpu_kernel void @test_isfinite_pattern_4_f16(ptr addrspace(1) nocapture %out, half %x) #0 { +; SI-LABEL: test_isfinite_pattern_4_f16: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_movk_i32 s1, 0x1f8 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: v_cmp_class_f32_e64 s[0:1], v0, s1 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: test_isfinite_pattern_4_f16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cmp_class_f16_e32 vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: test_isfinite_pattern_4_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_cmp_class_f16_e64 s2, s2, 0x1f8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %ord = fcmp ord half %x, 0.0 %x.fabs = tail call half @llvm.fabs.f16(half %x) #1 %ninf = fcmp one half %x.fabs, 0xH7C00 diff --git a/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll b/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll --- a/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll +++ b/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll @@ -1,20 +1,101 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=EGCM -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=EGCM -check-prefix=FUNC %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6 %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=CYPRESS %s +; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefixes=CAYMAN %s declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone -; FUNC-LABEL: {{^}}test_convert_fp16_to_fp32: -; GCN: buffer_load_ushort [[VAL:v[0-9]+]] -; GCN: v_cvt_f32_f16_e32 [[RESULT:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[RESULT]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[RES:T[0-9]+\.[XYZW]]] -; CM: MEM_RAT_CACHELESS STORE_DWORD [[RES:T[0-9]+\.[XYZW]]] -; EGCM: VTX_READ_16 [[VAL:T[0-9]+\.[XYZW]]] -; EGCM: FLT16_TO_FLT32{{[ *]*}}[[RES]], [[VAL]] define amdgpu_kernel void @test_convert_fp16_to_fp32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { +; GFX6-LABEL: test_convert_fp16_to_fp32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s10, s6 +; GFX6-NEXT: s_mov_b32 s11, s7 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s8, s2 +; GFX6-NEXT: s_mov_b32 s9, s3 +; GFX6-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX8-LABEL: test_convert_fp16_to_fp32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: s_mov_b32 s10, s6 +; GFX8-NEXT: s_mov_b32 s11, s7 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s8, s2 +; GFX8-NEXT: s_mov_b32 s9, s3 +; GFX8-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX8-NEXT: s_mov_b32 s4, s0 +; GFX8-NEXT: s_mov_b32 s5, s1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX8-NEXT: s_endpgm +; +; GFX11-LABEL: test_convert_fp16_to_fp32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; CYPRESS-LABEL: test_convert_fp16_to_fp32: +; CYPRESS: ; %bb.0: +; CYPRESS-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; CYPRESS-NEXT: TEX 0 @6 +; CYPRESS-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] +; CYPRESS-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; CYPRESS-NEXT: CF_END +; CYPRESS-NEXT: PAD +; CYPRESS-NEXT: Fetch clause starting at 6: +; CYPRESS-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 +; CYPRESS-NEXT: ALU clause starting at 8: +; CYPRESS-NEXT: MOV * T0.X, KC0[2].Z, +; CYPRESS-NEXT: ALU clause starting at 9: +; CYPRESS-NEXT: FLT16_TO_FLT32 T0.X, T0.X, +; CYPRESS-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; CYPRESS-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; CAYMAN-LABEL: test_convert_fp16_to_fp32: +; CAYMAN: ; %bb.0: +; CAYMAN-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; CAYMAN-NEXT: TEX 0 @6 +; CAYMAN-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] +; CAYMAN-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X +; CAYMAN-NEXT: CF_END +; CAYMAN-NEXT: PAD +; CAYMAN-NEXT: Fetch clause starting at 6: +; CAYMAN-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 +; CAYMAN-NEXT: ALU clause starting at 8: +; CAYMAN-NEXT: MOV * T0.X, KC0[2].Z, +; CAYMAN-NEXT: ALU clause starting at 9: +; CAYMAN-NEXT: FLT16_TO_FLT32 * T0.X, T0.X, +; CAYMAN-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; CAYMAN-NEXT: 2(2.802597e-45), 0(0.000000e+00) %val = load i16, ptr addrspace(1) %in, align 2 %cvt = call float @llvm.convert.from.fp16.f32(i16 %val) nounwind readnone store float %cvt, ptr addrspace(1) %out, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll b/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll --- a/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll +++ b/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll @@ -1,14 +1,69 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6 %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone -; FUNC-LABEL: {{^}}test_convert_fp16_to_fp64: -; GCN: buffer_load_ushort [[VAL:v[0-9]+]] -; GCN: v_cvt_f32_f16_e32 [[RESULT32:v[0-9]+]], [[VAL]] -; GCN: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[RESULT32]] -; GCN: buffer_store_dwordx2 [[RESULT]] define amdgpu_kernel void @test_convert_fp16_to_fp64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { +; GFX6-LABEL: test_convert_fp16_to_fp64: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s10, s6 +; GFX6-NEXT: s_mov_b32 s11, s7 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s8, s2 +; GFX6-NEXT: s_mov_b32 s9, s3 +; GFX6-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX8-LABEL: test_convert_fp16_to_fp64: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: s_mov_b32 s10, s6 +; GFX8-NEXT: s_mov_b32 s11, s7 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s8, s2 +; GFX8-NEXT: s_mov_b32 s9, s3 +; GFX8-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX8-NEXT: s_mov_b32 s4, s0 +; GFX8-NEXT: s_mov_b32 s5, s1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX8-NEXT: s_endpgm +; +; GFX11-LABEL: test_convert_fp16_to_fp64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load i16, ptr addrspace(1) %in, align 2 %cvt = call double @llvm.convert.from.fp16.f64(i16 %val) nounwind readnone store double %cvt, ptr addrspace(1) %out, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll b/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll --- a/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll +++ b/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll @@ -1,18 +1,92 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=CYPRESS %s declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone -; FUNC-LABEL: {{^}}test_convert_fp32_to_fp16: -; GCN: buffer_load_dword [[VAL:v[0-9]+]] -; GCN: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[VAL]] -; GCN: buffer_store_short [[RESULT]] - -; EG: MEM_RAT MSKOR -; EG: VTX_READ_32 -; EG: FLT32_TO_FLT16 define amdgpu_kernel void @test_convert_fp32_to_fp16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { +; GFX6-LABEL: test_convert_fp32_to_fp16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s10, s6 +; GFX6-NEXT: s_mov_b32 s11, s7 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s8, s2 +; GFX6-NEXT: s_mov_b32 s9, s3 +; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX8-LABEL: test_convert_fp32_to_fp16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: s_mov_b32 s10, s6 +; GFX8-NEXT: s_mov_b32 s11, s7 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s8, s2 +; GFX8-NEXT: s_mov_b32 s9, s3 +; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX8-NEXT: s_mov_b32 s4, s0 +; GFX8-NEXT: s_mov_b32 s5, s1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX8-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX8-NEXT: s_endpgm +; +; GFX11-LABEL: test_convert_fp32_to_fp16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; CYPRESS-LABEL: test_convert_fp32_to_fp16: +; CYPRESS: ; %bb.0: +; CYPRESS-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; CYPRESS-NEXT: TEX 0 @6 +; CYPRESS-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[] +; CYPRESS-NEXT: MEM_RAT MSKOR T0.XW, T1.X +; CYPRESS-NEXT: CF_END +; CYPRESS-NEXT: PAD +; CYPRESS-NEXT: Fetch clause starting at 6: +; CYPRESS-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; CYPRESS-NEXT: ALU clause starting at 8: +; CYPRESS-NEXT: MOV * T0.X, KC0[2].Z, +; CYPRESS-NEXT: ALU clause starting at 9: +; CYPRESS-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x, +; CYPRESS-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; CYPRESS-NEXT: FLT32_TO_FLT16 T1.W, T0.X, +; CYPRESS-NEXT: LSHL * T0.W, PV.W, literal.x, +; CYPRESS-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; CYPRESS-NEXT: LSHL T0.X, PV.W, PS, +; CYPRESS-NEXT: LSHL * T0.W, literal.x, PS, +; CYPRESS-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; CYPRESS-NEXT: MOV T0.Y, 0.0, +; CYPRESS-NEXT: MOV * T0.Z, 0.0, +; CYPRESS-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; CYPRESS-NEXT: 2(2.802597e-45), 0(0.000000e+00) %val = load float, ptr addrspace(1) %in, align 4 %cvt = call i16 @llvm.convert.to.fp16.f32(float %val) nounwind readnone store i16 %cvt, ptr addrspace(1) %out, align 2 diff --git a/llvm/test/CodeGen/AMDGPU/fpext-free.ll b/llvm/test/CodeGen/AMDGPU/fpext-free.ll --- a/llvm/test/CodeGen/AMDGPU/fpext-free.ll +++ b/llvm/test/CodeGen/AMDGPU/fpext-free.ll @@ -1,19 +1,36 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,GFX9,GFX9-F32FLUSH %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=ieee -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,GFX9,GFX9-F32DENORM %s -; RUN: llc -march=amdgcn -mcpu=gfx803 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89 %s -; RUN: llc -march=amdgcn -mcpu=gfx803 -denormal-fp-math-f32=ieee -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89 %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -march=amdgcn -mcpu=gfx1100 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-F32FLUSH %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -denormal-fp-math-f32=ieee -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-F32DENORM %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9-F32FLUSH %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=ieee -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9-F32DENORM %s +; RUN: llc -march=amdgcn -mcpu=gfx803 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89 %s +; RUN: llc -march=amdgcn -mcpu=gfx803 -denormal-fp-math-f32=ieee -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89 %s ; fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z) - -; GCN-LABEL: {{^}}fadd_fpext_fmul_f16_to_f32: -; GCN: s_waitcnt -; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]{{$}} -; GFX9-F32FLUSH-NEXT: s_setpc_b64 - -; GFX9-F32DENORM-NEXT: v_mul_f16 -; GFX9-F32DENORM-NEXT: v_cvt_f32_f16 -; GFX9-F32DENORM-NEXT: v_add_f32 define float @fadd_fpext_fmul_f16_to_f32(half %x, half %y, float %z) #0 { +; GFX11-LABEL: fadd_fpext_fmul_f16_to_f32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-F32FLUSH-LABEL: fadd_fpext_fmul_f16_to_f32: +; GFX9-F32FLUSH: ; %bb.0: ; %entry +; GFX9-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] +; GFX9-F32FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-F32DENORM-LABEL: fadd_fpext_fmul_f16_to_f32: +; GFX9-F32DENORM: ; %bb.0: ; %entry +; GFX9-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-F32DENORM-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX9-F32DENORM-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX9-F32DENORM-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX9-F32DENORM-NEXT: s_setpc_b64 s[30:31] entry: %mul = fmul half %x, %y %mul.ext = fpext half %mul to float @@ -22,12 +39,27 @@ } ; f16->f64 is not free. -; GCN-LABEL: {{^}}fadd_fpext_fmul_f16_to_f64: -; GFX89: v_mul_f16 -; GFX89: v_cvt_f32_f16 -; GFX89: v_cvt_f64_f32 -; GFX89: v_add_f64 define double @fadd_fpext_fmul_f16_to_f64(half %x, half %y, double %z) #0 { +; GFX11-LABEL: fadd_fpext_fmul_f16_to_f64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: fadd_fpext_fmul_f16_to_f64: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX89-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX89-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX89-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX89-NEXT: s_setpc_b64 s[30:31] entry: %mul = fmul half %x, %y %mul.ext = fpext half %mul to double @@ -36,11 +68,24 @@ } ; f32->f64 is not free. -; GCN-LABEL: {{^}}fadd_fpext_fmul_f32_to_f64: -; GCN: v_mul_f32 -; GCN: v_cvt_f64_f32 -; GCN: v_add_f64 define double @fadd_fpext_fmul_f32_to_f64(float %x, float %y, double %z) #0 { +; GFX11-LABEL: fadd_fpext_fmul_f32_to_f64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: fadd_fpext_fmul_f32_to_f64: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX89-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX89-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX89-NEXT: s_setpc_b64 s[30:31] entry: %mul = fmul float %x, %y %mul.ext = fpext float %mul to double @@ -49,16 +94,30 @@ } ; fold (fadd x, (fpext (fmul y, z))) -> (fma (fpext y), (fpext z), x) -; GCN-LABEL: {{^}}fadd_fpext_fmul_f16_to_f32_commute: -; GCN: s_waitcnt -; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]{{$}} -; GFX9-F32FLUSH-NEXT: s_setpc_b64 - -; GFX9-F32DENORM-NEXT: v_mul_f16 -; GFX9-F32DENORM-NEXT: v_cvt_f32_f16 -; GFX9-F32DENORM-NEXT: v_add_f32 -; GFX9-F32DENORM-NEXT: s_setpc_b64 define float @fadd_fpext_fmul_f16_to_f32_commute(half %x, half %y, float %z) #0 { +; GFX11-LABEL: fadd_fpext_fmul_f16_to_f32_commute: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-NEXT: v_add_f32_e32 v0, v2, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-F32FLUSH-LABEL: fadd_fpext_fmul_f16_to_f32_commute: +; GFX9-F32FLUSH: ; %bb.0: ; %entry +; GFX9-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] +; GFX9-F32FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-F32DENORM-LABEL: fadd_fpext_fmul_f16_to_f32_commute: +; GFX9-F32DENORM: ; %bb.0: ; %entry +; GFX9-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-F32DENORM-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX9-F32DENORM-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX9-F32DENORM-NEXT: v_add_f32_e32 v0, v2, v0 +; GFX9-F32DENORM-NEXT: s_setpc_b64 s[30:31] entry: %mul = fmul half %x, %y %mul.ext = fpext half %mul to float @@ -68,20 +127,33 @@ ; fold (fadd (fma x, y, (fpext (fmul u, v))), z) ; -> (fma x, y, (fma (fpext u), (fpext v), z)) - -; GCN-LABEL: {{^}}fadd_muladd_fpext_fmul_f16_to_f32: -; GCN: s_waitcnt -; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v2, v2, v3, v4 op_sel_hi:[1,1,0] -; GFX9-F32FLUSH-NEXT: v_mac_f32_e32 v2, v0, v1 -; GFX9-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-F32FLUSH-NEXT: s_setpc_b64 - -; GFX9-F32DENORM-NEXT: v_mul_f16 -; GFX9-F32DENORM-NEXT: v_cvt_f32_f16 -; GFX9-F32DENORM-NEXT: v_fma_f32 -; GFX9-F32DENORM-NEXT: v_add_f32 -; GFX9-F32DENORM-NEXT: s_setpc_b64 define float @fadd_muladd_fpext_fmul_f16_to_f32(float %x, float %y, half %u, half %v, float %z) #0 { +; GFX11-LABEL: fadd_muladd_fpext_fmul_f16_to_f32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mul_f16_e32 v2, v2, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[0,0,1] +; GFX11-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-F32FLUSH-LABEL: fadd_muladd_fpext_fmul_f16_to_f32: +; GFX9-F32FLUSH: ; %bb.0: ; %entry +; GFX9-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v2, v2, v3, v4 op_sel_hi:[1,1,0] +; GFX9-F32FLUSH-NEXT: v_mac_f32_e32 v2, v0, v1 +; GFX9-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-F32FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-F32DENORM-LABEL: fadd_muladd_fpext_fmul_f16_to_f32: +; GFX9-F32DENORM: ; %bb.0: ; %entry +; GFX9-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-F32DENORM-NEXT: v_mul_f16_e32 v2, v2, v3 +; GFX9-F32DENORM-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX9-F32DENORM-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX9-F32DENORM-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX9-F32DENORM-NEXT: s_setpc_b64 s[30:31] entry: %mul = fmul half %u, %v %mul.ext = fpext half %mul to float @@ -92,19 +164,33 @@ ; fold (fadd x, (fma y, z, (fpext (fmul u, v))) ; -> (fma y, z, (fma (fpext u), (fpext v), x)) -; GCN-LABEL: {{^}}fadd_muladd_fpext_fmul_f16_to_f32_commute: -; GCN: s_waitcnt -; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v2, v2, v3, v4 op_sel_hi:[1,1,0] -; GFX9-F32FLUSH-NEXT: v_mac_f32_e32 v2, v0, v1 -; GFX9-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-F32FLUSH-NEXT: s_setpc_b64 - -; GFX9-F32DENORM-NEXT: v_mul_f16 -; GFX9-F32DENORM-NEXT: v_cvt_f32_f16 -; GFX9-F32DENORM-NEXT: v_fma_f32 -; GFX9-F32DENORM-NEXT: v_add_f32 -; GFX9-F32DENORM-NEXT: s_setpc_b64 define float @fadd_muladd_fpext_fmul_f16_to_f32_commute(float %x, float %y, half %u, half %v, float %z) #0 { +; GFX11-LABEL: fadd_muladd_fpext_fmul_f16_to_f32_commute: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mul_f16_e32 v2, v2, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[0,0,1] +; GFX11-NEXT: v_add_f32_e32 v0, v4, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-F32FLUSH-LABEL: fadd_muladd_fpext_fmul_f16_to_f32_commute: +; GFX9-F32FLUSH: ; %bb.0: ; %entry +; GFX9-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v2, v2, v3, v4 op_sel_hi:[1,1,0] +; GFX9-F32FLUSH-NEXT: v_mac_f32_e32 v2, v0, v1 +; GFX9-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-F32FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-F32DENORM-LABEL: fadd_muladd_fpext_fmul_f16_to_f32_commute: +; GFX9-F32DENORM: ; %bb.0: ; %entry +; GFX9-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-F32DENORM-NEXT: v_mul_f16_e32 v2, v2, v3 +; GFX9-F32DENORM-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX9-F32DENORM-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX9-F32DENORM-NEXT: v_add_f32_e32 v0, v4, v0 +; GFX9-F32DENORM-NEXT: s_setpc_b64 s[30:31] entry: %mul = fmul half %u, %v %mul.ext = fpext half %mul to float @@ -113,17 +199,33 @@ ret float %add } -; GCN-LABEL: {{^}}fadd_fmad_fpext_fmul_f16_to_f32: -; GCN: s_waitcnt -; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v2, v2, v3, v4 op_sel_hi:[1,1,0] -; GFX9-F32FLUSH-NEXT: v_mac_f32_e32 v2, v0, v1 -; GFX9-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-F32FLUSH-NEXT: s_setpc_b64 - -; GFX9-F32DENORM-NEXT: v_mul_f16_e32 v2, v2, v3 -; GFX9-F32DENORM-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX9-F32DENORM-NEXT: v_fma_f32 v0, v0, v1, v2 define float @fadd_fmad_fpext_fmul_f16_to_f32(float %x, float %y, half %u, half %v, float %z) #0 { +; GFX11-LABEL: fadd_fmad_fpext_fmul_f16_to_f32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mul_f16_e32 v2, v2, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[0,0,1] +; GFX11-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-F32FLUSH-LABEL: fadd_fmad_fpext_fmul_f16_to_f32: +; GFX9-F32FLUSH: ; %bb.0: ; %entry +; GFX9-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v2, v2, v3, v4 op_sel_hi:[1,1,0] +; GFX9-F32FLUSH-NEXT: v_mac_f32_e32 v2, v0, v1 +; GFX9-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-F32FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-F32DENORM-LABEL: fadd_fmad_fpext_fmul_f16_to_f32: +; GFX9-F32DENORM: ; %bb.0: ; %entry +; GFX9-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-F32DENORM-NEXT: v_mul_f16_e32 v2, v2, v3 +; GFX9-F32DENORM-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX9-F32DENORM-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX9-F32DENORM-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX9-F32DENORM-NEXT: s_setpc_b64 s[30:31] entry: %mul = fmul half %u, %v %mul.ext = fpext half %mul to float @@ -135,20 +237,33 @@ ; fold (fadd (fma x, y, (fpext (fmul u, v))), z) ; -> (fma x, y, (fma (fpext u), (fpext v), z)) - -; GCN-LABEL: {{^}}fadd_fma_fpext_fmul_f16_to_f32: -; GCN: s_waitcnt -; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v2, v2, v3, v4 op_sel_hi:[1,1,0] -; GFX9-F32FLUSH-NEXT: v_mac_f32_e32 v2, v0, v1 -; GFX9-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-F32FLUSH-NEXT: s_setpc_b64 - -; GFX9-F32DENORM-NEXT: v_mul_f16_e32 v2, v2, v3 -; GFX9-F32DENORM-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX9-F32DENORM-NEXT: v_fma_f32 v0, v0, v1, v2 -; GFX9-F32DENORM-NEXT: v_add_f32_e32 v0, v0, v4 -; GFX9-F32DENORM-NEXT: s_setpc_b64 define float @fadd_fma_fpext_fmul_f16_to_f32(float %x, float %y, half %u, half %v, float %z) #0 { +; GFX11-LABEL: fadd_fma_fpext_fmul_f16_to_f32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mul_f16_e32 v2, v2, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[0,0,1] +; GFX11-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-F32FLUSH-LABEL: fadd_fma_fpext_fmul_f16_to_f32: +; GFX9-F32FLUSH: ; %bb.0: ; %entry +; GFX9-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v2, v2, v3, v4 op_sel_hi:[1,1,0] +; GFX9-F32FLUSH-NEXT: v_mac_f32_e32 v2, v0, v1 +; GFX9-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-F32FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-F32DENORM-LABEL: fadd_fma_fpext_fmul_f16_to_f32: +; GFX9-F32DENORM: ; %bb.0: ; %entry +; GFX9-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-F32DENORM-NEXT: v_mul_f16_e32 v2, v2, v3 +; GFX9-F32DENORM-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX9-F32DENORM-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX9-F32DENORM-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX9-F32DENORM-NEXT: s_setpc_b64 s[30:31] entry: %mul = fmul contract half %u, %v %mul.ext = fpext half %mul to float @@ -157,19 +272,33 @@ ret float %add } -; GCN-LABEL: {{^}}fadd_fma_fpext_fmul_f16_to_f32_commute: -; GCN: s_waitcnt -; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v2, v2, v3, v4 op_sel_hi:[1,1,0] -; GFX9-F32FLUSH-NEXT: v_mac_f32_e32 v2, v0, v1 -; GFX9-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-F32FLUSH-NEXT: s_setpc_b64 - -; GFX9-F32DENORM-NEXT: v_mul_f16_e32 v2, v2, v3 -; GFX9-F32DENORM-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX9-F32DENORM-NEXT: v_fma_f32 v0, v0, v1, v2 -; GFX9-F32DENORM-NEXT: v_add_f32_e32 v0, v4, v0 -; GFX9-F32DENORM-NEXT: s_setpc_b64 define float @fadd_fma_fpext_fmul_f16_to_f32_commute(float %x, float %y, half %u, half %v, float %z) #0 { +; GFX11-LABEL: fadd_fma_fpext_fmul_f16_to_f32_commute: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mul_f16_e32 v2, v2, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[0,0,1] +; GFX11-NEXT: v_add_f32_e32 v0, v4, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-F32FLUSH-LABEL: fadd_fma_fpext_fmul_f16_to_f32_commute: +; GFX9-F32FLUSH: ; %bb.0: ; %entry +; GFX9-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v2, v2, v3, v4 op_sel_hi:[1,1,0] +; GFX9-F32FLUSH-NEXT: v_mac_f32_e32 v2, v0, v1 +; GFX9-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-F32FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-F32DENORM-LABEL: fadd_fma_fpext_fmul_f16_to_f32_commute: +; GFX9-F32DENORM: ; %bb.0: ; %entry +; GFX9-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-F32DENORM-NEXT: v_mul_f16_e32 v2, v2, v3 +; GFX9-F32DENORM-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX9-F32DENORM-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX9-F32DENORM-NEXT: v_add_f32_e32 v0, v4, v0 +; GFX9-F32DENORM-NEXT: s_setpc_b64 s[30:31] entry: %mul = fmul contract half %u, %v %mul.ext = fpext half %mul to float @@ -180,19 +309,34 @@ ; fold (fadd x, (fpext (fma y, z, (fmul u, v))) ; -> (fma (fpext y), (fpext z), (fma (fpext u), (fpext v), x)) - -; GCN-LABEL: {{^}}fadd_fpext_fmuladd_f16_to_f32: -; GCN: s_waitcnt -; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, v3, v4, v0 op_sel_hi:[1,1,0] -; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, v1, v2, v0 op_sel_hi:[1,1,0] -; GFX9-F32FLUSH-NEXT: s_setpc_b64 - -; GFX9-F32DENORM-NEXT: v_mul_f16 -; GFX9-F32DENORM-NEXT: v_fma_f16 -; GFX9-F32DENORM-NEXT: v_cvt_f32_f16 -; GFX9-F32DENORM-NEXT: v_add_f32 -; GFX9-F32DENORM-NEXT: s_setpc_b64 define float @fadd_fpext_fmuladd_f16_to_f32(float %x, half %y, half %z, half %u, half %v) #0 { +; GFX11-LABEL: fadd_fpext_fmuladd_f16_to_f32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mul_f16_e32 v3, v3, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fmac_f16_e32 v3, v1, v2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-F32FLUSH-LABEL: fadd_fpext_fmuladd_f16_to_f32: +; GFX9-F32FLUSH: ; %bb.0: ; %entry +; GFX9-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, v3, v4, v0 op_sel_hi:[1,1,0] +; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, v1, v2, v0 op_sel_hi:[1,1,0] +; GFX9-F32FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-F32DENORM-LABEL: fadd_fpext_fmuladd_f16_to_f32: +; GFX9-F32DENORM: ; %bb.0: ; %entry +; GFX9-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-F32DENORM-NEXT: v_mul_f16_e32 v3, v3, v4 +; GFX9-F32DENORM-NEXT: v_fma_f16 v1, v1, v2, v3 +; GFX9-F32DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX9-F32DENORM-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX9-F32DENORM-NEXT: s_setpc_b64 s[30:31] entry: %mul = fmul contract half %u, %v %fma = call half @llvm.fmuladd.f16(half %y, half %z, half %mul) @@ -201,18 +345,34 @@ ret float %add } -; GCN-LABEL: {{^}}fadd_fpext_fma_f16_to_f32: -; GCN: s_waitcnt -; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, v3, v4, v0 op_sel_hi:[1,1,0] -; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, v1, v2, v0 op_sel_hi:[1,1,0] -; GFX9-F32FLUSH-NEXT: s_setpc_b64 - -; GFX9-F32DENORM-NEXT: v_mul_f16 -; GFX9-F32DENORM-NEXT: v_fma_f16 -; GFX9-F32DENORM-NEXT: v_cvt_f32_f16 -; GFX9-F32DENORM-NEXT: v_add_f32 -; GFX9-F32DENORM-NEXT: s_setpc_b64 define float @fadd_fpext_fma_f16_to_f32(float %x, half %y, half %z, half %u, half %v) #0 { +; GFX11-LABEL: fadd_fpext_fma_f16_to_f32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mul_f16_e32 v3, v3, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fmac_f16_e32 v3, v1, v2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-F32FLUSH-LABEL: fadd_fpext_fma_f16_to_f32: +; GFX9-F32FLUSH: ; %bb.0: ; %entry +; GFX9-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, v3, v4, v0 op_sel_hi:[1,1,0] +; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, v1, v2, v0 op_sel_hi:[1,1,0] +; GFX9-F32FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-F32DENORM-LABEL: fadd_fpext_fma_f16_to_f32: +; GFX9-F32DENORM: ; %bb.0: ; %entry +; GFX9-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-F32DENORM-NEXT: v_mul_f16_e32 v3, v3, v4 +; GFX9-F32DENORM-NEXT: v_fma_f16 v1, v1, v2, v3 +; GFX9-F32DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX9-F32DENORM-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX9-F32DENORM-NEXT: s_setpc_b64 s[30:31] entry: %mul = fmul contract half %u, %v %fma = call half @llvm.fma.f16(half %y, half %z, half %mul) @@ -221,18 +381,34 @@ ret float %add } -; GCN-LABEL: {{^}}fadd_fpext_fma_f16_to_f32_commute: -; GCN: s_waitcnt -; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, v3, v4, v0 op_sel_hi:[1,1,0] -; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, v1, v2, v0 op_sel_hi:[1,1,0] -; GFX9-F32FLUSH-NEXT: s_setpc_b64 - -; GFX9-F32DENORM-NEXT: v_mul_f16 -; GFX9-F32DENORM-NEXT: v_fma_f16 -; GFX9-F32DENORM-NEXT: v_cvt_f32_f16 -; GFX9-F32DENORM-NEXT: v_add_f32_e32 -; GFX9-F32DENORM-NEXT: s_setpc_b64 define float @fadd_fpext_fma_f16_to_f32_commute(float %x, half %y, half %z, half %u, half %v) #0 { +; GFX11-LABEL: fadd_fpext_fma_f16_to_f32_commute: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mul_f16_e32 v3, v3, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fmac_f16_e32 v3, v1, v2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-F32FLUSH-LABEL: fadd_fpext_fma_f16_to_f32_commute: +; GFX9-F32FLUSH: ; %bb.0: ; %entry +; GFX9-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, v3, v4, v0 op_sel_hi:[1,1,0] +; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, v1, v2, v0 op_sel_hi:[1,1,0] +; GFX9-F32FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-F32DENORM-LABEL: fadd_fpext_fma_f16_to_f32_commute: +; GFX9-F32DENORM: ; %bb.0: ; %entry +; GFX9-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-F32DENORM-NEXT: v_mul_f16_e32 v3, v3, v4 +; GFX9-F32DENORM-NEXT: v_fma_f16 v1, v1, v2, v3 +; GFX9-F32DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX9-F32DENORM-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX9-F32DENORM-NEXT: s_setpc_b64 s[30:31] entry: %mul = fmul contract half %u, %v %fma = call half @llvm.fma.f16(half %y, half %z, half %mul) @@ -243,17 +419,30 @@ ; fold (fsub (fpext (fmul x, y)), z) ; -> (fma (fpext x), (fpext y), (fneg z)) - -; GCN-LABEL: {{^}}fsub_fpext_fmul_f16_to_f32: -; GCN: s_waitcnt -; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, v0, v1, -v2 op_sel_hi:[1,1,0]{{$}} -; GFX9-F32FLUSH-NEXT: s_setpc_b64 - -; GFX9-F32DENORM-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX9-F32DENORM-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX9-F32DENORM-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX9-F32DENORM-NEXT: s_setpc_b64 define float @fsub_fpext_fmul_f16_to_f32(half %x, half %y, float %z) #0 { +; GFX11-LABEL: fsub_fpext_fmul_f16_to_f32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-F32FLUSH-LABEL: fsub_fpext_fmul_f16_to_f32: +; GFX9-F32FLUSH: ; %bb.0: ; %entry +; GFX9-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, v0, v1, -v2 op_sel_hi:[1,1,0] +; GFX9-F32FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-F32DENORM-LABEL: fsub_fpext_fmul_f16_to_f32: +; GFX9-F32DENORM: ; %bb.0: ; %entry +; GFX9-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-F32DENORM-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX9-F32DENORM-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX9-F32DENORM-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX9-F32DENORM-NEXT: s_setpc_b64 s[30:31] entry: %mul = fmul half %x, %y %mul.ext = fpext half %mul to float @@ -263,17 +452,37 @@ ; fold (fsub x, (fpext (fmul y, z))) ; -> (fma (fneg (fpext y)), (fpext z), x) - -; GCN-LABEL: {{^}}fsub_fpext_fmul_f16_to_f32_commute: -; GCN: s_waitcnt -; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, -v1, v2, v0 op_sel_hi:[1,1,0] -; GFX9-F32FLUSH-NEXT: s_setpc_b64 - -; GFX9-F32DENORM-NEXT: v_mul_f16_e32 -; GFX9-F32DENORM-NEXT: v_cvt_f32_f16_e32 -; GFX9-F32DENORM-NEXT: v_sub_f32_e32 -; GFX9-F32DENORM-NEXT: s_setpc_b64 define float @fsub_fpext_fmul_f16_to_f32_commute(float %x, half %y, half %z) #0 { +; GFX11-F32FLUSH-LABEL: fsub_fpext_fmul_f16_to_f32_commute: +; GFX11-F32FLUSH: ; %bb.0: ; %entry +; GFX11-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-F32FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-F32FLUSH-NEXT: v_fma_mix_f32 v0, -v1, v2, v0 op_sel_hi:[1,1,0] +; GFX11-F32FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-F32DENORM-LABEL: fsub_fpext_fmul_f16_to_f32_commute: +; GFX11-F32DENORM: ; %bb.0: ; %entry +; GFX11-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-F32DENORM-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-F32DENORM-NEXT: v_mul_f16_e32 v1, v1, v2 +; GFX11-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-F32DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX11-F32DENORM-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX11-F32DENORM-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-F32FLUSH-LABEL: fsub_fpext_fmul_f16_to_f32_commute: +; GFX9-F32FLUSH: ; %bb.0: ; %entry +; GFX9-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, -v1, v2, v0 op_sel_hi:[1,1,0] +; GFX9-F32FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-F32DENORM-LABEL: fsub_fpext_fmul_f16_to_f32_commute: +; GFX9-F32DENORM: ; %bb.0: ; %entry +; GFX9-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-F32DENORM-NEXT: v_mul_f16_e32 v1, v1, v2 +; GFX9-F32DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX9-F32DENORM-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX9-F32DENORM-NEXT: s_setpc_b64 s[30:31] entry: %mul = fmul contract half %y, %z %mul.ext = fpext half %mul to float @@ -283,17 +492,30 @@ ; fold (fsub (fpext (fneg (fmul, x, y))), z) ; -> (fneg (fma (fpext x), (fpext y), z)) - -; GCN-LABEL: {{^}}fsub_fpext_fneg_fmul_f16_to_f32: -; GCN: s_waitcnt -; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, v0, -v1, -v2 op_sel_hi:[1,1,0]{{$}} -; GFX9-F32FLUSH-NEXT: s_setpc_b64 - -; GFX9-F32DENORM-NEXT: v_mul_f16_e64 v0, v0, -v1 -; GFX9-F32DENORM-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX9-F32DENORM-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX9-F32DENORM-NEXT: s_setpc_b64 define float @fsub_fpext_fneg_fmul_f16_to_f32(half %x, half %y, float %z) #0 { +; GFX11-LABEL: fsub_fpext_fneg_fmul_f16_to_f32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mul_f16_e64 v0, v0, -v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-F32FLUSH-LABEL: fsub_fpext_fneg_fmul_f16_to_f32: +; GFX9-F32FLUSH: ; %bb.0: ; %entry +; GFX9-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, v0, -v1, -v2 op_sel_hi:[1,1,0] +; GFX9-F32FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-F32DENORM-LABEL: fsub_fpext_fneg_fmul_f16_to_f32: +; GFX9-F32DENORM: ; %bb.0: ; %entry +; GFX9-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-F32DENORM-NEXT: v_mul_f16_e64 v0, v0, -v1 +; GFX9-F32DENORM-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX9-F32DENORM-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX9-F32DENORM-NEXT: s_setpc_b64 s[30:31] entry: %mul = fmul half %x, %y %neg.mul = fsub half -0.0, %mul @@ -304,17 +526,30 @@ ; fold (fsub (fneg (fpext (fmul, x, y))), z) ; -> (fneg (fma (fpext x)), (fpext y), z) - -; GCN-LABEL: {{^}}fsub_fneg_fpext_fmul_f16_to_f32: -; GCN: s_waitcnt -; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, v0, -v1, -v2 op_sel_hi:[1,1,0]{{$}} -; GFX9-F32FLUSH-NEXT: s_setpc_b64 - -; GFX9-F32DENORM-NEXT: v_mul_f16_e64 v0, v0, -v1 -; GFX9-F32DENORM-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX9-F32DENORM-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX9-F32DENORM-NEXT: s_setpc_b64 define float @fsub_fneg_fpext_fmul_f16_to_f32(half %x, half %y, float %z) #0 { +; GFX11-LABEL: fsub_fneg_fpext_fmul_f16_to_f32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mul_f16_e64 v0, v0, -v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-F32FLUSH-LABEL: fsub_fneg_fpext_fmul_f16_to_f32: +; GFX9-F32FLUSH: ; %bb.0: ; %entry +; GFX9-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, v0, -v1, -v2 op_sel_hi:[1,1,0] +; GFX9-F32FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-F32DENORM-LABEL: fsub_fneg_fpext_fmul_f16_to_f32: +; GFX9-F32DENORM: ; %bb.0: ; %entry +; GFX9-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-F32DENORM-NEXT: v_mul_f16_e64 v0, v0, -v1 +; GFX9-F32DENORM-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX9-F32DENORM-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX9-F32DENORM-NEXT: s_setpc_b64 s[30:31] entry: %mul = fmul half %x, %y %mul.ext = fpext half %mul to float @@ -325,19 +560,33 @@ ; fold (fsub (fmad x, y, (fpext (fmul u, v))), z) ; -> (fmad x, y (fmad (fpext u), (fpext v), (fneg z))) -; GCN-LABEL: {{^}}fsub_muladd_fpext_mul_f16_to_f32: -; GCN: s_waitcnt -; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v2, v3, v4, -v2 op_sel_hi:[1,1,0]{{$}} -; GFX9-F32FLUSH-NEXT: v_mac_f32_e32 v2, v0, v1 -; GFX9-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-F32FLUSH-NEXT: s_setpc_b64 - -; GFX9-F32DENORM-NEXT: v_mul_f16_e32 v3, v3, v4 -; GFX9-F32DENORM-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX9-F32DENORM-NEXT: v_fma_f32 v0, v0, v1, v3 -; GFX9-F32DENORM-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX9-F32DENORM-NEXT: s_setpc_b64 define float @fsub_muladd_fpext_mul_f16_to_f32(float %x, float %y, float %z, half %u, half %v) #0 { +; GFX11-LABEL: fsub_muladd_fpext_mul_f16_to_f32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mul_f16_e32 v3, v3, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_mix_f32 v0, v0, v1, v3 op_sel_hi:[0,0,1] +; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-F32FLUSH-LABEL: fsub_muladd_fpext_mul_f16_to_f32: +; GFX9-F32FLUSH: ; %bb.0: ; %entry +; GFX9-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v2, v3, v4, -v2 op_sel_hi:[1,1,0] +; GFX9-F32FLUSH-NEXT: v_mac_f32_e32 v2, v0, v1 +; GFX9-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-F32FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-F32DENORM-LABEL: fsub_muladd_fpext_mul_f16_to_f32: +; GFX9-F32DENORM: ; %bb.0: ; %entry +; GFX9-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-F32DENORM-NEXT: v_mul_f16_e32 v3, v3, v4 +; GFX9-F32DENORM-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX9-F32DENORM-NEXT: v_fma_f32 v0, v0, v1, v3 +; GFX9-F32DENORM-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX9-F32DENORM-NEXT: s_setpc_b64 s[30:31] entry: %mul = fmul reassoc half %u, %v %mul.ext = fpext half %mul to float @@ -349,14 +598,27 @@ ; fold (fsub (fpext (fmad x, y, (fmul u, v))), z) ; -> (fmad (fpext x), (fpext y), ; (fmad (fpext u), (fpext v), (fneg z))) - -; GCN-LABEL: {{^}}fsub_fpext_muladd_mul_f16_to_f32: -; GFX9: v_mul_f16 -; GFX9: v_fma_f16 -; GFX9: v_cvt_f32_f16 -; GFX9: v_sub_f32 -; GCN: s_setpc_b64 define float @fsub_fpext_muladd_mul_f16_to_f32(half %x, half %y, float %z, half %u, half %v) #0 { +; GFX11-LABEL: fsub_fpext_muladd_mul_f16_to_f32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mul_f16_e32 v3, v3, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fmac_f16_e32 v3, v0, v1 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: fsub_fpext_muladd_mul_f16_to_f32: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: v_mul_f16_e32 v3, v3, v4 +; GFX89-NEXT: v_fma_f16 v0, v0, v1, v3 +; GFX89-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX89-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX89-NEXT: s_setpc_b64 s[30:31] entry: %mul = fmul half %u, %v %fma = call half @llvm.fmuladd.f16(half %x, half %y, half %mul) @@ -367,18 +629,32 @@ ; fold (fsub x, (fmad y, z, (fpext (fmul u, v)))) ; -> (fmad (fneg y), z, (fmad (fneg (fpext u)), (fpext v), x)) -; GCN-LABEL: {{^}}fsub_muladd_fpext_mul_f16_to_f32_commute: -; GCN: s_waitcnt -; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, -v3, v4, v0 op_sel_hi:[1,1,0]{{$}} -; GFX9-F32FLUSH-NEXT: v_mad_f32 v0, -v1, v2, v0{{$}} -; GFX9-F32FLUSH-NEXT: s_setpc_b64 - -; GFX9-F32DENORM-NEXT: v_mul_f16_e32 v3, v3, v4 -; GFX9-F32DENORM-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX9-F32DENORM-NEXT: v_fma_f32 v1, v1, v2, v3 -; GFX9-F32DENORM-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX9-F32DENORM-NEXT: s_setpc_b64 define float @fsub_muladd_fpext_mul_f16_to_f32_commute(float %x, float %y, float %z, half %u, half %v) #0 { +; GFX11-LABEL: fsub_muladd_fpext_mul_f16_to_f32_commute: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mul_f16_e32 v3, v3, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_mix_f32 v1, v1, v2, v3 op_sel_hi:[0,0,1] +; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-F32FLUSH-LABEL: fsub_muladd_fpext_mul_f16_to_f32_commute: +; GFX9-F32FLUSH: ; %bb.0: ; %entry +; GFX9-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, -v3, v4, v0 op_sel_hi:[1,1,0] +; GFX9-F32FLUSH-NEXT: v_mad_f32 v0, -v1, v2, v0 +; GFX9-F32FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-F32DENORM-LABEL: fsub_muladd_fpext_mul_f16_to_f32_commute: +; GFX9-F32DENORM: ; %bb.0: ; %entry +; GFX9-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-F32DENORM-NEXT: v_mul_f16_e32 v3, v3, v4 +; GFX9-F32DENORM-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX9-F32DENORM-NEXT: v_fma_f32 v1, v1, v2, v3 +; GFX9-F32DENORM-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX9-F32DENORM-NEXT: s_setpc_b64 s[30:31] entry: %mul = fmul reassoc half %u, %v %mul.ext = fpext half %mul to float @@ -390,14 +666,27 @@ ; fold (fsub x, (fpext (fma y, z, (fmul u, v)))) ; -> (fma (fneg (fpext y)), (fpext z), ; (fma (fneg (fpext u)), (fpext v), x)) -; GCN-LABEL: {{^}}fsub_fpext_muladd_mul_f16_to_f32_commute: -; GCN: s_waitcnt -; GFX9-NEXT: v_mul_f16_e32 v3, v3, v4 -; GFX9-NEXT: v_fma_f16 v1, v1, v2, v3 -; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX9-NEXT: s_setpc_b64 define float @fsub_fpext_muladd_mul_f16_to_f32_commute(float %x, half %y, half %z, half %u, half %v) #0 { +; GFX11-LABEL: fsub_fpext_muladd_mul_f16_to_f32_commute: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mul_f16_e32 v3, v3, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fmac_f16_e32 v3, v1, v2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: fsub_fpext_muladd_mul_f16_to_f32_commute: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: v_mul_f16_e32 v3, v3, v4 +; GFX89-NEXT: v_fma_f16 v1, v1, v2, v3 +; GFX89-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX89-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX89-NEXT: s_setpc_b64 s[30:31] entry: %mul = fmul half %u, %v %fma = call half @llvm.fmuladd.f16(half %y, half %z, half %mul) diff --git a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll @@ -1,13 +1,64 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GCN,SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GCN,GFX89 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GCN,GFX89 %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX89,VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX89,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11 %s -; GCN-LABEL: {{^}}fpext_f16_to_f32 -; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] -; GCN: v_cvt_f32_f16_e32 v[[R_F32:[0-9]+]], v[[A_F16]] -; GCN: buffer_store_dword v[[R_F32]] -; GCN: s_endpgm define amdgpu_kernel void @fpext_f16_to_f32( +; SI-LABEL: fpext_f16_to_f32: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; GFX89-LABEL: fpext_f16_to_f32: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_mov_b32 s10, s6 +; GFX89-NEXT: s_mov_b32 s11, s7 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s8, s2 +; GFX89-NEXT: s_mov_b32 s9, s3 +; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX89-NEXT: s_mov_b32 s4, s0 +; GFX89-NEXT: s_mov_b32 s5, s1 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX89-NEXT: s_endpgm +; +; GFX11-LABEL: fpext_f16_to_f32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) #0 { entry: @@ -17,13 +68,65 @@ ret void } -; GCN-LABEL: {{^}}fpext_f16_to_f64 -; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] -; GCN: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; GCN: v_cvt_f64_f32_e32 v[[[R_F64_0:[0-9]+]]:[[R_F64_1:[0-9]+]]], v[[A_F32]] -; GCN: buffer_store_dwordx2 v[[[R_F64_0]]:[[R_F64_1]]] -; GCN: s_endpgm define amdgpu_kernel void @fpext_f16_to_f64( +; SI-LABEL: fpext_f16_to_f64: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; GFX89-LABEL: fpext_f16_to_f64: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_mov_b32 s10, s6 +; GFX89-NEXT: s_mov_b32 s11, s7 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s8, s2 +; GFX89-NEXT: s_mov_b32 s9, s3 +; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX89-NEXT: s_mov_b32 s4, s0 +; GFX89-NEXT: s_mov_b32 s5, s1 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX89-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX89-NEXT: s_endpgm +; +; GFX11-LABEL: fpext_f16_to_f64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) #0 { entry: @@ -33,16 +136,67 @@ ret void } -; GCN-LABEL: {{^}}fpext_v2f16_to_v2f32 -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN-DAG: v_cvt_f32_f16_e32 v[[R_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[R_F32_1:[0-9]+]], v[[A_F16_1]] -; GFX89: v_cvt_f32_f16_sdwa v[[R_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GCN: buffer_store_dwordx2 v[[[R_F32_0]]:[[R_F32_1]]] -; GCN: s_endpgm - define amdgpu_kernel void @fpext_v2f16_to_v2f32( +; SI-LABEL: fpext_v2f16_to_v2f32: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; GFX89-LABEL: fpext_v2f16_to_v2f32: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_mov_b32 s10, s6 +; GFX89-NEXT: s_mov_b32 s11, s7 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s8, s2 +; GFX89-NEXT: s_mov_b32 s9, s3 +; GFX89-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; GFX89-NEXT: s_mov_b32 s4, s0 +; GFX89-NEXT: s_mov_b32 s5, s1 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GFX89-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX89-NEXT: s_endpgm +; +; GFX11-LABEL: fpext_v2f16_to_v2f32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) #0 { entry: @@ -52,19 +206,74 @@ ret void } -; GCN-LABEL: {{^}}fpext_v2f16_to_v2f64 -; GCN: buffer_load_dword -; SI-DAG: v_lshrrev_b32_e32 -; SI-DAG: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GFX89: v_cvt_f32_f16_sdwa - -; GCN: v_cvt_f64_f32_e32 -; GCN: v_cvt_f64_f32_e32 -; GCN: buffer_store_dwordx4 -; GCN: s_endpgm - define amdgpu_kernel void @fpext_v2f16_to_v2f64( +; SI-LABEL: fpext_v2f16_to_v2f64: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; GFX89-LABEL: fpext_v2f16_to_v2f64: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_mov_b32 s10, s6 +; GFX89-NEXT: s_mov_b32 s11, s7 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s8, s2 +; GFX89-NEXT: s_mov_b32 s9, s3 +; GFX89-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX89-NEXT: s_mov_b32 s4, s0 +; GFX89-NEXT: s_mov_b32 s5, s1 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX89-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX89-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 +; GFX89-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX89-NEXT: s_endpgm +; +; GFX11-LABEL: fpext_v2f16_to_v2f64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -74,9 +283,52 @@ ret void } -; GCN-LABEL: {{^}}s_fneg_fpext_f16_to_f32: -; GCN: v_cvt_f32_f16_e32 v{{[0-9]+}}, s{{[0-9]+}} define amdgpu_kernel void @s_fneg_fpext_f16_to_f32(ptr addrspace(1) %r, i32 %a) { +; SI-LABEL: s_fneg_fpext_f16_to_f32: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: s_fneg_fpext_f16_to_f32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: s_fneg_fpext_f16_to_f32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cvt_f32_f16_e32 v0, s2 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: s_fneg_fpext_f16_to_f32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %a.trunc = trunc i32 %a to i16 %a.val = bitcast i16 %a.trunc to half @@ -85,10 +337,61 @@ ret void } -; GCN-LABEL: {{^}}fneg_fpext_f16_to_f32: -; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] -; GCN: v_cvt_f32_f16_e64 v{{[0-9]+}}, -[[A]] define amdgpu_kernel void @fneg_fpext_f16_to_f32( +; SI-LABEL: fneg_fpext_f16_to_f32: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; GFX89-LABEL: fneg_fpext_f16_to_f32: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_mov_b32 s10, s6 +; GFX89-NEXT: s_mov_b32 s11, s7 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s8, s2 +; GFX89-NEXT: s_mov_b32 s9, s3 +; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX89-NEXT: s_mov_b32 s4, s0 +; GFX89-NEXT: s_mov_b32 s5, s1 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX89-NEXT: s_endpgm +; +; GFX11-LABEL: fneg_fpext_f16_to_f32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -99,10 +402,61 @@ ret void } -; GCN-LABEL: {{^}}fabs_fpext_f16_to_f32: -; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] -; GCN: v_cvt_f32_f16_e64 v{{[0-9]+}}, |[[A]]| define amdgpu_kernel void @fabs_fpext_f16_to_f32( +; SI-LABEL: fabs_fpext_f16_to_f32: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; GFX89-LABEL: fabs_fpext_f16_to_f32: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_mov_b32 s10, s6 +; GFX89-NEXT: s_mov_b32 s11, s7 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s8, s2 +; GFX89-NEXT: s_mov_b32 s9, s3 +; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX89-NEXT: s_mov_b32 s4, s0 +; GFX89-NEXT: s_mov_b32 s5, s1 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX89-NEXT: s_endpgm +; +; GFX11-LABEL: fabs_fpext_f16_to_f32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -113,10 +467,61 @@ ret void } -; GCN-LABEL: {{^}}fneg_fabs_fpext_f16_to_f32: -; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] -; GCN: v_cvt_f32_f16_e64 v{{[0-9]+}}, -|[[A]]| define amdgpu_kernel void @fneg_fabs_fpext_f16_to_f32( +; SI-LABEL: fneg_fabs_fpext_f16_to_f32: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e64 v0, -|v0| +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; GFX89-LABEL: fneg_fabs_fpext_f16_to_f32: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_mov_b32 s10, s6 +; GFX89-NEXT: s_mov_b32 s11, s7 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s8, s2 +; GFX89-NEXT: s_mov_b32 s9, s3 +; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX89-NEXT: s_mov_b32 s4, s0 +; GFX89-NEXT: s_mov_b32 s5, s1 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: v_cvt_f32_f16_e64 v0, -|v0| +; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX89-NEXT: s_endpgm +; +; GFX11-LABEL: fneg_fabs_fpext_f16_to_f32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f32_f16_e64 v0, -|v0| +; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -128,17 +533,75 @@ ret void } -; GCN-LABEL: {{^}}fneg_multi_use_fpext_f16_to_f32: -; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] -; GCN-DAG: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x8000, [[A]] - ; FIXME: Using the source modifier here only wastes code size -; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[A]] -; GFX89-DAG: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]] -; GCN: store_dword [[CVT]] -; GCN: store_short [[XOR]] define amdgpu_kernel void @fneg_multi_use_fpext_f16_to_f32( +; SI-LABEL: fneg_multi_use_fpext_f16_to_f32: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_endpgm +; +; GFX89-LABEL: fneg_multi_use_fpext_f16_to_f32: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_mov_b32 s10, s6 +; GFX89-NEXT: s_mov_b32 s11, s7 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s8, s2 +; GFX89-NEXT: s_mov_b32 s9, s3 +; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX89-NEXT: s_mov_b32 s4, s0 +; GFX89-NEXT: s_mov_b32 s5, s1 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: v_cvt_f32_f16_e64 v1, -v0 +; GFX89-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_endpgm +; +; GFX11-LABEL: fneg_multi_use_fpext_f16_to_f32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f32_f16_e64 v1, -v0 +; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX11-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -150,19 +613,75 @@ ret void } -; GCN-LABEL: {{^}}fneg_multi_foldable_use_fpext_f16_to_f32: -; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] -; GCN-DAG: v_cvt_f32_f16_e64 [[CVTA_NEG:v[0-9]+]], -[[A]] -; SI-DAG: v_cvt_f32_f16_e32 [[CVTA:v[0-9]+]], [[A]] -; SI: v_mul_f32_e32 [[MUL_F32:v[0-9]+]], [[CVTA_NEG]], [[CVTA]] -; SI: v_cvt_f16_f32_e32 [[MUL:v[0-9]+]], [[MUL_F32]] - -; GFX89-DAG: v_cvt_f32_f16_e64 [[CVT_NEGA:v[0-9]+]], -[[A]] -; GFX89: v_mul_f16_e64 [[MUL:v[0-9]+]], -[[A]], [[A]] - -; GCN: buffer_store_dword [[CVTA_NEG]] -; GCN: buffer_store_short [[MUL]] define amdgpu_kernel void @fneg_multi_foldable_use_fpext_f16_to_f32( +; SI-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-NEXT: v_mul_f32_e32 v1, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_short v1, off, s[4:7], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_endpgm +; +; GFX89-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_mov_b32 s10, s6 +; GFX89-NEXT: s_mov_b32 s11, s7 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s8, s2 +; GFX89-NEXT: s_mov_b32 s9, s3 +; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX89-NEXT: s_mov_b32 s4, s0 +; GFX89-NEXT: s_mov_b32 s5, s1 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: v_cvt_f32_f16_e64 v1, -v0 +; GFX89-NEXT: v_mul_f16_e64 v0, -v0, v0 +; GFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_endpgm +; +; GFX11-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f32_f16_e64 v1, -v0 +; GFX11-NEXT: v_mul_f16_e64 v0, -v0, v0 +; GFX11-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -175,16 +694,73 @@ ret void } -; GCN-LABEL: {{^}}fabs_multi_use_fpext_f16_to_f32: -; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] -; GCN-DAG: v_and_b32_e32 [[XOR:v[0-9]+]], 0x7fff, [[A]] - -; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[A]] -; GFX89-DAG: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], |[[A]]| - -; GCN: store_dword [[CVT]] -; GCN: store_short [[XOR]] define amdgpu_kernel void @fabs_multi_use_fpext_f16_to_f32( +; SI-LABEL: fabs_multi_use_fpext_f16_to_f32: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_endpgm +; +; GFX89-LABEL: fabs_multi_use_fpext_f16_to_f32: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_mov_b32 s10, s6 +; GFX89-NEXT: s_mov_b32 s11, s7 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s8, s2 +; GFX89-NEXT: s_mov_b32 s9, s3 +; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX89-NEXT: s_mov_b32 s4, s0 +; GFX89-NEXT: s_mov_b32 s5, s1 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: v_cvt_f32_f16_e64 v1, |v0| +; GFX89-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_endpgm +; +; GFX11-LABEL: fabs_multi_use_fpext_f16_to_f32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f32_f16_e64 v1, |v0| +; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -196,19 +772,75 @@ ret void } -; GCN-LABEL: {{^}}fabs_multi_foldable_use_fpext_f16_to_f32: -; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] -; SI: v_cvt_f32_f16_e32 [[CVTA:v[0-9]+]], [[A]] -; SI: v_mul_f32_e64 [[MUL_F32:v[0-9]+]], |[[CVTA]]|, [[CVTA]] -; SI: v_cvt_f16_f32_e32 [[MUL:v[0-9]+]], [[MUL_F32]] -; SI: v_and_b32_e32 [[ABS_A:v[0-9]+]], 0x7fffffff, [[CVTA]] - -; GFX89-DAG: v_cvt_f32_f16_e64 [[ABS_A:v[0-9]+]], |[[A]]| -; GFX89: v_mul_f16_e64 [[MUL:v[0-9]+]], |[[A]]|, [[A]] - -; GCN: buffer_store_dword [[ABS_A]] -; GCN: buffer_store_short [[MUL]] define amdgpu_kernel void @fabs_multi_foldable_use_fpext_f16_to_f32( +; SI-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_mul_f32_e64 v1, |v0|, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_short v1, off, s[4:7], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_endpgm +; +; GFX89-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_mov_b32 s10, s6 +; GFX89-NEXT: s_mov_b32 s11, s7 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s8, s2 +; GFX89-NEXT: s_mov_b32 s9, s3 +; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX89-NEXT: s_mov_b32 s4, s0 +; GFX89-NEXT: s_mov_b32 s5, s1 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: v_cvt_f32_f16_e64 v1, |v0| +; GFX89-NEXT: v_mul_f16_e64 v0, |v0|, v0 +; GFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_endpgm +; +; GFX11-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f32_f16_e64 v1, |v0| +; GFX11-NEXT: v_mul_f16_e64 v0, |v0|, v0 +; GFX11-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -221,16 +853,73 @@ ret void } -; GCN-LABEL: {{^}}fabs_fneg_multi_use_fpext_f16_to_f32: -; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] -; GCN-DAG: v_or_b32_e32 [[OR:v[0-9]+]], 0x8000, [[A]] - -; SI: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[OR]] -; GFX89-DAG: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -|[[OR]]| - -; GCN: buffer_store_dword [[CVT]] -; GCN: buffer_store_short [[OR]] define amdgpu_kernel void @fabs_fneg_multi_use_fpext_f16_to_f32( +; SI-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, 0x8000, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_endpgm +; +; GFX89-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_mov_b32 s10, s6 +; GFX89-NEXT: s_mov_b32 s11, s7 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s8, s2 +; GFX89-NEXT: s_mov_b32 s9, s3 +; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX89-NEXT: s_mov_b32 s4, s0 +; GFX89-NEXT: s_mov_b32 s5, s1 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: v_cvt_f32_f16_e64 v1, -|v0| +; GFX89-NEXT: v_or_b32_e32 v0, 0x8000, v0 +; GFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_endpgm +; +; GFX11-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f32_f16_e64 v1, -|v0| +; GFX11-NEXT: v_or_b32_e32 v0, 0x8000, v0 +; GFX11-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -243,19 +932,75 @@ ret void } -; GCN-LABEL: {{^}}fabs_fneg_multi_foldable_use_fpext_f16_to_f32: -; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] -; SI: v_cvt_f32_f16_e32 [[CVTA:v[0-9]+]], [[A]] -; SI: v_mul_f32_e64 [[MUL_F32:v[0-9]+]], -|[[CVTA]]|, [[CVTA]] -; SI: v_cvt_f16_f32_e32 [[MUL:v[0-9]+]], [[MUL_F32]] -; SI: v_or_b32_e32 [[FABS_FNEG:v[0-9]+]], 0x80000000, [[CVTA]] - -; GFX89-DAG: v_cvt_f32_f16_e64 [[FABS_FNEG:v[0-9]+]], -|[[A]]| -; GFX89-DAG: v_mul_f16_e64 [[MUL:v[0-9]+]], -|[[A]]|, [[A]] - -; GCN: buffer_store_dword [[FABS_FNEG]] -; GCN: buffer_store_short [[MUL]] define amdgpu_kernel void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32( +; SI-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_mul_f32_e64 v1, -|v0|, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v0, 0x80000000, v0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_short v1, off, s[4:7], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_endpgm +; +; GFX89-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_mov_b32 s10, s6 +; GFX89-NEXT: s_mov_b32 s11, s7 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s8, s2 +; GFX89-NEXT: s_mov_b32 s9, s3 +; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX89-NEXT: s_mov_b32 s4, s0 +; GFX89-NEXT: s_mov_b32 s5, s1 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: v_cvt_f32_f16_e64 v1, -|v0| +; GFX89-NEXT: v_mul_f16_e64 v0, -|v0|, v0 +; GFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_endpgm +; +; GFX11-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f32_f16_e64 v1, -|v0| +; GFX11-NEXT: v_mul_f16_e64 v0, -|v0|, v0 +; GFX11-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: