diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,VI %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,GFX9 %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CI %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s ; DAGCombiner will transform: ; (fabs (f16 bitcast (i16 a))) => (f16 bitcast (and (i16 a), 0x7FFFFFFF)) @@ -42,6 +43,19 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: s_fabs_free_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %bc= bitcast i16 %in to half %fabs = call half @llvm.fabs.f16(half %bc) store half %fabs, ptr addrspace(1) %out @@ -83,6 +97,19 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: s_fabs_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %fabs = call half @llvm.fabs.f16(half %in) store half %fabs, ptr addrspace(1) %out ret void @@ -123,6 +150,19 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: s_fabs_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in) store <2 x half> %fabs, ptr addrspace(1) %out ret void @@ -166,6 +206,19 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: s_fabs_v4f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff +; GFX11-NEXT: s_and_b32 s3, s3, 0x7fff7fff +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in) store <4 x half> %fabs, ptr addrspace(1) %out ret void @@ -212,6 +265,20 @@ ; GFX9-NEXT: v_mul_f16_e64 v1, |s2|, v1 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: fabs_fold_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mul_f16_e64 v1, |s2|, s3 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %fabs = call half @llvm.fabs.f16(half %in0) %fmul = fmul half %fabs, %in1 store half %fmul, ptr addrspace(1) %out @@ -257,6 +324,18 @@ ; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_fabs_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.in = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid @@ -301,6 +380,19 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: fabs_free_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %bc = bitcast i32 %in to <2 x half> %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %bc) store <2 x half> %fabs, ptr addrspace(1) %out @@ -366,6 +458,21 @@ ; GFX9-NEXT: v_pk_mul_f16 v0, v2, v0 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_fabs_fold_self_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_mul_f16 v0, v1, v0 +; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <2 x half>, ptr addrspace(1) %in, i32 %tid %val = load <2 x half>, ptr addrspace(1) %gep @@ -438,6 +545,21 @@ ; GFX9-NEXT: v_pk_mul_f16 v0, v0, s6 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_fabs_fold_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_mul_f16 v0, v0, s0 +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <2 x half>, ptr addrspace(1) %in, i32 %tid %val = load <2 x half>, ptr addrspace(1) %gep @@ -506,6 +628,24 @@ ; GFX9-NEXT: global_store_short v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_extract_fabs_fold_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-NEXT: v_mul_f16_e64 v0, |v0|, 4.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_add_f16_e64 v1, |v1|, 2.0 +; GFX11-NEXT: global_store_b16 v[0:1], v0, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_b16 v[0:1], v1, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.in = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid %val = load <2 x half>, ptr addrspace(1) %gep.in @@ -570,6 +710,21 @@ ; GFX9-NEXT: global_store_short_d16_hi v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_extract_fabs_no_fold_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX11-NEXT: global_store_b16 v[0:1], v0, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_d16_hi_b16 v[0:1], v0, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.in = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid %val = load <2 x half>, ptr addrspace(1) %gep.in @@ -589,6 +744,3 @@ attributes #0 = { nounwind } attributes #1 = { nounwind readnone } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GCN: {{.*}} -; GFX89: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll @@ -1,17 +1,82 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-misched=false < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-misched=false < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-misched=false < %s | FileCheck -check-prefixes=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-misched=false < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs -enable-misched=false < %s | FileCheck -check-prefixes=GFX11 %s -; GCN-LABEL: {{^}}fadd_f16 -; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]] -; GCN: {{buffer|flat}}_load_ushort v[[B_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]] -; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]] -; GCN: buffer_store_short v[[R_F16]] -; GCN: s_endpgm define amdgpu_kernel void @fadd_f16( +; SI-LABEL: fadd_f16: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fadd_f16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f16_e32 v0, v0, v1 +; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: fadd_f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s2, s10 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s4 +; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 +; GFX11-NEXT: buffer_load_u16 v0, off, s[4:7], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) { @@ -23,15 +88,63 @@ ret void } -; GCN-LABEL: {{^}}fadd_f16_imm_a -; GCN: {{buffer|flat}}_load_ushort v[[B_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], 1.0, v[[B_F32]] -; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], 1.0, v[[B_F16]] -; GCN: buffer_store_short v[[R_F16]] -; GCN: s_endpgm define amdgpu_kernel void @fadd_f16_imm_a( +; SI-LABEL: fadd_f16_imm_a: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s0, s2 +; SI-NEXT: s_mov_b32 s1, s3 +; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fadd_f16_imm_a: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_mov_b32 s0, s2 +; VI-NEXT: s_mov_b32 s1, s3 +; VI-NEXT: s_mov_b32 s2, s6 +; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f16_e32 v0, 1.0, v0 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: fadd_f16_imm_a: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_mov_b32 s0, s2 +; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: s_mov_b32 s2, s6 +; GFX11-NEXT: s_mov_b32 s3, s7 +; GFX11-NEXT: buffer_load_u16 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f16_e32 v0, 1.0, v0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %b) { entry: @@ -41,15 +154,63 @@ ret void } -; GCN-LABEL: {{^}}fadd_f16_imm_b -; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], 2.0, v[[A_F32]] -; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], 2.0, v[[A_F16]] -; GCN: buffer_store_short v[[R_F16]] -; GCN: s_endpgm define amdgpu_kernel void @fadd_f16_imm_b( +; SI-LABEL: fadd_f16_imm_b: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s0, s2 +; SI-NEXT: s_mov_b32 s1, s3 +; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 2.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fadd_f16_imm_b: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_mov_b32 s0, s2 +; VI-NEXT: s_mov_b32 s1, s3 +; VI-NEXT: s_mov_b32 s2, s6 +; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f16_e32 v0, 2.0, v0 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: fadd_f16_imm_b: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_mov_b32 s0, s2 +; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: s_mov_b32 s2, s6 +; GFX11-NEXT: s_mov_b32 s3, s7 +; GFX11-NEXT: buffer_load_u16 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f16_e32 v0, 2.0, v0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -59,33 +220,85 @@ ret void } -; GCN-LABEL: {{^}}fadd_v2f16: -; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; VI: flat_load_dword v[[A_V2_F16:[0-9]+]] -; VI: flat_load_dword v[[B_V2_F16:[0-9]+]] - -; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] - -; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI-DAG: v_add_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]] -; SI-DAG: v_add_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] - -; VI-DAG: v_add_f16_e32 v[[R_F16_LO:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] -; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]] - -; GCN: buffer_store_dword v[[R_V2_F16]] -; GCN: s_endpgm define amdgpu_kernel void @fadd_v2f16( +; SI-LABEL: fadd_v2f16: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s14, 0 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[12:13], s[6:7] +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[2:3], s[14:15] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[12:15], 0 addr64 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, v2, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fadd_v2f16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: flat_load_dword v1, v[2:3] +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_add_f16_e32 v0, v0, v1 +; VI-NEXT: v_or_b32_e32 v0, v0, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: fadd_v2f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX11-NEXT: global_load_b32 v0, v0, s[8:9] +; GFX11-NEXT: s_mov_b32 s0, s4 +; GFX11-NEXT: s_mov_b32 s1, s5 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v0, v1, v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) { @@ -100,26 +313,70 @@ ret void } -; GCN-LABEL: {{^}}fadd_v2f16_imm_a: -; GCN-DAG: {{buffer|flat}}_load_dword v[[B_V2_F16:[0-9]+]] -; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI-DAG: v_add_f32_e32 v[[R_F32_0:[0-9]+]], 1.0, v[[B_F32_0]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI-DAG: v_add_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] - -; VI-DAG: v_mov_b32_e32 v[[CONST2:[0-9]+]], 0x4000 -; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], v[[CONST2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]] -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] - -; GCN: buffer_store_dword v[[R_V2_F16]] -; GCN: s_endpgm define amdgpu_kernel void @fadd_v2f16_imm_a( +; SI-LABEL: fadd_v2f16_imm_a: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, 2.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fadd_v2f16_imm_a: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, 0x4000 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 1.0, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: fadd_v2f16_imm_a: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v0, 0x40003c00, v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %b) { entry: @@ -131,26 +388,70 @@ ret void } -; GCN-LABEL: {{^}}fadd_v2f16_imm_b: -; GCN-DAG: {{buffer|flat}}_load_dword v[[A_V2_F16:[0-9]+]] -; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI-DAG: v_add_f32_e32 v[[R_F32_0:[0-9]+]], 2.0, v[[A_F32_0]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI-DAG: v_add_f32_e32 v[[R_F32_1:[0-9]+]], 1.0, v[[A_F32_1]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] - -; VI-DAG: v_mov_b32_e32 v[[CONST1:[0-9]+]], 0x3c00 -; VI-DAG: v_add_f16_sdwa v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-DAG: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[A_V2_F16]] -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] - -; GCN: buffer_store_dword v[[R_V2_F16]] -; GCN: s_endpgm define amdgpu_kernel void @fadd_v2f16_imm_b( +; SI-LABEL: fadd_v2f16_imm_b: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 2.0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fadd_v2f16_imm_b: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, 0x3c00 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 2.0, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: fadd_v2f16_imm_b: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v0, 0x3c004000, v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -2,6 +2,7 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s ; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CI %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s declare half @llvm.fabs.f16(half) #0 declare half @llvm.canonicalize.f16(half) #0 @@ -45,6 +46,15 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_undef_value_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b16 v0, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half undef) store half %canonicalized, ptr addrspace(1) %out ret void @@ -87,6 +97,18 @@ ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_canonicalize_var_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u16 v0, v0, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-NEXT: global_store_b16 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load half, ptr addrspace(1) %out %canonicalized = call half @llvm.canonicalize.f16(half %val) store half %canonicalized, ptr addrspace(1) undef @@ -127,6 +149,18 @@ ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_canonicalize_var_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_max_f16_e64 v1, s2, s2 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = bitcast i16 %val.arg to half %canonicalized = call half @llvm.canonicalize.f16(half %val) store half %canonicalized, ptr addrspace(1) %out @@ -158,6 +192,15 @@ ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_canonicalize_build_vector_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %ins0 = insertelement <2 x half> undef, half %lo, i32 0 %ins1 = insertelement <2 x half> %ins0, half %hi, i32 1 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins1) @@ -201,6 +244,18 @@ ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_canonicalize_fabs_var_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f16_e64 v1, |v1|, |v1| +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load half, ptr addrspace(1) %out %val.fabs = call half @llvm.fabs.f16(half %val) %canonicalized = call half @llvm.canonicalize.f16(half %val.fabs) @@ -245,6 +300,18 @@ ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_canonicalize_fneg_fabs_var_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f16_e64 v1, -|v1|, -|v1| +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load half, ptr addrspace(1) %out %val.fabs = call half @llvm.fabs.f16(half %val) %val.fabs.fneg = fneg half %val.fabs @@ -290,6 +357,18 @@ ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_canonicalize_fneg_var_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f16_e64 v1, -v1, -v1 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load half, ptr addrspace(1) %out %val.fneg = fneg half %val %canonicalized = call half @llvm.canonicalize.f16(half %val.fneg) @@ -334,6 +413,18 @@ ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f16_e64 v1, -v1, -v1 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load half, ptr addrspace(1) %out %val.fneg = fneg half %val %canonicalized = call half @llvm.canonicalize.f16(half %val.fneg) @@ -378,6 +469,18 @@ ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f16_e64 v1, -|v1|, -|v1| +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load half, ptr addrspace(1) %out %val.fabs = call half @llvm.fabs.f16(half %val) %val.fabs.fneg = fneg half %val.fabs @@ -414,6 +517,15 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_p0_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b16 v0, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half 0.0) store half %canonicalized, ptr addrspace(1) %out ret void @@ -448,6 +560,15 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_n0_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff8000 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half -0.0) store half %canonicalized, ptr addrspace(1) %out ret void @@ -482,6 +603,15 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_p1_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3c00 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half 1.0) store half %canonicalized, ptr addrspace(1) %out ret void @@ -516,6 +646,15 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_n1_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffffbc00 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half -1.0) store half %canonicalized, ptr addrspace(1) %out ret void @@ -550,6 +689,15 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_literal_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x4c00 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half 16.0) store half %canonicalized, ptr addrspace(1) %out ret void @@ -584,6 +732,15 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF) store half %canonicalized, ptr addrspace(1) %out ret void @@ -618,6 +775,15 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: test_denormals_fold_canonicalize_denormal0_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF) store half %canonicalized, ptr addrspace(1) %out ret void @@ -652,6 +818,15 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF) store half %canonicalized, ptr addrspace(1) %out ret void @@ -686,6 +861,15 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: test_denormals_fold_canonicalize_denormal1_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF) store half %canonicalized, ptr addrspace(1) %out ret void @@ -720,6 +904,15 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_qnan_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c00 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half 0xH7C00) store half %canonicalized, ptr addrspace(1) %out ret void @@ -754,6 +947,15 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg1_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -1 to half)) store half %canonicalized, ptr addrspace(1) %out ret void @@ -788,6 +990,15 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg2_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -2 to half)) store half %canonicalized, ptr addrspace(1) %out ret void @@ -822,6 +1033,15 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_snan0_value_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half 0xH7C01) store half %canonicalized, ptr addrspace(1) %out ret void @@ -856,6 +1076,15 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_snan1_value_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half 0xH7DFF) store half %canonicalized, ptr addrspace(1) %out ret void @@ -890,6 +1119,15 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_snan2_value_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half 0xHFDFF) store half %canonicalized, ptr addrspace(1) %out ret void @@ -924,6 +1162,15 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_snan3_value_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half 0xHFC01) store half %canonicalized, ptr addrspace(1) %out ret void @@ -984,6 +1231,18 @@ ; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_canonicalize_var_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid %val = load <2 x half>, ptr addrspace(1) %gep @@ -1048,6 +1307,20 @@ ; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_canonicalize_fabs_var_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid %val = load <2 x half>, ptr addrspace(1) %gep @@ -1114,6 +1387,20 @@ ; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_canonicalize_fneg_fabs_var_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid %val = load <2 x half>, ptr addrspace(1) %gep @@ -1180,6 +1467,18 @@ ; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_canonicalize_fneg_var_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid %val = load <2 x half>, ptr addrspace(1) %gep @@ -1233,6 +1532,18 @@ ; CI-NEXT: v_or_b32_e32 v0, v1, v0 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_canonicalize_var_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_pk_max_f16 v1, s2, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = bitcast i32 %val.arg to <2 x half> %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val) store <2 x half> %canonicalized, ptr addrspace(1) %out @@ -1267,6 +1578,15 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_p0_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> zeroinitializer) store <2 x half> %canonicalized, ptr addrspace(1) %out ret void @@ -1301,6 +1621,15 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_n0_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x80008000 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, ptr addrspace(1) %out ret void @@ -1335,6 +1664,15 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_p1_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3c003c00 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, ptr addrspace(1) %out ret void @@ -1369,6 +1707,15 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_n1_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xbc00bc00 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, ptr addrspace(1) %out ret void @@ -1403,6 +1750,15 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_literal_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x4c004c00 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, ptr addrspace(1) %out ret void @@ -1437,6 +1793,15 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff03ff +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, ptr addrspace(1) %out ret void @@ -1471,6 +1836,15 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: test_denormals_fold_canonicalize_denormal0_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff03ff +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, ptr addrspace(1) %out ret void @@ -1505,6 +1879,15 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x83ff83ff +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, ptr addrspace(1) %out ret void @@ -1539,6 +1922,15 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: test_denormals_fold_canonicalize_denormal1_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x83ff83ff +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, ptr addrspace(1) %out ret void @@ -1573,6 +1965,15 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_qnan_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c007c00 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, ptr addrspace(1) %out ret void @@ -1607,6 +2008,15 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg1_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> bitcast (i32 -1 to <2 x half>)) store <2 x half> %canonicalized, ptr addrspace(1) %out ret void @@ -1641,6 +2051,15 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg2_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, ptr addrspace(1) %out ret void @@ -1675,6 +2094,15 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_snan0_value_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, ptr addrspace(1) %out ret void @@ -1709,6 +2137,15 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_snan1_value_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, ptr addrspace(1) %out ret void @@ -1743,6 +2180,15 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_snan2_value_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, ptr addrspace(1) %out ret void @@ -1777,6 +2223,15 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_snan3_value_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, ptr addrspace(1) %out ret void @@ -1809,6 +2264,14 @@ ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_canonicalize_var_v3f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %canonicalized = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> %val) ret <3 x half> %canonicalized } @@ -1844,6 +2307,14 @@ ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_canonicalize_var_v4f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %val) ret <4 x half> %canonicalized } @@ -1876,6 +2347,15 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_canonicalize_undef_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef) store <2 x half> %canonicalized, ptr addrspace(1) %out ret void @@ -1903,6 +2383,15 @@ ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_canonicalize_reg_undef_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pack_b32_f16 v0, v0, 0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vec = insertelement <2 x half> undef, half %val, i32 0 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec) ret <2 x half> %canonicalized @@ -1929,6 +2418,15 @@ ; CI-NEXT: v_mul_f32_e32 v1, 1.0, v0 ; CI-NEXT: v_mov_b32_e32 v0, 0x7fc00000 ; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_canonicalize_undef_reg_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vec = insertelement <2 x half> undef, half %val, i32 1 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec) ret <2 x half> %canonicalized @@ -1953,6 +2451,13 @@ ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: v_mov_b32_e32 v1, 1.0 ; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_canonicalize_undef_lo_imm_hi_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_bfrev_b32_e32 v0, 60 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vec = insertelement <2 x half> undef, half 1.0, i32 1 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec) ret <2 x half> %canonicalized @@ -1977,6 +2482,13 @@ ; CI-NEXT: v_mov_b32_e32 v0, 1.0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_canonicalize_imm_lo_undef_hi_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0x3c00 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vec = insertelement <2 x half> undef, half 1.0, i32 0 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec) ret <2 x half> %canonicalized @@ -2001,6 +2513,13 @@ ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: v_mov_b32_e32 v1, 0x41800000 ; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_canonicalize_undef_lo_k_hi_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_bfrev_b32_e32 v0, 50 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vec = insertelement <2 x half> undef, half 16.0, i32 1 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec) ret <2 x half> %canonicalized @@ -2025,6 +2544,13 @@ ; CI-NEXT: v_mov_b32_e32 v0, 0x41800000 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_canonicalize_k_lo_undef_hi_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0x4c00 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vec = insertelement <2 x half> undef, half 16.0, i32 0 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec) ret <2 x half> %canonicalized @@ -2052,6 +2578,15 @@ ; CI-NEXT: v_mov_b32_e32 v1, 2.0 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_canonicalize_reg_k_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pack_b32_f16 v0, v0, 2.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vec0 = insertelement <2 x half> undef, half %val, i32 0 %vec1 = insertelement <2 x half> %vec0, half 2.0, i32 1 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec1) @@ -2080,6 +2615,15 @@ ; CI-NEXT: v_cvt_f32_f16_e32 v1, v0 ; CI-NEXT: v_mov_b32_e32 v0, 2.0 ; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_canonicalize_k_reg_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pack_b32_f16 v0, 2.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vec0 = insertelement <2 x half> undef, half 2.0, i32 0 %vec1 = insertelement <2 x half> %vec0, half %val, i32 1 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec1) @@ -2117,6 +2661,17 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_canonicalize_undef_v4f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef) store <4 x half> %canonicalized, ptr addrspace(1) %out ret void @@ -2149,6 +2704,16 @@ ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0x7e007e00 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_pack_b32_f16 v0, v0, 0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vec = insertelement <4 x half> undef, half %val, i32 0 %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %vec) ret <4 x half> %canonicalized @@ -2185,6 +2750,16 @@ ; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: v_mov_b32_e32 v1, 0x7e007e00 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vec0 = insertelement <4 x half> undef, half %val0, i32 0 %vec1 = insertelement <4 x half> %vec0, half %val1, i32 1 %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %vec1) @@ -2226,6 +2801,17 @@ ; CI-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_pack_b32_f16 v0, v0, 0 +; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vec0 = insertelement <4 x half> undef, half %val0, i32 0 %vec1 = insertelement <4 x half> %vec0, half %val1, i32 2 %vec2 = insertelement <4 x half> %vec1, half %val2, i32 3 @@ -2272,6 +2858,15 @@ ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_canonicalize_var_v6f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %canonicalized = call <6 x half> @llvm.canonicalize.v6f16(<6 x half> %val) ret <6 x half> %canonicalized } @@ -2323,6 +2918,16 @@ ; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_canonicalize_var_v8f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX11-NEXT: v_pk_max_f16 v3, v3, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] %canonicalized = call <8 x half> @llvm.canonicalize.v8f16(<8 x half> %val) ret <8 x half> %canonicalized } @@ -2390,6 +2995,18 @@ ; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_canonicalize_var_v12f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX11-NEXT: v_pk_max_f16 v3, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v4, v4, v4 +; GFX11-NEXT: v_pk_max_f16 v5, v5, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] %canonicalized = call <12 x half> @llvm.canonicalize.v12f16(<12 x half> %val) ret <12 x half> %canonicalized } @@ -2473,6 +3090,20 @@ ; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_canonicalize_var_v16f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX11-NEXT: v_pk_max_f16 v3, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v4, v4, v4 +; GFX11-NEXT: v_pk_max_f16 v5, v5, v5 +; GFX11-NEXT: v_pk_max_f16 v6, v6, v6 +; GFX11-NEXT: v_pk_max_f16 v7, v7, v7 +; GFX11-NEXT: s_setpc_b64 s[30:31] %canonicalized = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> %val) ret <16 x half> %canonicalized } @@ -2622,6 +3253,28 @@ ; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_canonicalize_var_v32f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX11-NEXT: v_pk_max_f16 v3, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v4, v4, v4 +; GFX11-NEXT: v_pk_max_f16 v5, v5, v5 +; GFX11-NEXT: v_pk_max_f16 v6, v6, v6 +; GFX11-NEXT: v_pk_max_f16 v7, v7, v7 +; GFX11-NEXT: v_pk_max_f16 v8, v8, v8 +; GFX11-NEXT: v_pk_max_f16 v9, v9, v9 +; GFX11-NEXT: v_pk_max_f16 v10, v10, v10 +; GFX11-NEXT: v_pk_max_f16 v11, v11, v11 +; GFX11-NEXT: v_pk_max_f16 v12, v12, v12 +; GFX11-NEXT: v_pk_max_f16 v13, v13, v13 +; GFX11-NEXT: v_pk_max_f16 v14, v14, v14 +; GFX11-NEXT: v_pk_max_f16 v15, v15, v15 +; GFX11-NEXT: s_setpc_b64 s[30:31] %canonicalized = call <32 x half> @llvm.canonicalize.v32f16(<32 x half> %val) ret <32 x half> %canonicalized } @@ -3030,6 +3683,46 @@ ; CI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_canonicalize_var_v64f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX11-NEXT: v_pk_max_f16 v3, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v4, v4, v4 +; GFX11-NEXT: v_pk_max_f16 v5, v5, v5 +; GFX11-NEXT: v_pk_max_f16 v6, v6, v6 +; GFX11-NEXT: v_pk_max_f16 v7, v7, v7 +; GFX11-NEXT: v_pk_max_f16 v8, v8, v8 +; GFX11-NEXT: v_pk_max_f16 v9, v9, v9 +; GFX11-NEXT: v_pk_max_f16 v10, v10, v10 +; GFX11-NEXT: v_pk_max_f16 v11, v11, v11 +; GFX11-NEXT: v_pk_max_f16 v12, v12, v12 +; GFX11-NEXT: v_pk_max_f16 v13, v13, v13 +; GFX11-NEXT: v_pk_max_f16 v14, v14, v14 +; GFX11-NEXT: v_pk_max_f16 v15, v15, v15 +; GFX11-NEXT: v_pk_max_f16 v16, v16, v16 +; GFX11-NEXT: v_pk_max_f16 v17, v17, v17 +; GFX11-NEXT: v_pk_max_f16 v18, v18, v18 +; GFX11-NEXT: v_pk_max_f16 v19, v19, v19 +; GFX11-NEXT: v_pk_max_f16 v20, v20, v20 +; GFX11-NEXT: v_pk_max_f16 v21, v21, v21 +; GFX11-NEXT: v_pk_max_f16 v22, v22, v22 +; GFX11-NEXT: v_pk_max_f16 v23, v23, v23 +; GFX11-NEXT: v_pk_max_f16 v24, v24, v24 +; GFX11-NEXT: v_pk_max_f16 v25, v25, v25 +; GFX11-NEXT: v_pk_max_f16 v26, v26, v26 +; GFX11-NEXT: v_pk_max_f16 v27, v27, v27 +; GFX11-NEXT: v_pk_max_f16 v28, v28, v28 +; GFX11-NEXT: v_pk_max_f16 v29, v29, v29 +; GFX11-NEXT: v_pk_max_f16 v30, v30, v30 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_max_f16 v31, v31, v31 +; GFX11-NEXT: s_setpc_b64 s[30:31] %canonicalized = call <64 x half> @llvm.canonicalize.v64f16(<64 x half> %val) ret <64 x half> %canonicalized } diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll @@ -1,6 +1,8 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6,GFX678 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8,GFX678 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX678,GFX6 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX678,GFX8 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s declare float @llvm.fabs.f32(float) #0 declare float @llvm.canonicalize.f32(float) #0 @@ -17,32 +19,131 @@ declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>) #0 declare i32 @llvm.amdgcn.workitem.id.x() #0 -; GCN-LABEL: {{^}}v_test_canonicalize_var_f32: -; GFX678: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} -; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} -; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1 { +; GFX678-LABEL: v_test_canonicalize_var_f32: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_mov_b32_e32 v0, s0 +; GFX678-NEXT: v_mov_b32_e32 v1, s1 +; GFX678-NEXT: flat_load_dword v2, v[0:1] +; GFX678-NEXT: s_waitcnt vmcnt(0) +; GFX678-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX678-NEXT: flat_store_dword v[0:1], v2 +; GFX678-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_canonicalize_var_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_canonicalize_var_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load float, ptr addrspace(1) %out %canonicalized = call float @llvm.canonicalize.f32(float %val) store float %canonicalized, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}s_test_canonicalize_var_f32: -; GFX678: v_mul_f32_e64 [[REG:v[0-9]+]], 1.0, {{s[0-9]+}} -; GFX9: v_max_f32_e64 [[REG:v[0-9]+]], {{s[0-9]+}}, {{s[0-9]+}} -; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, float %val) #1 { +; GFX6-LABEL: s_test_canonicalize_var_f32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mul_f32_e64 v2, 1.0, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: flat_store_dword v[0:1], v2 +; GFX6-NEXT: s_endpgm +; +; GFX8-LABEL: s_test_canonicalize_var_f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mul_f32_e64 v2, 1.0, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: s_test_canonicalize_var_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_max_f32_e64 v1, s2, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_canonicalize_var_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_max_f32_e64 v1, s2, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float %val) store float %canonicalized, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f32: -; GFX678: v_mul_f32_e64 [[REG:v[0-9]+]], 1.0, |{{v[0-9]+}}| -; GFX9: v_max_f32_e64 [[REG:v[0-9]+]], |{{v[0-9]+}}|, |{{v[0-9]+}}| -; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %out) #1 { +; GFX678-LABEL: v_test_canonicalize_fabs_var_f32: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_mov_b32_e32 v0, s0 +; GFX678-NEXT: v_mov_b32_e32 v1, s1 +; GFX678-NEXT: flat_load_dword v2, v[0:1] +; GFX678-NEXT: s_waitcnt vmcnt(0) +; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, |v2| +; GFX678-NEXT: flat_store_dword v[0:1], v2 +; GFX678-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_canonicalize_fabs_var_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f32_e64 v1, |v1|, |v1| +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_canonicalize_fabs_var_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f32_e64 v1, |v1|, |v1| +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load float, ptr addrspace(1) %out %val.fabs = call float @llvm.fabs.f32(float %val) %canonicalized = call float @llvm.canonicalize.f32(float %val.fabs) @@ -50,11 +151,41 @@ ret void } -; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_f32: -; GFX678: v_mul_f32_e64 [[REG:v[0-9]+]], -1.0, |{{v[0-9]+}}| -; GFX9: v_max_f32_e64 [[REG:v[0-9]+]], -|{{v[0-9]+}}|, -|{{v[0-9]+}}| -; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1) %out) #1 { +; GFX678-LABEL: v_test_canonicalize_fneg_fabs_var_f32: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_mov_b32_e32 v0, s0 +; GFX678-NEXT: v_mov_b32_e32 v1, s1 +; GFX678-NEXT: flat_load_dword v2, v[0:1] +; GFX678-NEXT: s_waitcnt vmcnt(0) +; GFX678-NEXT: v_mul_f32_e64 v2, -1.0, |v2| +; GFX678-NEXT: flat_store_dword v[0:1], v2 +; GFX678-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_canonicalize_fneg_fabs_var_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f32_e64 v1, -|v1|, -|v1| +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_canonicalize_fneg_fabs_var_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f32_e64 v1, -|v1|, -|v1| +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load float, ptr addrspace(1) %out %val.fabs = call float @llvm.fabs.f32(float %val) %val.fabs.fneg = fneg float %val.fabs @@ -63,11 +194,41 @@ ret void } -; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_f32: -; GFX678: v_mul_f32_e32 [[REG:v[0-9]+]], -1.0, {{v[0-9]+}} -; GFX9: v_max_f32_e64 [[REG:v[0-9]+]], -{{v[0-9]+}}, -{{v[0-9]+}} -; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %out) #1 { +; GFX678-LABEL: v_test_canonicalize_fneg_var_f32: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_mov_b32_e32 v0, s0 +; GFX678-NEXT: v_mov_b32_e32 v1, s1 +; GFX678-NEXT: flat_load_dword v2, v[0:1] +; GFX678-NEXT: s_waitcnt vmcnt(0) +; GFX678-NEXT: v_mul_f32_e32 v2, -1.0, v2 +; GFX678-NEXT: flat_store_dword v[0:1], v2 +; GFX678-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_canonicalize_fneg_var_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f32_e64 v1, -v1, -v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_canonicalize_fneg_var_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f32_e64 v1, -v1, -v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load float, ptr addrspace(1) %out %val.fneg = fneg float %val %canonicalized = call float @llvm.canonicalize.f32(float %val.fneg) @@ -75,182 +236,686 @@ ret void } -; GCN-LABEL: {{^}}test_fold_canonicalize_undef_f32: -; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}} -; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %out) #1 { +; GFX678-LABEL: test_fold_canonicalize_undef_f32: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: v_mov_b32_e32 v2, 0 +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_mov_b32_e32 v0, s0 +; GFX678-NEXT: v_mov_b32_e32 v1, s1 +; GFX678-NEXT: flat_store_dword v[0:1], v2 +; GFX678-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_undef_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v0, v0, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_undef_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float undef) store float %canonicalized, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}test_fold_canonicalize_p0_f32: -; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}} -; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out) #1 { +; GFX678-LABEL: test_fold_canonicalize_p0_f32: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: v_mov_b32_e32 v2, 0 +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_mov_b32_e32 v0, s0 +; GFX678-NEXT: v_mov_b32_e32 v1, s1 +; GFX678-NEXT: flat_store_dword v[0:1], v2 +; GFX678-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_p0_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v0, v0, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_p0_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float 0.0) store float %canonicalized, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}test_fold_canonicalize_n0_f32: -; GCN: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}} -; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out) #1 { +; GFX678-LABEL: test_fold_canonicalize_n0_f32: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_mov_b32_e32 v0, s0 +; GFX678-NEXT: v_mov_b32_e32 v1, s1 +; GFX678-NEXT: flat_store_dword v[0:1], v2 +; GFX678-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_n0_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_n0_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float -0.0) store float %canonicalized, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}test_fold_canonicalize_p1_f32: -; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0{{$}} -; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out) #1 { +; GFX678-LABEL: test_fold_canonicalize_p1_f32: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: v_mov_b32_e32 v2, 1.0 +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_mov_b32_e32 v0, s0 +; GFX678-NEXT: v_mov_b32_e32 v1, s1 +; GFX678-NEXT: flat_store_dword v[0:1], v2 +; GFX678-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_p1_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_p1_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1.0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float 1.0) store float %canonicalized, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}test_fold_canonicalize_n1_f32: -; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], -1.0{{$}} -; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out) #1 { +; GFX678-LABEL: test_fold_canonicalize_n1_f32: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: v_mov_b32_e32 v2, -1.0 +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_mov_b32_e32 v0, s0 +; GFX678-NEXT: v_mov_b32_e32 v1, s1 +; GFX678-NEXT: flat_store_dword v[0:1], v2 +; GFX678-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_n1_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, -1.0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_n1_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, -1.0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float -1.0) store float %canonicalized, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}test_fold_canonicalize_literal_f32: -; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x41800000{{$}} -; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) %out) #1 { +; GFX678-LABEL: test_fold_canonicalize_literal_f32: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: v_mov_b32_e32 v2, 0x41800000 +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_mov_b32_e32 v0, s0 +; GFX678-NEXT: v_mov_b32_e32 v1, s1 +; GFX678-NEXT: flat_store_dword v[0:1], v2 +; GFX678-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_literal_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x41800000 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_literal_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x41800000 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float 16.0) store float %canonicalized, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal0_f32: -; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}} -; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr addrspace(1) %out) #1 { +; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: v_mov_b32_e32 v2, 0 +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_mov_b32_e32 v0, s0 +; GFX678-NEXT: v_mov_b32_e32 v1, s1 +; GFX678-NEXT: flat_store_dword v[0:1], v2 +; GFX678-NEXT: s_endpgm +; +; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v0, v0, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float)) store float %canonicalized, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_f32: -; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fffff{{$}} -; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr addrspace(1) %out) #3 { +; GFX678-LABEL: test_denormals_fold_canonicalize_denormal0_f32: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fffff +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_mov_b32_e32 v0, s0 +; GFX678-NEXT: v_mov_b32_e32 v1, s1 +; GFX678-NEXT: flat_store_dword v[0:1], v2 +; GFX678-NEXT: s_endpgm +; +; GFX9-LABEL: test_denormals_fold_canonicalize_denormal0_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fffff +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_denormals_fold_canonicalize_denormal0_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fffff +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float)) store float %canonicalized, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal1_f32: -; GCN: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}} -; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr addrspace(1) %out) #1 { +; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_mov_b32_e32 v0, s0 +; GFX678-NEXT: v_mov_b32_e32 v1, s1 +; GFX678-NEXT: flat_store_dword v[0:1], v2 +; GFX678-NEXT: s_endpgm +; +; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float)) store float %canonicalized, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_f32: -; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x807fffff{{$}} -; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr addrspace(1) %out) #3 { +; GFX678-LABEL: test_denormals_fold_canonicalize_denormal1_f32: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: v_mov_b32_e32 v2, 0x807fffff +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_mov_b32_e32 v0, s0 +; GFX678-NEXT: v_mov_b32_e32 v1, s1 +; GFX678-NEXT: flat_store_dword v[0:1], v2 +; GFX678-NEXT: s_endpgm +; +; GFX9-LABEL: test_denormals_fold_canonicalize_denormal1_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x807fffff +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_denormals_fold_canonicalize_denormal1_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x807fffff +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float)) store float %canonicalized, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_f32: -; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}} -; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out) #1 { +; GFX678-LABEL: test_fold_canonicalize_qnan_f32: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_mov_b32_e32 v0, s0 +; GFX678-NEXT: v_mov_b32_e32 v1, s1 +; GFX678-NEXT: flat_store_dword v[0:1], v2 +; GFX678-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_qnan_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_qnan_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float 0x7FF8000000000000) store float %canonicalized, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_f32: -; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}} -; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrspace(1) %out) #1 { +; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg1_f32: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_mov_b32_e32 v0, s0 +; GFX678-NEXT: v_mov_b32_e32 v1, s1 +; GFX678-NEXT: flat_store_dword v[0:1], v2 +; GFX678-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg1_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg1_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 -1 to float)) store float %canonicalized, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_f32: -; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}} -; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrspace(1) %out) #1 { +; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg2_f32: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_mov_b32_e32 v0, s0 +; GFX678-NEXT: v_mov_b32_e32 v1, s1 +; GFX678-NEXT: flat_store_dword v[0:1], v2 +; GFX678-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg2_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg2_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 -2 to float)) store float %canonicalized, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_f32: -; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}} -; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace(1) %out) #1 { +; GFX678-LABEL: test_fold_canonicalize_snan0_value_f32: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_mov_b32_e32 v0, s0 +; GFX678-NEXT: v_mov_b32_e32 v1, s1 +; GFX678-NEXT: flat_store_dword v[0:1], v2 +; GFX678-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_snan0_value_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_snan0_value_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2139095041 to float)) store float %canonicalized, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_f32: -; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}} -; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace(1) %out) #1 { +; GFX678-LABEL: test_fold_canonicalize_snan1_value_f32: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_mov_b32_e32 v0, s0 +; GFX678-NEXT: v_mov_b32_e32 v1, s1 +; GFX678-NEXT: flat_store_dword v[0:1], v2 +; GFX678-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_snan1_value_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_snan1_value_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2143289343 to float)) store float %canonicalized, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_f32: -; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}} -; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace(1) %out) #1 { +; GFX678-LABEL: test_fold_canonicalize_snan2_value_f32: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_mov_b32_e32 v0, s0 +; GFX678-NEXT: v_mov_b32_e32 v1, s1 +; GFX678-NEXT: flat_store_dword v[0:1], v2 +; GFX678-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_snan2_value_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_snan2_value_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 4286578689 to float)) store float %canonicalized, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_f32: -; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}} -; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace(1) %out) #1 { +; GFX678-LABEL: test_fold_canonicalize_snan3_value_f32: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_mov_b32_e32 v0, s0 +; GFX678-NEXT: v_mov_b32_e32 v1, s1 +; GFX678-NEXT: flat_store_dword v[0:1], v2 +; GFX678-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_snan3_value_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_snan3_value_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 4290772991 to float)) store float %canonicalized, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}v_test_canonicalize_var_f64: -; GCN: v_max_f64 [[REG:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} -; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, [[REG]] define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1 { +; GFX678-LABEL: v_test_canonicalize_var_f64: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_mov_b32_e32 v0, s0 +; GFX678-NEXT: v_mov_b32_e32 v1, s1 +; GFX678-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX678-NEXT: s_waitcnt vmcnt(0) +; GFX678-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX678-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX678-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_canonicalize_var_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_canonicalize_var_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load double, ptr addrspace(1) %out %canonicalized = call double @llvm.canonicalize.f64(double %val) store double %canonicalized, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}s_test_canonicalize_var_f64: -; GCN: v_max_f64 [[REG:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, [[REG]] define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, double %val) #1 { +; GFX6-LABEL: s_test_canonicalize_var_f64: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_max_f64 v[2:3], s[2:3], s[2:3] +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX6-NEXT: s_endpgm +; +; GFX8-LABEL: s_test_canonicalize_var_f64: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], s[2:3], s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: s_test_canonicalize_var_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_max_f64 v[0:1], s[2:3], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_canonicalize_var_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], s[2:3], s[2:3] +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double %val) store double %canonicalized, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f64: -; GCN: v_max_f64 [[REG:v\[[0-9]+:[0-9]+\]]], |{{v\[[0-9]+:[0-9]+\]}}|, |{{v\[[0-9]+:[0-9]+\]}}| -; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, [[REG]] define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %out) #1 { +; GFX678-LABEL: v_test_canonicalize_fabs_var_f64: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_mov_b32_e32 v0, s0 +; GFX678-NEXT: v_mov_b32_e32 v1, s1 +; GFX678-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX678-NEXT: s_waitcnt vmcnt(0) +; GFX678-NEXT: v_max_f64 v[2:3], |v[2:3]|, |v[2:3]| +; GFX678-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX678-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_canonicalize_fabs_var_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f64 v[0:1], |v[0:1]|, |v[0:1]| +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_canonicalize_fabs_var_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], |v[0:1]|, |v[0:1]| +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load double, ptr addrspace(1) %out %val.fabs = call double @llvm.fabs.f64(double %val) %canonicalized = call double @llvm.canonicalize.f64(double %val.fabs) @@ -258,10 +923,41 @@ ret void } -; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_f64: -; GCN: v_max_f64 [[REG:v\[[0-9]+:[0-9]\]]], -|{{v\[[0-9]+:[0-9]+\]}}|, -|{{v\[[0-9]+:[0-9]+\]}}| -; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, [[REG]] define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1) %out) #1 { +; GFX678-LABEL: v_test_canonicalize_fneg_fabs_var_f64: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_mov_b32_e32 v0, s0 +; GFX678-NEXT: v_mov_b32_e32 v1, s1 +; GFX678-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX678-NEXT: s_waitcnt vmcnt(0) +; GFX678-NEXT: v_max_f64 v[2:3], -|v[2:3]|, -|v[2:3]| +; GFX678-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX678-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_canonicalize_fneg_fabs_var_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]| +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_canonicalize_fneg_fabs_var_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]| +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load double, ptr addrspace(1) %out %val.fabs = call double @llvm.fabs.f64(double %val) %val.fabs.fneg = fneg double %val.fabs @@ -270,10 +966,41 @@ ret void } -; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_f64: -; GCN: v_max_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -{{v\[[0-9]+:[0-9]+\]}}, -{{v\[[0-9]+:[0-9]+\]}} -; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, [[REG]] define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %out) #1 { +; GFX678-LABEL: v_test_canonicalize_fneg_var_f64: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_mov_b32_e32 v0, s0 +; GFX678-NEXT: v_mov_b32_e32 v1, s1 +; GFX678-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX678-NEXT: s_waitcnt vmcnt(0) +; GFX678-NEXT: v_max_f64 v[2:3], -v[2:3], -v[2:3] +; GFX678-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX678-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_canonicalize_fneg_var_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_canonicalize_fneg_var_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1] +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load double, ptr addrspace(1) %out %val.fneg = fneg double %val %canonicalized = call double @llvm.canonicalize.f64(double %val.fneg) @@ -281,170 +1008,617 @@ ret void } -; GCN-LABEL: {{^}}test_fold_canonicalize_p0_f64: -; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} -; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], v[[LO]]{{$}} -; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]] define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out) #1 { +; GFX678-LABEL: test_fold_canonicalize_p0_f64: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_mov_b32_e32 v1, v0 +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v2, s0 +; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX678-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_p0_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_p0_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double 0.0) store double %canonicalized, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}test_fold_canonicalize_n0_f64: -; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} -; GCN-DAG: v_bfrev_b32_e32 v[[HI:[0-9]+]], 1{{$}} -; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]] define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out) #1 { +; GFX678-LABEL: test_fold_canonicalize_n0_f64: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v2, s0 +; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX678-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_n0_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_n0_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double -0.0) store double %canonicalized, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}test_fold_canonicalize_p1_f64: -; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} -; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x3ff00000{{$}} -; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]] define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out) #1 { +; GFX678-LABEL: test_fold_canonicalize_p1_f64: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x3ff00000 +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v2, s0 +; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX678-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_p1_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff00000 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_p1_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff00000 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double 1.0) store double %canonicalized, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}test_fold_canonicalize_n1_f64: -; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} -; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0xbff00000{{$}} -; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]] define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out) #1 { +; GFX678-LABEL: test_fold_canonicalize_n1_f64: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_mov_b32_e32 v1, 0xbff00000 +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v2, s0 +; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX678-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_n1_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xbff00000 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_n1_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xbff00000 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double -1.0) store double %canonicalized, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}test_fold_canonicalize_literal_f64: -; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} -; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x40300000{{$}} -; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]] define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) %out) #1 { +; GFX678-LABEL: test_fold_canonicalize_literal_f64: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x40300000 +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v2, s0 +; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX678-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_literal_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x40300000 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_literal_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40300000 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double 16.0) store double %canonicalized, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal0_f64: -; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} -; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], v[[LO]]{{$}} -; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]] define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr addrspace(1) %out) #2 { +; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_mov_b32_e32 v1, v0 +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v2, s0 +; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX678-NEXT: s_endpgm +; +; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 4503599627370495 to double)) store double %canonicalized, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_f64: -; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], -1{{$}} -; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0xfffff{{$}} -; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]] define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr addrspace(1) %out) #3 { +; GFX678-LABEL: test_denormals_fold_canonicalize_denormal0_f64: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: v_mov_b32_e32 v0, -1 +; GFX678-NEXT: v_mov_b32_e32 v1, 0xfffff +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v2, s0 +; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX678-NEXT: s_endpgm +; +; GFX9-LABEL: test_denormals_fold_canonicalize_denormal0_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, -1 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xfffff +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_denormals_fold_canonicalize_denormal0_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0xfffff +; GFX11-NEXT: v_mov_b32_e32 v0, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 4503599627370495 to double)) store double %canonicalized, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal1_f64: -; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} -; GCN: v_bfrev_b32_e32 v[[HI:[0-9]+]], 1{{$}} -; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]] define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr addrspace(1) %out) #2 { +; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v2, s0 +; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX678-NEXT: s_endpgm +; +; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double)) store double %canonicalized, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_f64: -; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], -1{{$}} -; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x800fffff{{$}} -; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]] define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr addrspace(1) %out) #3 { +; GFX678-LABEL: test_denormals_fold_canonicalize_denormal1_f64: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: v_mov_b32_e32 v0, -1 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x800fffff +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v2, s0 +; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX678-NEXT: s_endpgm +; +; GFX9-LABEL: test_denormals_fold_canonicalize_denormal1_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, -1 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x800fffff +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_denormals_fold_canonicalize_denormal1_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x800fffff +; GFX11-NEXT: v_mov_b32_e32 v0, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double)) store double %canonicalized, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_f64: -; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}} -; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} -; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]] define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out) #1 { +; GFX678-LABEL: test_fold_canonicalize_qnan_f64: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v2, s0 +; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX678-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_qnan_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_qnan_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double 0x7FF8000000000000) store double %canonicalized, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_f64: -; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}} -; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} -; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]] define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrspace(1) %out) #1 { +; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg1_f64: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v2, s0 +; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX678-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg1_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg1_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -1 to double)) store double %canonicalized, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_f64: -; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}} -; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} -; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]] define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrspace(1) %out) #1 { +; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg2_f64: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v2, s0 +; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX678-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg2_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg2_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -2 to double)) store double %canonicalized, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_f64: -; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}} -; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} -; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]] define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace(1) %out) #1 { +; GFX678-LABEL: test_fold_canonicalize_snan0_value_f64: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v2, s0 +; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX678-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_snan0_value_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_snan0_value_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9218868437227405313 to double)) store double %canonicalized, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_f64: -; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}} -; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} -; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]] define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace(1) %out) #1 { +; GFX678-LABEL: test_fold_canonicalize_snan1_value_f64: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v2, s0 +; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX678-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_snan1_value_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_snan1_value_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9223372036854775807 to double)) store double %canonicalized, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_f64: -; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}} -; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} -; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]] define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace(1) %out) #1 { +; GFX678-LABEL: test_fold_canonicalize_snan2_value_f64: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v2, s0 +; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX678-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_snan2_value_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_snan2_value_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 18442240474082181121 to double)) store double %canonicalized, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_f64: -; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}} -; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} -; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]] define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace(1) %out) #1 { +; GFX678-LABEL: test_fold_canonicalize_snan3_value_f64: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v2, s0 +; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX678-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_snan3_value_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_fold_canonicalize_snan3_value_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 18446744073709551615 to double)) store double %canonicalized, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}test_canonicalize_value_f64_flush: -; GFX678: v_mul_f64 v[{{[0-9:]+}}], 1.0, v[{{[0-9:]+}}] -; GCN9: v_max_f64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %arg, ptr addrspace(1) %out) #4 { +; GFX6-LABEL: test_canonicalize_value_f64_flush: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX6-NEXT: v_mov_b32_e32 v3, s3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, s2, v2 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mul_f64 v[0:1], 1.0, v[0:1] +; GFX6-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX6-NEXT: s_endpgm +; +; GFX8-LABEL: test_canonicalize_value_f64_flush: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mul_f64 v[0:1], 1.0, v[0:1] +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: test_canonicalize_value_f64_flush: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_canonicalize_value_f64_flush: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds double, ptr addrspace(1) %arg, i32 %id %v = load double, ptr addrspace(1) %gep, align 8 @@ -454,10 +1628,63 @@ ret void } -; GCN-LABEL: {{^}}test_canonicalize_value_f32_flush: -; GFX6: v_mul_f32_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}} -; GFX9: v_max_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %arg, ptr addrspace(1) %out) #4 { +; GFX6-LABEL: test_canonicalize_value_f32_flush: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: flat_load_dword v0, v[0:1] +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: flat_store_dword v[0:1], v3 +; GFX6-NEXT: s_endpgm +; +; GFX8-LABEL: test_canonicalize_value_f32_flush: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_store_dword v[0:1], v3 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: test_canonicalize_value_f32_flush: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_canonicalize_value_f32_flush: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id %v = load float, ptr addrspace(1) %gep, align 4 @@ -467,10 +1694,65 @@ ret void } -; GCN-LABEL: {{^}}test_canonicalize_value_f16_flush: -; GFX8: v_mul_f16_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}} -; GFX9: v_max_f16_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %arg, ptr addrspace(1) %out) #4 { +; GFX6-LABEL: test_canonicalize_value_f16_flush: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: flat_load_ushort v0, v[0:1] +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: flat_store_short v[0:1], v3 +; GFX6-NEXT: s_endpgm +; +; GFX8-LABEL: test_canonicalize_value_f16_flush: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mul_f16_e32 v3, 1.0, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_store_short v[0:1], v3 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: test_canonicalize_value_f16_flush: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX9-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_canonicalize_value_f16_flush: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX11-NEXT: global_store_b16 v0, v1, s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds half, ptr addrspace(1) %arg, i32 %id %v = load half, ptr addrspace(1) %gep, align 2 @@ -480,13 +1762,75 @@ ret void } -; GCN-LABEL: {{^}}test_canonicalize_value_v2f16_flush: -; GFX8: v_mov_b32_e32 [[ONE:v[0-9]+]], 0x3c00 -; GFX8-DAG: v_mul_f16_sdwa v{{[0-9]+}}, [[ONE]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-DAG: v_mul_f16_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} -; GFX9: v_pk_max_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) %arg, ptr addrspace(1) %out) #4 { +; GFX6-LABEL: test_canonicalize_value_v2f16_flush: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: flat_load_dword v0, v[0:1] +; GFX6-NEXT: v_mov_b32_e32 v3, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v4, v1, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX6-NEXT: flat_store_dword v[0:1], v4 +; GFX6-NEXT: s_endpgm +; +; GFX8-LABEL: test_canonicalize_value_v2f16_flush: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v1, 0x3c00 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_e32 v0, 1.0, v0 +; GFX8-NEXT: v_or_b32_e32 v4, v0, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-NEXT: flat_store_dword v[0:1], v4 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: test_canonicalize_value_v2f16_flush: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_canonicalize_value_v2f16_flush: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %arg, i32 %id %v = load <2 x half>, ptr addrspace(1) %gep, align 4 @@ -496,9 +1840,63 @@ ret void } -; GCN-LABEL: {{^}}test_canonicalize_value_f64_denorm: -; GCN: v_max_f64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) %arg, ptr addrspace(1) %out) #3 { +; GFX6-LABEL: test_canonicalize_value_f64_denorm: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX6-NEXT: v_mov_b32_e32 v3, s3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, s2, v2 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX6-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX6-NEXT: s_endpgm +; +; GFX8-LABEL: test_canonicalize_value_f64_denorm: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: test_canonicalize_value_f64_denorm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_canonicalize_value_f64_denorm: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds double, ptr addrspace(1) %arg, i32 %id %v = load double, ptr addrspace(1) %gep, align 8 @@ -508,10 +1906,63 @@ ret void } -; GCN-LABEL: {{^}}test_canonicalize_value_f32_denorm: -; GFX678: v_mul_f32_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}} -; GFX9: v_max_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) %arg, ptr addrspace(1) %out) #3 { +; GFX6-LABEL: test_canonicalize_value_f32_denorm: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: flat_load_dword v0, v[0:1] +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: flat_store_dword v[0:1], v3 +; GFX6-NEXT: s_endpgm +; +; GFX8-LABEL: test_canonicalize_value_f32_denorm: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_store_dword v[0:1], v3 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: test_canonicalize_value_f32_denorm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_canonicalize_value_f32_denorm: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id %v = load float, ptr addrspace(1) %gep, align 4 @@ -522,11 +1973,65 @@ } ; FIXME: Conversion to float should count as the canonicalize pre-gfx8 -; GCN-LABEL: {{^}}test_canonicalize_value_f16_denorm: -; GFX6: v_mul_f32_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}} -; GFX8: v_max_f16_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -; GFX9: v_max_f16_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) %arg, ptr addrspace(1) %out) #3 { +; GFX6-LABEL: test_canonicalize_value_f16_denorm: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: flat_load_ushort v0, v[0:1] +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: flat_store_short v[0:1], v3 +; GFX6-NEXT: s_endpgm +; +; GFX8-LABEL: test_canonicalize_value_f16_denorm: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_store_short v[0:1], v3 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: test_canonicalize_value_f16_denorm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX9-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_canonicalize_value_f16_denorm: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX11-NEXT: global_store_b16 v0, v1, s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds half, ptr addrspace(1) %arg, i32 %id %v = load half, ptr addrspace(1) %gep, align 2 @@ -536,15 +2041,75 @@ ret void } -; GCN-LABEL: {{^}}test_canonicalize_value_v2f16_denorm: -; GFX6: v_mul_f32_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}} -; GFX6: v_mul_f32_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}} -; GFX8: v_max_f16_sdwa -; GFX8: v_max_f16_e32 -; GFX9: v_pk_max_f16 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) %arg, ptr addrspace(1) %out) #3 { +; GFX6-LABEL: test_canonicalize_value_v2f16_denorm: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: flat_load_dword v0, v[0:1] +; GFX6-NEXT: v_mov_b32_e32 v3, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v4, v1, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX6-NEXT: flat_store_dword v[0:1], v4 +; GFX6-NEXT: s_endpgm +; +; GFX8-LABEL: test_canonicalize_value_v2f16_denorm: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f16_sdwa v3, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v3, v0, v3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_store_dword v[0:1], v3 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: test_canonicalize_value_v2f16_denorm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_canonicalize_value_v2f16_denorm: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %arg, i32 %id %v = load <2 x half>, ptr addrspace(1) %gep, align 4 @@ -554,10 +2119,67 @@ ret void } -; GCN-LABEL: {{^}}v_test_canonicalize_var_v2f64: -; GCN: v_max_f64 -; GCN: v_max_f64 define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out) #1 { +; GFX6-LABEL: v_test_canonicalize_var_v2f64: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX6-NEXT: v_mov_b32_e32 v5, s1 +; GFX6-NEXT: v_mov_b32_e32 v4, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX6-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX6-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX6-NEXT: s_endpgm +; +; GFX8-LABEL: v_test_canonicalize_var_v2f64: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_canonicalize_var_v2f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_canonicalize_var_v2f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b128 v[0:3], v0, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <2 x double>, ptr addrspace(1) %out, i32 %tid %val = load <2 x double>, ptr addrspace(1) %gep @@ -566,91 +2188,216 @@ ret void } -; GCN-LABEL: {{^}}v_test_canonicalize_v2f32_flush: -; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} -; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} -; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} -; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} define <2 x float> @v_test_canonicalize_v2f32_flush(<2 x float> %arg) #1 { +; GFX678-LABEL: v_test_canonicalize_v2f32_flush: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX678-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX678-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX678-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_canonicalize_v2f32_flush: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_canonicalize_v2f32_flush: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %canon = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> %arg) ret <2 x float> %canon } -; GCN-LABEL: {{^}}v_test_canonicalize_v3f32_flush: -; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} -; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} -; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} -; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} -; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} -; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} define <3 x float> @v_test_canonicalize_v3f32_flush(<3 x float> %arg) #1 { +; GFX678-LABEL: v_test_canonicalize_v3f32_flush: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX678-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX678-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX678-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX678-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_canonicalize_v3f32_flush: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_canonicalize_v3f32_flush: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1 +; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %canon = call <3 x float> @llvm.canonicalize.v3f32(<3 x float> %arg) ret <3 x float> %canon } -; GCN-LABEL: {{^}}v_test_canonicalize_v4f32_flush: -; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} -; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} -; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} -; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} -; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} -; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} -; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} -; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} define <4 x float> @v_test_canonicalize_v4f32_flush(<4 x float> %arg) #1 { +; GFX678-LABEL: v_test_canonicalize_v4f32_flush: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX678-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX678-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX678-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX678-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX678-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_canonicalize_v4f32_flush: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_canonicalize_v4f32_flush: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1 +; GFX11-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v3, v3, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] %canon = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> %arg) ret <4 x float> %canon } -; GCN-LABEL: {{^}}v_test_canonicalize_v8f32_flush: -; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} -; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} -; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} -; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} -; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} -; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} -; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} -; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} - -; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} -; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} -; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} -; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} -; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} -; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} -; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} -; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} + define <8 x float> @v_test_canonicalize_v8f32_flush(<8 x float> %arg) #1 { +; GFX678-LABEL: v_test_canonicalize_v8f32_flush: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX678-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX678-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX678-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX678-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX678-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX678-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX678-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX678-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX678-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_canonicalize_v8f32_flush: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-NEXT: v_max_f32_e32 v4, v4, v4 +; GFX9-NEXT: v_max_f32_e32 v5, v5, v5 +; GFX9-NEXT: v_max_f32_e32 v6, v6, v6 +; GFX9-NEXT: v_max_f32_e32 v7, v7, v7 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_canonicalize_v8f32_flush: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1 +; GFX11-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v3, v3, v3 +; GFX11-NEXT: v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v5, v5, v5 +; GFX11-NEXT: v_dual_max_f32 v6, v6, v6 :: v_dual_max_f32 v7, v7, v7 +; GFX11-NEXT: s_setpc_b64 s[30:31] %canon = call <8 x float> @llvm.canonicalize.v8f32(<8 x float> %arg) ret <8 x float> %canon } -; GCN-LABEL: {{^}}v_test_canonicalize_v2f64: -; GCN: v_max_f64 -; GCN: v_max_f64 define <2 x double> @v_test_canonicalize_v2f64(<2 x double> %arg) #1 { +; GFX678-LABEL: v_test_canonicalize_v2f64: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX678-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX678-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX678-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_canonicalize_v2f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_canonicalize_v2f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: s_setpc_b64 s[30:31] %canon = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> %arg) ret <2 x double> %canon } -; GCN-LABEL: {{^}}v_test_canonicalize_v3f64: -; GCN: v_max_f64 -; GCN: v_max_f64 -; GCN: v_max_f64 define <3 x double> @v_test_canonicalize_v3f64(<3 x double> %arg) #1 { +; GFX678-LABEL: v_test_canonicalize_v3f64: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX678-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX678-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX678-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX678-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_canonicalize_v3f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_canonicalize_v3f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX11-NEXT: s_setpc_b64 s[30:31] %canon = call <3 x double> @llvm.canonicalize.v3f64(<3 x double> %arg) ret <3 x double> %canon } -; GCN-LABEL: {{^}}v_test_canonicalize_v4f64: -; GCN: v_max_f64 -; GCN: v_max_f64 -; GCN: v_max_f64 -; GCN: v_max_f64 define <4 x double> @v_test_canonicalize_v4f64(<4 x double> %arg) #1 { +; GFX678-LABEL: v_test_canonicalize_v4f64: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX678-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX678-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX678-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX678-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX678-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_canonicalize_v4f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX9-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_canonicalize_v4f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX11-NEXT: s_setpc_b64 s[30:31] %canon = call <4 x double> @llvm.canonicalize.v4f64(<4 x double> %arg) ret <4 x double> %canon } diff --git a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll @@ -1,17 +1,84 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefix=GCN -check-prefix=VI %s - -; GCN-LABEL: {{^}}fcmp_f16_lt -; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] -; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]] -; VI: v_cmp_lt_f16_e32 vcc, v[[A_F16]], v[[B_F16]] -; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] -; GCN: buffer_store_dword v[[R_I32]] -; GCN: s_endpgm +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GFX11 %s + define amdgpu_kernel void @fcmp_f16_lt( +; SI-LABEL: fcmp_f16_lt: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fcmp_f16_lt: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: fcmp_f16_lt: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s2, s10 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s4 +; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) { @@ -24,20 +91,83 @@ ret void } -; GCN-LABEL: {{^}}fcmp_f16_lt_abs: -; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] -; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] - -; SI: v_cvt_f32_f16_e64 v[[A_F32:[0-9]+]], |v[[A_F16]]| -; SI: v_cvt_f32_f16_e64 v[[B_F32:[0-9]+]], |v[[B_F16]]| - -; SI: v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]] -; VI: v_cmp_lt_f16_e64 s{{\[[0-9]+:[0-9]+\]}}, |v[[A_F16]]|, |v[[B_F16]]| - -; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] -; GCN: buffer_store_dword v[[R_I32]] -; GCN: s_endpgm define amdgpu_kernel void @fcmp_f16_lt_abs( +; SI-LABEL: fcmp_f16_lt_abs: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; SI-NEXT: v_cvt_f32_f16_e64 v1, |v1| +; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fcmp_f16_lt_abs: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: v_cmp_lt_f16_e64 s[4:5], |v0|, |v1| +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: fcmp_f16_lt_abs: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s2, s10 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s4 +; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: v_cmp_lt_f16_e64 s0, |v0|, |v1| +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, s0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) { @@ -52,17 +182,82 @@ ret void } -; GCN-LABEL: {{^}}fcmp_f16_eq -; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] -; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_cmp_eq_f32_e32 vcc, v[[A_F32]], v[[B_F32]] -; VI: v_cmp_eq_f16_e32 vcc, v[[A_F16]], v[[B_F16]] -; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] -; GCN: buffer_store_dword v[[R_I32]] -; GCN: s_endpgm define amdgpu_kernel void @fcmp_f16_eq( +; SI-LABEL: fcmp_f16_eq: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fcmp_f16_eq: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: v_cmp_eq_f16_e32 vcc, v0, v1 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: fcmp_f16_eq: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s2, s10 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s4 +; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: v_cmp_eq_f16_e32 vcc_lo, v0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) { @@ -75,17 +270,82 @@ ret void } -; GCN-LABEL: {{^}}fcmp_f16_le -; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] -; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_cmp_le_f32_e32 vcc, v[[A_F32]], v[[B_F32]] -; VI: v_cmp_le_f16_e32 vcc, v[[A_F16]], v[[B_F16]] -; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] -; GCN: buffer_store_dword v[[R_I32]] -; GCN: s_endpgm define amdgpu_kernel void @fcmp_f16_le( +; SI-LABEL: fcmp_f16_le: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cmp_le_f32_e32 vcc, v0, v1 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fcmp_f16_le: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: v_cmp_le_f16_e32 vcc, v0, v1 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: fcmp_f16_le: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s2, s10 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s4 +; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: v_cmp_le_f16_e32 vcc_lo, v0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) { @@ -98,17 +358,82 @@ ret void } -; GCN-LABEL: {{^}}fcmp_f16_gt -; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] -; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_cmp_gt_f32_e32 vcc, v[[A_F32]], v[[B_F32]] -; VI: v_cmp_gt_f16_e32 vcc, v[[A_F16]], v[[B_F16]] -; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] -; GCN: buffer_store_dword v[[R_I32]] -; GCN: s_endpgm define amdgpu_kernel void @fcmp_f16_gt( +; SI-LABEL: fcmp_f16_gt: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fcmp_f16_gt: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: v_cmp_gt_f16_e32 vcc, v0, v1 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: fcmp_f16_gt: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s2, s10 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s4 +; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, v0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) { @@ -121,17 +446,82 @@ ret void } -; GCN-LABEL: {{^}}fcmp_f16_lg -; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] -; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_cmp_lg_f32_e32 vcc, v[[A_F32]], v[[B_F32]] -; VI: v_cmp_lg_f16_e32 vcc, v[[A_F16]], v[[B_F16]] -; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] -; GCN: buffer_store_dword v[[R_I32]] -; GCN: s_endpgm define amdgpu_kernel void @fcmp_f16_lg( +; SI-LABEL: fcmp_f16_lg: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fcmp_f16_lg: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: v_cmp_lg_f16_e32 vcc, v0, v1 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: fcmp_f16_lg: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s2, s10 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s4 +; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: v_cmp_lg_f16_e32 vcc_lo, v0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) { @@ -144,17 +534,82 @@ ret void } -; GCN-LABEL: {{^}}fcmp_f16_ge -; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] -; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_cmp_ge_f32_e32 vcc, v[[A_F32]], v[[B_F32]] -; VI: v_cmp_ge_f16_e32 vcc, v[[A_F16]], v[[B_F16]] -; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] -; GCN: buffer_store_dword v[[R_I32]] -; GCN: s_endpgm define amdgpu_kernel void @fcmp_f16_ge( +; SI-LABEL: fcmp_f16_ge: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fcmp_f16_ge: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: v_cmp_ge_f16_e32 vcc, v0, v1 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: fcmp_f16_ge: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s2, s10 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s4 +; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, v0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) { @@ -167,17 +622,82 @@ ret void } -; GCN-LABEL: {{^}}fcmp_f16_o -; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] -; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_cmp_o_f32_e32 vcc, v[[A_F32]], v[[B_F32]] -; VI: v_cmp_o_f16_e32 vcc, v[[A_F16]], v[[B_F16]] -; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] -; GCN: buffer_store_dword v[[R_I32]] -; GCN: s_endpgm define amdgpu_kernel void @fcmp_f16_o( +; SI-LABEL: fcmp_f16_o: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fcmp_f16_o: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: fcmp_f16_o: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s2, s10 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s4 +; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) { @@ -190,17 +710,82 @@ ret void } -; GCN-LABEL: {{^}}fcmp_f16_u -; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] -; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_cmp_u_f32_e32 vcc, v[[A_F32]], v[[B_F32]] -; VI: v_cmp_u_f16_e32 vcc, v[[A_F16]], v[[B_F16]] -; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] -; GCN: buffer_store_dword v[[R_I32]] -; GCN: s_endpgm define amdgpu_kernel void @fcmp_f16_u( +; SI-LABEL: fcmp_f16_u: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cmp_u_f32_e32 vcc, v0, v1 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fcmp_f16_u: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: v_cmp_u_f16_e32 vcc, v0, v1 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: fcmp_f16_u: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s2, s10 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s4 +; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: v_cmp_u_f16_e32 vcc_lo, v0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) { @@ -213,17 +798,82 @@ ret void } -; GCN-LABEL: {{^}}fcmp_f16_nge -; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] -; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_cmp_nge_f32_e32 vcc, v[[A_F32]], v[[B_F32]] -; VI: v_cmp_nge_f16_e32 vcc, v[[A_F16]], v[[B_F16]] -; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] -; GCN: buffer_store_dword v[[R_I32]] -; GCN: s_endpgm define amdgpu_kernel void @fcmp_f16_nge( +; SI-LABEL: fcmp_f16_nge: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fcmp_f16_nge: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: v_cmp_nge_f16_e32 vcc, v0, v1 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: fcmp_f16_nge: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s2, s10 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s4 +; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: v_cmp_nge_f16_e32 vcc_lo, v0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) { @@ -236,17 +886,82 @@ ret void } -; GCN-LABEL: {{^}}fcmp_f16_nlg -; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] -; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_cmp_nlg_f32_e32 vcc, v[[A_F32]], v[[B_F32]] -; VI: v_cmp_nlg_f16_e32 vcc, v[[A_F16]], v[[B_F16]] -; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] -; GCN: buffer_store_dword v[[R_I32]] -; GCN: s_endpgm define amdgpu_kernel void @fcmp_f16_nlg( +; SI-LABEL: fcmp_f16_nlg: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fcmp_f16_nlg: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: v_cmp_nlg_f16_e32 vcc, v0, v1 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: fcmp_f16_nlg: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s2, s10 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s4 +; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) { @@ -259,17 +974,82 @@ ret void } -; GCN-LABEL: {{^}}fcmp_f16_ngt -; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] -; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_cmp_ngt_f32_e32 vcc, v[[A_F32]], v[[B_F32]] -; VI: v_cmp_ngt_f16_e32 vcc, v[[A_F16]], v[[B_F16]] -; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] -; GCN: buffer_store_dword v[[R_I32]] -; GCN: s_endpgm define amdgpu_kernel void @fcmp_f16_ngt( +; SI-LABEL: fcmp_f16_ngt: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fcmp_f16_ngt: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v1 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: fcmp_f16_ngt: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s2, s10 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s4 +; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) { @@ -282,17 +1062,82 @@ ret void } -; GCN-LABEL: {{^}}fcmp_f16_nle -; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] -; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_cmp_nle_f32_e32 vcc, v[[A_F32]], v[[B_F32]] -; VI: v_cmp_nle_f16_e32 vcc, v[[A_F16]], v[[B_F16]] -; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] -; GCN: buffer_store_dword v[[R_I32]] -; GCN: s_endpgm define amdgpu_kernel void @fcmp_f16_nle( +; SI-LABEL: fcmp_f16_nle: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fcmp_f16_nle: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: v_cmp_nle_f16_e32 vcc, v0, v1 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: fcmp_f16_nle: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s2, s10 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s4 +; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) { @@ -305,17 +1150,82 @@ ret void } -; GCN-LABEL: {{^}}fcmp_f16_neq -; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] -; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_cmp_neq_f32_e32 vcc, v[[A_F32]], v[[B_F32]] -; VI: v_cmp_neq_f16_e32 vcc, v[[A_F16]], v[[B_F16]] -; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] -; GCN: buffer_store_dword v[[R_I32]] -; GCN: s_endpgm define amdgpu_kernel void @fcmp_f16_neq( +; SI-LABEL: fcmp_f16_neq: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fcmp_f16_neq: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: v_cmp_neq_f16_e32 vcc, v0, v1 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: fcmp_f16_neq: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s2, s10 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s4 +; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: v_cmp_neq_f16_e32 vcc_lo, v0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) { @@ -328,17 +1238,82 @@ ret void } -; GCN-LABEL: {{^}}fcmp_f16_nlt -; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] -; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_cmp_nlt_f32_e32 vcc, v[[A_F32]], v[[B_F32]] -; VI: v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]] -; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] -; GCN: buffer_store_dword v[[R_I32]] -; GCN: s_endpgm define amdgpu_kernel void @fcmp_f16_nlt( +; SI-LABEL: fcmp_f16_nlt: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fcmp_f16_nlt: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: fcmp_f16_nlt: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s2, s10 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s4 +; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) { @@ -351,13 +1326,97 @@ ret void } -; GCN-LABEL: {{^}}fcmp_v2f16_lt: -; SI: v_cmp_lt_f32_e32 vcc, -; SI: v_cmp_lt_f32_e32 vcc, - -; VI: v_cmp_lt_f16_e32 vcc, -; VI: v_cmp_lt_f16_e32 vcc, define amdgpu_kernel void @fcmp_v2f16_lt( +; SI-LABEL: fcmp_v2f16_lt: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cmp_lt_f32_e32 vcc, v2, v3 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; SI-NEXT: v_cmp_lt_f32_e32 vcc, v4, v1 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fcmp_v2f16_lt: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_cmp_lt_f16_e32 vcc, v1, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; VI-NEXT: v_cmp_lt_f16_e32 vcc, v3, v2 +; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: fcmp_v2f16_lt: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, s10 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-NEXT: s_mov_b32 s8, s4 +; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v1, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) { @@ -370,13 +1429,98 @@ ret void } -; GCN-LABEL: {{^}}fcmp_v2f16_eq -; SI: v_cmp_eq_f32_e32 vcc, -; SI: v_cmp_eq_f32_e32 vcc, -; VI: v_cmp_eq_f16_e32 vcc, -; VI: v_cmp_eq_f16_e32 vcc, define amdgpu_kernel void @fcmp_v2f16_eq( +; SI-LABEL: fcmp_v2f16_eq: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v3 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; SI-NEXT: v_cmp_eq_f32_e32 vcc, v4, v1 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fcmp_v2f16_eq: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_cmp_eq_f16_e32 vcc, v1, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; VI-NEXT: v_cmp_eq_f16_e32 vcc, v3, v2 +; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: fcmp_v2f16_eq: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, s10 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-NEXT: s_mov_b32 s8, s4 +; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-NEXT: v_cmp_eq_f16_e32 vcc_lo, v1, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_eq_f16_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) { @@ -389,12 +1533,97 @@ ret void } -; GCN-LABEL: {{^}}fcmp_v2f16_le: -; SI: v_cmp_le_f32_e32 vcc -; SI: v_cmp_le_f32_e32 vcc -; VI: v_cmp_le_f16_e32 vcc -; VI: v_cmp_le_f16_e32 vcc define amdgpu_kernel void @fcmp_v2f16_le( +; SI-LABEL: fcmp_v2f16_le: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cmp_le_f32_e32 vcc, v2, v3 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; SI-NEXT: v_cmp_le_f32_e32 vcc, v4, v1 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fcmp_v2f16_le: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_cmp_le_f16_e32 vcc, v1, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; VI-NEXT: v_cmp_le_f16_e32 vcc, v3, v2 +; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: fcmp_v2f16_le: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, s10 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-NEXT: s_mov_b32 s8, s4 +; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-NEXT: v_cmp_le_f16_e32 vcc_lo, v1, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_le_f16_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) { @@ -407,13 +1636,97 @@ ret void } -; GCN-LABEL: {{^}}fcmp_v2f16_gt: -; SI: v_cmp_gt_f32_e32 vcc, -; SI: v_cmp_gt_f32_e32 vcc, - -; VI: v_cmp_gt_f16_e32 vcc, -; VI: v_cmp_gt_f16_e32 vcc, define amdgpu_kernel void @fcmp_v2f16_gt( +; SI-LABEL: fcmp_v2f16_gt: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, v2, v3 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; SI-NEXT: v_cmp_gt_f32_e32 vcc, v4, v1 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fcmp_v2f16_gt: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_cmp_gt_f16_e32 vcc, v1, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; VI-NEXT: v_cmp_gt_f16_e32 vcc, v3, v2 +; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: fcmp_v2f16_gt: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, s10 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-NEXT: s_mov_b32 s8, s4 +; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, v1, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) { @@ -426,13 +1739,98 @@ ret void } -; GCN-LABEL: {{^}}fcmp_v2f16_lg: -; SI: v_cmp_lg_f32_e32 vcc, -; SI: v_cmp_lg_f32_e32 vcc, -; VI: v_cmp_lg_f16_e32 vcc, -; VI: v_cmp_lg_f16_e32 vcc, define amdgpu_kernel void @fcmp_v2f16_lg( +; SI-LABEL: fcmp_v2f16_lg: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cmp_lg_f32_e32 vcc, v2, v3 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; SI-NEXT: v_cmp_lg_f32_e32 vcc, v4, v1 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fcmp_v2f16_lg: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_cmp_lg_f16_e32 vcc, v1, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; VI-NEXT: v_cmp_lg_f16_e32 vcc, v3, v2 +; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: fcmp_v2f16_lg: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, s10 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-NEXT: s_mov_b32 s8, s4 +; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-NEXT: v_cmp_lg_f16_e32 vcc_lo, v1, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_lg_f16_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) { @@ -445,13 +1843,98 @@ ret void } -; GCN-LABEL: {{^}}fcmp_v2f16_ge: -; SI: v_cmp_ge_f32_e32 vcc, -; SI: v_cmp_ge_f32_e32 vcc, -; VI: v_cmp_ge_f16_e32 vcc, -; VI: v_cmp_ge_f16_e32 vcc, define amdgpu_kernel void @fcmp_v2f16_ge( +; SI-LABEL: fcmp_v2f16_ge: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cmp_ge_f32_e32 vcc, v2, v3 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; SI-NEXT: v_cmp_ge_f32_e32 vcc, v4, v1 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fcmp_v2f16_ge: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_cmp_ge_f16_e32 vcc, v1, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; VI-NEXT: v_cmp_ge_f16_e32 vcc, v3, v2 +; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: fcmp_v2f16_ge: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, s10 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-NEXT: s_mov_b32 s8, s4 +; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, v1, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) { @@ -464,13 +1947,98 @@ ret void } -; GCN-LABEL: {{^}}fcmp_v2f16_o: -; SI: v_cmp_o_f32_e32 vcc, -; SI: v_cmp_o_f32_e32 vcc, -; VI: v_cmp_o_f16_e32 vcc, -; VI: v_cmp_o_f16_e32 vcc, define amdgpu_kernel void @fcmp_v2f16_o( +; SI-LABEL: fcmp_v2f16_o: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cmp_o_f32_e32 vcc, v2, v3 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; SI-NEXT: v_cmp_o_f32_e32 vcc, v4, v1 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fcmp_v2f16_o: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_cmp_o_f16_e32 vcc, v1, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; VI-NEXT: v_cmp_o_f16_e32 vcc, v3, v2 +; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: fcmp_v2f16_o: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, s10 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-NEXT: s_mov_b32 s8, s4 +; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) { @@ -483,13 +2051,98 @@ ret void } -; GCN-LABEL: {{^}}fcmp_v2f16_u: -; SI: v_cmp_u_f32_e32 vcc, -; SI: v_cmp_u_f32_e32 vcc, -; VI: v_cmp_u_f16_e32 vcc, -; VI: v_cmp_u_f16_e32 vcc, define amdgpu_kernel void @fcmp_v2f16_u( +; SI-LABEL: fcmp_v2f16_u: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cmp_u_f32_e32 vcc, v2, v3 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; SI-NEXT: v_cmp_u_f32_e32 vcc, v4, v1 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fcmp_v2f16_u: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_cmp_u_f16_e32 vcc, v1, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; VI-NEXT: v_cmp_u_f16_e32 vcc, v3, v2 +; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: fcmp_v2f16_u: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, s10 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-NEXT: s_mov_b32 s8, s4 +; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-NEXT: v_cmp_u_f16_e32 vcc_lo, v1, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_u_f16_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) { @@ -502,13 +2155,97 @@ ret void } -; GCN-LABEL: {{^}}fcmp_v2f16_nge -; SI: v_cmp_nge_f32_e32 vcc, -; SI: v_cmp_nge_f32_e32 vcc, - -; VI: v_cmp_nge_f16_e32 vcc, -; VI: v_cmp_nge_f16_e32 vcc, define amdgpu_kernel void @fcmp_v2f16_nge( +; SI-LABEL: fcmp_v2f16_nge: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cmp_nge_f32_e32 vcc, v2, v3 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; SI-NEXT: v_cmp_nge_f32_e32 vcc, v4, v1 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fcmp_v2f16_nge: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_cmp_nge_f16_e32 vcc, v1, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; VI-NEXT: v_cmp_nge_f16_e32 vcc, v3, v2 +; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: fcmp_v2f16_nge: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, s10 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-NEXT: s_mov_b32 s8, s4 +; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-NEXT: v_cmp_nge_f16_e32 vcc_lo, v1, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_nge_f16_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) { @@ -521,13 +2258,97 @@ ret void } -; GCN-LABEL: {{^}}fcmp_v2f16_nlg -; SI: v_cmp_nlg_f32_e32 vcc -; SI: v_cmp_nlg_f32_e32 vcc - -; VI: v_cmp_nlg_f16_e32 vcc -; VI: v_cmp_nlg_f16_e32 vcc define amdgpu_kernel void @fcmp_v2f16_nlg( +; SI-LABEL: fcmp_v2f16_nlg: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cmp_nlg_f32_e32 vcc, v2, v3 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; SI-NEXT: v_cmp_nlg_f32_e32 vcc, v4, v1 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fcmp_v2f16_nlg: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_cmp_nlg_f16_e32 vcc, v1, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; VI-NEXT: v_cmp_nlg_f16_e32 vcc, v3, v2 +; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: fcmp_v2f16_nlg: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, s10 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-NEXT: s_mov_b32 s8, s4 +; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v1, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) { @@ -540,13 +2361,98 @@ ret void } -; GCN-LABEL: {{^}}fcmp_v2f16_ngt -; SI: v_cmp_ngt_f32_e32 vcc, -; SI: v_cmp_ngt_f32_e32 vcc, -; VI: v_cmp_ngt_f16_e32 vcc, -; VI: v_cmp_ngt_f16_e32 vcc, define amdgpu_kernel void @fcmp_v2f16_ngt( +; SI-LABEL: fcmp_v2f16_ngt: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v3 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v4, v1 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fcmp_v2f16_ngt: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_cmp_ngt_f16_e32 vcc, v1, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; VI-NEXT: v_cmp_ngt_f16_e32 vcc, v3, v2 +; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: fcmp_v2f16_ngt: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, s10 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-NEXT: s_mov_b32 s8, s4 +; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) { @@ -559,13 +2465,97 @@ ret void } -; GCN-LABEL: {{^}}fcmp_v2f16_nle -; SI: v_cmp_nle_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} -; SI: v_cmp_nle_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} - -; VI: v_cmp_nle_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} -; VI: v_cmp_nle_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @fcmp_v2f16_nle( +; SI-LABEL: fcmp_v2f16_nle: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cmp_nle_f32_e32 vcc, v2, v3 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; SI-NEXT: v_cmp_nle_f32_e32 vcc, v4, v1 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fcmp_v2f16_nle: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_cmp_nle_f16_e32 vcc, v1, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; VI-NEXT: v_cmp_nle_f16_e32 vcc, v3, v2 +; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: fcmp_v2f16_nle: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, s10 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-NEXT: s_mov_b32 s8, s4 +; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-NEXT: v_cmp_nle_f16_e32 vcc_lo, v1, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_nle_f16_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) { @@ -578,13 +2568,97 @@ ret void } -; GCN-LABEL: {{^}}fcmp_v2f16_neq -; SI: v_cmp_neq_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} -; SI: v_cmp_neq_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} - -; VI: v_cmp_neq_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} -; VI: v_cmp_neq_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @fcmp_v2f16_neq( +; SI-LABEL: fcmp_v2f16_neq: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cmp_neq_f32_e32 vcc, v2, v3 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; SI-NEXT: v_cmp_neq_f32_e32 vcc, v4, v1 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fcmp_v2f16_neq: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_cmp_neq_f16_e32 vcc, v1, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; VI-NEXT: v_cmp_neq_f16_e32 vcc, v3, v2 +; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: fcmp_v2f16_neq: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, s10 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-NEXT: s_mov_b32 s8, s4 +; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-NEXT: v_cmp_neq_f16_e32 vcc_lo, v1, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_neq_f16_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) { @@ -597,26 +2671,97 @@ ret void } -; GCN-LABEL: {{^}}fcmp_v2f16_nlt -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; SI-DAG: v_cmp_nlt_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] - -; GCN-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI-DAG: v_cmp_nlt_f32_e32 vcc, v[[A_F32_1]], v[[B_F32_1]] -; VI-DAG: v_cmp_nlt_f16_e32 vcc, v[[B_V2_F16]], v[[A_V2_F16]] -; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] - -; VI: v_cmp_nlt_f16_e32 vcc, v[[B_F16_1]], v[[A_F16_1]] -; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] -; GCN: buffer_store_dwordx2 v[[[R_I32_0]]:[[R_I32_1]]] -; GCN: s_endpgm define amdgpu_kernel void @fcmp_v2f16_nlt( +; SI-LABEL: fcmp_v2f16_nlt: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v2, v3 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v4, v1 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fcmp_v2f16_nlt: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v1, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v3, v2 +; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: fcmp_v2f16_nlt: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, s10 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-NEXT: s_mov_b32 s8, s4 +; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) { diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -2,6 +2,7 @@ ; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -enable-var-scope --check-prefixes=SI %s ; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope --check-prefixes=VI %s ; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope --check-prefixes=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 < %s | FileCheck -enable-var-scope --check-prefixes=GFX11 %s declare half @llvm.copysign.f16(half, half) #0 declare float @llvm.copysign.f32(float, float) #0 @@ -56,6 +57,21 @@ ; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v2 ; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: s_copysign_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v0, s3 +; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s2, v0 +; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %out = call half @llvm.copysign.f16(half %mag, half %sign) store half %out, ptr addrspace(1) %arg_out ret void @@ -96,6 +112,19 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_copysign_f16_0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call half @llvm.copysign.f16(half %mag, half 0.0) store half %result, ptr addrspace(1) %out, align 4 ret void @@ -136,6 +165,19 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_copysign_f16_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call half @llvm.copysign.f16(half %mag, half 1.0) store half %result, ptr addrspace(1) %out, align 4 ret void @@ -176,6 +218,19 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_copysign_f16_10.0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call half @llvm.copysign.f16(half %mag, half 10.0) store half %result, ptr addrspace(1) %out, align 4 ret void @@ -216,6 +271,19 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_copysign_f16_neg1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bitset1_b32 s2, 15 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call half @llvm.copysign.f16(half %mag, half -1.0) store half %result, ptr addrspace(1) %out, align 4 ret void @@ -256,6 +324,19 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_copysign_f16_neg10: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bitset1_b32 s2, 15 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call half @llvm.copysign.f16(half %mag, half -10.0) store half %result, ptr addrspace(1) %out, align 4 ret void @@ -298,6 +379,18 @@ ; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_copysign_f16_0_mag: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_and_b32_e64 v1, 0xffff8000, s2 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call half @llvm.copysign.f16(half 0.0, half %sign) store half %result, ptr addrspace(1) %out, align 4 ret void @@ -343,6 +436,20 @@ ; GFX9-NEXT: v_or_b32_e32 v1, 0x3c00, v1 ; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_copysign_f16_1_mag: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v0, 0x3c00, v0 +; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call half @llvm.copysign.f16(half 1.0, half %sign) store half %result, ptr addrspace(1) %out, align 4 ret void @@ -388,6 +495,20 @@ ; GFX9-NEXT: v_or_b32_e32 v1, 0x4900, v1 ; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_copysign_f16_10_mag: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v0, 0x4900, v0 +; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call half @llvm.copysign.f16(half 10.0, half %sign) store half %result, ptr addrspace(1) %out, align 4 ret void @@ -432,6 +553,20 @@ ; GFX9-NEXT: v_or_b32_e32 v1, 0x3c00, v1 ; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_copysign_f16_neg1_mag: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v0, 0x3c00, v0 +; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call half @llvm.copysign.f16(half -1.0, half %sign) store half %result, ptr addrspace(1) %out, align 4 ret void @@ -477,6 +612,20 @@ ; GFX9-NEXT: v_or_b32_e32 v1, 0x4900, v1 ; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_copysign_f16_neg10_mag: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v0, 0x4900, v0 +; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call half @llvm.copysign.f16(half -10.0, half %sign) store half %result, ptr addrspace(1) %out, align 4 ret void @@ -505,6 +654,13 @@ ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_copysign_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call half @llvm.copysign.f16(half %mag, half %sign) ret half %result } @@ -528,6 +684,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_copysign_f16_0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call half @llvm.copysign.f16(half %mag, half 0.0) ret half %result } @@ -551,6 +714,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_copysign_f16_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call half @llvm.copysign.f16(half %mag, half 1.0) ret half %result } @@ -574,6 +744,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_copysign_f16_10: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call half @llvm.copysign.f16(half %mag, half 10.0) ret half %result } @@ -597,6 +774,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_or_b32_e32 v0, 0x8000, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_copysign_f16_neg1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_or_b32_e32 v0, 0x8000, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call half @llvm.copysign.f16(half %mag, half -1.0) ret half %result } @@ -620,6 +804,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_or_b32_e32 v0, 0x8000, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_copysign_f16_neg10: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_or_b32_e32 v0, 0x8000, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call half @llvm.copysign.f16(half %mag, half -10.0) ret half %result } @@ -693,6 +884,25 @@ ; GFX9-NEXT: v_bfi_b32 v0, s0, v1, v0 ; GFX9-NEXT: global_store_dword v2, v0, s[4:5] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_copysign_out_f32_mag_f16_sign_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 1, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u16 v1, v1, s[6:7] +; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v1, v0 +; GFX11-NEXT: global_store_b32 v2, v0, s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %arg_mag_gep = getelementptr half, ptr addrspace(1) %arg_mag, i32 %tid %mag = load half, ptr addrspace(1) %arg_mag_gep @@ -774,6 +984,25 @@ ; GFX9-NEXT: v_bfi_b32 v3, s0, v3, v1 ; GFX9-NEXT: global_store_dwordx2 v0, v[2:3], s[4:5] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_copysign_out_f64_mag_f16_sign_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u16 v2, v1, s[6:7] +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v1 +; GFX11-NEXT: global_store_b64 v0, v[2:3], s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %arg_mag_gep = getelementptr half, ptr addrspace(1) %arg_mag, i32 %tid %mag = load half, ptr addrspace(1) %arg_mag_gep @@ -853,6 +1082,25 @@ ; GFX9-NEXT: v_bfi_b32 v0, s0, v0, v1 ; GFX9-NEXT: global_store_dword v2, v0, s[4:5] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_copysign_out_f32_mag_f32_sign_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 1, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u16 v1, v1, s[4:5] +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v1 +; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %arg_mag_gep = getelementptr float, ptr addrspace(1) %arg_mag, i32 %tid %mag = load float, ptr addrspace(1) %arg_mag_gep @@ -934,6 +1182,25 @@ ; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v2 ; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_copysign_out_f64_mag_f64_sign_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v0 +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u16 v2, v1, s[4:5] +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v2 +; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %arg_mag_gep = getelementptr double, ptr addrspace(1) %arg_mag, i32 %tid %mag = load double, ptr addrspace(1) %arg_mag_gep @@ -1015,6 +1282,25 @@ ; GFX9-NEXT: v_bfi_b32 v0, s0, v0, v1 ; GFX9-NEXT: global_store_short v2, v0, s[4:5] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_copysign_out_f16_mag_f16_sign_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v1, s[4:5] +; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 +; GFX11-NEXT: global_store_b16 v2, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %arg_mag_gep = getelementptr half, ptr addrspace(1) %arg_mag, i32 %tid %mag = load half, ptr addrspace(1) %arg_mag_gep @@ -1093,6 +1379,25 @@ ; GFX9-NEXT: v_bfi_b32 v1, s0, v2, v1 ; GFX9-NEXT: global_store_short v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_copysign_out_f16_mag_f16_sign_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[4:5] +; GFX11-NEXT: global_load_u16 v0, v2, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 +; GFX11-NEXT: global_store_b16 v2, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %arg_mag_gep = getelementptr half, ptr addrspace(1) %arg_mag, i32 %tid %mag = load half, ptr addrspace(1) %arg_mag @@ -1176,6 +1481,25 @@ ; GFX9-NEXT: v_bfi_b32 v0, s0, v1, v0 ; GFX9-NEXT: global_store_short v2, v0, s[4:5] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_copysign_out_f16_mag_f32_sign_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v1, s[6:7] +; GFX11-NEXT: global_load_u16 v0, v0, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v1, v0 +; GFX11-NEXT: global_store_b16 v2, v0, s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %arg_mag_gep = getelementptr float, ptr addrspace(1) %arg_mag, i32 %tid %mag = load float, ptr addrspace(1) %arg_mag_gep @@ -1372,6 +1696,75 @@ ; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v2 ; GFX9-NEXT: global_store_short v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: s_copysign_out_f16_mag_f64_sign_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_and_b32 s1, s7, 0x1ff +; GFX11-NEXT: s_lshr_b32 s2, s7, 8 +; GFX11-NEXT: s_or_b32 s1, s1, s6 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffe +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_cselect_b32 s1, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1 +; GFX11-NEXT: s_bfe_u32 s1, s7, 0xb0014 +; GFX11-NEXT: s_sub_i32 s3, 0x3f1, s1 +; GFX11-NEXT: s_addk_i32 s1, 0xfc10 +; GFX11-NEXT: v_med3_i32 v1, s3, 0, 13 +; GFX11-NEXT: v_readfirstlane_b32 s3, v0 +; GFX11-NEXT: s_lshl_b32 s8, s1, 12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_readfirstlane_b32 s6, v1 +; GFX11-NEXT: s_or_b32 s2, s2, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_or_b32 s3, s2, 0x1000 +; GFX11-NEXT: s_or_b32 s8, s2, s8 +; GFX11-NEXT: s_lshr_b32 s6, s3, s6 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e64 v0, v1, s6 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, s3, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_readfirstlane_b32 s3, v0 +; GFX11-NEXT: s_or_b32 s3, s6, s3 +; GFX11-NEXT: s_cmp_lt_i32 s1, 1 +; GFX11-NEXT: s_cselect_b32 s3, s3, s8 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s6, s3, 7 +; GFX11-NEXT: s_cmp_gt_i32 s6, 5 +; GFX11-NEXT: s_cselect_b32 s8, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s6, 3 +; GFX11-NEXT: s_cselect_b32 s6, -1, 0 +; GFX11-NEXT: s_lshr_b32 s3, s3, 2 +; GFX11-NEXT: s_or_b32 s6, s6, s8 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_cmp_lg_u32 s6, 0 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_cmp_lt_i32 s1, 31 +; GFX11-NEXT: s_cselect_b32 s3, s3, 0x7c00 +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_cmpk_eq_i32 s1, 0x40f +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: s_lshr_b32 s1, s7, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_and_b32 s1, s1, 0x8000 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 9, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v0, 0x7c00, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, s3, v0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v0, s1, v0 +; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, s0 +; GFX11-NEXT: global_store_b16 v1, v0, s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %mag.trunc = fptrunc double %mag to half %result = call half @llvm.copysign.f16(half %mag.trunc, half %sign) store half %result, ptr addrspace(1) %arg_out @@ -1441,6 +1834,26 @@ ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: s_copysign_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s3 +; GFX11-NEXT: s_lshr_b32 s3, s3, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s2, v0 +; GFX11-NEXT: s_lshr_b32 s2, s2, 16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s2, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %out = call <2 x half> @llvm.copysign.v2f16(<2 x half> %arg_mag, <2 x half> %arg_sign) store <2 x half> %out, ptr addrspace(1) %arg_out ret void @@ -1527,6 +1940,31 @@ ; GFX9-NEXT: global_store_short v0, v2, s[2:3] offset:4 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: s_copysign_v3f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshr_b32 s2, s6, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s4, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s2, v1 +; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, s5, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b16 v3, v2, s[0:1] offset:4 +; GFX11-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %out = call <3 x half> @llvm.copysign.v3f16(<3 x half> %arg_mag, <3 x half> %arg_sign) store <3 x half> %out, ptr addrspace(1) %arg_out ret void @@ -1628,6 +2066,33 @@ ; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: s_copysign_v4f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v0, s7 +; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: s_lshr_b32 s2, s7, 16 +; GFX11-NEXT: s_lshr_b32 s6, s6, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s6 +; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s5, v0 +; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s4, v1 +; GFX11-NEXT: s_lshr_b32 s3, s5, 16 +; GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, s3, v2 +; GFX11-NEXT: v_bfi_b32 v3, 0x7fff, s2, v3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v0 +; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v4 +; GFX11-NEXT: global_store_b64 v5, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %out = call <4 x half> @llvm.copysign.v4f16(<4 x half> %arg_mag, <4 x half> %arg_sign) store <4 x half> %out, ptr addrspace(1) %arg_out ret void diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SI %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VI %s +; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=SIVI,SI %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=SIVI,VI %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s define amdgpu_kernel void @s_test_copysign_f32(ptr addrspace(1) %out, float %mag, float %sign) { ; SI-LABEL: s_test_copysign_f32: @@ -30,6 +31,17 @@ ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_copysign_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s2, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call float @llvm.copysign.f32(float %mag, float %sign) store float %result, ptr addrspace(1) %out, align 4 ret void @@ -59,6 +71,19 @@ ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_copysign_f32_0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bitset0_b32 s2, 31 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call float @llvm.copysign.f32(float %mag, float 0.0) store float %result, ptr addrspace(1) %out, align 4 ret void @@ -88,6 +113,19 @@ ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_copysign_f32_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bitset0_b32 s2, 31 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call float @llvm.copysign.f32(float %mag, float 1.0) store float %result, ptr addrspace(1) %out, align 4 ret void @@ -117,6 +155,19 @@ ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_copysign_f32_10.0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bitset0_b32 s2, 31 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call float @llvm.copysign.f32(float %mag, float 10.0) store float %result, ptr addrspace(1) %out, align 4 ret void @@ -146,6 +197,19 @@ ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_copysign_f32_neg1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bitset1_b32 s2, 31 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call float @llvm.copysign.f32(float %mag, float -1.0) store float %result, ptr addrspace(1) %out, align 4 ret void @@ -175,6 +239,19 @@ ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_copysign_f32_neg10: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bitset1_b32 s2, 31 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call float @llvm.copysign.f32(float %mag, float -10.0) store float %result, ptr addrspace(1) %out, align 4 ret void @@ -204,6 +281,19 @@ ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_copysign_f32_0_mag: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call float @llvm.copysign.f32(float 0.0, float %sign) store float %result, ptr addrspace(1) %out, align 4 ret void @@ -236,6 +326,20 @@ ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_copysign_f32_1_mag: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s2, s2, 1.0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call float @llvm.copysign.f32(float 1.0, float %sign) store float %result, ptr addrspace(1) %out, align 4 ret void @@ -267,6 +371,20 @@ ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_copysign_f32_10_mag: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s2, s2, 0x41200000 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call float @llvm.copysign.f32(float 10.0, float %sign) store float %result, ptr addrspace(1) %out, align 4 ret void @@ -298,6 +416,20 @@ ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_copysign_f32_neg1_mag: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s2, s2, 1.0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call float @llvm.copysign.f32(float -1.0, float %sign) store float %result, ptr addrspace(1) %out, align 4 ret void @@ -329,6 +461,20 @@ ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_copysign_f32_neg10_mag: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s2, s2, 0x41200000 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call float @llvm.copysign.f32(float -10.0, float %sign) store float %result, ptr addrspace(1) %out, align 4 ret void @@ -368,6 +514,21 @@ ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_copysign_v2f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7 +; GFX11-NEXT: v_mov_b32_e32 v2, s6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v0 +; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s4, v2 +; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> %sign) store <2 x float> %result, ptr addrspace(1) %out, align 8 ret void @@ -414,6 +575,24 @@ ; VI-NEXT: v_mov_b32_e32 v3, s0 ; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_copysign_v3f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s10 :: v_dual_mov_b32 v1, s9 +; GFX11-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, s6, v0 +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s4, v3 +; GFX11-NEXT: global_store_b96 v4, v[0:2], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call <3 x float> @llvm.copysign.v3f32(<3 x float> %mag, <3 x float> %sign) store <3 x float> %result, ptr addrspace(1) %out, align 16 ret void @@ -465,144 +644,259 @@ ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_copysign_v4f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v6, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s11 :: v_dual_mov_b32 v1, s10 +; GFX11-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v5, s8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, s7, v0 +; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, s6, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v4 +; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s4, v5 +; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call <4 x float> @llvm.copysign.v4f32(<4 x float> %mag, <4 x float> %sign) store <4 x float> %result, ptr addrspace(1) %out, align 16 ret void } define float @v_test_copysign_f32(float %mag, float %sign) { -; GCN-LABEL: v_test_copysign_f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_brev_b32 s4, -2 -; GCN-NEXT: v_bfi_b32 v0, s4, v0, v1 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SIVI-LABEL: v_test_copysign_f32: +; SIVI: ; %bb.0: +; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SIVI-NEXT: s_brev_b32 s4, -2 +; SIVI-NEXT: v_bfi_b32 v0, s4, v0, v1 +; SIVI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_copysign_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call float @llvm.copysign.f32(float %mag, float %sign) ret float %result } define float @v_test_copysign_f32_0(float %mag) { -; GCN-LABEL: v_test_copysign_f32_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SIVI-LABEL: v_test_copysign_f32_0: +; SIVI: ; %bb.0: +; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SIVI-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; SIVI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_copysign_f32_0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call float @llvm.copysign.f32(float %mag, float 0.0) ret float %result } define float @v_test_copysign_f32_1(float %mag) { -; GCN-LABEL: v_test_copysign_f32_1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SIVI-LABEL: v_test_copysign_f32_1: +; SIVI: ; %bb.0: +; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SIVI-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; SIVI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_copysign_f32_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call float @llvm.copysign.f32(float %mag, float 1.0) ret float %result } define float @v_test_copysign_f32_10(float %mag) { -; GCN-LABEL: v_test_copysign_f32_10: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SIVI-LABEL: v_test_copysign_f32_10: +; SIVI: ; %bb.0: +; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SIVI-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; SIVI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_copysign_f32_10: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call float @llvm.copysign.f32(float %mag, float 10.0) ret float %result } define float @v_test_copysign_f32_neg1(float %mag) { -; GCN-LABEL: v_test_copysign_f32_neg1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, 0x80000000, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SIVI-LABEL: v_test_copysign_f32_neg1: +; SIVI: ; %bb.0: +; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SIVI-NEXT: v_or_b32_e32 v0, 0x80000000, v0 +; SIVI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_copysign_f32_neg1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_or_b32_e32 v0, 0x80000000, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call float @llvm.copysign.f32(float %mag, float -1.0) ret float %result } define float @v_test_copysign_f32_neg10(float %mag) { -; GCN-LABEL: v_test_copysign_f32_neg10: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, 0x80000000, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SIVI-LABEL: v_test_copysign_f32_neg10: +; SIVI: ; %bb.0: +; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SIVI-NEXT: v_or_b32_e32 v0, 0x80000000, v0 +; SIVI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_copysign_f32_neg10: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_or_b32_e32 v0, 0x80000000, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call float @llvm.copysign.f32(float %mag, float -10.0) ret float %result } define <2 x float> @v_test_copysign_v2f32(<2 x float> %mag, <2 x float> %sign) { -; GCN-LABEL: v_test_copysign_v2f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_brev_b32 s4, -2 -; GCN-NEXT: v_bfi_b32 v0, s4, v0, v2 -; GCN-NEXT: v_bfi_b32 v1, s4, v1, v3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SIVI-LABEL: v_test_copysign_v2f32: +; SIVI: ; %bb.0: +; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SIVI-NEXT: s_brev_b32 s4, -2 +; SIVI-NEXT: v_bfi_b32 v0, s4, v0, v2 +; SIVI-NEXT: v_bfi_b32 v1, s4, v1, v3 +; SIVI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_copysign_v2f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v2 +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> %sign) ret <2 x float> %result } define <2 x float> @v_test_copysign_v2f32_0(<2 x float> %mag) { -; GCN-LABEL: v_test_copysign_v2f32_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SIVI-LABEL: v_test_copysign_v2f32_0: +; SIVI: ; %bb.0: +; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SIVI-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; SIVI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; SIVI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_copysign_v2f32_0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> zeroinitializer) ret <2 x float> %result } define <2 x float> @v_test_copysign_v2f32_neg1(<2 x float> %mag) { -; GCN-LABEL: v_test_copysign_v2f32_neg1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, 0x80000000, v0 -; GCN-NEXT: v_or_b32_e32 v1, 0x80000000, v1 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SIVI-LABEL: v_test_copysign_v2f32_neg1: +; SIVI: ; %bb.0: +; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SIVI-NEXT: v_or_b32_e32 v0, 0x80000000, v0 +; SIVI-NEXT: v_or_b32_e32 v1, 0x80000000, v1 +; SIVI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_copysign_v2f32_neg1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_or_b32_e32 v0, 0x80000000, v0 +; GFX11-NEXT: v_or_b32_e32 v1, 0x80000000, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> ) ret <2 x float> %result } define <3 x float> @v_test_copysign_v3f32(<3 x float> %mag, <3 x float> %sign) { -; GCN-LABEL: v_test_copysign_v3f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_brev_b32 s4, -2 -; GCN-NEXT: v_bfi_b32 v0, s4, v0, v3 -; GCN-NEXT: v_bfi_b32 v1, s4, v1, v4 -; GCN-NEXT: v_bfi_b32 v2, s4, v2, v5 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SIVI-LABEL: v_test_copysign_v3f32: +; SIVI: ; %bb.0: +; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SIVI-NEXT: s_brev_b32 s4, -2 +; SIVI-NEXT: v_bfi_b32 v0, s4, v0, v3 +; SIVI-NEXT: v_bfi_b32 v1, s4, v1, v4 +; SIVI-NEXT: v_bfi_b32 v2, s4, v2, v5 +; SIVI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_copysign_v3f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v3 +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v4 +; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <3 x float> @llvm.copysign.v3f32(<3 x float> %mag, <3 x float> %sign) ret <3 x float> %result } define <4 x float> @v_test_copysign_v4f32(<4 x float> %mag, <4 x float> %sign) { -; GCN-LABEL: v_test_copysign_v4f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_brev_b32 s4, -2 -; GCN-NEXT: v_bfi_b32 v0, s4, v0, v4 -; GCN-NEXT: v_bfi_b32 v1, s4, v1, v5 -; GCN-NEXT: v_bfi_b32 v2, s4, v2, v6 -; GCN-NEXT: v_bfi_b32 v3, s4, v3, v7 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SIVI-LABEL: v_test_copysign_v4f32: +; SIVI: ; %bb.0: +; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SIVI-NEXT: s_brev_b32 s4, -2 +; SIVI-NEXT: v_bfi_b32 v0, s4, v0, v4 +; SIVI-NEXT: v_bfi_b32 v1, s4, v1, v5 +; SIVI-NEXT: v_bfi_b32 v2, s4, v2, v6 +; SIVI-NEXT: v_bfi_b32 v3, s4, v3, v7 +; SIVI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_copysign_v4f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v4 +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v5 +; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, v6 +; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v7 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.copysign.v4f32(<4 x float> %mag, <4 x float> %sign) ret <4 x float> %result } define <5 x float> @v_test_copysign_v5f32(<5 x float> %mag, <5 x float> %sign) { -; GCN-LABEL: v_test_copysign_v5f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_brev_b32 s4, -2 -; GCN-NEXT: v_bfi_b32 v0, s4, v0, v5 -; GCN-NEXT: v_bfi_b32 v1, s4, v1, v6 -; GCN-NEXT: v_bfi_b32 v2, s4, v2, v7 -; GCN-NEXT: v_bfi_b32 v3, s4, v3, v8 -; GCN-NEXT: v_bfi_b32 v4, s4, v4, v9 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SIVI-LABEL: v_test_copysign_v5f32: +; SIVI: ; %bb.0: +; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SIVI-NEXT: s_brev_b32 s4, -2 +; SIVI-NEXT: v_bfi_b32 v0, s4, v0, v5 +; SIVI-NEXT: v_bfi_b32 v1, s4, v1, v6 +; SIVI-NEXT: v_bfi_b32 v2, s4, v2, v7 +; SIVI-NEXT: v_bfi_b32 v3, s4, v3, v8 +; SIVI-NEXT: v_bfi_b32 v4, s4, v4, v9 +; SIVI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_copysign_v5f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v5 +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v6 +; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, v7 +; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v8 +; GFX11-NEXT: v_bfi_b32 v4, 0x7fffffff, v4, v9 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <5 x float> @llvm.copysign.v5f32(<5 x float> %mag, <5 x float> %sign) ret <5 x float> %result } @@ -637,6 +931,21 @@ ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_copysign_f32_fptrunc_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s2, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %sign.trunc = fptrunc double %sign to float %result = call float @llvm.copysign.f32(float %mag, float %sign.trunc) store float %result, ptr addrspace(1) %out, align 4 @@ -669,6 +978,18 @@ ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_copysign_f32_1_fptrunc_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s2, s2, 1.0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %sign.trunc = fptrunc double %sign to float %result = call float @llvm.copysign.f32(float 1.0, float %sign.trunc) store float %result, ptr addrspace(1) %out, align 4 @@ -703,6 +1024,18 @@ ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_copysign_f32_fpext_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s2, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %sign.ext = fpext half %sign to float %result = call float @llvm.copysign.f32(float %mag, float %sign.ext) store float %result, ptr addrspace(1) %out, align 4 @@ -736,6 +1069,22 @@ ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_copysign_f32_1_fpext_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000 +; GFX11-NEXT: s_or_b32 s2, s2, 1.0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %sign.ext = fpext half %sign to float %result = call float @llvm.copysign.f32(float 1.0, float %sign.ext) store float %result, ptr addrspace(1) %out, align 4 @@ -772,6 +1121,19 @@ ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_copysign_f32_fpext_bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v0, s3 +; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s2, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %sign.ext = fpext bfloat %sign to float %result = call float @llvm.copysign.f32(float %mag, float %sign.ext) store float %result, ptr addrspace(1) %out, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SI %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VI %s +; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=SIVI,SI %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=SIVI,VI %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s declare double @llvm.copysign.f64(double, double) #0 declare <2 x double> @llvm.copysign.v2f64(<2 x double>, <2 x double>) #0 @@ -39,6 +40,22 @@ ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_copysign_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x74 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call double @llvm.copysign.f64(double %mag, double %sign) store double %result, ptr addrspace(1) %out, align 8 ret void @@ -70,6 +87,20 @@ ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_copysign_f64_0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bitset0_b32 s3, 31 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call double @llvm.copysign.f64(double %mag, double 0.0) store double %result, ptr addrspace(1) %out, align 8 ret void @@ -101,6 +132,20 @@ ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_copysign_f64_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bitset0_b32 s3, 31 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call double @llvm.copysign.f64(double %mag, double 1.0) store double %result, ptr addrspace(1) %out, align 8 ret void @@ -132,6 +177,20 @@ ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_copysign_f64_10: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bitset0_b32 s3, 31 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call double @llvm.copysign.f64(double %mag, double 10.0) store double %result, ptr addrspace(1) %out, align 8 ret void @@ -163,6 +222,20 @@ ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_copysign_f64_neg1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bitset1_b32 s3, 31 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call double @llvm.copysign.f64(double %mag, double -1.0) store double %result, ptr addrspace(1) %out, align 8 ret void @@ -194,6 +267,20 @@ ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_copysign_f64_neg10: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bitset1_b32 s3, 31 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call double @llvm.copysign.f64(double %mag, double -10.0) store double %result, ptr addrspace(1) %out, align 8 ret void @@ -231,6 +318,22 @@ ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_copysign_f64_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x74 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %sign.ext = fpext float %sign to double %result = call double @llvm.copysign.f64(double %mag, double %sign.ext) store double %result, ptr addrspace(1) %out, align 8 @@ -269,6 +372,22 @@ ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_copysign_f64_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x74 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %sign.ext = fpext half %sign to double %result = call double @llvm.copysign.f64(double %mag, double %sign.ext) store double %result, ptr addrspace(1) %out, align 8 @@ -301,6 +420,17 @@ ; VI-NEXT: v_mov_b32_e32 v3, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_copysign_f64_0_mag: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call double @llvm.copysign.f64(double 0.0, double %sign) store double %result, ptr addrspace(1) %out, align 4 ret void @@ -334,6 +464,18 @@ ; VI-NEXT: v_mov_b32_e32 v3, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_copysign_f64_1_mag: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s2, s2, 0x3ff00000 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call double @llvm.copysign.f64(double 1.0, double %sign) store double %result, ptr addrspace(1) %out, align 4 ret void @@ -367,6 +509,18 @@ ; VI-NEXT: v_mov_b32_e32 v3, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_copysign_f64_10_mag: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s2, s2, 0x40240000 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call double @llvm.copysign.f64(double 10.0, double %sign) store double %result, ptr addrspace(1) %out, align 4 ret void @@ -400,6 +554,18 @@ ; VI-NEXT: v_mov_b32_e32 v3, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_copysign_f64_neg1_mag: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s2, s2, 0x3ff00000 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call double @llvm.copysign.f64(double -1.0, double %sign) store double %result, ptr addrspace(1) %out, align 4 ret void @@ -433,6 +599,18 @@ ; VI-NEXT: v_mov_b32_e32 v3, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_copysign_f64_neg10_mag: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s2, s2, 0x40240000 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call double @llvm.copysign.f64(double -10.0, double %sign) store double %result, ptr addrspace(1) %out, align 4 ret void @@ -476,6 +654,23 @@ ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_copysign_v2f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s11 +; GFX11-NEXT: v_mov_b32_e32 v2, s9 +; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, s7, v1 +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, s6 +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call <2 x double> @llvm.copysign.v2f64(<2 x double> %mag, <2 x double> %sign) store <2 x double> %result, ptr addrspace(1) %out, align 16 ret void @@ -533,6 +728,26 @@ ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_copysign_v3f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x44 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v1, s15 +; GFX11-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: v_dual_mov_b32 v7, s13 :: v_dual_mov_b32 v4, s8 +; GFX11-NEXT: v_mov_b32_e32 v2, s6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfi_b32 v5, 0x7fffffff, s9, v5 +; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, s7, v1 +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v7 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b64 v6, v[4:5], s[0:1] offset:16 +; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call <3 x double> @llvm.copysign.v3f64(<3 x double> %mag, <3 x double> %sign) store <3 x double> %result, ptr addrspace(1) %out, align 32 ret void @@ -599,86 +814,162 @@ ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_copysign_v4f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x44 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s15 +; GFX11-NEXT: v_dual_mov_b32 v3, s19 :: v_dual_mov_b32 v2, s10 +; GFX11-NEXT: v_dual_mov_b32 v9, s17 :: v_dual_mov_b32 v4, s4 +; GFX11-NEXT: v_dual_mov_b32 v5, s13 :: v_dual_mov_b32 v0, s8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfi_b32 v7, 0x7fffffff, s7, v1 +; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, s11, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s9, v9 +; GFX11-NEXT: v_mov_b32_e32 v6, s6 +; GFX11-NEXT: v_bfi_b32 v5, 0x7fffffff, s5, v5 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 +; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call <4 x double> @llvm.copysign.v4f64(<4 x double> %mag, <4 x double> %sign) store <4 x double> %result, ptr addrspace(1) %out, align 32 ret void } define double @v_test_copysign_f64(ptr addrspace(1) %out, [8 x i32], double %mag, [8 x i32], double %sign) { -; GCN-LABEL: v_test_copysign_f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_brev_b32 s4, -2 -; GCN-NEXT: v_mov_b32_e32 v0, v10 -; GCN-NEXT: v_bfi_b32 v1, s4, v11, v21 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SIVI-LABEL: v_test_copysign_f64: +; SIVI: ; %bb.0: +; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SIVI-NEXT: s_brev_b32 s4, -2 +; SIVI-NEXT: v_mov_b32_e32 v0, v10 +; SIVI-NEXT: v_bfi_b32 v1, s4, v11, v21 +; SIVI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_copysign_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, v10 +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v11, v21 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call double @llvm.copysign.f64(double %mag, double %sign) ret double %result } define double @v_test_copysign_f64_0(ptr addrspace(1) %out, [8 x i32], double %mag) { -; GCN-LABEL: v_test_copysign_f64_0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, v10 -; GCN-NEXT: v_and_b32_e32 v1, 0x7fffffff, v11 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SIVI-LABEL: v_test_copysign_f64_0: +; SIVI: ; %bb.0: +; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SIVI-NEXT: v_mov_b32_e32 v0, v10 +; SIVI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v11 +; SIVI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_copysign_f64_0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_mov_b32 v0, v10 :: v_dual_and_b32 v1, 0x7fffffff, v11 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call double @llvm.copysign.f64(double %mag, double 0.0) ret double %result } define double @v_test_copysign_f64_1(ptr addrspace(1) %out, [8 x i32], double %mag) { -; GCN-LABEL: v_test_copysign_f64_1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, v10 -; GCN-NEXT: v_and_b32_e32 v1, 0x7fffffff, v11 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SIVI-LABEL: v_test_copysign_f64_1: +; SIVI: ; %bb.0: +; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SIVI-NEXT: v_mov_b32_e32 v0, v10 +; SIVI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v11 +; SIVI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_copysign_f64_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_mov_b32 v0, v10 :: v_dual_and_b32 v1, 0x7fffffff, v11 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call double @llvm.copysign.f64(double %mag, double 1.0) ret double %result } define double @v_test_copysign_f64_10(ptr addrspace(1) %out, [8 x i32], double %mag) { -; GCN-LABEL: v_test_copysign_f64_10: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, v10 -; GCN-NEXT: v_and_b32_e32 v1, 0x7fffffff, v11 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SIVI-LABEL: v_test_copysign_f64_10: +; SIVI: ; %bb.0: +; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SIVI-NEXT: v_mov_b32_e32 v0, v10 +; SIVI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v11 +; SIVI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_copysign_f64_10: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_mov_b32 v0, v10 :: v_dual_and_b32 v1, 0x7fffffff, v11 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call double @llvm.copysign.f64(double %mag, double 10.0) ret double %result } define double @v_test_copysign_f64_neg1(ptr addrspace(1) %out, [8 x i32], double %mag) { -; GCN-LABEL: v_test_copysign_f64_neg1: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, v10 -; GCN-NEXT: v_or_b32_e32 v1, 0x80000000, v11 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SIVI-LABEL: v_test_copysign_f64_neg1: +; SIVI: ; %bb.0: +; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SIVI-NEXT: v_mov_b32_e32 v0, v10 +; SIVI-NEXT: v_or_b32_e32 v1, 0x80000000, v11 +; SIVI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_copysign_f64_neg1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, v10 +; GFX11-NEXT: v_or_b32_e32 v1, 0x80000000, v11 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call double @llvm.copysign.f64(double %mag, double -1.0) ret double %result } define double @v_test_copysign_f64_neg10(ptr addrspace(1) %out, [8 x i32], double %mag) { -; GCN-LABEL: v_test_copysign_f64_neg10: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, v10 -; GCN-NEXT: v_or_b32_e32 v1, 0x80000000, v11 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SIVI-LABEL: v_test_copysign_f64_neg10: +; SIVI: ; %bb.0: +; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SIVI-NEXT: v_mov_b32_e32 v0, v10 +; SIVI-NEXT: v_or_b32_e32 v1, 0x80000000, v11 +; SIVI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_copysign_f64_neg10: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, v10 +; GFX11-NEXT: v_or_b32_e32 v1, 0x80000000, v11 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call double @llvm.copysign.f64(double %mag, double -10.0) ret double %result } define double @v_test_copysign_f64_f32(ptr addrspace(1) %out, [8 x i32], double %mag, [8 x i32], float %sign) { -; GCN-LABEL: v_test_copysign_f64_f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_brev_b32 s4, -2 -; GCN-NEXT: v_mov_b32_e32 v0, v10 -; GCN-NEXT: v_bfi_b32 v1, s4, v11, v20 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SIVI-LABEL: v_test_copysign_f64_f32: +; SIVI: ; %bb.0: +; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SIVI-NEXT: s_brev_b32 s4, -2 +; SIVI-NEXT: v_mov_b32_e32 v0, v10 +; SIVI-NEXT: v_bfi_b32 v1, s4, v11, v20 +; SIVI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_copysign_f64_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, v10 +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v11, v20 +; GFX11-NEXT: s_setpc_b64 s[30:31] %sign.ext = fpext float %sign to double %result = call double @llvm.copysign.f64(double %mag, double %sign.ext) ret double %result @@ -701,55 +992,100 @@ ; VI-NEXT: v_mov_b32_e32 v0, v10 ; VI-NEXT: v_bfi_b32 v1, s4, v11, v1 ; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_copysign_f64_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_mov_b32 v0, v10 :: v_dual_lshlrev_b32 v1, 16, v20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v11, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %sign.ext = fpext half %sign to double %result = call double @llvm.copysign.f64(double %mag, double %sign.ext) ret double %result } define <2 x double> @v_test_copysign_v2f64(ptr addrspace(1) %out, <2 x double> %mag, <2 x double> %sign) { -; GCN-LABEL: v_test_copysign_v2f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_brev_b32 s4, -2 -; GCN-NEXT: v_mov_b32_e32 v0, v2 -; GCN-NEXT: v_bfi_b32 v1, s4, v3, v7 -; GCN-NEXT: v_bfi_b32 v3, s4, v5, v9 -; GCN-NEXT: v_mov_b32_e32 v2, v4 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SIVI-LABEL: v_test_copysign_v2f64: +; SIVI: ; %bb.0: +; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SIVI-NEXT: s_brev_b32 s4, -2 +; SIVI-NEXT: v_mov_b32_e32 v0, v2 +; SIVI-NEXT: v_bfi_b32 v1, s4, v3, v7 +; SIVI-NEXT: v_bfi_b32 v3, s4, v5, v9 +; SIVI-NEXT: v_mov_b32_e32 v2, v4 +; SIVI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_copysign_v2f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v3, v7 +; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, v5, v9 +; GFX11-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x double> @llvm.copysign.v2f64(<2 x double> %mag, <2 x double> %sign) ret <2 x double> %result } define <3 x double> @v_test_copysign_v3f64(ptr addrspace(1) %out, <3 x double> %mag, <3 x double> %sign) { -; GCN-LABEL: v_test_copysign_v3f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_brev_b32 s4, -2 -; GCN-NEXT: v_mov_b32_e32 v0, v2 -; GCN-NEXT: v_bfi_b32 v1, s4, v3, v9 -; GCN-NEXT: v_bfi_b32 v3, s4, v5, v11 -; GCN-NEXT: v_bfi_b32 v5, s4, v7, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v4 -; GCN-NEXT: v_mov_b32_e32 v4, v6 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SIVI-LABEL: v_test_copysign_v3f64: +; SIVI: ; %bb.0: +; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SIVI-NEXT: s_brev_b32 s4, -2 +; SIVI-NEXT: v_mov_b32_e32 v0, v2 +; SIVI-NEXT: v_bfi_b32 v1, s4, v3, v9 +; SIVI-NEXT: v_bfi_b32 v3, s4, v5, v11 +; SIVI-NEXT: v_bfi_b32 v5, s4, v7, v13 +; SIVI-NEXT: v_mov_b32_e32 v2, v4 +; SIVI-NEXT: v_mov_b32_e32 v4, v6 +; SIVI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_copysign_v3f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v3, v9 +; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, v5, v11 +; GFX11-NEXT: v_bfi_b32 v5, 0x7fffffff, v7, v13 +; GFX11-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v6 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <3 x double> @llvm.copysign.v3f64(<3 x double> %mag, <3 x double> %sign) ret <3 x double> %result } define <4 x double> @v_test_copysign_v4f64(ptr addrspace(1) %out, <4 x double> %mag, <4 x double> %sign) { -; GCN-LABEL: v_test_copysign_v4f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_brev_b32 s4, -2 -; GCN-NEXT: v_mov_b32_e32 v0, v2 -; GCN-NEXT: v_bfi_b32 v1, s4, v3, v11 -; GCN-NEXT: v_bfi_b32 v3, s4, v5, v13 -; GCN-NEXT: v_bfi_b32 v5, s4, v7, v15 -; GCN-NEXT: v_bfi_b32 v7, s4, v9, v17 -; GCN-NEXT: v_mov_b32_e32 v2, v4 -; GCN-NEXT: v_mov_b32_e32 v4, v6 -; GCN-NEXT: v_mov_b32_e32 v6, v8 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SIVI-LABEL: v_test_copysign_v4f64: +; SIVI: ; %bb.0: +; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SIVI-NEXT: s_brev_b32 s4, -2 +; SIVI-NEXT: v_mov_b32_e32 v0, v2 +; SIVI-NEXT: v_bfi_b32 v1, s4, v3, v11 +; SIVI-NEXT: v_bfi_b32 v3, s4, v5, v13 +; SIVI-NEXT: v_bfi_b32 v5, s4, v7, v15 +; SIVI-NEXT: v_bfi_b32 v7, s4, v9, v17 +; SIVI-NEXT: v_mov_b32_e32 v2, v4 +; SIVI-NEXT: v_mov_b32_e32 v4, v6 +; SIVI-NEXT: v_mov_b32_e32 v6, v8 +; SIVI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_test_copysign_v4f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v3, v11 +; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, v5, v13 +; GFX11-NEXT: v_bfi_b32 v5, 0x7fffffff, v7, v15 +; GFX11-NEXT: v_bfi_b32 v7, 0x7fffffff, v9, v17 +; GFX11-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v6 +; GFX11-NEXT: v_mov_b32_e32 v6, v8 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <4 x double> @llvm.copysign.v4f64(<4 x double> %mag, <4 x double> %sign) ret <4 x double> %result } diff --git a/llvm/test/CodeGen/AMDGPU/fexp.ll b/llvm/test/CodeGen/AMDGPU/fexp.ll --- a/llvm/test/CodeGen/AMDGPU/fexp.ll +++ b/llvm/test/CodeGen/AMDGPU/fexp.ll @@ -1,60 +1,107 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -;RUN: llc -mtriple=amdgcn-- < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s -;RUN: llc -mtriple=amdgcn-- -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s -;RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn-- < %s | FileCheck -enable-var-scope -check-prefixes=GFX689,SI %s +; RUN: llc -mtriple=amdgcn-- -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=GFX689,VI %s +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX689,GFX9 %s +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s define float @v_exp_f32(float %arg0) { -; GCN-LABEL: v_exp_f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; GCN-NEXT: v_exp_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX689-LABEL: v_exp_f32: +; GFX689: ; %bb.0: +; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; GFX689-NEXT: v_exp_f32_e32 v0, v0 +; GFX689-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_exp_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call float @llvm.exp.f32(float %arg0) ret float %result } define <2 x float> @v_exp_v2f32(<2 x float> %arg0) { -; GCN-LABEL: v_exp_v2f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 -; GCN-NEXT: v_exp_f32_e32 v0, v0 -; GCN-NEXT: v_exp_f32_e32 v1, v1 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX689-LABEL: v_exp_v2f32: +; GFX689: ; %bb.0: +; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; GFX689-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 +; GFX689-NEXT: v_exp_f32_e32 v0, v0 +; GFX689-NEXT: v_exp_f32_e32 v1, v1 +; GFX689-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_exp_v2f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_mul_f32 v0, 0x3fb8aa3b, v0 :: v_dual_mul_f32 v1, 0x3fb8aa3b, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: v_exp_f32_e32 v1, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x float> @llvm.exp.v2f32(<2 x float> %arg0) ret <2 x float> %result } define <3 x float> @v_exp_v3f32(<3 x float> %arg0) { -; GCN-LABEL: v_exp_v3f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 -; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2 -; GCN-NEXT: v_exp_f32_e32 v0, v0 -; GCN-NEXT: v_exp_f32_e32 v1, v1 -; GCN-NEXT: v_exp_f32_e32 v2, v2 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX689-LABEL: v_exp_v3f32: +; GFX689: ; %bb.0: +; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; GFX689-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 +; GFX689-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2 +; GFX689-NEXT: v_exp_f32_e32 v0, v0 +; GFX689-NEXT: v_exp_f32_e32 v1, v1 +; GFX689-NEXT: v_exp_f32_e32 v2, v2 +; GFX689-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_exp_v3f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_mul_f32 v0, 0x3fb8aa3b, v0 :: v_dual_mul_f32 v1, 0x3fb8aa3b, v1 +; GFX11-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: v_exp_f32_e32 v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_exp_f32_e32 v2, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <3 x float> @llvm.exp.v3f32(<3 x float> %arg0) ret <3 x float> %result } define <4 x float> @v_exp_v4f32(<4 x float> %arg0) { -; GCN-LABEL: v_exp_v4f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 -; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2 -; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3 -; GCN-NEXT: v_exp_f32_e32 v0, v0 -; GCN-NEXT: v_exp_f32_e32 v1, v1 -; GCN-NEXT: v_exp_f32_e32 v2, v2 -; GCN-NEXT: v_exp_f32_e32 v3, v3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX689-LABEL: v_exp_v4f32: +; GFX689: ; %bb.0: +; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; GFX689-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 +; GFX689-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2 +; GFX689-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3 +; GFX689-NEXT: v_exp_f32_e32 v0, v0 +; GFX689-NEXT: v_exp_f32_e32 v1, v1 +; GFX689-NEXT: v_exp_f32_e32 v2, v2 +; GFX689-NEXT: v_exp_f32_e32 v3, v3 +; GFX689-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_exp_v4f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_mul_f32 v0, 0x3fb8aa3b, v0 :: v_dual_mul_f32 v1, 0x3fb8aa3b, v1 +; GFX11-NEXT: v_dual_mul_f32 v2, 0x3fb8aa3b, v2 :: v_dual_mul_f32 v3, 0x3fb8aa3b, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: v_exp_f32_e32 v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_exp_f32_e32 v2, v2 +; GFX11-NEXT: v_exp_f32_e32 v3, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.exp.v4f32(<4 x float> %arg0) ret <4 x float> %result } @@ -82,6 +129,15 @@ ; GFX9-NEXT: v_mul_f16_e32 v0, 0x3dc5, v0 ; GFX9-NEXT: v_exp_f16_e32 v0, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_exp_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mul_f16_e32 v0, 0x3dc5, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_exp_f16_e32 v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call half @llvm.exp.f16(half %arg0) ret half %result } @@ -120,6 +176,19 @@ ; GFX9-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_exp_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_pk_mul_f16 v0, 0x3dc5, v0 op_sel_hi:[0,1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-NEXT: v_exp_f16_e32 v0, v0 +; GFX11-NEXT: v_exp_f16_e32 v1, v1 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x half> @llvm.exp.v2f16(<2 x half> %arg0) ret <2 x half> %result } @@ -182,6 +251,28 @@ ; GFX9-NEXT: v_pack_b32_f16 v0, v3, v0 ; GFX9-NEXT: v_pack_b32_f16 v1, v2, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_exp_v4f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-NEXT: v_mul_f16_e32 v1, 0x3dc5, v1 +; GFX11-NEXT: v_mul_f16_e32 v0, 0x3dc5, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_mul_f16_e32 v2, 0x3dc5, v2 +; GFX11-NEXT: v_mul_f16_e32 v3, 0x3dc5, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_exp_f16_e32 v1, v1 +; GFX11-NEXT: v_exp_f16_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_exp_f16_e32 v2, v2 +; GFX11-NEXT: v_exp_f16_e32 v3, v3 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_pack_b32_f16 v0, v0, v2 +; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <4 x half> @llvm.exp.v4f16(<4 x half> %arg0) ret <4 x half> %result } diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.ll b/llvm/test/CodeGen/AMDGPU/fmax3.ll --- a/llvm/test/CodeGen/AMDGPU/fmax3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax3.ll @@ -1,15 +1,130 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s -; GCN-LABEL: {{^}}test_fmax3_olt_0_f32: -; GCN: buffer_load_dword [[REGC:v[0-9]+]] -; GCN: buffer_load_dword [[REGB:v[0-9]+]] -; GCN: buffer_load_dword [[REGA:v[0-9]+]] -; GCN: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] -; GCN: buffer_store_dword [[RESULT]], -; GCN: s_endpgm define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { +; SI-LABEL: test_fmax3_olt_0_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s22, s10 +; SI-NEXT: s_mov_b32 s23, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s20, s6 +; SI-NEXT: s_mov_b32 s21, s7 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[20:23], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: v_max3_f32 v0, v0, v1, v2 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: test_fmax3_olt_0_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: v_max3_f32 v0, v0, v1, v2 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fmax3_olt_0_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s12, s2 +; GFX9-NEXT: s_mov_b32 s13, s3 +; GFX9-NEXT: s_mov_b32 s16, s4 +; GFX9-NEXT: s_mov_b32 s17, s5 +; GFX9-NEXT: s_mov_b32 s18, s10 +; GFX9-NEXT: s_mov_b32 s19, s11 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s10 +; GFX9-NEXT: s_mov_b32 s7, s11 +; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s0 +; GFX9-NEXT: s_mov_b32 s9, s1 +; GFX9-NEXT: v_max3_f32 v0, v0, v1, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_fmax3_olt_0_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_mov_b32 s22, s10 +; GFX11-NEXT: s_mov_b32 s23, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: s_mov_b32 s20, s6 +; GFX11-NEXT: s_mov_b32 s21, s7 +; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: v_max3_f32 v0, v0, v1, v2 +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %a = load volatile float, ptr addrspace(1) %aptr, align 4 %b = load volatile float, ptr addrspace(1) %bptr, align 4 %c = load volatile float, ptr addrspace(1) %cptr, align 4 @@ -20,14 +135,127 @@ } ; Commute operand of second fmax -; GCN-LABEL: {{^}}test_fmax3_olt_1_f32: -; GCN: buffer_load_dword [[REGB:v[0-9]+]] -; GCN: buffer_load_dword [[REGA:v[0-9]+]] -; GCN: buffer_load_dword [[REGC:v[0-9]+]] -; GCN: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] -; GCN: buffer_store_dword [[RESULT]], -; GCN: s_endpgm define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { +; SI-LABEL: test_fmax3_olt_1_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s22, s10 +; SI-NEXT: s_mov_b32 s23, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s20, s6 +; SI-NEXT: s_mov_b32 s21, s7 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[20:23], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: v_max3_f32 v0, v2, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: test_fmax3_olt_1_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: v_max3_f32 v0, v2, v0, v1 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fmax3_olt_1_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s12, s2 +; GFX9-NEXT: s_mov_b32 s13, s3 +; GFX9-NEXT: s_mov_b32 s16, s4 +; GFX9-NEXT: s_mov_b32 s17, s5 +; GFX9-NEXT: s_mov_b32 s18, s10 +; GFX9-NEXT: s_mov_b32 s19, s11 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s10 +; GFX9-NEXT: s_mov_b32 s7, s11 +; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s0 +; GFX9-NEXT: s_mov_b32 s9, s1 +; GFX9-NEXT: v_max3_f32 v0, v2, v0, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_fmax3_olt_1_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_mov_b32 s22, s10 +; GFX11-NEXT: s_mov_b32 s23, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: s_mov_b32 s20, s6 +; GFX11-NEXT: s_mov_b32 s21, s7 +; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: v_max3_f32 v0, v2, v0, v1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %a = load volatile float, ptr addrspace(1) %aptr, align 4 %b = load volatile float, ptr addrspace(1) %bptr, align 4 %c = load volatile float, ptr addrspace(1) %cptr, align 4 @@ -37,26 +265,135 @@ ret void } -; GCN-LABEL: {{^}}test_fmax3_olt_0_f16: -; GCN: buffer_load_ushort [[REGA:v[0-9]+]] -; GCN: buffer_load_ushort [[REGB:v[0-9]+]] -; GCN: buffer_load_ushort [[REGC:v[0-9]+]] - -; SI-DAG: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], [[REGA]] -; SI-DAG: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], [[REGB]] -; SI-DAG: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], [[REGC]] -; SI: v_max3_f32 [[RESULT_F32:v[0-9]+]], [[CVT_A]], [[CVT_B]], [[CVT_C]] -; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT_F32]] - -; VI-DAG: v_max_f16_e32 [[QUIET_A:v[0-9]+]], [[REGA]], [[REGA]] -; VI-DAG: v_max_f16_e32 [[QUIET_B:v[0-9]+]], [[REGB]], [[REGB]] -; VI: v_max_f16_e32 [[MAX0:v[0-9]+]], [[QUIET_A]], [[QUIET_B]] -; VI: v_max_f16_e32 [[QUIET_C:v[0-9]+]], [[REGC]], [[REGC]] -; VI: v_max_f16_e32 [[RESULT:v[0-9]+]], [[MAX0]], [[QUIET_C]] - -; GFX9: v_max3_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]] -; GCN: buffer_store_short [[RESULT]], define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { +; SI-LABEL: test_fmax3_olt_0_f16: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s22, s10 +; SI-NEXT: s_mov_b32 s23, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s20, s6 +; SI-NEXT: s_mov_b32 s21, s7 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v2, off, s[20:23], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_max3_f32 v0, v0, v1, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: test_fmax3_olt_0_f16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: v_max_f16_e32 v0, v0, v0 +; VI-NEXT: v_max_f16_e32 v1, v1, v1 +; VI-NEXT: v_max_f16_e32 v0, v0, v1 +; VI-NEXT: v_max_f16_e32 v1, v2, v2 +; VI-NEXT: v_max_f16_e32 v0, v0, v1 +; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fmax3_olt_0_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s12, s2 +; GFX9-NEXT: s_mov_b32 s13, s3 +; GFX9-NEXT: s_mov_b32 s16, s4 +; GFX9-NEXT: s_mov_b32 s17, s5 +; GFX9-NEXT: s_mov_b32 s18, s10 +; GFX9-NEXT: s_mov_b32 s19, s11 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s10 +; GFX9-NEXT: s_mov_b32 s7, s11 +; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s0 +; GFX9-NEXT: s_mov_b32 s9, s1 +; GFX9-NEXT: v_max3_f16 v0, v0, v1, v2 +; GFX9-NEXT: buffer_store_short v0, off, s[8:11], 0 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_fmax3_olt_0_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_mov_b32 s22, s10 +; GFX11-NEXT: s_mov_b32 s23, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: s_mov_b32 s20, s6 +; GFX11-NEXT: s_mov_b32 s21, s7 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: v_max3_f16 v0, v0, v1, v2 +; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %a = load volatile half, ptr addrspace(1) %aptr, align 2 %b = load volatile half, ptr addrspace(1) %bptr, align 2 %c = load volatile half, ptr addrspace(1) %cptr, align 2 @@ -67,26 +404,135 @@ } ; Commute operand of second fmax -; GCN-LABEL: {{^}}test_fmax3_olt_1_f16: -; GCN: buffer_load_ushort [[REGA:v[0-9]+]] -; GCN: buffer_load_ushort [[REGB:v[0-9]+]] -; GCN: buffer_load_ushort [[REGC:v[0-9]+]] - -; SI-DAG: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], [[REGA]] -; SI-DAG: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], [[REGB]] -; SI-DAG: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], [[REGC]] -; SI: v_max3_f32 [[RESULT_F32:v[0-9]+]], [[CVT_C]], [[CVT_A]], [[CVT_B]] -; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT_F32]] - -; VI-DAG: v_max_f16_e32 [[QUIET_A:v[0-9]+]], [[REGA]], [[REGA]] -; VI-DAG: v_max_f16_e32 [[QUIET_B:v[0-9]+]], [[REGB]], [[REGB]] -; VI: v_max_f16_e32 [[MAX0:v[0-9]+]], [[QUIET_A]], [[QUIET_B]] -; VI: v_max_f16_e32 [[QUIET_C:v[0-9]+]], [[REGC]], [[REGC]] -; VI: v_max_f16_e32 [[RESULT:v[0-9]+]], [[QUIET_C]], [[MAX0]] - -; GFX9: v_max3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGA]], [[REGB]] -; GCN: buffer_store_short [[RESULT]], define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { +; SI-LABEL: test_fmax3_olt_1_f16: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s22, s10 +; SI-NEXT: s_mov_b32 s23, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s20, s6 +; SI-NEXT: s_mov_b32 s21, s7 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v2, off, s[20:23], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_max3_f32 v0, v2, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: test_fmax3_olt_1_f16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: v_max_f16_e32 v0, v0, v0 +; VI-NEXT: v_max_f16_e32 v1, v1, v1 +; VI-NEXT: v_max_f16_e32 v0, v0, v1 +; VI-NEXT: v_max_f16_e32 v1, v2, v2 +; VI-NEXT: v_max_f16_e32 v0, v1, v0 +; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fmax3_olt_1_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s15, s11 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s12, s2 +; GFX9-NEXT: s_mov_b32 s13, s3 +; GFX9-NEXT: s_mov_b32 s16, s4 +; GFX9-NEXT: s_mov_b32 s17, s5 +; GFX9-NEXT: s_mov_b32 s18, s10 +; GFX9-NEXT: s_mov_b32 s19, s11 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s10 +; GFX9-NEXT: s_mov_b32 s7, s11 +; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s0 +; GFX9-NEXT: s_mov_b32 s9, s1 +; GFX9-NEXT: v_max3_f16 v0, v2, v0, v1 +; GFX9-NEXT: buffer_store_short v0, off, s[8:11], 0 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_fmax3_olt_1_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_mov_b32 s22, s10 +; GFX11-NEXT: s_mov_b32 s23, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: s_mov_b32 s20, s6 +; GFX11-NEXT: s_mov_b32 s21, s7 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: v_max3_f16 v0, v2, v0, v1 +; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %a = load volatile half, ptr addrspace(1) %aptr, align 2 %b = load volatile half, ptr addrspace(1) %bptr, align 2 %c = load volatile half, ptr addrspace(1) %cptr, align 2 @@ -98,29 +544,61 @@ ; Checks whether the test passes; performMinMaxCombine() should not optimize vector patterns of max3 ; since there are no pack instructions for fmax3. -; GCN-LABEL: {{^}}no_fmax3_v2f16: - -; SI: v_cvt_f16_f32_e32 -; SI: v_max_f32_e32 -; SI-NEXT: v_max_f32_e32 -; SI-NEXT: v_max3_f32 -; SI-NEXT: v_max3_f32 - -; VI: s_waitcnt -; VI-NEXT: v_max_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v0, v0, v1 -; VI-NEXT: v_max_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_max_f16_e32 v0, v2, v0 -; VI-NEXT: v_max_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v0, v0, v3 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: s_setpc_b64 - -; GFX9: s_waitcnt -; GFX9-NEXT: v_pk_max_f16 -; GFX9-NEXT: v_pk_max_f16 -; GFX9-NEXT: v_pk_max_f16 define <2 x half> @no_fmax3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) #2 { +; SI-LABEL: no_fmax3_v2f16: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_max_f32_e32 v1, v1, v3 +; SI-NEXT: v_max_f32_e32 v0, v0, v2 +; SI-NEXT: v_max3_f32 v0, v4, v0, v6 +; SI-NEXT: v_max3_f32 v1, v5, v1, v7 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: no_fmax3_v2f16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_max_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_f16_e32 v0, v0, v1 +; VI-NEXT: v_max_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_max_f16_e32 v0, v2, v0 +; VI-NEXT: v_max_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_max_f16_e32 v0, v0, v3 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: no_fmax3_v2f16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_max_f16 v0, v0, v1 +; GFX9-NEXT: v_pk_max_f16 v0, v2, v0 +; GFX9-NEXT: v_pk_max_f16 v0, v0, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: no_fmax3_v2f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_pk_max_f16 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v0, v2, v0 +; GFX11-NEXT: v_pk_max_f16 v0, v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b) %max1 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %c, <2 x half> %max) diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll @@ -8,6 +8,9 @@ ; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI-SAFE %s ; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI-NNAN %s +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SAFE %s +; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-NNAN %s + define half @test_fmax_legacy_ugt_f16(half %a, half %b) #0 { ; GFX9-SAFE-LABEL: test_fmax_legacy_ugt_f16: ; GFX9-SAFE: ; %bb.0: @@ -54,6 +57,21 @@ ; SI-NNAN-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NNAN-NEXT: v_max_f32_e32 v0, v0, v1 ; SI-NNAN-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-LABEL: test_fmax_legacy_ugt_f16: +; GFX11-SAFE: ; %bb.0: +; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v1 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NNAN-LABEL: test_fmax_legacy_ugt_f16: +; GFX11-NNAN: ; %bb.0: +; GFX11-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NNAN-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NNAN-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX11-NNAN-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ugt half %a, %b %val = select i1 %cmp, half %a, half %b ret half %val @@ -129,6 +147,27 @@ ; SI-NNAN-NEXT: v_max_f32_e32 v0, v0, v2 ; SI-NNAN-NEXT: v_max_f32_e32 v1, v1, v3 ; SI-NNAN-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-LABEL: test_fmax_legacy_ugt_v2f16: +; GFX11-SAFE: ; %bb.0: +; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v3, v2 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v1 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX11-SAFE-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NNAN-LABEL: test_fmax_legacy_ugt_v2f16: +; GFX11-NNAN: ; %bb.0: +; GFX11-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NNAN-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NNAN-NEXT: v_pk_max_f16 v0, v0, v1 +; GFX11-NNAN-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ugt <2 x half> %a, %b %val = select <2 x i1> %cmp, <2 x half> %a, <2 x half> %b ret <2 x half> %val @@ -220,6 +259,30 @@ ; SI-NNAN-NEXT: v_max_f32_e32 v1, v1, v4 ; SI-NNAN-NEXT: v_max_f32_e32 v2, v2, v5 ; SI-NNAN-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-LABEL: test_fmax_legacy_ugt_v3f16: +; GFX11-SAFE: ; %bb.0: +; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v2 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v5, v4 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo +; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v1, v3 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX11-SAFE-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NNAN-LABEL: test_fmax_legacy_ugt_v3f16: +; GFX11-NNAN: ; %bb.0: +; GFX11-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NNAN-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NNAN-NEXT: v_pk_max_f16 v0, v0, v2 +; GFX11-NNAN-NEXT: v_pk_max_f16 v1, v1, v3 +; GFX11-NNAN-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ugt <3 x half> %a, %b %val = select <3 x i1> %cmp, <3 x half> %a, <3 x half> %b ret <3 x half> %val @@ -334,6 +397,36 @@ ; SI-NNAN-NEXT: v_max_f32_e32 v2, v2, v6 ; SI-NNAN-NEXT: v_max_f32_e32 v3, v3, v7 ; SI-NNAN-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-LABEL: test_fmax_legacy_ugt_v4f16: +; GFX11-SAFE: ; %bb.0: +; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v5, v4 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo +; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v7, v6 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v2 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v1, v3 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 +; GFX11-SAFE-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 +; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NNAN-LABEL: test_fmax_legacy_ugt_v4f16: +; GFX11-NNAN: ; %bb.0: +; GFX11-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NNAN-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NNAN-NEXT: v_pk_max_f16 v0, v0, v2 +; GFX11-NNAN-NEXT: v_pk_max_f16 v1, v1, v3 +; GFX11-NNAN-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ugt <4 x half> %a, %b %val = select <4 x i1> %cmp, <4 x half> %a, <4 x half> %b ret <4 x half> %val @@ -526,6 +619,52 @@ ; SI-NNAN-NEXT: v_max_f32_e32 v6, v6, v14 ; SI-NNAN-NEXT: v_max_f32_e32 v7, v7, v15 ; SI-NNAN-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-LABEL: test_fmax_legacy_ugt_v8f16: +; GFX11-SAFE: ; %bb.0: +; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v10, 16, v7 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v12, 16, v6 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v11, v10 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc_lo +; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v13, v12 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc_lo +; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v15, v14 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc_lo +; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v9, v8 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo +; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v2, v6 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo +; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v4 +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-SAFE-NEXT: v_perm_b32 v2, v11, v2, 0x5040100 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo +; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v1, v5 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo +; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v3, v7 +; GFX11-SAFE-NEXT: v_perm_b32 v1, v12, v1, 0x5040100 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX11-SAFE-NEXT: v_perm_b32 v0, v8, v0, 0x5040100 +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SAFE-NEXT: v_perm_b32 v3, v10, v3, 0x5040100 +; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NNAN-LABEL: test_fmax_legacy_ugt_v8f16: +; GFX11-NNAN: ; %bb.0: +; GFX11-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NNAN-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NNAN-NEXT: v_pk_max_f16 v0, v0, v4 +; GFX11-NNAN-NEXT: v_pk_max_f16 v1, v1, v5 +; GFX11-NNAN-NEXT: v_pk_max_f16 v2, v2, v6 +; GFX11-NNAN-NEXT: v_pk_max_f16 v3, v3, v7 +; GFX11-NNAN-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ugt <8 x half> %a, %b %val = select <8 x i1> %cmp, <8 x half> %a, <8 x half> %b ret <8 x half> %val