diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll @@ -1,33 +1,129 @@ -; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI-FLUSH,VI %s -; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI-FLUSH,VI %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=VI-FLUSH %s +; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=VI-FLUSH %s -; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-DENORM,GCN-DENORM-STRICT,VI-DENORM,VI %s -; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-DENORM,GCN-DENORM-CONTRACT,VI-DENORM-CONTRACT,VI-DENORM,VI %s +; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=VI-DENORM %s +; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=VI-DENORM,VI-DENORM-CONTRACT %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10-FLUSH,GFX10 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10-FLUSH,GFX10 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-DENORM,GCN-DENORM-STRICT,GFX10-DENORM-STRICT,GFX10-DENORM,GFX10 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-DENORM,GCN-DENORM-CONTRACT,GFX10-DENORM-CONTRACT,GFX10-DENORM,GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-FLUSH %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-FLUSH %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DENORM,GFX10-DENORM-STRICT %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DENORM,GFX10-DENORM-CONTRACT %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10-FLUSH,GFX10 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10-FLUSH,GFX10 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-DENORM,GCN-DENORM-STRICT,GFX10-DENORM-STRICT,GFX10-DENORM,GFX10 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-DENORM,GCN-DENORM-CONTRACT,GFX10-DENORM-CONTRACT,GFX10-DENORM,GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FLUSH %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FLUSH %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-DENORM,GFX11-DENORM-STRICT %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-DENORM,GFX11-DENORM-CONTRACT %s declare i32 @llvm.amdgcn.workitem.id.x() #1 declare half @llvm.fmuladd.f16(half, half, half) #1 declare half @llvm.fabs.f16(half) #1 -; GCN-LABEL: {{^}}fmuladd_f16: -; VI-FLUSH: v_mac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} - -; VI-DENORM: v_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} - -; GFX10-FLUSH: v_mul_f16_e32 -; GFX10-FLUSH: v_add_f16_e32 -; GFX10-DENORM: v_fmac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} - define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, +; VI-FLUSH-LABEL: fmuladd_f16: +; VI-FLUSH: ; %bb.0: +; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2 +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 +; VI-FLUSH-NEXT: v_mov_b32_e32 v2, s4 +; VI-FLUSH-NEXT: v_mov_b32_e32 v3, s5 +; VI-FLUSH-NEXT: v_mov_b32_e32 v4, s6 +; VI-FLUSH-NEXT: v_mov_b32_e32 v5, s7 +; VI-FLUSH-NEXT: flat_load_ushort v6, v[0:1] +; VI-FLUSH-NEXT: flat_load_ushort v2, v[2:3] +; VI-FLUSH-NEXT: flat_load_ushort v3, v[4:5] +; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0 +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: v_mac_f16_e32 v3, v6, v2 +; VI-FLUSH-NEXT: flat_store_short v[0:1], v3 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-DENORM-LABEL: fmuladd_f16: +; VI-DENORM: ; %bb.0: +; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; VI-DENORM-NEXT: v_mov_b32_e32 v0, s2 +; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3 +; VI-DENORM-NEXT: v_mov_b32_e32 v2, s4 +; VI-DENORM-NEXT: v_mov_b32_e32 v3, s5 +; VI-DENORM-NEXT: v_mov_b32_e32 v4, s6 +; VI-DENORM-NEXT: v_mov_b32_e32 v5, s7 +; VI-DENORM-NEXT: flat_load_ushort v6, v[0:1] +; VI-DENORM-NEXT: flat_load_ushort v2, v[2:3] +; VI-DENORM-NEXT: flat_load_ushort v3, v[4:5] +; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0 +; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 +; VI-DENORM-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-NEXT: v_fma_f16 v2, v6, v2, v3 +; VI-DENORM-NEXT: flat_store_short v[0:1], v2 +; VI-DENORM-NEXT: s_endpgm +; +; GFX10-FLUSH-LABEL: fmuladd_f16: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-FLUSH-NEXT: s_clause 0x2 +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[4:5] +; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7] +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(1) +; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v3 +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: s_endpgm +; +; GFX10-DENORM-LABEL: fmuladd_f16: +; GFX10-DENORM: ; %bb.0: +; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-NEXT: s_clause 0x2 +; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[4:5] +; GFX10-DENORM-NEXT: global_load_ushort v3, v0, s[6:7] +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: v_fmac_f16_e32 v3, v1, v2 +; GFX10-DENORM-NEXT: global_store_short v0, v3, s[0:1] +; GFX10-DENORM-NEXT: s_endpgm +; +; GFX11-FLUSH-LABEL: fmuladd_f16: +; GFX11-FLUSH: ; %bb.0: +; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-NEXT: s_clause 0x2 +; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[4:5] +; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[6:7] +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(1) +; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v3 +; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FLUSH-NEXT: s_nop 0 +; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FLUSH-NEXT: s_endpgm +; +; GFX11-DENORM-LABEL: fmuladd_f16: +; GFX11-DENORM: ; %bb.0: +; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-DENORM-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DENORM-NEXT: s_clause 0x2 +; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[4:5] +; GFX11-DENORM-NEXT: global_load_u16 v3, v0, s[6:7] +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-NEXT: v_fmac_f16_e32 v3, v1, v2 +; GFX11-DENORM-NEXT: global_store_b16 v0, v3, s[0:1] +; GFX11-DENORM-NEXT: s_nop 0 +; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DENORM-NEXT: s_endpgm ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 { %r0 = load half, ptr addrspace(1) %in1 %r1 = load half, ptr addrspace(1) %in2 @@ -37,16 +133,146 @@ ret void } -; GCN-LABEL: {{^}}fmul_fadd_f16: -; VI-FLUSH: v_mac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} - -; VI-DENORM-CONTRACT: v_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} - -; GFX10-FLUSH: v_mul_f16_e32 -; GFX10-FLUSH: v_add_f16_e32 -; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} - define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, +; VI-FLUSH-LABEL: fmul_fadd_f16: +; VI-FLUSH: ; %bb.0: +; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2 +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 +; VI-FLUSH-NEXT: v_mov_b32_e32 v2, s4 +; VI-FLUSH-NEXT: v_mov_b32_e32 v3, s5 +; VI-FLUSH-NEXT: v_mov_b32_e32 v4, s6 +; VI-FLUSH-NEXT: v_mov_b32_e32 v5, s7 +; VI-FLUSH-NEXT: flat_load_ushort v6, v[0:1] +; VI-FLUSH-NEXT: flat_load_ushort v2, v[2:3] +; VI-FLUSH-NEXT: flat_load_ushort v3, v[4:5] +; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0 +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: v_mac_f16_e32 v3, v6, v2 +; VI-FLUSH-NEXT: flat_store_short v[0:1], v3 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-DENORM-CONTRACT-LABEL: fmul_fadd_f16: +; VI-DENORM-CONTRACT: ; %bb.0: +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v0, s2 +; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 +; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v2, s4 +; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v3, s5 +; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v4, s6 +; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v5, s7 +; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v6, v[0:1] +; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v2, v[2:3] +; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v3, v[4:5] +; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v0, s0 +; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1 +; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-CONTRACT-NEXT: v_fma_f16 v2, v6, v2, v3 +; VI-DENORM-CONTRACT-NEXT: flat_store_short v[0:1], v2 +; VI-DENORM-CONTRACT-NEXT: s_endpgm +; +; GFX10-FLUSH-LABEL: fmul_fadd_f16: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-FLUSH-NEXT: s_clause 0x2 +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[4:5] +; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7] +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(1) +; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v3 +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: s_endpgm +; +; GFX10-DENORM-STRICT-LABEL: fmul_fadd_f16: +; GFX10-DENORM-STRICT: ; %bb.0: +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-STRICT-NEXT: s_clause 0x2 +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[4:5] +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[6:7] +; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(1) +; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2 +; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v3 +; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-STRICT-NEXT: s_endpgm +; +; GFX10-DENORM-CONTRACT-LABEL: fmul_fadd_f16: +; GFX10-DENORM-CONTRACT: ; %bb.0: +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-CONTRACT-NEXT: s_clause 0x2 +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[4:5] +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[6:7] +; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v3, v1, v2 +; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v3, s[0:1] +; GFX10-DENORM-CONTRACT-NEXT: s_endpgm +; +; GFX11-FLUSH-LABEL: fmul_fadd_f16: +; GFX11-FLUSH: ; %bb.0: +; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-NEXT: s_clause 0x2 +; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[4:5] +; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[6:7] +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(1) +; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v3 +; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FLUSH-NEXT: s_nop 0 +; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FLUSH-NEXT: s_endpgm +; +; GFX11-DENORM-STRICT-LABEL: fmul_fadd_f16: +; GFX11-DENORM-STRICT: ; %bb.0: +; GFX11-DENORM-STRICT-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-DENORM-STRICT-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DENORM-STRICT-NEXT: s_clause 0x2 +; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[4:5] +; GFX11-DENORM-STRICT-NEXT: global_load_u16 v3, v0, s[6:7] +; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(1) +; GFX11-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2 +; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v3 +; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-DENORM-STRICT-NEXT: s_nop 0 +; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DENORM-STRICT-NEXT: s_endpgm +; +; GFX11-DENORM-CONTRACT-LABEL: fmul_fadd_f16: +; GFX11-DENORM-CONTRACT: ; %bb.0: +; GFX11-DENORM-CONTRACT-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DENORM-CONTRACT-NEXT: s_clause 0x2 +; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[4:5] +; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v3, v0, s[6:7] +; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v3, v1, v2 +; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v3, s[0:1] +; GFX11-DENORM-CONTRACT-NEXT: s_nop 0 +; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DENORM-CONTRACT-NEXT: s_endpgm ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 { %r0 = load half, ptr addrspace(1) %in1 %r1 = load half, ptr addrspace(1) %in2 @@ -57,16 +283,111 @@ ret void } -; GCN-LABEL: {{^}}fmul_fadd_contract_f16: -; VI-FLUSH: v_mac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} - -; VI-DENORM: v_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} - -; GFX10-FLUSH: v_mul_f16_e32 -; GFX10-FLUSH: v_add_f16_e32 -; GFX10-DENORM: v_fmac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} - define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, +; VI-FLUSH-LABEL: fmul_fadd_contract_f16: +; VI-FLUSH: ; %bb.0: +; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2 +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 +; VI-FLUSH-NEXT: v_mov_b32_e32 v2, s4 +; VI-FLUSH-NEXT: v_mov_b32_e32 v3, s5 +; VI-FLUSH-NEXT: v_mov_b32_e32 v4, s6 +; VI-FLUSH-NEXT: v_mov_b32_e32 v5, s7 +; VI-FLUSH-NEXT: flat_load_ushort v6, v[0:1] +; VI-FLUSH-NEXT: flat_load_ushort v2, v[2:3] +; VI-FLUSH-NEXT: flat_load_ushort v3, v[4:5] +; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0 +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: v_mac_f16_e32 v3, v6, v2 +; VI-FLUSH-NEXT: flat_store_short v[0:1], v3 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-DENORM-LABEL: fmul_fadd_contract_f16: +; VI-DENORM: ; %bb.0: +; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; VI-DENORM-NEXT: v_mov_b32_e32 v0, s2 +; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3 +; VI-DENORM-NEXT: v_mov_b32_e32 v2, s4 +; VI-DENORM-NEXT: v_mov_b32_e32 v3, s5 +; VI-DENORM-NEXT: v_mov_b32_e32 v4, s6 +; VI-DENORM-NEXT: v_mov_b32_e32 v5, s7 +; VI-DENORM-NEXT: flat_load_ushort v6, v[0:1] +; VI-DENORM-NEXT: flat_load_ushort v2, v[2:3] +; VI-DENORM-NEXT: flat_load_ushort v3, v[4:5] +; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0 +; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 +; VI-DENORM-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-NEXT: v_fma_f16 v2, v6, v2, v3 +; VI-DENORM-NEXT: flat_store_short v[0:1], v2 +; VI-DENORM-NEXT: s_endpgm +; +; GFX10-FLUSH-LABEL: fmul_fadd_contract_f16: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-FLUSH-NEXT: s_clause 0x2 +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[4:5] +; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7] +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(1) +; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v3 +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: s_endpgm +; +; GFX10-DENORM-LABEL: fmul_fadd_contract_f16: +; GFX10-DENORM: ; %bb.0: +; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-NEXT: s_clause 0x2 +; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[4:5] +; GFX10-DENORM-NEXT: global_load_ushort v3, v0, s[6:7] +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: v_fmac_f16_e32 v3, v1, v2 +; GFX10-DENORM-NEXT: global_store_short v0, v3, s[0:1] +; GFX10-DENORM-NEXT: s_endpgm +; +; GFX11-FLUSH-LABEL: fmul_fadd_contract_f16: +; GFX11-FLUSH: ; %bb.0: +; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-NEXT: s_clause 0x2 +; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[4:5] +; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[6:7] +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(1) +; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v3 +; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FLUSH-NEXT: s_nop 0 +; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FLUSH-NEXT: s_endpgm +; +; GFX11-DENORM-LABEL: fmul_fadd_contract_f16: +; GFX11-DENORM: ; %bb.0: +; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-DENORM-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DENORM-NEXT: s_clause 0x2 +; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[4:5] +; GFX11-DENORM-NEXT: global_load_u16 v3, v0, s[6:7] +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-NEXT: v_fmac_f16_e32 v3, v1, v2 +; GFX11-DENORM-NEXT: global_store_b16 v0, v3, s[0:1] +; GFX11-DENORM-NEXT: s_nop 0 +; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DENORM-NEXT: s_endpgm ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 { %r0 = load half, ptr addrspace(1) %in1 %r1 = load half, ptr addrspace(1) %in2 @@ -77,23 +398,101 @@ ret void } -; GCN-LABEL: {{^}}fmuladd_2.0_a_b_f16 -; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R1:v[0-9]+]], -; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R2:v[0-9]+]], -; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]] -; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] - -; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] -; GFX10-DENORM: v_fmac_f16_e32 [[R2:v[0-9]+]], 2.0, [[R1]] - -; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]] -; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]] - -; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -; GFX10-DENORM: global_store_{{short|b16}} v{{[0-9]+}}, [[R2]] -; GFX10-FLUSH: global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}} - define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; VI-FLUSH-LABEL: fmuladd_2.0_a_b_f16: +; VI-FLUSH: ; %bb.0: +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 +; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0 +; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: flat_load_ushort v4, v[0:1] glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: flat_load_ushort v2, v[2:3] glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: v_mac_f16_e32 v2, 2.0, v4 +; VI-FLUSH-NEXT: flat_store_short v[0:1], v2 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-DENORM-LABEL: fmuladd_2.0_a_b_f16: +; VI-DENORM: ; %bb.0: +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 +; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-DENORM-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-DENORM-NEXT: v_add_u32_e32 v2, vcc, 2, v0 +; VI-DENORM-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-DENORM-NEXT: flat_load_ushort v4, v[0:1] glc +; VI-DENORM-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-NEXT: flat_load_ushort v2, v[2:3] glc +; VI-DENORM-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-NEXT: v_fma_f16 v2, v4, 2.0, v2 +; VI-DENORM-NEXT: flat_store_short v[0:1], v2 +; VI-DENORM-NEXT: s_endpgm +; +; GFX10-FLUSH-LABEL: fmuladd_2.0_a_b_f16: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1 +; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v2 +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: s_endpgm +; +; GFX10-DENORM-LABEL: fmuladd_2.0_a_b_f16: +; GFX10-DENORM: ; %bb.0: +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: v_fmac_f16_e32 v2, 2.0, v1 +; GFX10-DENORM-NEXT: global_store_short v0, v2, s[0:1] +; GFX10-DENORM-NEXT: s_endpgm +; +; GFX11-FLUSH-LABEL: fmuladd_2.0_a_b_f16: +; GFX11-FLUSH: ; %bb.0: +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v2 +; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FLUSH-NEXT: s_nop 0 +; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FLUSH-NEXT: s_endpgm +; +; GFX11-DENORM-LABEL: fmuladd_2.0_a_b_f16: +; GFX11-DENORM: ; %bb.0: +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-NEXT: v_fmac_f16_e32 v2, 2.0, v1 +; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1] +; GFX11-DENORM-NEXT: s_nop 0 +; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DENORM-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid %gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1 @@ -107,23 +506,101 @@ ret void } -; GCN-LABEL: {{^}}fmuladd_a_2.0_b_f16 -; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R1:v[0-9]+]], -; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R2:v[0-9]+]], -; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]] -; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] - -; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] -; GFX10-DENORM: v_fmac_f16_e32 [[R2]], 2.0, [[R1]] - -; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]] -; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]] - -; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -; GFX10-DENORM: global_store_{{short|b16}} v{{[0-9]+}}, [[R2]] -; GFX10-FLUSH: global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]] - define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; VI-FLUSH-LABEL: fmuladd_a_2.0_b_f16: +; VI-FLUSH: ; %bb.0: +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 +; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0 +; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: flat_load_ushort v4, v[0:1] glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: flat_load_ushort v2, v[2:3] glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: v_mac_f16_e32 v2, 2.0, v4 +; VI-FLUSH-NEXT: flat_store_short v[0:1], v2 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-DENORM-LABEL: fmuladd_a_2.0_b_f16: +; VI-DENORM: ; %bb.0: +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 +; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-DENORM-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-DENORM-NEXT: v_add_u32_e32 v2, vcc, 2, v0 +; VI-DENORM-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-DENORM-NEXT: flat_load_ushort v4, v[0:1] glc +; VI-DENORM-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-NEXT: flat_load_ushort v2, v[2:3] glc +; VI-DENORM-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-NEXT: v_fma_f16 v2, v4, 2.0, v2 +; VI-DENORM-NEXT: flat_store_short v[0:1], v2 +; VI-DENORM-NEXT: s_endpgm +; +; GFX10-FLUSH-LABEL: fmuladd_a_2.0_b_f16: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1 +; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v2 +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: s_endpgm +; +; GFX10-DENORM-LABEL: fmuladd_a_2.0_b_f16: +; GFX10-DENORM: ; %bb.0: +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: v_fmac_f16_e32 v2, 2.0, v1 +; GFX10-DENORM-NEXT: global_store_short v0, v2, s[0:1] +; GFX10-DENORM-NEXT: s_endpgm +; +; GFX11-FLUSH-LABEL: fmuladd_a_2.0_b_f16: +; GFX11-FLUSH: ; %bb.0: +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v2 +; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FLUSH-NEXT: s_nop 0 +; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FLUSH-NEXT: s_endpgm +; +; GFX11-DENORM-LABEL: fmuladd_a_2.0_b_f16: +; GFX11-DENORM: ; %bb.0: +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-NEXT: v_fmac_f16_e32 v2, 2.0, v1 +; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1] +; GFX11-DENORM-NEXT: s_nop 0 +; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DENORM-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid %gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1 @@ -137,27 +614,132 @@ ret void } -; GCN-LABEL: {{^}}fadd_a_a_b_f16: -; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R1:v[0-9]+]], -; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R2:v[0-9]+]], -; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]] -; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] - -; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] -; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[R2]], 2.0, [[R1]] - -; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] - -; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] - -; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]] -; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]] -; GFX10-FLUSH: global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]] -; GFX10-DENORM-STRICT: global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]] -; GFX10-DENORM-CONTRACT: global_store_{{short|b16}} v{{[0-9]+}}, [[R2]] - define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, +; VI-FLUSH-LABEL: fadd_a_a_b_f16: +; VI-FLUSH: ; %bb.0: +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 +; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0 +; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: flat_load_ushort v4, v[0:1] glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: flat_load_ushort v2, v[2:3] glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: v_mac_f16_e32 v2, 2.0, v4 +; VI-FLUSH-NEXT: flat_store_short v[0:1], v2 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-DENORM-CONTRACT-LABEL: fadd_a_a_b_f16: +; VI-DENORM-CONTRACT: ; %bb.0: +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1 +; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v2, vcc, 2, v0 +; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v4, v[0:1] glc +; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v2, v[2:3] glc +; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-CONTRACT-NEXT: v_fma_f16 v2, v4, 2.0, v2 +; VI-DENORM-CONTRACT-NEXT: flat_store_short v[0:1], v2 +; VI-DENORM-CONTRACT-NEXT: s_endpgm +; +; GFX10-FLUSH-LABEL: fadd_a_a_b_f16: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1 +; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v2 +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: s_endpgm +; +; GFX10-DENORM-STRICT-LABEL: fadd_a_a_b_f16: +; GFX10-DENORM-STRICT: ; %bb.0: +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc +; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc +; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v1 +; GFX10-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v2 +; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-STRICT-NEXT: s_endpgm +; +; GFX10-DENORM-CONTRACT-LABEL: fadd_a_a_b_f16: +; GFX10-DENORM-CONTRACT: ; %bb.0: +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc +; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v2, 2.0, v1 +; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v2, s[0:1] +; GFX10-DENORM-CONTRACT-NEXT: s_endpgm +; +; GFX11-FLUSH-LABEL: fadd_a_a_b_f16: +; GFX11-FLUSH: ; %bb.0: +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v2 +; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FLUSH-NEXT: s_nop 0 +; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FLUSH-NEXT: s_endpgm +; +; GFX11-DENORM-STRICT-LABEL: fadd_a_a_b_f16: +; GFX11-DENORM-STRICT: ; %bb.0: +; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc +; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v1 +; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v2 +; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-DENORM-STRICT-NEXT: s_nop 0 +; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DENORM-STRICT-NEXT: s_endpgm +; +; GFX11-DENORM-CONTRACT-LABEL: fadd_a_a_b_f16: +; GFX11-DENORM-CONTRACT: ; %bb.0: +; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc +; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v2, 2.0, v1 +; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v2, s[0:1] +; GFX11-DENORM-CONTRACT-NEXT: s_nop 0 +; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DENORM-CONTRACT-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -174,27 +756,132 @@ ret void } -; GCN-LABEL: {{^}}fadd_b_a_a_f16: -; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R1:v[0-9]+]], -; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R2:v[0-9]+]], -; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]] -; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] - -; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] -; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[R2]], 2.0, [[R1]] - -; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] - -; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] - -; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]] -; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]] -; GFX10-FLUSH: global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]] -; GFX10-DENORM-STRICT: global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]] -; GFX10-DENORM-CONTRACT: global_store_{{short|b16}} v{{[0-9]+}}, [[R2]] - define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, +; VI-FLUSH-LABEL: fadd_b_a_a_f16: +; VI-FLUSH: ; %bb.0: +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 +; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0 +; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: flat_load_ushort v4, v[0:1] glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: flat_load_ushort v2, v[2:3] glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: v_mac_f16_e32 v2, 2.0, v4 +; VI-FLUSH-NEXT: flat_store_short v[0:1], v2 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-DENORM-CONTRACT-LABEL: fadd_b_a_a_f16: +; VI-DENORM-CONTRACT: ; %bb.0: +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1 +; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v2, vcc, 2, v0 +; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v4, v[0:1] glc +; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v2, v[2:3] glc +; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-CONTRACT-NEXT: v_fma_f16 v2, v4, 2.0, v2 +; VI-DENORM-CONTRACT-NEXT: flat_store_short v[0:1], v2 +; VI-DENORM-CONTRACT-NEXT: s_endpgm +; +; GFX10-FLUSH-LABEL: fadd_b_a_a_f16: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1 +; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v2, v1 +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: s_endpgm +; +; GFX10-DENORM-STRICT-LABEL: fadd_b_a_a_f16: +; GFX10-DENORM-STRICT: ; %bb.0: +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc +; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc +; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v1 +; GFX10-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v2, v1 +; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-STRICT-NEXT: s_endpgm +; +; GFX10-DENORM-CONTRACT-LABEL: fadd_b_a_a_f16: +; GFX10-DENORM-CONTRACT: ; %bb.0: +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc +; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v2, 2.0, v1 +; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v2, s[0:1] +; GFX10-DENORM-CONTRACT-NEXT: s_endpgm +; +; GFX11-FLUSH-LABEL: fadd_b_a_a_f16: +; GFX11-FLUSH: ; %bb.0: +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v2, v1 +; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FLUSH-NEXT: s_nop 0 +; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FLUSH-NEXT: s_endpgm +; +; GFX11-DENORM-STRICT-LABEL: fadd_b_a_a_f16: +; GFX11-DENORM-STRICT: ; %bb.0: +; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc +; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v1 +; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v2, v1 +; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-DENORM-STRICT-NEXT: s_nop 0 +; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DENORM-STRICT-NEXT: s_endpgm +; +; GFX11-DENORM-CONTRACT-LABEL: fadd_b_a_a_f16: +; GFX11-DENORM-CONTRACT: ; %bb.0: +; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc +; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v2, 2.0, v1 +; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v2, s[0:1] +; GFX11-DENORM-CONTRACT-NEXT: s_nop 0 +; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DENORM-CONTRACT-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -211,19 +898,101 @@ ret void } -; GCN-LABEL: {{^}}fmuladd_neg_2.0_a_b_f16 -; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R1:v[0-9]+]], -; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R2:v[0-9]+]], -; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]] -; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] -; GFX10-DENORM: v_fmac_f16_e32 [[R2]], -2.0, [[R1]] -; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] -; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]] -; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]] -; GFX10-FLUSH: global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]] -; GFX10-DENORM: global_store_{{short|b16}} v{{[0-9]+}}, [[R2]] define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; VI-FLUSH-LABEL: fmuladd_neg_2.0_a_b_f16: +; VI-FLUSH: ; %bb.0: +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 +; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0 +; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: flat_load_ushort v4, v[0:1] glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: flat_load_ushort v2, v[2:3] glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: v_mac_f16_e32 v2, -2.0, v4 +; VI-FLUSH-NEXT: flat_store_short v[0:1], v2 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-DENORM-LABEL: fmuladd_neg_2.0_a_b_f16: +; VI-DENORM: ; %bb.0: +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 +; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-DENORM-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-DENORM-NEXT: v_add_u32_e32 v2, vcc, 2, v0 +; VI-DENORM-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-DENORM-NEXT: flat_load_ushort v4, v[0:1] glc +; VI-DENORM-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-NEXT: flat_load_ushort v2, v[2:3] glc +; VI-DENORM-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-NEXT: v_fma_f16 v2, v4, -2.0, v2 +; VI-DENORM-NEXT: flat_store_short v[0:1], v2 +; VI-DENORM-NEXT: s_endpgm +; +; GFX10-FLUSH-LABEL: fmuladd_neg_2.0_a_b_f16: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1 +; GFX10-FLUSH-NEXT: v_sub_f16_e32 v1, v2, v1 +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: s_endpgm +; +; GFX10-DENORM-LABEL: fmuladd_neg_2.0_a_b_f16: +; GFX10-DENORM: ; %bb.0: +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: v_fmac_f16_e32 v2, -2.0, v1 +; GFX10-DENORM-NEXT: global_store_short v0, v2, s[0:1] +; GFX10-DENORM-NEXT: s_endpgm +; +; GFX11-FLUSH-LABEL: fmuladd_neg_2.0_a_b_f16: +; GFX11-FLUSH: ; %bb.0: +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v2, v1 +; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FLUSH-NEXT: s_nop 0 +; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FLUSH-NEXT: s_endpgm +; +; GFX11-DENORM-LABEL: fmuladd_neg_2.0_a_b_f16: +; GFX11-DENORM: ; %bb.0: +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-NEXT: v_fmac_f16_e32 v2, -2.0, v1 +; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1] +; GFX11-DENORM-NEXT: s_nop 0 +; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DENORM-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid %gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1 @@ -237,22 +1006,101 @@ ret void } -; GCN-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f16 -; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R1:v[0-9]+]], -; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R2:v[0-9]+]], -; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]] -; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] - -; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] -; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] - -; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]] -; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]] -; GFX10-FLUSH: global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]] - -; GFX10-DENORM: v_fmac_f16_e32 [[R2]], 2.0, [[R1]] -; GFX10-DENORM: global_store_{{short|b16}} v{{[0-9]+}}, [[R2]] define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; VI-FLUSH-LABEL: fmuladd_neg_2.0_neg_a_b_f16: +; VI-FLUSH: ; %bb.0: +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 +; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0 +; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: flat_load_ushort v4, v[0:1] glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: flat_load_ushort v2, v[2:3] glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: v_mac_f16_e32 v2, 2.0, v4 +; VI-FLUSH-NEXT: flat_store_short v[0:1], v2 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-DENORM-LABEL: fmuladd_neg_2.0_neg_a_b_f16: +; VI-DENORM: ; %bb.0: +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 +; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-DENORM-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-DENORM-NEXT: v_add_u32_e32 v2, vcc, 2, v0 +; VI-DENORM-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-DENORM-NEXT: flat_load_ushort v4, v[0:1] glc +; VI-DENORM-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-NEXT: flat_load_ushort v2, v[2:3] glc +; VI-DENORM-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-NEXT: v_fma_f16 v2, v4, 2.0, v2 +; VI-DENORM-NEXT: flat_store_short v[0:1], v2 +; VI-DENORM-NEXT: s_endpgm +; +; GFX10-FLUSH-LABEL: fmuladd_neg_2.0_neg_a_b_f16: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1 +; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v2, v1 +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: s_endpgm +; +; GFX10-DENORM-LABEL: fmuladd_neg_2.0_neg_a_b_f16: +; GFX10-DENORM: ; %bb.0: +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: v_fmac_f16_e32 v2, 2.0, v1 +; GFX10-DENORM-NEXT: global_store_short v0, v2, s[0:1] +; GFX10-DENORM-NEXT: s_endpgm +; +; GFX11-FLUSH-LABEL: fmuladd_neg_2.0_neg_a_b_f16: +; GFX11-FLUSH: ; %bb.0: +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v2, v1 +; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FLUSH-NEXT: s_nop 0 +; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FLUSH-NEXT: s_endpgm +; +; GFX11-DENORM-LABEL: fmuladd_neg_2.0_neg_a_b_f16: +; GFX11-DENORM: ; %bb.0: +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-NEXT: v_fmac_f16_e32 v2, 2.0, v1 +; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1] +; GFX11-DENORM-NEXT: s_nop 0 +; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DENORM-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid %gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1 @@ -268,22 +1116,101 @@ ret void } -; GCN-LABEL: {{^}}fmuladd_2.0_neg_a_b_f16 -; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R1:v[0-9]+]], -; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R2:v[0-9]+]], -; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]] -; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] - -; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] -; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] - -; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]] -; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]] -; GFX10-FLUSH: global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]] - -; GFX10-DENORM: v_fmac_f16_e32 [[R2]], -2.0, [[R1]] -; GFX10-DENORM: global_store_{{short|b16}} v{{[0-9]+}}, [[R2]] define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; VI-FLUSH-LABEL: fmuladd_2.0_neg_a_b_f16: +; VI-FLUSH: ; %bb.0: +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 +; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0 +; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: flat_load_ushort v4, v[0:1] glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: flat_load_ushort v2, v[2:3] glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: v_mac_f16_e32 v2, -2.0, v4 +; VI-FLUSH-NEXT: flat_store_short v[0:1], v2 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-DENORM-LABEL: fmuladd_2.0_neg_a_b_f16: +; VI-DENORM: ; %bb.0: +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 +; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-DENORM-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-DENORM-NEXT: v_add_u32_e32 v2, vcc, 2, v0 +; VI-DENORM-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-DENORM-NEXT: flat_load_ushort v4, v[0:1] glc +; VI-DENORM-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-NEXT: flat_load_ushort v2, v[2:3] glc +; VI-DENORM-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-NEXT: v_fma_f16 v2, v4, -2.0, v2 +; VI-DENORM-NEXT: flat_store_short v[0:1], v2 +; VI-DENORM-NEXT: s_endpgm +; +; GFX10-FLUSH-LABEL: fmuladd_2.0_neg_a_b_f16: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1 +; GFX10-FLUSH-NEXT: v_sub_f16_e32 v1, v2, v1 +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: s_endpgm +; +; GFX10-DENORM-LABEL: fmuladd_2.0_neg_a_b_f16: +; GFX10-DENORM: ; %bb.0: +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: v_fmac_f16_e32 v2, -2.0, v1 +; GFX10-DENORM-NEXT: global_store_short v0, v2, s[0:1] +; GFX10-DENORM-NEXT: s_endpgm +; +; GFX11-FLUSH-LABEL: fmuladd_2.0_neg_a_b_f16: +; GFX11-FLUSH: ; %bb.0: +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v2, v1 +; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FLUSH-NEXT: s_nop 0 +; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FLUSH-NEXT: s_endpgm +; +; GFX11-DENORM-LABEL: fmuladd_2.0_neg_a_b_f16: +; GFX11-DENORM: ; %bb.0: +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-NEXT: v_fmac_f16_e32 v2, -2.0, v1 +; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1] +; GFX11-DENORM-NEXT: s_nop 0 +; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DENORM-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid %gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1 @@ -299,16 +1226,101 @@ ret void } -; GCN-LABEL: {{^}}fmuladd_2.0_a_neg_b_f16 -; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R1:v[0-9]+]], -; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R2:v[0-9]+]], -; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] -; GCN-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] -; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]] -; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]] -; GFX10: global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]] define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; VI-FLUSH-LABEL: fmuladd_2.0_a_neg_b_f16: +; VI-FLUSH: ; %bb.0: +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 +; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0 +; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: flat_load_ushort v4, v[0:1] glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: flat_load_ushort v2, v[2:3] glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: v_mad_f16 v2, v4, 2.0, -v2 +; VI-FLUSH-NEXT: flat_store_short v[0:1], v2 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-DENORM-LABEL: fmuladd_2.0_a_neg_b_f16: +; VI-DENORM: ; %bb.0: +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 +; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-DENORM-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-DENORM-NEXT: v_add_u32_e32 v2, vcc, 2, v0 +; VI-DENORM-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-DENORM-NEXT: flat_load_ushort v4, v[0:1] glc +; VI-DENORM-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-NEXT: flat_load_ushort v2, v[2:3] glc +; VI-DENORM-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-NEXT: v_fma_f16 v2, v4, 2.0, -v2 +; VI-DENORM-NEXT: flat_store_short v[0:1], v2 +; VI-DENORM-NEXT: s_endpgm +; +; GFX10-FLUSH-LABEL: fmuladd_2.0_a_neg_b_f16: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1 +; GFX10-FLUSH-NEXT: v_sub_f16_e32 v1, v1, v2 +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: s_endpgm +; +; GFX10-DENORM-LABEL: fmuladd_2.0_a_neg_b_f16: +; GFX10-DENORM: ; %bb.0: +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: v_fma_f16 v1, v1, 2.0, -v2 +; GFX10-DENORM-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-NEXT: s_endpgm +; +; GFX11-FLUSH-LABEL: fmuladd_2.0_a_neg_b_f16: +; GFX11-FLUSH: ; %bb.0: +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v1, v2 +; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FLUSH-NEXT: s_nop 0 +; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FLUSH-NEXT: s_endpgm +; +; GFX11-DENORM-LABEL: fmuladd_2.0_a_neg_b_f16: +; GFX11-DENORM: ; %bb.0: +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-NEXT: v_fma_f16 v1, v1, 2.0, -v2 +; GFX11-DENORM-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-DENORM-NEXT: s_nop 0 +; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DENORM-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid %gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1 @@ -324,24 +1336,158 @@ ret void } -; GCN-LABEL: {{^}}mad_sub_f16: -; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGA:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGB:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGC:v[0-9]+]] - -; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]] - -; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]] - -; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] -; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] - -; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] - -; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] -; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] -; GFX10: global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]] define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 { +; VI-FLUSH-LABEL: mad_sub_f16: +; VI-FLUSH: ; %bb.0: +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 +; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0 +; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; VI-FLUSH-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: flat_load_ushort v7, v[0:1] glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: flat_load_ushort v2, v[2:3] glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: flat_load_ushort v3, v[4:5] glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 +; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: v_mad_f16 v2, v7, v2, -v3 +; VI-FLUSH-NEXT: flat_store_short v[0:1], v2 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-DENORM-CONTRACT-LABEL: mad_sub_f16: +; VI-DENORM-CONTRACT: ; %bb.0: +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0 +; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 +; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v2, vcc, 2, v0 +; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v7, v[0:1] glc +; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v2, v[2:3] glc +; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v3, v[4:5] glc +; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1 +; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-DENORM-CONTRACT-NEXT: v_fma_f16 v2, v7, v2, -v3 +; VI-DENORM-CONTRACT-NEXT: flat_store_short v[0:1], v2 +; VI-DENORM-CONTRACT-NEXT: s_endpgm +; +; GFX10-FLUSH-LABEL: mad_sub_f16: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 +; GFX10-FLUSH-NEXT: v_sub_f16_e32 v1, v1, v3 +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: s_endpgm +; +; GFX10-DENORM-STRICT-LABEL: mad_sub_f16: +; GFX10-DENORM-STRICT: ; %bb.0: +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2 +; GFX10-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v1, v3 +; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-STRICT-NEXT: s_endpgm +; +; GFX10-DENORM-CONTRACT-LABEL: mad_sub_f16: +; GFX10-DENORM-CONTRACT: ; %bb.0: +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-CONTRACT-NEXT: v_fma_f16 v1, v1, v2, -v3 +; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-CONTRACT-NEXT: s_endpgm +; +; GFX11-FLUSH-LABEL: mad_sub_f16: +; GFX11-FLUSH: ; %bb.0: +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v1, v3 +; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FLUSH-NEXT: s_nop 0 +; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FLUSH-NEXT: s_endpgm +; +; GFX11-DENORM-STRICT-LABEL: mad_sub_f16: +; GFX11-DENORM-STRICT: ; %bb.0: +; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc +; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc +; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-STRICT-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc +; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2 +; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v1, v3 +; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-DENORM-STRICT-NEXT: s_nop 0 +; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DENORM-STRICT-NEXT: s_endpgm +; +; GFX11-DENORM-CONTRACT-LABEL: mad_sub_f16: +; GFX11-DENORM-CONTRACT: ; %bb.0: +; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc +; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc +; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc +; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-CONTRACT-NEXT: v_fma_f16 v1, v1, v2, -v3 +; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-DENORM-CONTRACT-NEXT: s_nop 0 +; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DENORM-CONTRACT-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 %gep0 = getelementptr half, ptr addrspace(1) %ptr, i64 %tid.ext @@ -359,23 +1505,158 @@ ret void } -; GCN-LABEL: {{^}}mad_sub_inv_f16: -; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGA:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGB:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGC:v[0-9]+]] -; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]] - -; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]] - -; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] -; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] - -; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] - -; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] -; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] -; GFX10: global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]] define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 { +; VI-FLUSH-LABEL: mad_sub_inv_f16: +; VI-FLUSH: ; %bb.0: +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 +; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0 +; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; VI-FLUSH-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: flat_load_ushort v7, v[0:1] glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: flat_load_ushort v2, v[2:3] glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: flat_load_ushort v3, v[4:5] glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 +; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: v_mad_f16 v2, -v7, v2, v3 +; VI-FLUSH-NEXT: flat_store_short v[0:1], v2 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-DENORM-CONTRACT-LABEL: mad_sub_inv_f16: +; VI-DENORM-CONTRACT: ; %bb.0: +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0 +; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 +; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v2, vcc, 2, v0 +; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v7, v[0:1] glc +; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v2, v[2:3] glc +; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v3, v[4:5] glc +; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1 +; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-DENORM-CONTRACT-NEXT: v_fma_f16 v2, -v7, v2, v3 +; VI-DENORM-CONTRACT-NEXT: flat_store_short v[0:1], v2 +; VI-DENORM-CONTRACT-NEXT: s_endpgm +; +; GFX10-FLUSH-LABEL: mad_sub_inv_f16: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 +; GFX10-FLUSH-NEXT: v_sub_f16_e32 v1, v3, v1 +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: s_endpgm +; +; GFX10-DENORM-STRICT-LABEL: mad_sub_inv_f16: +; GFX10-DENORM-STRICT: ; %bb.0: +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2 +; GFX10-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v3, v1 +; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-STRICT-NEXT: s_endpgm +; +; GFX10-DENORM-CONTRACT-LABEL: mad_sub_inv_f16: +; GFX10-DENORM-CONTRACT: ; %bb.0: +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-CONTRACT-NEXT: v_fma_f16 v1, -v1, v2, v3 +; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-CONTRACT-NEXT: s_endpgm +; +; GFX11-FLUSH-LABEL: mad_sub_inv_f16: +; GFX11-FLUSH: ; %bb.0: +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v3, v1 +; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FLUSH-NEXT: s_nop 0 +; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FLUSH-NEXT: s_endpgm +; +; GFX11-DENORM-STRICT-LABEL: mad_sub_inv_f16: +; GFX11-DENORM-STRICT: ; %bb.0: +; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc +; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc +; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-STRICT-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc +; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2 +; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v3, v1 +; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-DENORM-STRICT-NEXT: s_nop 0 +; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DENORM-STRICT-NEXT: s_endpgm +; +; GFX11-DENORM-CONTRACT-LABEL: mad_sub_inv_f16: +; GFX11-DENORM-CONTRACT: ; %bb.0: +; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc +; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc +; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc +; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-CONTRACT-NEXT: v_fma_f16 v1, -v1, v2, v3 +; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-DENORM-CONTRACT-NEXT: s_nop 0 +; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DENORM-CONTRACT-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 %gep0 = getelementptr half, ptr addrspace(1) %ptr, i64 %tid.ext @@ -393,23 +1674,158 @@ ret void } -; GCN-LABEL: {{^}}mad_sub_fabs_f16: -; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGA:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGB:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGC:v[0-9]+]] -; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]| - -; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]| - -; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] -; GCN-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]| - -; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] - -; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] -; GFX10-FLUSH: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]| -; GFX10: global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]] define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 { +; VI-FLUSH-LABEL: mad_sub_fabs_f16: +; VI-FLUSH: ; %bb.0: +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 +; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0 +; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; VI-FLUSH-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: flat_load_ushort v7, v[0:1] glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: flat_load_ushort v2, v[2:3] glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: flat_load_ushort v3, v[4:5] glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 +; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: v_mad_f16 v2, v7, v2, -|v3| +; VI-FLUSH-NEXT: flat_store_short v[0:1], v2 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16: +; VI-DENORM-CONTRACT: ; %bb.0: +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0 +; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 +; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v2, vcc, 2, v0 +; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v7, v[0:1] glc +; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v2, v[2:3] glc +; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v3, v[4:5] glc +; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1 +; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-DENORM-CONTRACT-NEXT: v_fma_f16 v2, v7, v2, -|v3| +; VI-DENORM-CONTRACT-NEXT: flat_store_short v[0:1], v2 +; VI-DENORM-CONTRACT-NEXT: s_endpgm +; +; GFX10-FLUSH-LABEL: mad_sub_fabs_f16: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 +; GFX10-FLUSH-NEXT: v_sub_f16_e64 v1, v1, |v3| +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: s_endpgm +; +; GFX10-DENORM-STRICT-LABEL: mad_sub_fabs_f16: +; GFX10-DENORM-STRICT: ; %bb.0: +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2 +; GFX10-DENORM-STRICT-NEXT: v_sub_f16_e64 v1, v1, |v3| +; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-STRICT-NEXT: s_endpgm +; +; GFX10-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16: +; GFX10-DENORM-CONTRACT: ; %bb.0: +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-CONTRACT-NEXT: v_fma_f16 v1, v1, v2, -|v3| +; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-CONTRACT-NEXT: s_endpgm +; +; GFX11-FLUSH-LABEL: mad_sub_fabs_f16: +; GFX11-FLUSH: ; %bb.0: +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_sub_f16_e64 v1, v1, |v3| +; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FLUSH-NEXT: s_nop 0 +; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FLUSH-NEXT: s_endpgm +; +; GFX11-DENORM-STRICT-LABEL: mad_sub_fabs_f16: +; GFX11-DENORM-STRICT: ; %bb.0: +; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc +; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc +; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-STRICT-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc +; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2 +; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-STRICT-NEXT: v_sub_f16_e64 v1, v1, |v3| +; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-DENORM-STRICT-NEXT: s_nop 0 +; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DENORM-STRICT-NEXT: s_endpgm +; +; GFX11-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16: +; GFX11-DENORM-CONTRACT: ; %bb.0: +; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc +; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc +; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc +; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-CONTRACT-NEXT: v_fma_f16 v1, v1, v2, -|v3| +; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-DENORM-CONTRACT-NEXT: s_nop 0 +; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DENORM-CONTRACT-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 %gep0 = getelementptr half, ptr addrspace(1) %ptr, i64 %tid.ext @@ -428,24 +1844,158 @@ ret void } -; GCN-LABEL: {{^}}mad_sub_fabs_inv_f16: -; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGA:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGB:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGC:v[0-9]+]] - -; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| - -; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| - -; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] -; GCN-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]] - -; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] - -; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] -; GFX10-FLUSH: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]] -; GFX10: global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]] define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 { +; VI-FLUSH-LABEL: mad_sub_fabs_inv_f16: +; VI-FLUSH: ; %bb.0: +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 +; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0 +; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; VI-FLUSH-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: flat_load_ushort v7, v[0:1] glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: flat_load_ushort v2, v[2:3] glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: flat_load_ushort v3, v[4:5] glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 +; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: v_mad_f16 v2, -v7, v2, |v3| +; VI-FLUSH-NEXT: flat_store_short v[0:1], v2 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16: +; VI-DENORM-CONTRACT: ; %bb.0: +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0 +; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 +; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v2, vcc, 2, v0 +; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v7, v[0:1] glc +; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v2, v[2:3] glc +; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v3, v[4:5] glc +; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1 +; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-DENORM-CONTRACT-NEXT: v_fma_f16 v2, -v7, v2, |v3| +; VI-DENORM-CONTRACT-NEXT: flat_store_short v[0:1], v2 +; VI-DENORM-CONTRACT-NEXT: s_endpgm +; +; GFX10-FLUSH-LABEL: mad_sub_fabs_inv_f16: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 +; GFX10-FLUSH-NEXT: v_sub_f16_e64 v1, |v3|, v1 +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: s_endpgm +; +; GFX10-DENORM-STRICT-LABEL: mad_sub_fabs_inv_f16: +; GFX10-DENORM-STRICT: ; %bb.0: +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2 +; GFX10-DENORM-STRICT-NEXT: v_sub_f16_e64 v1, |v3|, v1 +; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-STRICT-NEXT: s_endpgm +; +; GFX10-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16: +; GFX10-DENORM-CONTRACT: ; %bb.0: +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-CONTRACT-NEXT: v_fma_f16 v1, -v1, v2, |v3| +; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-CONTRACT-NEXT: s_endpgm +; +; GFX11-FLUSH-LABEL: mad_sub_fabs_inv_f16: +; GFX11-FLUSH: ; %bb.0: +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_sub_f16_e64 v1, |v3|, v1 +; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FLUSH-NEXT: s_nop 0 +; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FLUSH-NEXT: s_endpgm +; +; GFX11-DENORM-STRICT-LABEL: mad_sub_fabs_inv_f16: +; GFX11-DENORM-STRICT: ; %bb.0: +; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc +; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc +; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-STRICT-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc +; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2 +; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-STRICT-NEXT: v_sub_f16_e64 v1, |v3|, v1 +; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-DENORM-STRICT-NEXT: s_nop 0 +; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DENORM-STRICT-NEXT: s_endpgm +; +; GFX11-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16: +; GFX11-DENORM-CONTRACT: ; %bb.0: +; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc +; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc +; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc +; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-CONTRACT-NEXT: v_fma_f16 v1, -v1, v2, |v3| +; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-DENORM-CONTRACT-NEXT: s_nop 0 +; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DENORM-CONTRACT-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 %gep0 = getelementptr half, ptr addrspace(1) %ptr, i64 %tid.ext @@ -464,27 +2014,158 @@ ret void } -; GCN-LABEL: {{^}}neg_neg_mad_f16: -; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGA:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGB:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGC:v[0-9]+]] - -; VI-FLUSH: v_mac_f16_e32 [[REGC]], [[REGA]], [[REGB]] -; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REGC]] - -; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]] -; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[REGC]], [[REGA]], [[REGB]] - -; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] -; GCN-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] -; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] - -; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] -; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] -; GFX10-FLUSH: global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]] -; GFX10-DENORM-STRICT: global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]] -; GFX10-DENORM-CONTRACT: global_store_{{short|b16}} v{{[0-9]+}}, [[REGC]] define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 { +; VI-FLUSH-LABEL: neg_neg_mad_f16: +; VI-FLUSH: ; %bb.0: +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 +; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0 +; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; VI-FLUSH-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: flat_load_ushort v7, v[0:1] glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: flat_load_ushort v2, v[2:3] glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: flat_load_ushort v3, v[4:5] glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 +; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: v_mac_f16_e32 v3, v7, v2 +; VI-FLUSH-NEXT: flat_store_short v[0:1], v3 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-DENORM-CONTRACT-LABEL: neg_neg_mad_f16: +; VI-DENORM-CONTRACT: ; %bb.0: +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0 +; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 +; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v2, vcc, 2, v0 +; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v7, v[0:1] glc +; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v2, v[2:3] glc +; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v3, v[4:5] glc +; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1 +; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-DENORM-CONTRACT-NEXT: v_fma_f16 v2, v7, v2, v3 +; VI-DENORM-CONTRACT-NEXT: flat_store_short v[0:1], v2 +; VI-DENORM-CONTRACT-NEXT: s_endpgm +; +; GFX10-FLUSH-LABEL: neg_neg_mad_f16: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 +; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v3, v1 +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: s_endpgm +; +; GFX10-DENORM-STRICT-LABEL: neg_neg_mad_f16: +; GFX10-DENORM-STRICT: ; %bb.0: +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2 +; GFX10-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v3, v1 +; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-STRICT-NEXT: s_endpgm +; +; GFX10-DENORM-CONTRACT-LABEL: neg_neg_mad_f16: +; GFX10-DENORM-CONTRACT: ; %bb.0: +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v3, v1, v2 +; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v3, s[0:1] +; GFX10-DENORM-CONTRACT-NEXT: s_endpgm +; +; GFX11-FLUSH-LABEL: neg_neg_mad_f16: +; GFX11-FLUSH: ; %bb.0: +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v3, v1 +; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FLUSH-NEXT: s_nop 0 +; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FLUSH-NEXT: s_endpgm +; +; GFX11-DENORM-STRICT-LABEL: neg_neg_mad_f16: +; GFX11-DENORM-STRICT: ; %bb.0: +; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc +; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc +; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-STRICT-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc +; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2 +; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v3, v1 +; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-DENORM-STRICT-NEXT: s_nop 0 +; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DENORM-STRICT-NEXT: s_endpgm +; +; GFX11-DENORM-CONTRACT-LABEL: neg_neg_mad_f16: +; GFX11-DENORM-CONTRACT: ; %bb.0: +; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc +; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc +; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc +; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v3, v1, v2 +; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v3, s[0:1] +; GFX11-DENORM-CONTRACT-NEXT: s_nop 0 +; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DENORM-CONTRACT-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 %gep0 = getelementptr half, ptr addrspace(1) %ptr, i64 %tid.ext @@ -504,24 +2185,158 @@ ret void } -; GCN-LABEL: {{^}}mad_fabs_sub_f16: -; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGA:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGB:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGC:v[0-9]+]] - -; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]] - -; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]] - -; GCN-DENORM-STRICT: v_mul_f16_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]| -; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] - -; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] - -; GFX10-FLUSH: v_mul_f16_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]| -; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] -; GFX10: global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]] define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 { +; VI-FLUSH-LABEL: mad_fabs_sub_f16: +; VI-FLUSH: ; %bb.0: +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 +; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0 +; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; VI-FLUSH-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: flat_load_ushort v7, v[0:1] glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: flat_load_ushort v2, v[2:3] glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: flat_load_ushort v3, v[4:5] glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 +; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: v_mad_f16 v2, v7, |v2|, -v3 +; VI-FLUSH-NEXT: flat_store_short v[0:1], v2 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16: +; VI-DENORM-CONTRACT: ; %bb.0: +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0 +; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 +; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v2, vcc, 2, v0 +; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v7, v[0:1] glc +; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v2, v[2:3] glc +; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v3, v[4:5] glc +; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1 +; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-DENORM-CONTRACT-NEXT: v_fma_f16 v2, v7, |v2|, -v3 +; VI-DENORM-CONTRACT-NEXT: flat_store_short v[0:1], v2 +; VI-DENORM-CONTRACT-NEXT: s_endpgm +; +; GFX10-FLUSH-LABEL: mad_fabs_sub_f16: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: v_mul_f16_e64 v1, v1, |v2| +; GFX10-FLUSH-NEXT: v_sub_f16_e32 v1, v1, v3 +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: s_endpgm +; +; GFX10-DENORM-STRICT-LABEL: mad_fabs_sub_f16: +; GFX10-DENORM-STRICT: ; %bb.0: +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e64 v1, v1, |v2| +; GFX10-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v1, v3 +; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-STRICT-NEXT: s_endpgm +; +; GFX10-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16: +; GFX10-DENORM-CONTRACT: ; %bb.0: +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-CONTRACT-NEXT: v_fma_f16 v1, v1, |v2|, -v3 +; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-CONTRACT-NEXT: s_endpgm +; +; GFX11-FLUSH-LABEL: mad_fabs_sub_f16: +; GFX11-FLUSH: ; %bb.0: +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: v_mul_f16_e64 v1, v1, |v2| +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v1, v3 +; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FLUSH-NEXT: s_nop 0 +; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FLUSH-NEXT: s_endpgm +; +; GFX11-DENORM-STRICT-LABEL: mad_fabs_sub_f16: +; GFX11-DENORM-STRICT: ; %bb.0: +; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc +; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc +; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-STRICT-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc +; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-STRICT-NEXT: v_mul_f16_e64 v1, v1, |v2| +; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v1, v3 +; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-DENORM-STRICT-NEXT: s_nop 0 +; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DENORM-STRICT-NEXT: s_endpgm +; +; GFX11-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16: +; GFX11-DENORM-CONTRACT: ; %bb.0: +; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc +; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc +; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc +; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-CONTRACT-NEXT: v_fma_f16 v1, v1, |v2|, -v3 +; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-DENORM-CONTRACT-NEXT: s_nop 0 +; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DENORM-CONTRACT-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 %gep0 = getelementptr half, ptr addrspace(1) %ptr, i64 %tid.ext @@ -540,26 +2355,132 @@ ret void } -; GCN-LABEL: {{^}}fsub_c_fadd_a_a_f16: -; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R1:v[0-9]+]], -; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R2:v[0-9]+]], -; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]] -; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] - -; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] -; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[R2]], -2.0, [[R1]] - -; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] - -; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] - -; GFX10-FLUSH: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] -; GFX10-FLUSH: global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]] -; GFX10-DENORM-STRICT: global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]] -; GFX10-DENORM-CONTRACT: global_store_{{short|b16}} v{{[0-9]+}}, [[R2]] define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; VI-FLUSH-LABEL: fsub_c_fadd_a_a_f16: +; VI-FLUSH: ; %bb.0: +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 +; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0 +; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: flat_load_ushort v4, v[0:1] glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: flat_load_ushort v2, v[2:3] glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: v_mac_f16_e32 v2, -2.0, v4 +; VI-FLUSH-NEXT: flat_store_short v[0:1], v2 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-DENORM-CONTRACT-LABEL: fsub_c_fadd_a_a_f16: +; VI-DENORM-CONTRACT: ; %bb.0: +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1 +; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v2, vcc, 2, v0 +; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v4, v[0:1] glc +; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v2, v[2:3] glc +; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-CONTRACT-NEXT: v_fma_f16 v2, v4, -2.0, v2 +; VI-DENORM-CONTRACT-NEXT: flat_store_short v[0:1], v2 +; VI-DENORM-CONTRACT-NEXT: s_endpgm +; +; GFX10-FLUSH-LABEL: fsub_c_fadd_a_a_f16: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1 +; GFX10-FLUSH-NEXT: v_sub_f16_e32 v1, v2, v1 +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: s_endpgm +; +; GFX10-DENORM-STRICT-LABEL: fsub_c_fadd_a_a_f16: +; GFX10-DENORM-STRICT: ; %bb.0: +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc +; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc +; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v1 +; GFX10-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v2, v1 +; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-STRICT-NEXT: s_endpgm +; +; GFX10-DENORM-CONTRACT-LABEL: fsub_c_fadd_a_a_f16: +; GFX10-DENORM-CONTRACT: ; %bb.0: +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc +; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v2, -2.0, v1 +; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v2, s[0:1] +; GFX10-DENORM-CONTRACT-NEXT: s_endpgm +; +; GFX11-FLUSH-LABEL: fsub_c_fadd_a_a_f16: +; GFX11-FLUSH: ; %bb.0: +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v2, v1 +; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FLUSH-NEXT: s_nop 0 +; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FLUSH-NEXT: s_endpgm +; +; GFX11-DENORM-STRICT-LABEL: fsub_c_fadd_a_a_f16: +; GFX11-DENORM-STRICT: ; %bb.0: +; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc +; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v1 +; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v2, v1 +; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-DENORM-STRICT-NEXT: s_nop 0 +; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DENORM-STRICT-NEXT: s_endpgm +; +; GFX11-DENORM-CONTRACT-LABEL: fsub_c_fadd_a_a_f16: +; GFX11-DENORM-CONTRACT: ; %bb.0: +; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc +; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v2, -2.0, v1 +; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v2, s[0:1] +; GFX11-DENORM-CONTRACT-NEXT: s_nop 0 +; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DENORM-CONTRACT-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid %gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1 @@ -575,23 +2496,132 @@ ret void } -; GCN-LABEL: {{^}}fsub_fadd_a_a_c_f16: -; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R1:v[0-9]+]], -; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R2:v[0-9]+]], - -; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] - -; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] - -; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] - -; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] - -; GFX10-FLUSH: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] -; GFX10: global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]] define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; VI-FLUSH-LABEL: fsub_fadd_a_a_c_f16: +; VI-FLUSH: ; %bb.0: +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 +; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0 +; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: flat_load_ushort v4, v[0:1] glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: flat_load_ushort v2, v[2:3] glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: v_mad_f16 v2, v4, 2.0, -v2 +; VI-FLUSH-NEXT: flat_store_short v[0:1], v2 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-DENORM-CONTRACT-LABEL: fsub_fadd_a_a_c_f16: +; VI-DENORM-CONTRACT: ; %bb.0: +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1 +; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v2, vcc, 2, v0 +; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v4, v[0:1] glc +; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v2, v[2:3] glc +; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-CONTRACT-NEXT: v_fma_f16 v2, v4, 2.0, -v2 +; VI-DENORM-CONTRACT-NEXT: flat_store_short v[0:1], v2 +; VI-DENORM-CONTRACT-NEXT: s_endpgm +; +; GFX10-FLUSH-LABEL: fsub_fadd_a_a_c_f16: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1 +; GFX10-FLUSH-NEXT: v_sub_f16_e32 v1, v1, v2 +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: s_endpgm +; +; GFX10-DENORM-STRICT-LABEL: fsub_fadd_a_a_c_f16: +; GFX10-DENORM-STRICT: ; %bb.0: +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc +; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc +; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v1 +; GFX10-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v1, v2 +; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-STRICT-NEXT: s_endpgm +; +; GFX10-DENORM-CONTRACT-LABEL: fsub_fadd_a_a_c_f16: +; GFX10-DENORM-CONTRACT: ; %bb.0: +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc +; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-CONTRACT-NEXT: v_fma_f16 v1, v1, 2.0, -v2 +; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-CONTRACT-NEXT: s_endpgm +; +; GFX11-FLUSH-LABEL: fsub_fadd_a_a_c_f16: +; GFX11-FLUSH: ; %bb.0: +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v1, v2 +; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FLUSH-NEXT: s_nop 0 +; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FLUSH-NEXT: s_endpgm +; +; GFX11-DENORM-STRICT-LABEL: fsub_fadd_a_a_c_f16: +; GFX11-DENORM-STRICT: ; %bb.0: +; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc +; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v1 +; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v1, v2 +; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-DENORM-STRICT-NEXT: s_nop 0 +; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DENORM-STRICT-NEXT: s_endpgm +; +; GFX11-DENORM-CONTRACT-LABEL: fsub_fadd_a_a_c_f16: +; GFX11-DENORM-CONTRACT: ; %bb.0: +; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc +; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-CONTRACT-NEXT: v_fma_f16 v1, v1, 2.0, -v2 +; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-DENORM-CONTRACT-NEXT: s_nop 0 +; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DENORM-CONTRACT-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid %gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll @@ -1,41 +1,227 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,VI-FLUSH %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,VI-DENORM %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10PLUS,GFX10PLUS-FLUSH %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10PLUS,GFX10PLUS-DENORM %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10PLUS,GFX10PLUS-FLUSH %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10PLUS,GFX10PLUS-DENORM %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck --check-prefixes=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=VI-FLUSH %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck --check-prefixes=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=VI-DENORM %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-FLUSH %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-DENORM %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-FLUSH %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-DENORM %s declare half @llvm.fmuladd.f16(half %a, half %b, half %c) declare <2 x half> @llvm.fmuladd.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) -; GCN-LABEL: {{^}}fmuladd_f16 -; GCN: buffer_load_{{ushort|u16}} v[[A_F16:[0-9]+]] -; GCN: buffer_load_{{ushort|u16}} v[[B_F16:[0-9]+]] -; GCN: buffer_load_{{ushort|u16}} v[[C_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] -; SI: v_mac_f32_e32 v[[C_F32]], v[[A_F32]], v[[B_F32]] -; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[C_F32]] -; SI: buffer_store_short v[[R_F16]] - -; VI-FLUSH: v_mac_f16_e32 v[[C_F16]], v[[A_F16]], v[[B_F16]] -; VI-FLUSH: buffer_store_short v[[C_F16]] - -; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]] -; VI-DENORM: buffer_store_short [[RESULT]] - -; GFX10PLUS-FLUSH: v_mul_f16_e32 [[MUL:v[0-9]+]], v[[A_F16]], v[[B_F16]] -; GFX10PLUS-FLUSH: v_add_f16_e32 [[ADD:v[0-9]+]], [[MUL]], v[[C_F16]] -; GFX10PLUS-FLUSH: buffer_store_{{short|b16}} [[ADD]] - -; GFX10PLUS-DENORM: v_fmac_f16_e32 v[[C_F16]], v[[A_F16]], v[[B_F16]] -; GFX10PLUS-DENORM: buffer_store_{{short|b16}} v[[C_F16]], - -; GCN: s_endpgm define amdgpu_kernel void @fmuladd_f16( +; SI-LABEL: fmuladd_f16: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_mac_f32_e32 v2, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-FLUSH-LABEL: fmuladd_f16: +; VI-FLUSH: ; %bb.0: +; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_mov_b32 s11, 0xf000 +; VI-FLUSH-NEXT: s_mov_b32 s10, -1 +; VI-FLUSH-NEXT: s_mov_b32 s14, s10 +; VI-FLUSH-NEXT: s_mov_b32 s15, s11 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: s_mov_b32 s12, s2 +; VI-FLUSH-NEXT: s_mov_b32 s13, s3 +; VI-FLUSH-NEXT: s_mov_b32 s16, s4 +; VI-FLUSH-NEXT: s_mov_b32 s17, s5 +; VI-FLUSH-NEXT: s_mov_b32 s18, s10 +; VI-FLUSH-NEXT: s_mov_b32 s19, s11 +; VI-FLUSH-NEXT: s_mov_b32 s4, s6 +; VI-FLUSH-NEXT: s_mov_b32 s5, s7 +; VI-FLUSH-NEXT: s_mov_b32 s6, s10 +; VI-FLUSH-NEXT: s_mov_b32 s7, s11 +; VI-FLUSH-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; VI-FLUSH-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; VI-FLUSH-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; VI-FLUSH-NEXT: s_mov_b32 s8, s0 +; VI-FLUSH-NEXT: s_mov_b32 s9, s1 +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: v_mac_f16_e32 v2, v0, v1 +; VI-FLUSH-NEXT: buffer_store_short v2, off, s[8:11], 0 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-DENORM-LABEL: fmuladd_f16: +; VI-DENORM: ; %bb.0: +; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-DENORM-NEXT: s_mov_b32 s11, 0xf000 +; VI-DENORM-NEXT: s_mov_b32 s10, -1 +; VI-DENORM-NEXT: s_mov_b32 s14, s10 +; VI-DENORM-NEXT: s_mov_b32 s15, s11 +; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; VI-DENORM-NEXT: s_mov_b32 s12, s2 +; VI-DENORM-NEXT: s_mov_b32 s13, s3 +; VI-DENORM-NEXT: s_mov_b32 s16, s4 +; VI-DENORM-NEXT: s_mov_b32 s17, s5 +; VI-DENORM-NEXT: s_mov_b32 s18, s10 +; VI-DENORM-NEXT: s_mov_b32 s19, s11 +; VI-DENORM-NEXT: s_mov_b32 s4, s6 +; VI-DENORM-NEXT: s_mov_b32 s5, s7 +; VI-DENORM-NEXT: s_mov_b32 s6, s10 +; VI-DENORM-NEXT: s_mov_b32 s7, s11 +; VI-DENORM-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; VI-DENORM-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; VI-DENORM-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; VI-DENORM-NEXT: s_mov_b32 s8, s0 +; VI-DENORM-NEXT: s_mov_b32 s9, s1 +; VI-DENORM-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-NEXT: v_fma_f16 v0, v0, v1, v2 +; VI-DENORM-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-DENORM-NEXT: s_endpgm +; +; GFX10-FLUSH-LABEL: fmuladd_f16: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_mov_b32 s10, -1 +; GFX10-FLUSH-NEXT: s_mov_b32 s11, 0x31016000 +; GFX10-FLUSH-NEXT: s_mov_b32 s14, s10 +; GFX10-FLUSH-NEXT: s_mov_b32 s15, s11 +; GFX10-FLUSH-NEXT: s_mov_b32 s18, s10 +; GFX10-FLUSH-NEXT: s_mov_b32 s19, s11 +; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-FLUSH-NEXT: s_mov_b32 s12, s2 +; GFX10-FLUSH-NEXT: s_mov_b32 s13, s3 +; GFX10-FLUSH-NEXT: s_mov_b32 s16, s4 +; GFX10-FLUSH-NEXT: s_mov_b32 s17, s5 +; GFX10-FLUSH-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; GFX10-FLUSH-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; GFX10-FLUSH-NEXT: s_mov_b32 s4, s6 +; GFX10-FLUSH-NEXT: s_mov_b32 s5, s7 +; GFX10-FLUSH-NEXT: s_mov_b32 s6, s10 +; GFX10-FLUSH-NEXT: s_mov_b32 s7, s11 +; GFX10-FLUSH-NEXT: s_mov_b32 s8, s0 +; GFX10-FLUSH-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; GFX10-FLUSH-NEXT: s_mov_b32 s9, s1 +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(1) +; GFX10-FLUSH-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: v_add_f16_e32 v0, v0, v2 +; GFX10-FLUSH-NEXT: buffer_store_short v0, off, s[8:11], 0 +; GFX10-FLUSH-NEXT: s_endpgm +; +; GFX10-DENORM-LABEL: fmuladd_f16: +; GFX10-DENORM: ; %bb.0: +; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-DENORM-NEXT: s_mov_b32 s10, -1 +; GFX10-DENORM-NEXT: s_mov_b32 s11, 0x31016000 +; GFX10-DENORM-NEXT: s_mov_b32 s14, s10 +; GFX10-DENORM-NEXT: s_mov_b32 s15, s11 +; GFX10-DENORM-NEXT: s_mov_b32 s18, s10 +; GFX10-DENORM-NEXT: s_mov_b32 s19, s11 +; GFX10-DENORM-NEXT: s_mov_b32 s22, s10 +; GFX10-DENORM-NEXT: s_mov_b32 s23, s11 +; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-NEXT: s_mov_b32 s12, s2 +; GFX10-DENORM-NEXT: s_mov_b32 s13, s3 +; GFX10-DENORM-NEXT: s_mov_b32 s16, s4 +; GFX10-DENORM-NEXT: s_mov_b32 s17, s5 +; GFX10-DENORM-NEXT: s_mov_b32 s20, s6 +; GFX10-DENORM-NEXT: s_mov_b32 s21, s7 +; GFX10-DENORM-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; GFX10-DENORM-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; GFX10-DENORM-NEXT: buffer_load_ushort v2, off, s[20:23], 0 +; GFX10-DENORM-NEXT: s_mov_b32 s8, s0 +; GFX10-DENORM-NEXT: s_mov_b32 s9, s1 +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: v_fmac_f16_e32 v2, v0, v1 +; GFX10-DENORM-NEXT: buffer_store_short v2, off, s[8:11], 0 +; GFX10-DENORM-NEXT: s_endpgm +; +; GFX11-FLUSH-LABEL: fmuladd_f16: +; GFX11-FLUSH: ; %bb.0: +; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_mov_b32 s10, -1 +; GFX11-FLUSH-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FLUSH-NEXT: s_mov_b32 s14, s10 +; GFX11-FLUSH-NEXT: s_mov_b32 s15, s11 +; GFX11-FLUSH-NEXT: s_mov_b32 s18, s10 +; GFX11-FLUSH-NEXT: s_mov_b32 s19, s11 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-NEXT: s_mov_b32 s12, s2 +; GFX11-FLUSH-NEXT: s_mov_b32 s13, s3 +; GFX11-FLUSH-NEXT: s_mov_b32 s16, s4 +; GFX11-FLUSH-NEXT: s_mov_b32 s17, s5 +; GFX11-FLUSH-NEXT: buffer_load_u16 v0, off, s[12:15], 0 +; GFX11-FLUSH-NEXT: buffer_load_u16 v1, off, s[16:19], 0 +; GFX11-FLUSH-NEXT: s_mov_b32 s4, s6 +; GFX11-FLUSH-NEXT: s_mov_b32 s5, s7 +; GFX11-FLUSH-NEXT: s_mov_b32 s6, s10 +; GFX11-FLUSH-NEXT: s_mov_b32 s7, s11 +; GFX11-FLUSH-NEXT: s_mov_b32 s8, s0 +; GFX11-FLUSH-NEXT: buffer_load_u16 v2, off, s[4:7], 0 +; GFX11-FLUSH-NEXT: s_mov_b32 s9, s1 +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(1) +; GFX11-FLUSH-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_add_f16_e32 v0, v0, v2 +; GFX11-FLUSH-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-FLUSH-NEXT: s_nop 0 +; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FLUSH-NEXT: s_endpgm +; +; GFX11-DENORM-LABEL: fmuladd_f16: +; GFX11-DENORM: ; %bb.0: +; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-DENORM-NEXT: s_mov_b32 s10, -1 +; GFX11-DENORM-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-DENORM-NEXT: s_mov_b32 s14, s10 +; GFX11-DENORM-NEXT: s_mov_b32 s15, s11 +; GFX11-DENORM-NEXT: s_mov_b32 s18, s10 +; GFX11-DENORM-NEXT: s_mov_b32 s19, s11 +; GFX11-DENORM-NEXT: s_mov_b32 s22, s10 +; GFX11-DENORM-NEXT: s_mov_b32 s23, s11 +; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DENORM-NEXT: s_mov_b32 s12, s2 +; GFX11-DENORM-NEXT: s_mov_b32 s13, s3 +; GFX11-DENORM-NEXT: s_mov_b32 s16, s4 +; GFX11-DENORM-NEXT: s_mov_b32 s17, s5 +; GFX11-DENORM-NEXT: s_mov_b32 s20, s6 +; GFX11-DENORM-NEXT: s_mov_b32 s21, s7 +; GFX11-DENORM-NEXT: buffer_load_u16 v0, off, s[12:15], 0 +; GFX11-DENORM-NEXT: buffer_load_u16 v1, off, s[16:19], 0 +; GFX11-DENORM-NEXT: buffer_load_u16 v2, off, s[20:23], 0 +; GFX11-DENORM-NEXT: s_mov_b32 s8, s0 +; GFX11-DENORM-NEXT: s_mov_b32 s9, s1 +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-NEXT: v_fmac_f16_e32 v2, v0, v1 +; GFX11-DENORM-NEXT: buffer_store_b16 v2, off, s[8:11], 0 +; GFX11-DENORM-NEXT: s_nop 0 +; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DENORM-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -48,31 +234,182 @@ ret void } -; GCN-LABEL: {{^}}fmuladd_f16_imm_a -; GCN: buffer_load_{{ushort|u16}} v[[B_F16:[0-9]+]] -; GCN: buffer_load_{{ushort|u16}} v[[C_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] -; SI: v_mac_f32_e32 v[[C_F32]], 0x40400000, v[[B_F32]] -; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[C_F32]] -; SI: buffer_store_short v[[R_F16]] - -; VI-FLUSH: v_mac_f16_e32 v[[C_F16]], 0x4200, v[[B_F16]] -; VI-FLUSH: buffer_store_short v[[C_F16]] - -; VI-DENORM: s_movk_i32 [[KA:s[0-9]+]], 0x4200 -; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], v[[B_F16]], [[KA]], v[[C_F16]] -; VI-DENORM: buffer_store_short [[RESULT]] - -; GFX10PLUS-FLUSH: v_mul_f16_e32 [[MUL:v[0-9]+]], 0x4200, v[[B_F16]] -; GFX10PLUS-FLUSH: v_add_f16_e32 [[ADD:v[0-9]+]], [[MUL]], v[[C_F16]] -; GFX10PLUS-FLUSH: buffer_store_{{short|b16}} [[ADD]] - -; GFX10PLUS-DENORM: v_fmac_f16_e32 v[[C_F16]], 0x4200, v[[B_F16]] -; GFX10PLUS-DENORM: buffer_store_{{short|b16}} v[[C_F16]], - -; GCN: s_endpgm define amdgpu_kernel void @fmuladd_f16_imm_a( +; SI-LABEL: fmuladd_f16_imm_a: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_mac_f32_e32 v1, 0x40400000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-FLUSH-LABEL: fmuladd_f16_imm_a: +; VI-FLUSH: ; %bb.0: +; VI-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-FLUSH-NEXT: s_mov_b32 s3, 0xf000 +; VI-FLUSH-NEXT: s_mov_b32 s2, -1 +; VI-FLUSH-NEXT: s_mov_b32 s14, s2 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: s_mov_b32 s12, s6 +; VI-FLUSH-NEXT: s_mov_b32 s13, s7 +; VI-FLUSH-NEXT: s_mov_b32 s15, s3 +; VI-FLUSH-NEXT: s_mov_b32 s10, s2 +; VI-FLUSH-NEXT: s_mov_b32 s11, s3 +; VI-FLUSH-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: s_mov_b32 s0, s4 +; VI-FLUSH-NEXT: s_mov_b32 s1, s5 +; VI-FLUSH-NEXT: v_mac_f16_e32 v1, 0x4200, v0 +; VI-FLUSH-NEXT: buffer_store_short v1, off, s[0:3], 0 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-DENORM-LABEL: fmuladd_f16_imm_a: +; VI-DENORM: ; %bb.0: +; VI-DENORM-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-DENORM-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-DENORM-NEXT: s_mov_b32 s3, 0xf000 +; VI-DENORM-NEXT: s_mov_b32 s2, -1 +; VI-DENORM-NEXT: s_mov_b32 s14, s2 +; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; VI-DENORM-NEXT: s_mov_b32 s12, s6 +; VI-DENORM-NEXT: s_mov_b32 s13, s7 +; VI-DENORM-NEXT: s_mov_b32 s15, s3 +; VI-DENORM-NEXT: s_mov_b32 s10, s2 +; VI-DENORM-NEXT: s_mov_b32 s11, s3 +; VI-DENORM-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; VI-DENORM-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; VI-DENORM-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-NEXT: s_mov_b32 s0, s4 +; VI-DENORM-NEXT: s_movk_i32 s4, 0x4200 +; VI-DENORM-NEXT: s_mov_b32 s1, s5 +; VI-DENORM-NEXT: v_fma_f16 v0, v0, s4, v1 +; VI-DENORM-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-DENORM-NEXT: s_endpgm +; +; GFX10-FLUSH-LABEL: fmuladd_f16_imm_a: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_clause 0x1 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX10-FLUSH-NEXT: s_mov_b32 s2, -1 +; GFX10-FLUSH-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-FLUSH-NEXT: s_mov_b32 s14, s2 +; GFX10-FLUSH-NEXT: s_mov_b32 s15, s3 +; GFX10-FLUSH-NEXT: s_mov_b32 s10, s2 +; GFX10-FLUSH-NEXT: s_mov_b32 s11, s3 +; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-FLUSH-NEXT: s_mov_b32 s12, s6 +; GFX10-FLUSH-NEXT: s_mov_b32 s13, s7 +; GFX10-FLUSH-NEXT: s_mov_b32 s0, s4 +; GFX10-FLUSH-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: s_mov_b32 s1, s5 +; GFX10-FLUSH-NEXT: v_mul_f16_e32 v0, 0x4200, v0 +; GFX10-FLUSH-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX10-FLUSH-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX10-FLUSH-NEXT: s_endpgm +; +; GFX10-DENORM-LABEL: fmuladd_f16_imm_a: +; GFX10-DENORM: ; %bb.0: +; GFX10-DENORM-NEXT: s_clause 0x1 +; GFX10-DENORM-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX10-DENORM-NEXT: s_mov_b32 s2, -1 +; GFX10-DENORM-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-DENORM-NEXT: s_mov_b32 s14, s2 +; GFX10-DENORM-NEXT: s_mov_b32 s15, s3 +; GFX10-DENORM-NEXT: s_mov_b32 s10, s2 +; GFX10-DENORM-NEXT: s_mov_b32 s11, s3 +; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-NEXT: s_mov_b32 s12, s6 +; GFX10-DENORM-NEXT: s_mov_b32 s13, s7 +; GFX10-DENORM-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: s_mov_b32 s0, s4 +; GFX10-DENORM-NEXT: s_mov_b32 s1, s5 +; GFX10-DENORM-NEXT: v_fmac_f16_e32 v1, 0x4200, v0 +; GFX10-DENORM-NEXT: buffer_store_short v1, off, s[0:3], 0 +; GFX10-DENORM-NEXT: s_endpgm +; +; GFX11-FLUSH-LABEL: fmuladd_f16_imm_a: +; GFX11-FLUSH: ; %bb.0: +; GFX11-FLUSH-NEXT: s_clause 0x1 +; GFX11-FLUSH-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FLUSH-NEXT: s_mov_b32 s10, -1 +; GFX11-FLUSH-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FLUSH-NEXT: s_mov_b32 s14, s10 +; GFX11-FLUSH-NEXT: s_mov_b32 s15, s11 +; GFX11-FLUSH-NEXT: s_mov_b32 s2, s10 +; GFX11-FLUSH-NEXT: s_mov_b32 s3, s11 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-NEXT: s_mov_b32 s12, s6 +; GFX11-FLUSH-NEXT: s_mov_b32 s13, s7 +; GFX11-FLUSH-NEXT: s_mov_b32 s8, s4 +; GFX11-FLUSH-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: s_mov_b32 s9, s5 +; GFX11-FLUSH-NEXT: v_mul_f16_e32 v0, 0x4200, v0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX11-FLUSH-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-FLUSH-NEXT: s_nop 0 +; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FLUSH-NEXT: s_endpgm +; +; GFX11-DENORM-LABEL: fmuladd_f16_imm_a: +; GFX11-DENORM: ; %bb.0: +; GFX11-DENORM-NEXT: s_clause 0x1 +; GFX11-DENORM-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DENORM-NEXT: s_mov_b32 s10, -1 +; GFX11-DENORM-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-DENORM-NEXT: s_mov_b32 s14, s10 +; GFX11-DENORM-NEXT: s_mov_b32 s15, s11 +; GFX11-DENORM-NEXT: s_mov_b32 s2, s10 +; GFX11-DENORM-NEXT: s_mov_b32 s3, s11 +; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DENORM-NEXT: s_mov_b32 s12, s6 +; GFX11-DENORM-NEXT: s_mov_b32 s13, s7 +; GFX11-DENORM-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-NEXT: s_mov_b32 s8, s4 +; GFX11-DENORM-NEXT: s_mov_b32 s9, s5 +; GFX11-DENORM-NEXT: v_fmac_f16_e32 v1, 0x4200, v0 +; GFX11-DENORM-NEXT: buffer_store_b16 v1, off, s[8:11], 0 +; GFX11-DENORM-NEXT: s_nop 0 +; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DENORM-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %b, ptr addrspace(1) %c) { @@ -83,31 +420,182 @@ ret void } -; GCN-LABEL: {{^}}fmuladd_f16_imm_b -; GCN: buffer_load_{{ushort|u16}} v[[A_F16:[0-9]+]] -; GCN: buffer_load_{{ushort|u16}} v[[C_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] -; SI: v_mac_f32_e32 v[[C_F32]], 0x40400000, v[[A_F32]] -; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[C_F32]] -; SI: buffer_store_short v[[R_F16]] - -; VI-FLUSH: v_mac_f16_e32 v[[C_F16]], 0x4200, v[[A_F16]] -; VI-FLUSH: buffer_store_short v[[C_F16]] - -; VI-DENORM: s_movk_i32 [[KA:s[0-9]+]], 0x4200 -; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], v[[A_F16]], [[KA]], v[[C_F16]] -; VI-DENORM: buffer_store_short [[RESULT]] - -; GFX10PLUS-FLUSH: v_mul_f16_e32 [[MUL:v[0-9]+]], 0x4200, v[[A_F16]] -; GFX10PLUS-FLUSH: v_add_f16_e32 [[ADD:v[0-9]+]], [[MUL]], v[[C_F16]] -; GFX10PLUS-FLUSH: buffer_store_{{short|b16}} [[ADD]] - -; GFX10PLUS-DENORM: v_fmac_f16_e32 v[[C_F16]], 0x4200, v[[A_F16]] -; GFX10PLUS-DENORM: buffer_store_{{short|b16}} v[[C_F16]], - -; GCN: s_endpgm define amdgpu_kernel void @fmuladd_f16_imm_b( +; SI-LABEL: fmuladd_f16_imm_b: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_mac_f32_e32 v1, 0x40400000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-FLUSH-LABEL: fmuladd_f16_imm_b: +; VI-FLUSH: ; %bb.0: +; VI-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-FLUSH-NEXT: s_mov_b32 s3, 0xf000 +; VI-FLUSH-NEXT: s_mov_b32 s2, -1 +; VI-FLUSH-NEXT: s_mov_b32 s14, s2 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: s_mov_b32 s12, s6 +; VI-FLUSH-NEXT: s_mov_b32 s13, s7 +; VI-FLUSH-NEXT: s_mov_b32 s15, s3 +; VI-FLUSH-NEXT: s_mov_b32 s10, s2 +; VI-FLUSH-NEXT: s_mov_b32 s11, s3 +; VI-FLUSH-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: s_mov_b32 s0, s4 +; VI-FLUSH-NEXT: s_mov_b32 s1, s5 +; VI-FLUSH-NEXT: v_mac_f16_e32 v1, 0x4200, v0 +; VI-FLUSH-NEXT: buffer_store_short v1, off, s[0:3], 0 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-DENORM-LABEL: fmuladd_f16_imm_b: +; VI-DENORM: ; %bb.0: +; VI-DENORM-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-DENORM-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-DENORM-NEXT: s_mov_b32 s3, 0xf000 +; VI-DENORM-NEXT: s_mov_b32 s2, -1 +; VI-DENORM-NEXT: s_mov_b32 s14, s2 +; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; VI-DENORM-NEXT: s_mov_b32 s12, s6 +; VI-DENORM-NEXT: s_mov_b32 s13, s7 +; VI-DENORM-NEXT: s_mov_b32 s15, s3 +; VI-DENORM-NEXT: s_mov_b32 s10, s2 +; VI-DENORM-NEXT: s_mov_b32 s11, s3 +; VI-DENORM-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; VI-DENORM-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; VI-DENORM-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-NEXT: s_mov_b32 s0, s4 +; VI-DENORM-NEXT: s_movk_i32 s4, 0x4200 +; VI-DENORM-NEXT: s_mov_b32 s1, s5 +; VI-DENORM-NEXT: v_fma_f16 v0, v0, s4, v1 +; VI-DENORM-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-DENORM-NEXT: s_endpgm +; +; GFX10-FLUSH-LABEL: fmuladd_f16_imm_b: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_clause 0x1 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX10-FLUSH-NEXT: s_mov_b32 s2, -1 +; GFX10-FLUSH-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-FLUSH-NEXT: s_mov_b32 s14, s2 +; GFX10-FLUSH-NEXT: s_mov_b32 s15, s3 +; GFX10-FLUSH-NEXT: s_mov_b32 s10, s2 +; GFX10-FLUSH-NEXT: s_mov_b32 s11, s3 +; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-FLUSH-NEXT: s_mov_b32 s12, s6 +; GFX10-FLUSH-NEXT: s_mov_b32 s13, s7 +; GFX10-FLUSH-NEXT: s_mov_b32 s0, s4 +; GFX10-FLUSH-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: s_mov_b32 s1, s5 +; GFX10-FLUSH-NEXT: v_mul_f16_e32 v0, 0x4200, v0 +; GFX10-FLUSH-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX10-FLUSH-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX10-FLUSH-NEXT: s_endpgm +; +; GFX10-DENORM-LABEL: fmuladd_f16_imm_b: +; GFX10-DENORM: ; %bb.0: +; GFX10-DENORM-NEXT: s_clause 0x1 +; GFX10-DENORM-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX10-DENORM-NEXT: s_mov_b32 s2, -1 +; GFX10-DENORM-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-DENORM-NEXT: s_mov_b32 s14, s2 +; GFX10-DENORM-NEXT: s_mov_b32 s15, s3 +; GFX10-DENORM-NEXT: s_mov_b32 s10, s2 +; GFX10-DENORM-NEXT: s_mov_b32 s11, s3 +; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-NEXT: s_mov_b32 s12, s6 +; GFX10-DENORM-NEXT: s_mov_b32 s13, s7 +; GFX10-DENORM-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: s_mov_b32 s0, s4 +; GFX10-DENORM-NEXT: s_mov_b32 s1, s5 +; GFX10-DENORM-NEXT: v_fmac_f16_e32 v1, 0x4200, v0 +; GFX10-DENORM-NEXT: buffer_store_short v1, off, s[0:3], 0 +; GFX10-DENORM-NEXT: s_endpgm +; +; GFX11-FLUSH-LABEL: fmuladd_f16_imm_b: +; GFX11-FLUSH: ; %bb.0: +; GFX11-FLUSH-NEXT: s_clause 0x1 +; GFX11-FLUSH-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FLUSH-NEXT: s_mov_b32 s10, -1 +; GFX11-FLUSH-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FLUSH-NEXT: s_mov_b32 s14, s10 +; GFX11-FLUSH-NEXT: s_mov_b32 s15, s11 +; GFX11-FLUSH-NEXT: s_mov_b32 s2, s10 +; GFX11-FLUSH-NEXT: s_mov_b32 s3, s11 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-NEXT: s_mov_b32 s12, s6 +; GFX11-FLUSH-NEXT: s_mov_b32 s13, s7 +; GFX11-FLUSH-NEXT: s_mov_b32 s8, s4 +; GFX11-FLUSH-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: s_mov_b32 s9, s5 +; GFX11-FLUSH-NEXT: v_mul_f16_e32 v0, 0x4200, v0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX11-FLUSH-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-FLUSH-NEXT: s_nop 0 +; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FLUSH-NEXT: s_endpgm +; +; GFX11-DENORM-LABEL: fmuladd_f16_imm_b: +; GFX11-DENORM: ; %bb.0: +; GFX11-DENORM-NEXT: s_clause 0x1 +; GFX11-DENORM-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DENORM-NEXT: s_mov_b32 s10, -1 +; GFX11-DENORM-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-DENORM-NEXT: s_mov_b32 s14, s10 +; GFX11-DENORM-NEXT: s_mov_b32 s15, s11 +; GFX11-DENORM-NEXT: s_mov_b32 s2, s10 +; GFX11-DENORM-NEXT: s_mov_b32 s3, s11 +; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DENORM-NEXT: s_mov_b32 s12, s6 +; GFX11-DENORM-NEXT: s_mov_b32 s13, s7 +; GFX11-DENORM-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-NEXT: s_mov_b32 s8, s4 +; GFX11-DENORM-NEXT: s_mov_b32 s9, s5 +; GFX11-DENORM-NEXT: v_fmac_f16_e32 v1, 0x4200, v0 +; GFX11-DENORM-NEXT: buffer_store_b16 v1, off, s[8:11], 0 +; GFX11-DENORM-NEXT: s_nop 0 +; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DENORM-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %c) { @@ -118,64 +606,240 @@ ret void } -; GCN-LABEL: {{^}}fmuladd_v2f16 -; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]] - -; VI-FLUSH: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; VI-FLUSH: buffer_load_dword v[[C_V2_F16:[0-9]+]] -; VI-FLUSH: buffer_load_dword v[[B_V2_F16:[0-9]+]] - -; VI-DENORM: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; VI-DENORM: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; VI-DENORM: buffer_load_dword v[[C_V2_F16:[0-9]+]] - -; GFX10PLUS: buffer_load_{{dword|b32}} v[[A_V2_F16:[0-9]+]] -; GFX10PLUS: buffer_load_{{dword|b32}} v[[B_V2_F16:[0-9]+]] -; GFX10PLUS: buffer_load_{{dword|b32}} v[[C_V2_F16:[0-9]+]] - -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] - -; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]] - -; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] -; SI-DAG: v_mac_f32_e32 v[[C_F32_0]], v[[A_F32_0]], v[[B_F32_0]] -; SI-DAG: v_mac_f32_e32 v[[C_F32_1]], v[[A_F32_1]], v[[B_F32_1]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]] -; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]] - -; VI-FLUSH: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] -; VI-FLUSH-DAG: v_mac_f16_sdwa v[[C_F16_1]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-FLUSH-DAG: v_mac_f16_e32 v[[C_V2_F16]], v[[A_V2_F16]], v[[B_V2_F16]] -; VI-FLUSH-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[C_F16_1]] -; VI-FLUSH-NOT: v_and_b32 -; VI-FLUSH: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[C_V2_F16]], v[[R_F16_HI]] - -; VI-DENORM-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; VI-DENORM-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; VI-DENORM-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] -; VI-DENORM-DAG: v_fma_f16 v[[RES0:[0-9]+]], v[[C_V2_F16]], v[[B_V2_F16]], v[[A_V2_F16]] -; VI-DENORM-DAG: v_fma_f16 v[[RES1:[0-9]+]], v[[C_F16_1]], v[[B_F16_1]], v[[A_F16_1]] -; VI-DENORM-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[RES1]] -; VI-DENORM-NOT: v_and_b32 -; VI-DENORM: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[RES0]], v[[R_F16_HI]] - -; GFX10PLUS-FLUSH: v_pk_mul_f16 [[MUL:v[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] -; GFX10PLUS-FLUSH: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], [[MUL]], v[[C_V2_F16]] - -; GFX10PLUS-DENORM: v_pk_fma_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]] - -; GCN: buffer_store_{{dword|b32}} v[[R_V2_F16]] define amdgpu_kernel void @fmuladd_v2f16( +; SI-LABEL: fmuladd_v2f16: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_mac_f32_e32 v5, v0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_mac_f32_e32 v2, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-FLUSH-LABEL: fmuladd_v2f16: +; VI-FLUSH: ; %bb.0: +; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_mov_b32 s11, 0xf000 +; VI-FLUSH-NEXT: s_mov_b32 s10, -1 +; VI-FLUSH-NEXT: s_mov_b32 s14, s10 +; VI-FLUSH-NEXT: s_mov_b32 s15, s11 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: s_mov_b32 s12, s2 +; VI-FLUSH-NEXT: s_mov_b32 s13, s3 +; VI-FLUSH-NEXT: s_mov_b32 s16, s4 +; VI-FLUSH-NEXT: s_mov_b32 s17, s5 +; VI-FLUSH-NEXT: s_mov_b32 s18, s10 +; VI-FLUSH-NEXT: s_mov_b32 s19, s11 +; VI-FLUSH-NEXT: s_mov_b32 s4, s6 +; VI-FLUSH-NEXT: s_mov_b32 s5, s7 +; VI-FLUSH-NEXT: s_mov_b32 s6, s10 +; VI-FLUSH-NEXT: s_mov_b32 s7, s11 +; VI-FLUSH-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; VI-FLUSH-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; VI-FLUSH-NEXT: buffer_load_dword v2, off, s[16:19], 0 +; VI-FLUSH-NEXT: s_mov_b32 s8, s0 +; VI-FLUSH-NEXT: s_mov_b32 s9, s1 +; VI-FLUSH-NEXT: s_waitcnt vmcnt(1) +; VI-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: v_mac_f16_sdwa v3, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VI-FLUSH-NEXT: v_mac_f16_e32 v1, v0, v2 +; VI-FLUSH-NEXT: v_or_b32_e32 v0, v1, v3 +; VI-FLUSH-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-DENORM-LABEL: fmuladd_v2f16: +; VI-DENORM: ; %bb.0: +; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-DENORM-NEXT: s_mov_b32 s11, 0xf000 +; VI-DENORM-NEXT: s_mov_b32 s10, -1 +; VI-DENORM-NEXT: s_mov_b32 s14, s10 +; VI-DENORM-NEXT: s_mov_b32 s15, s11 +; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; VI-DENORM-NEXT: s_mov_b32 s16, s4 +; VI-DENORM-NEXT: s_mov_b32 s17, s5 +; VI-DENORM-NEXT: s_mov_b32 s4, s6 +; VI-DENORM-NEXT: s_mov_b32 s5, s7 +; VI-DENORM-NEXT: s_mov_b32 s6, s10 +; VI-DENORM-NEXT: s_mov_b32 s7, s11 +; VI-DENORM-NEXT: s_mov_b32 s12, s2 +; VI-DENORM-NEXT: s_mov_b32 s13, s3 +; VI-DENORM-NEXT: s_mov_b32 s18, s10 +; VI-DENORM-NEXT: s_mov_b32 s19, s11 +; VI-DENORM-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; VI-DENORM-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; VI-DENORM-NEXT: buffer_load_dword v2, off, s[12:15], 0 +; VI-DENORM-NEXT: s_mov_b32 s8, s0 +; VI-DENORM-NEXT: s_mov_b32 s9, s1 +; VI-DENORM-NEXT: s_waitcnt vmcnt(2) +; VI-DENORM-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; VI-DENORM-NEXT: s_waitcnt vmcnt(1) +; VI-DENORM-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; VI-DENORM-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; VI-DENORM-NEXT: v_fma_f16 v3, v5, v4, v3 +; VI-DENORM-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VI-DENORM-NEXT: v_fma_f16 v0, v2, v1, v0 +; VI-DENORM-NEXT: v_or_b32_e32 v0, v0, v3 +; VI-DENORM-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-DENORM-NEXT: s_endpgm +; +; GFX10-FLUSH-LABEL: fmuladd_v2f16: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_mov_b32 s10, -1 +; GFX10-FLUSH-NEXT: s_mov_b32 s11, 0x31016000 +; GFX10-FLUSH-NEXT: s_mov_b32 s14, s10 +; GFX10-FLUSH-NEXT: s_mov_b32 s15, s11 +; GFX10-FLUSH-NEXT: s_mov_b32 s18, s10 +; GFX10-FLUSH-NEXT: s_mov_b32 s19, s11 +; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-FLUSH-NEXT: s_mov_b32 s12, s2 +; GFX10-FLUSH-NEXT: s_mov_b32 s13, s3 +; GFX10-FLUSH-NEXT: s_mov_b32 s16, s4 +; GFX10-FLUSH-NEXT: s_mov_b32 s17, s5 +; GFX10-FLUSH-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; GFX10-FLUSH-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; GFX10-FLUSH-NEXT: s_mov_b32 s4, s6 +; GFX10-FLUSH-NEXT: s_mov_b32 s5, s7 +; GFX10-FLUSH-NEXT: s_mov_b32 s6, s10 +; GFX10-FLUSH-NEXT: s_mov_b32 s7, s11 +; GFX10-FLUSH-NEXT: s_mov_b32 s8, s0 +; GFX10-FLUSH-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; GFX10-FLUSH-NEXT: s_mov_b32 s9, s1 +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(1) +; GFX10-FLUSH-NEXT: v_pk_mul_f16 v0, v0, v1 +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: v_pk_add_f16 v0, v0, v2 +; GFX10-FLUSH-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX10-FLUSH-NEXT: s_endpgm +; +; GFX10-DENORM-LABEL: fmuladd_v2f16: +; GFX10-DENORM: ; %bb.0: +; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-DENORM-NEXT: s_mov_b32 s10, -1 +; GFX10-DENORM-NEXT: s_mov_b32 s11, 0x31016000 +; GFX10-DENORM-NEXT: s_mov_b32 s14, s10 +; GFX10-DENORM-NEXT: s_mov_b32 s15, s11 +; GFX10-DENORM-NEXT: s_mov_b32 s18, s10 +; GFX10-DENORM-NEXT: s_mov_b32 s19, s11 +; GFX10-DENORM-NEXT: s_mov_b32 s22, s10 +; GFX10-DENORM-NEXT: s_mov_b32 s23, s11 +; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-NEXT: s_mov_b32 s12, s2 +; GFX10-DENORM-NEXT: s_mov_b32 s13, s3 +; GFX10-DENORM-NEXT: s_mov_b32 s16, s4 +; GFX10-DENORM-NEXT: s_mov_b32 s17, s5 +; GFX10-DENORM-NEXT: s_mov_b32 s20, s6 +; GFX10-DENORM-NEXT: s_mov_b32 s21, s7 +; GFX10-DENORM-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; GFX10-DENORM-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; GFX10-DENORM-NEXT: buffer_load_dword v2, off, s[20:23], 0 +; GFX10-DENORM-NEXT: s_mov_b32 s8, s0 +; GFX10-DENORM-NEXT: s_mov_b32 s9, s1 +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: v_pk_fma_f16 v0, v0, v1, v2 +; GFX10-DENORM-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX10-DENORM-NEXT: s_endpgm +; +; GFX11-FLUSH-LABEL: fmuladd_v2f16: +; GFX11-FLUSH: ; %bb.0: +; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_mov_b32 s10, -1 +; GFX11-FLUSH-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FLUSH-NEXT: s_mov_b32 s14, s10 +; GFX11-FLUSH-NEXT: s_mov_b32 s15, s11 +; GFX11-FLUSH-NEXT: s_mov_b32 s18, s10 +; GFX11-FLUSH-NEXT: s_mov_b32 s19, s11 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-NEXT: s_mov_b32 s12, s2 +; GFX11-FLUSH-NEXT: s_mov_b32 s13, s3 +; GFX11-FLUSH-NEXT: s_mov_b32 s16, s4 +; GFX11-FLUSH-NEXT: s_mov_b32 s17, s5 +; GFX11-FLUSH-NEXT: buffer_load_b32 v0, off, s[12:15], 0 +; GFX11-FLUSH-NEXT: buffer_load_b32 v1, off, s[16:19], 0 +; GFX11-FLUSH-NEXT: s_mov_b32 s4, s6 +; GFX11-FLUSH-NEXT: s_mov_b32 s5, s7 +; GFX11-FLUSH-NEXT: s_mov_b32 s6, s10 +; GFX11-FLUSH-NEXT: s_mov_b32 s7, s11 +; GFX11-FLUSH-NEXT: s_mov_b32 s8, s0 +; GFX11-FLUSH-NEXT: buffer_load_b32 v2, off, s[4:7], 0 +; GFX11-FLUSH-NEXT: s_mov_b32 s9, s1 +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(1) +; GFX11-FLUSH-NEXT: v_pk_mul_f16 v0, v0, v1 +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_pk_add_f16 v0, v0, v2 +; GFX11-FLUSH-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-FLUSH-NEXT: s_nop 0 +; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FLUSH-NEXT: s_endpgm +; +; GFX11-DENORM-LABEL: fmuladd_v2f16: +; GFX11-DENORM: ; %bb.0: +; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-DENORM-NEXT: s_mov_b32 s10, -1 +; GFX11-DENORM-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-DENORM-NEXT: s_mov_b32 s14, s10 +; GFX11-DENORM-NEXT: s_mov_b32 s15, s11 +; GFX11-DENORM-NEXT: s_mov_b32 s18, s10 +; GFX11-DENORM-NEXT: s_mov_b32 s19, s11 +; GFX11-DENORM-NEXT: s_mov_b32 s22, s10 +; GFX11-DENORM-NEXT: s_mov_b32 s23, s11 +; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DENORM-NEXT: s_mov_b32 s12, s2 +; GFX11-DENORM-NEXT: s_mov_b32 s13, s3 +; GFX11-DENORM-NEXT: s_mov_b32 s16, s4 +; GFX11-DENORM-NEXT: s_mov_b32 s17, s5 +; GFX11-DENORM-NEXT: s_mov_b32 s20, s6 +; GFX11-DENORM-NEXT: s_mov_b32 s21, s7 +; GFX11-DENORM-NEXT: buffer_load_b32 v0, off, s[12:15], 0 +; GFX11-DENORM-NEXT: buffer_load_b32 v1, off, s[16:19], 0 +; GFX11-DENORM-NEXT: buffer_load_b32 v2, off, s[20:23], 0 +; GFX11-DENORM-NEXT: s_mov_b32 s8, s0 +; GFX11-DENORM-NEXT: s_mov_b32 s9, s1 +; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-NEXT: v_pk_fma_f16 v0, v0, v1, v2 +; GFX11-DENORM-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-DENORM-NEXT: s_nop 0 +; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DENORM-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b,