diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll @@ -32,6 +32,46 @@ ret void } +; GCN-LABEL: {{^}}fmul_fadd_f16: +; VI-FLUSH: v_mac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} + +; VI-DENORM-CONTRACT: v_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} + +; GFX10-FLUSH: v_mul_f16_e32 +; GFX10-FLUSH: v_add_f16_e32 +; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} + +define amdgpu_kernel void @fmul_fadd_f16(half addrspace(1)* %out, half addrspace(1)* %in1, + half addrspace(1)* %in2, half addrspace(1)* %in3) #0 { + %r0 = load half, half addrspace(1)* %in1 + %r1 = load half, half addrspace(1)* %in2 + %r2 = load half, half addrspace(1)* %in3 + %mul = fmul half %r0, %r1 + %add = fadd half %mul, %r2 + store half %add, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}fmul_fadd_contract_f16: +; VI-FLUSH: v_mac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} + +; VI-DENORM: v_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} + +; GFX10-FLUSH: v_mul_f16_e32 +; GFX10-FLUSH: v_add_f16_e32 +; GFX10-DENORM: v_fmac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} + +define amdgpu_kernel void @fmul_fadd_contract_f16(half addrspace(1)* %out, half addrspace(1)* %in1, + half addrspace(1)* %in2, half addrspace(1)* %in3) #0 { + %r0 = load half, half addrspace(1)* %in1 + %r1 = load half, half addrspace(1)* %in2 + %r2 = load half, half addrspace(1)* %in3 + %mul = fmul half %r0, %r1 + %add = fadd contract half %mul, %r2 + store half %add, half addrspace(1)* %out + ret void +} + ; GCN-LABEL: {{^}}fmuladd_2.0_a_b_f16 ; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]], ; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]], diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll --- a/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll @@ -69,6 +69,24 @@ ret void } +; GCN-LABEL: {{^}}fmul_fadd_contract_f32: +; GCN-FLUSH-FMAC: v_fmac_f32_e32 + +; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 +; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 + +; GCN-DENORM-FASTFMA: v_fma_f32 +define amdgpu_kernel void @fmul_fadd_contract_f32(float addrspace(1)* %out, float addrspace(1)* %in1, + float addrspace(1)* %in2, float addrspace(1)* %in3) #0 { + %r0 = load volatile float, float addrspace(1)* %in1 + %r1 = load volatile float, float addrspace(1)* %in2 + %r2 = load volatile float, float addrspace(1)* %in3 + %mul = fmul float %r0, %r1 + %add = fadd contract float %mul, %r2 + store float %add, float addrspace(1)* %out + ret void +} + ; GCN-LABEL: {{^}}fmuladd_2.0_a_b_f32 ; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], ; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f64.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f64.ll --- a/llvm/test/CodeGen/AMDGPU/fmuladd.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f64.ll @@ -1,4 +1,4 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICTSI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,SI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,SI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,SI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,SI %s @@ -33,6 +33,20 @@ ret void } +; GCN-LABEL: {{^}}fmul_fadd_contract_f64: +; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} + +define amdgpu_kernel void @fmul_fadd_contract_f64(double addrspace(1)* %out, double addrspace(1)* %in1, + double addrspace(1)* %in2, double addrspace(1)* %in3) #0 { + %r0 = load double, double addrspace(1)* %in1 + %r1 = load double, double addrspace(1)* %in2 + %r2 = load double, double addrspace(1)* %in3 + %tmp = fmul double %r0, %r1 + %r3 = fadd contract double %tmp, %r2 + store double %r3, double addrspace(1)* %out + ret void +} + ; GCN-LABEL: {{^}}fadd_a_a_b_f64: ; GCN: {{buffer|flat}}_load_dwordx2 [[R1:v\[[0-9]+:[0-9]+\]]], ; GCN: {{buffer|flat}}_load_dwordx2 [[R2:v\[[0-9]+:[0-9]+\]]], diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.v2f16.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.v2f16.ll --- a/llvm/test/CodeGen/AMDGPU/fmuladd.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmuladd.v2f16.ll @@ -27,6 +27,39 @@ ret void } +; GCN-LABEL: {{^}}fmul_fadd_v2f16: +; GFX9-DENORM-STRICT: v_pk_mul_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} +; GFX9-DENORM-STRICT: v_pk_add_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} + +; GFX9-DENORM-CONTRACT: v_pk_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} +define amdgpu_kernel void @fmul_fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in1, + <2 x half> addrspace(1)* %in2, <2 x half> addrspace(1)* %in3) #0 { + %r0 = load <2 x half>, <2 x half> addrspace(1)* %in1 + %r1 = load <2 x half>, <2 x half> addrspace(1)* %in2 + %r2 = load <2 x half>, <2 x half> addrspace(1)* %in3 + %r3 = fmul <2 x half> %r0, %r1 + %r4 = fadd <2 x half> %r3, %r2 + store <2 x half> %r4, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}fmul_fadd_contract_v2f16: +; GFX9-FLUSH: v_pk_mul_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} +; GFX9-FLUSH: v_pk_add_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} + +; GFX9-DENORM: v_pk_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} +define amdgpu_kernel void @fmul_fadd_contract_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in1, + <2 x half> addrspace(1)* %in2, <2 x half> addrspace(1)* %in3) #0 { + %r0 = load <2 x half>, <2 x half> addrspace(1)* %in1 + %r1 = load <2 x half>, <2 x half> addrspace(1)* %in2 + %r2 = load <2 x half>, <2 x half> addrspace(1)* %in3 + %r3 = fmul <2 x half> %r0, %r1 + %r4 = fadd contract <2 x half> %r3, %r2 + store <2 x half> %r4, <2 x half> addrspace(1)* %out + ret void +} + + ; GCN-LABEL: {{^}}fmuladd_2.0_a_b_v2f16: ; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], ; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],