diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3297,7 +3297,7 @@ } } - unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16_e64 + unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16_gfx9_e64 : IsF64 ? AMDGPU::V_FMA_F64_e64 : AMDGPU::V_FMA_F32_e64) : (IsF16 ? AMDGPU::V_MAD_F16_e64 : AMDGPU::V_MAD_F32_e64); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll @@ -107,9 +107,9 @@ ; GFX10-CONTRACT: ; %bb.0: ; %.entry ; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CONTRACT-NEXT: v_fmac_f16_e32 v4, v2, v3 -; GFX10-CONTRACT-NEXT: v_fmac_f16_e32 v4, v0, v1 -; GFX10-CONTRACT-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-CONTRACT-NEXT: v_fma_f16 v2, v2, v3, v4 +; GFX10-CONTRACT-NEXT: v_fmac_f16_e32 v2, v0, v1 +; GFX10-CONTRACT-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-DENORM-LABEL: test_half_add_mul: @@ -148,9 +148,9 @@ ; GFX10-CONTRACT: ; %bb.0: ; %.entry ; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CONTRACT-NEXT: v_fmac_f16_e32 v4, v2, v3 -; GFX10-CONTRACT-NEXT: v_fmac_f16_e32 v4, v0, v1 -; GFX10-CONTRACT-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-CONTRACT-NEXT: v_fma_f16 v2, v2, v3, v4 +; GFX10-CONTRACT-NEXT: v_fmac_f16_e32 v2, v0, v1 +; GFX10-CONTRACT-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-DENORM-LABEL: test_half_add_mul_rhs: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll @@ -166,8 +166,7 @@ ; GFX10-CONTRACT: ; %bb.0: ; %.entry ; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CONTRACT-NEXT: v_fmac_f16_e32 v2, v0, v1 -; GFX10-CONTRACT-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-CONTRACT-NEXT: v_fma_f16 v0, v0, v1, v2 ; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-DENORM-LABEL: test_half_add_mul: @@ -182,8 +181,7 @@ ; GFX10-UNSAFE: ; %bb.0: ; %.entry ; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-UNSAFE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-UNSAFE-NEXT: v_fmac_f16_e32 v2, v0, v1 -; GFX10-UNSAFE-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-UNSAFE-NEXT: v_fma_f16 v0, v0, v1, v2 ; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31] .entry: %a = fmul half %x, %y @@ -229,8 +227,7 @@ ; GFX10-CONTRACT: ; %bb.0: ; %.entry ; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CONTRACT-NEXT: v_fmac_f16_e32 v2, v0, v1 -; GFX10-CONTRACT-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-CONTRACT-NEXT: v_fma_f16 v0, v0, v1, v2 ; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-DENORM-LABEL: test_half_add_mul_rhs: @@ -245,8 +242,7 @@ ; GFX10-UNSAFE: ; %bb.0: ; %.entry ; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-UNSAFE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-UNSAFE-NEXT: v_fmac_f16_e32 v2, v0, v1 -; GFX10-UNSAFE-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-UNSAFE-NEXT: v_fma_f16 v0, v0, v1, v2 ; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31] .entry: %a = fmul half %x, %y diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll @@ -93,8 +93,7 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_fmac_f16_e32 v2, v0, v1 -; GFX10-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-NEXT: v_fma_f16 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %fma = call half @llvm.fma.f16(half %x, half %y, half %z) ret half %fma diff --git a/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll --- a/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll @@ -14,8 +14,7 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_fmac_f16_e32 v2, v0, v1 -; GFX10-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-NEXT: v_fma_f16 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call half @llvm.experimental.constrained.fma.f16(half %x, half %y, half %z, metadata !"round.tonearest", metadata !"fpexcept.strict") ret half %val @@ -75,9 +74,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_fmac_f16_e32 v5, v1, v3 ; GFX10-NEXT: v_pk_fma_f16 v0, v0, v2, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_fma_f16 v1, v1, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <3 x half> @llvm.experimental.constrained.fma.v3f16(<3 x half> %x, <3 x half> %y, <3 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <3 x half> %val