Index: llvm/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstructions.td +++ llvm/lib/Target/AMDGPU/SIInstructions.td @@ -848,20 +848,29 @@ // VOP2 Patterns //===----------------------------------------------------------------------===// -multiclass FMADPat { - def : GCNPat < - (vt (fmad (VOP3NoMods vt:$src0), - (VOP3NoMods vt:$src1), - (VOP3NoMods vt:$src2))), +// TODO: Check only no src2 mods? +class FMADPat + : GCNPat <(vt (node (vt (VOP3NoMods vt:$src0)), + (vt (VOP3NoMods vt:$src1)), + (vt (VOP3NoMods vt:$src2)))), (inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) - >; +>; + + +// Prefer mac form when there are no modifiers. +let AddedComplexity = 9 in { +def : FMADPat ; +def : FMADPat ; + +let SubtargetPredicate = Has16BitInsts in { +def : FMADPat ; +def : FMADPat ; } -defm : FMADPat ; -defm : FMADPat ; +} -class FMADModsPat +class FMADModsPat : GCNPat< (Ty (mad_opr (Ty (VOP3Mods Ty:$src0, i32:$src0_mod)), (Ty (VOP3Mods Ty:$src1, i32:$src1_mod)), @@ -870,9 +879,8 @@ $src2_mod, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) >; -// FIXME: This should select to V_MAC_F32 -def : FMADModsPat; -def : FMADModsPat { +def : FMADModsPat; +def : FMADModsPat { let SubtargetPredicate = Has16BitInsts; } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fmad.ftz.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fmad.ftz.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fmad.ftz.mir @@ -19,8 +19,8 @@ ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec - ; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]] + ; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec + ; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = COPY $vgpr2 @@ -43,8 +43,8 @@ ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec - ; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]] + ; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec + ; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]] %0:sgpr(s32) = COPY $sgpr0 %1:vgpr(s32) = COPY $vgpr0 %2:vgpr(s32) = COPY $vgpr1 @@ -67,8 +67,8 @@ ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GCN: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec - ; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]] + ; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec + ; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:sgpr(s32) = COPY $sgpr0 %2:vgpr(s32) = COPY $vgpr1 @@ -91,8 +91,9 @@ ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GCN: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec - ; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]] + ; GCN: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY3]], 0, 0, implicit $exec + ; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr0 %2:sgpr(s32) = COPY $sgpr0 @@ -116,8 +117,8 @@ ; GCN: liveins: $sgpr0, $vgpr0 ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec - ; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]] + ; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec + ; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]] %0:sgpr(s32) = COPY $sgpr0 %1:vgpr(s32) = COPY $vgpr0 %2:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmad.ftz), %0, %0, %1 @@ -138,8 +139,9 @@ ; GCN: liveins: $sgpr0, $vgpr0 ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec - ; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]] + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec + ; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]] %0:sgpr(s32) = COPY $sgpr0 %1:vgpr(s32) = COPY $vgpr0 %2:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmad.ftz), %0, %1, %0 @@ -160,8 +162,9 @@ ; GCN: liveins: $sgpr0, $vgpr0 ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY1]], 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $exec - ; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]] + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, [[COPY2]], 0, 0, implicit $exec + ; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]] %0:sgpr(s32) = COPY $sgpr0 %1:vgpr(s32) = COPY $vgpr0 %2:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmad.ftz), %1, %0, %0 @@ -181,8 +184,9 @@ ; GCN-LABEL: name: fmad_ftz_s32_vsss ; GCN: liveins: $sgpr0, $vgpr0 ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GCN: [[V_MAD_F32_:%[0-9]+]]:vgpr_32 = V_MAD_F32 0, [[COPY]], 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $exec - ; GCN: S_ENDPGM 0, implicit [[V_MAD_F32_]] + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; GCN: [[V_MAC_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAC_F32_e64 0, [[COPY]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec + ; GCN: S_ENDPGM 0, implicit [[V_MAC_F32_e64_]] %0:sgpr(s32) = COPY $sgpr0 %1:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmad.ftz), %0, %0, %0 S_ENDPGM 0, implicit %1 Index: llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll +++ llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll @@ -137,8 +137,8 @@ } ; GCN-LABEL: test_fold_canonicalize_fmad_ftz_value_f32: -; GCN: s_mov_b32 [[SGPR:s[0-9]+]], 0x41700000 -; GCN: v_mad_f32 [[V:v[0-9]+]], v{{[0-9]+}}, [[SGPR]], [[SGPR]] +; GCN: v_mov_b32_e32 [[V:v[0-9]+]], 0x41700000 +; GCN: v_mac_f32_e32 [[V]], v{{[0-9]+}}, v{{[0-9]+$}} ; GCN-NOT: v_mul ; GCN-NOT: v_max ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.f16.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.f16.ll @@ -5,8 +5,7 @@ declare half @llvm.amdgcn.fmad.ftz.f16(half %a, half %b, half %c) ; GCN-LABEL: {{^}}mad_f16: -; GFX8: v_ma{{[dc]}}_f16 -; GFX9: v_mad_legacy_f16 +; GCN: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+$}} define amdgpu_kernel void @mad_f16( half addrspace(1)* %r, half addrspace(1)* %a, @@ -34,9 +33,7 @@ } ; GCN-LABEL: {{^}}mad_f16_imm_b: -; GCN: s_movk_i32 [[KB:s[0-9]+]], 0x4800 -; GFX8: v_mad_f16 {{v[0-9]+}}, {{v[0-9]+}}, [[KB]], -; GFX9: v_mad_legacy_f16 {{v[0-9]+}}, {{v[0-9]+}}, [[KB]], +; GCN: v_mac_f16_e32 {{v[0-9]+}}, 0x4800, {{v[0-9]+$}} define amdgpu_kernel void @mad_f16_imm_b( half addrspace(1)* %r, half addrspace(1)* %a, Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.ll @@ -35,7 +35,7 @@ ; GCN-LABEL: {{^}}mad_f32_imm_b: ; GCN: v_mov_b32_e32 [[KB:v[0-9]+]], 0x41000000 -; GCN: v_ma{{[dc]}}_f32 {{v[0-9]+}}, {{[vs][0-9]+}}, [[KB]], +; GCN: v_mac_f32_e32 {{v[0-9]+}}, {{[s][0-9]+}}, [[KB]] define amdgpu_kernel void @mad_f32_imm_b( float addrspace(1)* %r, float addrspace(1)* %a, @@ -48,8 +48,11 @@ } ; GCN-LABEL: {{^}}mad_f32_imm_c: -; GCN: v_mov_b32_e32 [[KC:v[0-9]+]], 0x41000000 -; GCN: v_ma{{[dc]}}_f32 {{v[0-9]+}}, {{[vs][0-9]+}}, {{v[0-9]+}}, [[KC]]{{$}} +; GCN: v_mov_b32_e32 [[C:v[0-9]+]], 0x41000000 +; GCN: s_load_dword [[A:s[0-9]+]] +; GCN: s_load_dword [[B:s[0-9]+]] +; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]] +; GCN: v_mac_f32_e32 [[C]], {{s[0-9]+}}, [[VB]]{{$}} define amdgpu_kernel void @mad_f32_imm_c( float addrspace(1)* %r, float addrspace(1)* %a,