diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1459,7 +1459,7 @@ def HasFmaakFmamkF32Insts : Predicate<"Subtarget->hasFmaakFmamkF32Insts()">, - AssemblerPredicate<(any_of FeatureGFX10Insts)>; + AssemblerPredicate<(any_of FeatureGFX10Insts, FeatureGFX940Insts)>; def HasImageInsts : Predicate<"Subtarget->hasImageInsts()">, AssemblerPredicate<(all_of FeatureImageInsts)>; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -843,7 +843,7 @@ } bool hasFmaakFmamkF32Insts() const { - return getGeneration() >= GFX10; + return getGeneration() >= GFX10 || hasGFX940Insts(); } bool hasImageInsts() const { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -7769,7 +7769,8 @@ GFX9 = 5, GFX10 = 6, SDWA10 = 7, - GFX90A = 8 + GFX90A = 8, + GFX940 = 9 }; static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) { @@ -7849,6 +7850,9 @@ if (ST.hasGFX90AInsts()) { uint16_t NMCOp = (uint16_t)-1; + if (ST.hasGFX940Insts()) + NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX940); + if (NMCOp == (uint16_t)-1) NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX90A); if (NMCOp == (uint16_t)-1) NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX9); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -29,6 +29,7 @@ int GFX10 = 6; int SDWA10 = 7; int GFX90A = 8; + int GFX940 = 9; } //===----------------------------------------------------------------------===// @@ -2543,7 +2544,8 @@ [!cast(SIEncodingFamily.GFX9)], [!cast(SIEncodingFamily.GFX10)], [!cast(SIEncodingFamily.SDWA10)], - [!cast(SIEncodingFamily.GFX90A)]]; + [!cast(SIEncodingFamily.GFX90A)], + [!cast(SIEncodingFamily.GFX940)]]; } // Get equivalent SOPK instruction. diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -1436,6 +1436,13 @@ VOP2_MADKe(NAME).Pfl>; } +multiclass VOP2_Real_MADK_gfx940 op> { + def _gfx940 : VOP2_Real(NAME), SIEncodingFamily.GFX940>, + VOP2_MADKe(NAME).Pfl> { + let DecoderNamespace = "GFX9"; + } +} + multiclass VOP2_Real_e32_vi op> { def _e32_vi : VOP2_Real(NAME#"_e32"), SIEncodingFamily.VI>, @@ -1736,6 +1743,11 @@ } } // End SubtargetPredicate = isGFX90APlus +let SubtargetPredicate = HasFmaakFmamkF32Insts in { +defm V_FMAMK_F32 : VOP2_Real_MADK_gfx940 <0x17>; +defm V_FMAAK_F32 : VOP2_Real_MADK_gfx940 <0x18>; +} + multiclass VOP2_Real_DOT_ACC_gfx9 op> : VOP2_Real_e32_vi { def _dpp_vi : VOP2_DPP(NAME#"_dpp")>; } diff --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll --- a/llvm/test/CodeGen/AMDGPU/madak.ll +++ b/llvm/test/CodeGen/AMDGPU/madak.ll @@ -2,7 +2,8 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8,GFX6_8_9,GFX8_9,GFX8_9_10,MAD %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GFX6_8_9,GFX8_9,GFX8_9_10,MAD %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX8_9_10,GFX10-MAD %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GFX10,GFX8_9_10,FMA %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GFX10,GFX8_9_10,FMA,GFX10-FMA %s +; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GFX9,GFX8_9_10,FMA,GFX940-FMA %s declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone declare float @llvm.fabs.f32(float) nounwind readnone @@ -135,7 +136,8 @@ ; GCN-NOT: v_madak_f32 ; GFX6_8_9: v_mac_f32_e32 [[VK]], [[SB]], [[VA]] ; GFX10-MAD: v_mad_f32 v{{[0-9]+}}, [[VA]], [[SB]], 0x41200000 -; FMA: v_fma_f32 v{{[0-9]+}}, [[VA]], [[SB]], 0x41200000 +; GFX10-FMA: v_fma_f32 v{{[0-9]+}}, [[VA]], [[SB]], 0x41200000 +; GFX940-FMA: v_fmac_f32_e32 v{{[0-9]+}}, [[SB]], [[VA]] define amdgpu_kernel void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float %b) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid @@ -156,7 +158,8 @@ ; GFX6_8_9-NOT: v_madak_f32 ; GFX6_8_9: v_mac_f32_e32 [[VK]], [[SB]], [[VA]] ; GFX10-MAD: v_madak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000 -; FMA: v_fmaak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000 +; GFX10-FMA: v_fmaak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000 +; GFX940-FMA: v_fmac_f32_e32 v{{[0-9]+}}, [[SB]], [[VA]] define amdgpu_kernel void @v_s_madak_f32(float addrspace(1)* noalias %out, float %a, float addrspace(1)* noalias %in.b) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid @@ -174,7 +177,8 @@ ; GCN-NOT: v_madak_f32 ; GFX8_9: v_mac_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} ; GFX10-MAD: v_mac_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} -; FMA: v_fmac_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; GFX10-FMA: v_fmac_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; GFX940-FMA: v_fmac_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} define amdgpu_kernel void @s_s_madak_f32(float addrspace(1)* %out, float %a, float %b) #0 { %mul = fmul float %a, %b %madak = fadd float %mul, 10.0 @@ -189,7 +193,8 @@ ; GFX8_9_10: {{flat|global}}_load_dword [[VA:v[0-9]+]] ; GFX6_8_9: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}} ; GFX10-MAD: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, 0x41200000 -; FMA: v_fma_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, 0x41200000 +; GFX10-FMA: v_fma_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, 0x41200000 +; GFX940-FMA: v_fma_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{s[0-9]+}} ; GCN: s_endpgm define amdgpu_kernel void @no_madak_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -215,7 +220,8 @@ ; GFX8_9_10: {{flat|global}}_load_dword [[VA:v[0-9]+]] ; GFX6_8_9: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{[sv][0-9]+}} ; GFX10-MAD: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, 0x41200000 -; FMA: v_fma_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, 0x41200000 +; GFX10-FMA: v_fma_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, 0x41200000 +; GFX940-FMA: v_fma_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{s[0-9]+}} ; GCN: s_endpgm define amdgpu_kernel void @no_madak_src1_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -245,7 +251,8 @@ ; MAD: v_mac_f32_e64 [[MADAK]], [[SGPR0]], 0.5 ; GFX10: v_mov_b32_e32 [[SGPR0_VCOPY:v[0-9]+]], [[SGPR0]] ; GFX10-MAD: v_madak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000 -; FMA: v_fmaak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000 +; GFX10-FMA: v_fmaak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000 +; GFX940-FMA: v_fmac_f32_e64 [[MADAK:v[0-9]+]], [[SGPR0]], 0.5 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[MADAK]], [[VGPR]] ; GFX6: buffer_store_dword [[MUL]] ; GFX8_9_10: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[MUL]] diff --git a/llvm/test/MC/AMDGPU/gfx940_asm_features.s b/llvm/test/MC/AMDGPU/gfx940_asm_features.s new file mode 100644 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx940_asm_features.s @@ -0,0 +1,10 @@ +// RUN: llvm-mc -arch=amdgcn -mcpu=gfx940 -show-encoding %s | FileCheck --check-prefix=GFX940 --strict-whitespace %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx90a %s 2>&1 | FileCheck --check-prefixes=GFX90A --implicit-check-not=error: %s + +// GFX90A: error: instruction not supported on this GPU +// GFX940: v_fmamk_f32 v0, v2, 0x42c80000, v3 ; encoding: [0x02,0x07,0x00,0x2e,0x00,0x00,0xc8,0x42] +v_fmamk_f32 v0, v2, 100.0, v3 + +// GFX90A: error: instruction not supported on this GPU +// GFX940: v_fmaak_f32 v0, v2, v3, 0x42c80000 ; encoding: [0x02,0x07,0x00,0x30,0x00,0x00,0xc8,0x42] +v_fmaak_f32 v0, v2, v3, 100.0 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx940_dasm_features.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx940_dasm_features.txt new file mode 100644 --- /dev/null +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx940_dasm_features.txt @@ -0,0 +1,7 @@ +# RUN: llvm-mc -arch=amdgcn -mcpu=gfx940 -disassemble -show-encoding %s | FileCheck -strict-whitespace --check-prefix=GFX940 %s + +# GFX940: v_fmamk_f32 v0, v2, 0x42c80000, v3 ; encoding: [0x02,0x07,0x00,0x2e,0x00,0x00,0xc8,0x42] +0x02,0x07,0x00,0x2e,0x00,0x00,0xc8,0x42 + +# GFX940: v_fmaak_f32 v0, v2, v3, 0x42c80000 ; encoding: [0x02,0x07,0x00,0x30,0x00,0x00,0xc8,0x42] +0x02,0x07,0x00,0x30,0x00,0x00,0xc8,0x42