diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -95,7 +95,9 @@ return Opcode == AMDGPU::V_MFMA_F64_4X4X4F64_e64 || Opcode == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64 || Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_e64 || - Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64; + Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64 || + Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64 || + Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64; } static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) { @@ -1477,6 +1479,8 @@ switch (Opc1) { case AMDGPU::V_MFMA_F64_16X16X4F64_e64: case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64: + case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64: + case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64: if (!isXDL(ST, *MI)) NeedWaitStates = DMFMA16x16WritesVGPROverlappedSrcCWaitStates; break; @@ -1509,6 +1513,8 @@ switch (Opc1) { case AMDGPU::V_MFMA_F64_16X16X4F64_e64: case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64: + case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64: + case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64: NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates; break; case AMDGPU::V_MFMA_F64_4X4X4F64_e64: diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -300,6 +300,13 @@ assert(!Fold.needsShrink() && "not handled"); if (Fold.isImm()) { + if (Old.isTied()) { + int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(MI->getOpcode()); + if (NewMFMAOpc == -1) + return false; + MI->setDesc(TII.get(NewMFMAOpc)); + MI->untieRegOperand(0); + } Old.ChangeToImmediate(Fold.ImmToFold); return true; } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1246,6 +1246,10 @@ LLVM_READONLY int getFlatScratchInstSVfromSS(uint16_t Opcode); + /// \returns earlyclobber version of a MAC MFMA is exists. + LLVM_READONLY + int getMFMAEarlyClobberOp(uint16_t Opcode); + const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL; const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19); const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3198,10 +3198,14 @@ Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64; bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64; + int NewMFMAOpc = -1; switch (Opc) { default: - return nullptr; + NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc); + if (NewMFMAOpc == -1) + return nullptr; + break; case AMDGPU::V_MAC_F16_e64: case AMDGPU::V_FMAC_F16_e64: IsF16 = true; @@ -3230,6 +3234,19 @@ } } + MachineInstrBuilder MIB; + MachineBasicBlock &MBB = *MI.getParent(); + + if (NewMFMAOpc != -1) { + MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc)); + for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) + MIB.add(MI.getOperand(I)); + updateLiveVariables(LV, MI, *MIB); + if (LIS) + LIS->ReplaceMachineInstrInMaps(MI, *MIB); + return MIB; + } + const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); const MachineOperand *Src0Mods = @@ -3240,8 +3257,6 @@ const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod); - MachineInstrBuilder MIB; - MachineBasicBlock &MBB = *MI.getParent(); if (!Src0Mods && !Src1Mods && !Clamp && !Omod && !IsF64 && // If we have an SGPR input, we will violate the constant bus restriction. @@ -7750,6 +7765,12 @@ } } + if (isMAI(Opcode)) { + int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode); + if (MFMAOp != -1) + Opcode = MFMAOp; + } + int MCOp = AMDGPU::getMCOpcode(Opcode, Gen); // -1 means that Opcode is already a native instruction. diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -2588,6 +2588,14 @@ let ValueCols = [["SV"]]; } +def getMFMAEarlyClobberOp : InstrMapping { + let FilterClass = "MFMATable"; + let RowFields = ["FMAOp"]; + let ColFields = ["IsMac"]; + let KeyCol = ["1"]; + let ValueCols = [["0"]]; +} + include "SIInstructions.td" include "DSInstructions.td" diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -388,6 +388,12 @@ let HasModifiers = 0; let Asm64 = "$vdst, $src0, $src1, $src2$cbsz$abid$blgp"; let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, Src2RC64:$src2, cbsz:$cbsz, abid:$abid, blgp:$blgp); + // Dst and SrcC cannot partially overlap if SrcC/Dst is bigger than 4 VGPRs. + // We then create two versions of the instruction: with tied dst and src2 + // and with the eralyclobber flag on the dst. This is strciter than the + // actual HW restriction. In particular earlyclobber also affects src0 and + // src1 allocation which is not required. + bit NoDstOverlap = !gt(DstVT.Size, 128); } def VOPProfileMAI_F32_F32_X4 : VOPProfileMAI; @@ -426,6 +432,11 @@ def VOPProfileMAI_F64_16X16X4F64_VCD : VOPProfileMAI; def VOPProfileMAI_F64_4X4X4F64_VCD : VOPProfileMAI; +class MFMATable { + bit IsMac = is_mac; + string FMAOp = Name; +} + let Predicates = [HasMAIInsts] in { let isAsCheapAsAMove = 1, isReMaterializable = 1 in { @@ -435,13 +446,31 @@ } // End isMoveImm = 1 } // End isAsCheapAsAMove = 1, isReMaterializable = 1 -multiclass MAIInst { +multiclass MAIInst("VOPProfileMAI_" # P).NoDstOverlap> { let isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 in { // FP32 denorm mode is respected, rounding mode is not. Exceptions are not supported. - defm "" : VOP3Inst("VOPProfileMAI_" # P), node>; - - let SubtargetPredicate = isGFX90APlus, Mnemonic = OpName in - defm _vgprcd : VOP3Inst("VOPProfileMAI_" # P # "_VCD")>; + let Constraints = !if(NoDstOverlap, "@earlyclobber $vdst", "") in { + defm "" : VOP3Inst("VOPProfileMAI_" # P), !if(NoDstOverlap, null_frag, node)>, + MFMATable<0, NAME # "_e64">; + + let SubtargetPredicate = isGFX90APlus, Mnemonic = OpName in + defm _vgprcd : VOP3Inst("VOPProfileMAI_" # P # "_VCD")>, + MFMATable<0, NAME # "_vgprcd_e64">; + } + + foreach _ = BoolToList.ret in { + let Constraints = !if(NoDstOverlap, "$vdst = $src2", ""), + isConvertibleToThreeAddress = NoDstOverlap, + Mnemonic = OpName in { + defm "_mac" : VOP3Inst("VOPProfileMAI_" # P), node>, + MFMATable<1, NAME # "_e64">; + + let SubtargetPredicate = isGFX90APlus in + defm _mac_vgprcd : VOP3Inst("VOPProfileMAI_" # P # "_VCD")>, + MFMATable<1, NAME # "_vgprcd_e64">; + } + } } // End isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 } @@ -517,6 +546,7 @@ } } +let Constraints = "" in { multiclass VOP3P_Real_MFMA_gfx90a op> { let SubtargetPredicate = isGFX90AOnly, AssemblerPredicate = isGFX90AOnly, DecoderNamespace = "GFX90A" in { @@ -536,6 +566,7 @@ let DecoderNamespace = "GFX8"; } } +} defm V_PK_MAD_I16 : VOP3P_Real_vi <0x00>; defm V_PK_MUL_LO_U16 : VOP3P_Real_vi <0x01>; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll @@ -601,14 +601,15 @@ ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_vecarg: ; GFX90A-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 -; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 +; GFX90A-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 ; GCN-COUNT-8: global_load_dwordx4 ; GFX908-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GFX908-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 ; GFX90A-NOT: v_accvgpr_write +; GFX908-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 +; GFX908-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 ; GFX908: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 ; GFX90A: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GFX908-COUNT-32: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 ; GFX908-COUNT-8: global_store_dwordx4 ; GFX90A-NOT: v_accvgpr_read_b32 ; GFX90A-COUNT-5: global_store_dwordx4 v{{[0-9:]+}}, a[{{[0-9:]+}}], s[{{[0-9:]+}}] diff --git a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll @@ -0,0 +1,66 @@ +; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY %s +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY %s +; RUN: llc -march=amdgcn -mcpu=gfx90a -sgpr-regalloc=fast -vgpr-regalloc=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,FAST %s + +; Check that Dst and SrcC of MFMA instructions reading more than 4 registers as SrcC +; is either completely disjoint or exactly the same, but does not alias. + +declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) +declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32) +declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32) + +; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32: +; GREEDY: v_mfma_f32_32x32x1f32 a[0:31], v{{[0-9]+}}, v{{[0-9]+}}, a[0:31] +; GREEDY: v_mfma_f32_32x32x1f32 a[32:63], v{{[0-9]+}}, v{{[0-9]+}}, a[0:31] +; FAST: v_mfma_f32_32x32x1f32 a[64:95], v{{[0-9]+}}, v{{[0-9]+}}, a[64:95] +; FAST: v_mfma_f32_32x32x1f32 a[32:63], v{{[0-9]+}}, v{{[0-9]+}}, a[64:95] +; GCN: v_mfma_f32_32x32x1f32 a[0:31], v{{[0-9]+}}, v{{[0-9]+}}, a[0:31] +define amdgpu_kernel void @test_mfma_f32_32x32x1f32(<32 x float> addrspace(1)* %arg) #0 { +bb: + %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0) + %mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %mai.1, i32 0, i32 0, i32 0) + %tmp.1 = shufflevector <32 x float> %mai.2, <32 x float> %mai.1, <32 x i32> + %mai.3 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %tmp.1, i32 0, i32 0, i32 0) + store <32 x float> %mai.3, <32 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32: +; GREEDY: v_mfma_f32_16x16x1f32 a[0:15], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15] +; GREEDY: v_mfma_f32_16x16x1f32 a[16:31], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15] +; FAST: v_mfma_f32_16x16x1f32 a[32:47], v{{[0-9]+}}, v{{[0-9]+}}, a[32:47] +; FAST: v_mfma_f32_16x16x1f32 a[16:31], v{{[0-9]+}}, v{{[0-9]+}}, a[32:47] +; GCN: v_mfma_f32_16x16x1f32 a[0:15], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15] +define amdgpu_kernel void @test_mfma_f32_16x16x1f32(<16 x float> addrspace(1)* %arg) #0 { +bb: + %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg + %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %in.1, i32 0, i32 0, i32 0) + %mai.2 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %mai.1, i32 0, i32 0, i32 0) + %tmp.1 = shufflevector <16 x float> %mai.2, <16 x float> %mai.1, <16 x i32> + %mai.3 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %tmp.1, i32 0, i32 0, i32 0) + store <16 x float> %mai.3, <16 x float> addrspace(1)* %arg + ret void +} + +; This instruction allows the overlap since it only read 4 registers. + +; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32: +; GREEDY: v_mfma_f32_4x4x1f32 a[0:3], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3] +; GREEDY: v_mfma_f32_4x4x1f32 a[2:5], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3] +; FAST: v_mfma_f32_4x4x1f32 a[8:11], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3] +; FAST: v_mfma_f32_4x4x1f32 a[4:7], v{{[0-9]+}}, v{{[0-9]+}}, a[8:11] +; GCN: v_mfma_f32_4x4x1f32 a[0:3], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3] +define amdgpu_kernel void @test_mfma_f32_4x4x1f32(<4 x float> addrspace(1)* %arg) #0 { +bb: + %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg + %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %in.1, i32 0, i32 0, i32 0) + %mai.2 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %mai.1, i32 0, i32 0, i32 0) + %tmp.1 = shufflevector <4 x float> %mai.1, <4 x float> %mai.2, <4 x i32> + %mai.3 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %tmp.1, i32 0, i32 0, i32 0) + store <4 x float> %mai.3, <4 x float> addrspace(1)* %arg + ret void +} + +attributes #0 = { "amdgpu-flat-work-group-size"="1,256" } diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll --- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll @@ -1,37 +1,6 @@ ; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908 %s ; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A %s -; GCN-LABEL: {{^}}max_24regs_32a_used: -; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 -; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 -; GCN-DAG: v_mfma_f32_16x16x1f32 -; GCN-DAG: v_mfma_f32_16x16x1f32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-NOT: buffer_store_dword -; GCN-NOT: buffer_load_dword -; GFX908-NOT: v_accvgpr_write_b32 -; GFX90A: v_accvgpr_write_b32 -; GCN: ScratchSize: 0 -define amdgpu_kernel void @max_24regs_32a_used(<16 x float> addrspace(1)* %arg, float addrspace(1)* %out) #0 { -bb: - %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg - %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 1.0, <16 x float> %in.1, i32 0, i32 0, i32 0) - %mai.2 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 1.0, <16 x float> %mai.1, i32 0, i32 0, i32 0) - %elt1 = extractelement <16 x float> %mai.2, i32 0 - %elt2 = extractelement <16 x float> %mai.1, i32 15 - %elt3 = extractelement <16 x float> %mai.1, i32 14 - %elt4 = extractelement <16 x float> %mai.2, i32 1 - store float %elt1, float addrspace(1)* %out - %gep1 = getelementptr float, float addrspace(1)* %out, i64 1 - store float %elt2, float addrspace(1)* %gep1 - %gep2 = getelementptr float, float addrspace(1)* %out, i64 2 - store float %elt3, float addrspace(1)* %gep2 - %gep3 = getelementptr float, float addrspace(1)* %out, i64 3 - store float %elt4, float addrspace(1)* %gep3 - - ret void -} - ; GCN-LABEL: {{^}}max_12regs_13a_used: ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 @@ -152,7 +121,6 @@ declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32) declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) -attributes #0 = { nounwind "amdgpu-num-vgpr"="24" } attributes #1 = { nounwind "amdgpu-num-vgpr"="10" } attributes #2 = { nounwind "amdgpu-num-vgpr"="12" } attributes #3 = { nounwind "amdgpu-num-vgpr"="32" }