diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -1688,7 +1688,7 @@ NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates; break; case 4: - assert(isDGEMM(MFMA->getOpcode())); + assert(isDGEMM(MFMA->getOpcode()) || ST.hasGFX940Insts()); NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates : DMFMA4x4WriteVgprVALUReadWaitStates; diff --git a/llvm/lib/Target/AMDGPU/GCNProcessors.td b/llvm/lib/Target/AMDGPU/GCNProcessors.td --- a/llvm/lib/Target/AMDGPU/GCNProcessors.td +++ b/llvm/lib/Target/AMDGPU/GCNProcessors.td @@ -192,7 +192,7 @@ FeatureISAVersion9_0_C.Features >; -def : ProcessorModel<"gfx940", SIDPFullSpeedModel, +def : ProcessorModel<"gfx940", SIDPGFX940FullSpeedModel, FeatureISAVersion9_4_0.Features >; diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td --- a/llvm/lib/Target/AMDGPU/SISchedule.td +++ b/llvm/lib/Target/AMDGPU/SISchedule.td @@ -59,6 +59,7 @@ // mAI multipass instructions. def Write2PassMAI : SchedWrite; +def Write4PassMAI : SchedWrite; def Write8PassMAI : SchedWrite; def Write16PassMAI : SchedWrite; def Write4PassDGEMM : SchedWrite; @@ -86,6 +87,7 @@ def SIFullSpeedModel : SISchedMachineModel; def SIQuarterSpeedModel : SISchedMachineModel; def SIDPFullSpeedModel : SISchedMachineModel; +def SIDPGFX940FullSpeedModel : SISchedMachineModel; def GFX10SpeedModel : SISchedMachineModel; // XXX: Are the resource counts correct? @@ -156,6 +158,8 @@ let ResourceCycles = [2] in def : HWWriteRes; + let ResourceCycles = [4] in + def : HWWriteRes; let ResourceCycles = [8] in def : HWWriteRes; let ResourceCycles = [16] in @@ -244,6 +248,33 @@ } // End SchedModel = SIDPFullSpeedModel +let SchedModel = SIDPGFX940FullSpeedModel in { + +defm : SICommonWriteRes; + +def : HWVALUWriteRes; +def : HWVALUWriteRes; +def : HWVALUWriteRes; +def : HWVALUWriteRes; +def : HWVALUWriteRes; +def : HWVALUWriteRes; +def : HWVALUWriteRes; + +def : InstRW<[WriteCopy], (instrs COPY)>; +def : InstRW<[Write64Bit], (instregex "^V_ACCVGPR_WRITE_B32_e64$")>; +def : InstRW<[Write2PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_4X4X")>; + +def : InstRW<[Write4PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X16")>; +def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X[14][FBI]")>; + +def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X8")>; +def : InstRW<[Write16PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X[124][FBI]")>; + +def : InstRW<[Write4PassDGEMM, MIMFMARead], (instregex "^V_MFMA_.64_4X4X")>; +def : InstRW<[Write8PassDGEMM, MIMFMARead], (instregex "^V_MFMA_.64_16X16X")>; + +} // End SchedModel = SIDPGFX940FullSpeedModel + let SchedModel = GFX10SpeedModel in { // The latency values are 1 / (operations / cycle).