diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -2266,10 +2266,10 @@ int WaitStatesNeeded = 0; - bool IsMemOrExport = SIInstrInfo::isVMEM(*MI) || - SIInstrInfo::isFLAT(*MI) || - SIInstrInfo::isDS(*MI) || - SIInstrInfo::isEXP(*MI); + bool IsMem = SIInstrInfo::isVMEM(*MI) || + SIInstrInfo::isFLAT(*MI) || + SIInstrInfo::isDS(*MI); + bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI); bool IsVALU = SIInstrInfo::isVALU(*MI); const MachineInstr *MFMA = nullptr; @@ -2313,6 +2313,7 @@ const int DotWriteSameDotReadSrcAB = 3; const int DotWriteDifferentVALURead = 3; const int MaxWaitStates = 19; + const int DMFMABetweenVALUWriteVMEMRead = 2; for (const MachineOperand &Use : MI->explicit_uses()) { if (!Use.isReg()) @@ -2335,6 +2336,47 @@ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); } + // Workaround for HW data hazard bug observed only in GFX90A. When there + // is a DGEMM instruction in-between a VALU and a VMEM instruction it + // causes the SQ to incorrectly not insert two wait states between the two + // instructions needed to avoid data hazard. + if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) { + if (TRI.isVectorRegister(MF.getRegInfo(), Use.getReg())) { + auto IsDGEMMHazard = [this](const MachineInstr &MI) { + if (!TII.isVALU(MI)) + return false; + + auto NextMI = std::next(MI.getIterator()); + // If VALU write is at the end of a block without any branch or + // jump statements then this block is a fallthrough. Check if succ + // starts with DGEMM. + if (NextMI == MI.getParent()->end()) { + const MachineBasicBlock* CurrMBB = MI.getParent()->getSingleSuccessor(); + // May be empty fallthrough blocks, so look through them all. + while (CurrMBB && CurrMBB->begin() == CurrMBB->end()) + CurrMBB = CurrMBB->getSingleSuccessor(); + + if (!CurrMBB) + return false; + + NextMI = CurrMBB->instr_begin(); + } + + if (!isDGEMM(NextMI->getOpcode())) + return false; + + return true; + }; + + int WaitStatesNeededForUse = + DMFMABetweenVALUWriteVMEMRead - + getWaitStatesSinceDef(Reg, IsDGEMMHazard, + DMFMABetweenVALUWriteVMEMRead); + + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + } + } + MFMA = nullptr; WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates); diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -191,6 +191,7 @@ bool HasFlatSegmentOffsetBug = false; bool HasImageStoreD16Bug = false; bool HasImageGather4D16Bug = false; + bool HasDGEMMVALUWriteMemOpBug = false; bool HasVOPDInsts = false; // Dummy feature to use for assembler in tablegen. @@ -900,6 +901,8 @@ bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; } + bool hasDGEMMVALUWriteMemOpBug() const { return HasDGEMMVALUWriteMemOpBug; } + bool hasNSAEncoding() const { return HasNSAEncoding; } unsigned getNSAMaxSize() const { return NSAMaxSize; } diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir --- a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir +++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir @@ -1308,3 +1308,178 @@ $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec $vgpr4_vgpr5 = V_FMAC_F64_e32 $vgpr4_vgpr5, $vgpr4_vgpr5, $vgpr4_vgpr5, implicit $mode, implicit $exec ... +# GCN-LABEL: name: dgemm_between_valu_write_buffer_store +# GCN: V_MOV_B32_e32 +# GCN-NEXT: V_MFMA +# GCN-NEXT: S_NOP 0 +# GCN-NEXT: BUFFER_STORE_DWORD +name: dgemm_between_valu_write_buffer_store +body: | + bb.0: + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec + BUFFER_STORE_DWORDX2_OFFEN_exact $vgpr2_vgpr3, $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec +... +# GCN-LABEL: name: dgemm_between_valu_write_buffer_load +# GCN: V_MOV_B32_e32 +# GCN-NEXT: V_MFMA +# GCN-NEXT: S_NOP 0 +# GCN-NEXT: BUFFER_LOAD_DWORD +name: dgemm_between_valu_write_buffer_load +body: | + bb.0: + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, implicit $exec +... +# GCN-LABEL: name: dgemm_between_valu_write_global_store +# GCN: V_MOV_B32_e32 +# GCN-NEXT: V_MFMA +# GCN-NEXT: S_NOP 0 +# GCN-NEXT: GLOBAL_STORE_DWORD + +name: dgemm_between_valu_write_global_store +body: | + bb.0: + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec + GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec +... +# GCN-LABEL: name: dgemm_between_valu_write_global_load +# GCN: V_MOV_B32_e32 +# GCN-NEXT: V_MOV_B32_e32 +# GCN-NEXT: V_MFMA +# GCN-NEXT: S_NOP 0 +# GCN-NEXT: GLOBAL_LOAD_DWORD +name: dgemm_between_valu_write_global_load +body: | + bb.0: + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + $vgpr1 = V_MOV_B32_e32 0, implicit $exec + $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec + $vgpr2 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec +... +# GCN-LABEL: name: dgemm_between_valu_write_ds_write +# GCN: V_MOV_B32_e32 +# GCN-NEXT: V_MFMA +# GCN-NEXT: S_NOP 0 +# GCN-NEXT: DS_WRITE_B32 +name: dgemm_between_valu_write_ds_write +body: | + bb.0: + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec + DS_WRITE_B32 $vgpr1, $vgpr0, 0, 0, implicit $m0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: dgemm_between_valu_write_ds_read +# GCN: V_MOV_B32_e32 +# GCN-NEXT: V_MFMA +# GCN-NEXT: S_NOP 0 +# GCN-NEXT: DS_READ_B32_gfx9 +name: dgemm_between_valu_write_ds_read +body: | + bb.0: + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec + $vgpr1 = DS_READ_B32_gfx9 $vgpr0, 0, 0, implicit $exec +... +# GCN-LABEL: name: dgemm_between_valu_write_flat_store +# GCN: V_MOV_B32_e32 +# GCN-NEXT: V_MOV_B32_e32 +# GCN-NEXT: V_MFMA +# GCN-NEXT: S_NOP 0 +# GCN-NEXT: FLAT_STORE_DWORD +name: dgemm_between_valu_write_flat_store +body: | + bb.0: + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + $vgpr1 = V_MOV_B32_e32 0, implicit $exec + $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec + FLAT_STORE_DWORD $vgpr0_vgpr1, $agpr2, 0, 0, implicit $mode, implicit $exec, implicit $flat_scr +... +# GCN-LABEL: name: dgemm_between_valu_write_flat_load +# GCN: V_MOV_B32_e32 +# GCN-NEXT: V_MOV_B32_e32 +# GCN-NEXT: V_MFMA +# GCN-NEXT: S_NOP 0 +# GCN-NEXT: FLAT_LOAD_DWORD +name: dgemm_between_valu_write_flat_load +body: | + bb.0: + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + $vgpr1 = V_MOV_B32_e32 0, implicit $exec + $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec + $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr +... +# GCN-LABEL: name: dgemm_between_valu_write_scratch_store +# GCN: V_MOV_B32_e32 +# GCN-NEXT: V_MFMA +# GCN-NEXT: S_NOP 0 +# GCN-NEXT: SCRATCH_STORE_DWORD +name: dgemm_between_valu_write_scratch_store +body: | + bb.0: + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec + SCRATCH_STORE_DWORD $vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr +... +# GCN-LABEL: name: dgemm_between_valu_write_scratch_load +# GCN: V_MOV_B32_e32 +# GCN-NEXT: V_MFMA +# GCN-NEXT: S_NOP 0 +# GCN-NEXT: SCRATCH_LOAD_DWORD +name: dgemm_between_valu_write_scratch_load +body: | + bb.0: + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec + $vgpr1 = SCRATCH_LOAD_DWORD undef $vgpr0, 0, 0, implicit $exec, implicit $flat_scr +... +# GCN-LABEL: name: dgemm_between_valu_write_buffer_store_fallthrough1 +# GCN: V_MOV_B32_e32 +# GCN-NEXT: V_MFMA +# GCN: bb.1: +# GCN-NEXT: S_NOP +# GCN-NEXT: BUFFER_STORE_DWORD +name: dgemm_between_valu_write_buffer_store_fallthrough1 +body: | + bb.0: + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec + + bb.1: + BUFFER_STORE_DWORDX2_OFFEN_exact $vgpr2_vgpr3, $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec +... +# GCN-LABEL: name: dgemm_between_valu_write_buffer_store_fallthrough2 +# GCN: V_MOV_B32_e32 +# GCN: bb.1: +# GCN-NEXT: V_MFMA +# GCN-NEXT: S_NOP +# GCN-NEXT: BUFFER_STORE_DWORD +name: dgemm_between_valu_write_buffer_store_fallthrough2 +body: | + bb.0: + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + + bb.1: + $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec + BUFFER_STORE_DWORDX2_OFFEN_exact $vgpr2_vgpr3, $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec +... +# GCN-LABEL: name: dgemm_between_valu_write_buffer_store_fallthrough3 +# GCN: V_MOV_B32_e32 +# GCN: bb.1: +# GCN: bb.2: +# GCN-NEXT: V_MFMA +# GCN-NEXT: S_NOP +# GCN-NEXT: BUFFER_STORE_DWORD +name: dgemm_between_valu_write_buffer_store_fallthrough3 +body: | + bb.0: + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + + bb.1: + + bb.2: + $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec + BUFFER_STORE_DWORDX2_OFFEN_exact $vgpr2_vgpr3, $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec +... diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir --- a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir +++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir @@ -2016,3 +2016,15 @@ $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_MFMA_F32_32X32X1F32_vgprcd_e64 $agpr26, $agpr28, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec $vgpr1 = V_MOV_B32_e32 0, implicit $exec ... +# GCN-LABEL: name: dgemm_between_valu_write_buffer_store_no_snop +# GCN: V_MOV_B32_e32 +# GCN-NEXT: V_MFMA_F64 +# GCN-NOT: S_NOP +# GCN-NEXT: BUFFER_STORE_DWORD +name: dgemm_between_valu_write_buffer_store_no_snop +body: | + bb.0: + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec + BUFFER_STORE_DWORDX2_OFFEN_exact $vgpr2_vgpr3, $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec +...