Index: llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp =================================================================== --- llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -561,13 +561,25 @@ return std::numeric_limits::max(); } +// Check if this is inline asm, but ignoring asm with empty content. These are +// currently used as hacks to work around the lack of convergence tokens. +static bool isNonEmptyInlineAsm(const MachineInstr &MI) { + if (!MI.isInlineAsm()) + return false; + + // TODO: Device libraries using asm hacks with non-empty strings. + const char *AsmStr = MI.getOperand(0).getSymbolName(); + return AsmStr && AsmStr[0]; +} + int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg, IsHazardFn IsHazardDef, int Limit) { const SIRegisterInfo *TRI = ST.getRegisterInfo(); auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) { - return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI); + return (IsHazardDef(MI) || isNonEmptyInlineAsm(MI)) && + MI.modifiesRegister(Reg, TRI); }; return getWaitStatesSince(IsHazardFn, Limit); @@ -1961,7 +1973,7 @@ unsigned Opc = MI->getOpcode(); auto IsVALUFn = [](const MachineInstr &MI) { - return SIInstrInfo::isVALU(MI) || MI.isInlineAsm(); + return SIInstrInfo::isVALU(MI); }; if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write @@ -2182,17 +2194,31 @@ if (!Use.isReg()) continue; Register Reg = Use.getReg(); - bool FullReg; + bool FullReg = false; const MachineInstr *MI1; auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1, this](const MachineInstr &MI) { - if (!SIInstrInfo::isMFMA(MI)) - return false; - Register DstReg = MI.getOperand(0).getReg(); - FullReg = (DstReg == Reg); - MI1 = &MI; - return TRI.regsOverlap(DstReg, Reg); + if (SIInstrInfo::isMFMA(MI)) { + MI1 = &MI; + Register DstReg = MI.getOperand(0).getReg(); + FullReg = (DstReg == Reg); + return TRI.regsOverlap(DstReg, Reg); + } + + if (MI.isInlineAsm()) { + MI1 = &MI; + for (const MachineOperand &MO : MI.operands()) { + if (!MO.isReg() || !MO.isDef()) + continue; + + Register DstReg = MO.getReg(); + if (TRI.regsOverlap(DstReg, Reg)) + return true; + } + } + + return false; }; WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates - @@ -2233,6 +2259,10 @@ if (!isXDL(ST, *MI)) NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates; break; + case AMDGPU::INLINEASM: + case AMDGPU::INLINEASM_BR: + NeedWaitStates = MaxWaitStates; + break; default: if (ST.hasGFX940Insts() && isXDL(ST, *MI) && !isXDL(ST, *MI1)) break; @@ -2285,6 +2315,10 @@ case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64: NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates; break; + case AMDGPU::INLINEASM: + case AMDGPU::INLINEASM_BR: + NeedWaitStates = MaxWaitStates; + break; default: switch (TSchedModel.computeInstrLatency(MI1)) { case 2: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.writelane.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.writelane.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.writelane.ll @@ -258,6 +258,7 @@ ; GFX7-NEXT: ;;#ASMEND ; GFX7-NEXT: s_mov_b32 s0, m0 ; GFX7-NEXT: s_mov_b32 m0, s2 +; GFX7-NEXT: s_nop 1 ; GFX7-NEXT: v_writelane_b32 v0, s0, m0 ; GFX7-NEXT: ; return to shader part epilog ; @@ -268,6 +269,7 @@ ; GFX8-NEXT: ;;#ASMEND ; GFX8-NEXT: s_mov_b32 s0, m0 ; GFX8-NEXT: s_mov_b32 m0, s2 +; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_writelane_b32 v0, s0, m0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -290,6 +292,7 @@ ; GFX7-NEXT: ;;#ASMSTART ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ;;#ASMEND +; GFX7-NEXT: s_nop 3 ; GFX7-NEXT: v_writelane_b32 v0, s2, m0 ; GFX7-NEXT: ; return to shader part epilog ; @@ -298,6 +301,7 @@ ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ;;#ASMEND +; GFX8-NEXT: s_nop 3 ; GFX8-NEXT: v_writelane_b32 v0, s2, m0 ; GFX8-NEXT: ; return to shader part epilog ; Index: llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -13,6 +13,9 @@ ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; def v[0:31] a[0:15] ; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_read_b32 v34, a15 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_write_b32 a31, v34 @@ -104,6 +107,9 @@ ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; copy ; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_read_b32 v34, a1 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_write_b32 a16, v34 @@ -148,6 +154,9 @@ ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; copy ; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_read_b32 v32, a2 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_write_b32 a3, v32 @@ -386,10 +395,13 @@ ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; def v[0:31] a[0:15] ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_accvgpr_read_b32 v35, a15 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; def v32 ; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_accvgpr_read_b32 v35, a15 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_write_b32 a31, v35 ; GFX908-NEXT: v_accvgpr_read_b32 v35, a14 @@ -440,6 +452,9 @@ ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; copy ; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_read_b32 v35, a1 ; GFX908-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v34, v33, a[16:31] ; GFX908-NEXT: s_nop 0 @@ -448,6 +463,8 @@ ; GFX908-NEXT: ; copy ; GFX908-NEXT: ;;#ASMEND ; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_read_b32 v33, a2 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_write_b32 a3, v33 @@ -954,6 +971,9 @@ ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; copy ; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_read_b32 v34, a1 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_write_b32 a32, v34 @@ -998,6 +1018,9 @@ ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ; copy ; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_read_b32 v32, a2 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_write_b32 a3, v32 Index: llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir +++ llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir @@ -1,7 +1,7 @@ -# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN,SICI +# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN,SICI,SI # RUN: llc -march=amdgcn -mcpu=hawaii -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN,CIVI,SICI -# RUN: llc -march=amdgcn -mcpu=fiji -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN,CIVI,VI -# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN,CIVI,VI,GFX9 +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN,CIVI,VI,GFX89 +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN,CIVI,VI,GFX9,GFX89 --- | define amdgpu_kernel void @div_fmas() { ret void } @@ -13,6 +13,8 @@ define amdgpu_kernel void @s_movrel() { ret void } define amdgpu_kernel void @v_interp() { ret void } define amdgpu_kernel void @dpp() { ret void } + define amdgpu_kernel void @salu_write_buffer_smrd() { ret void } + define amdgpu_kernel void @vmem_read_sgpr_valu_def() { ret void } ... --- # GCN-LABEL: name: div_fmas @@ -36,6 +38,16 @@ # GCN: V_DIV_SCALE_F32 # GCN: S_NOP 3 # GCN: V_DIV_FMAS_F32 + +# GCN-LABEL: bb.4: +# GCN: INLINEASM +# GCN-NEXT: S_NOP 3 +# GCN-NEXT: V_DIV_FMAS_F32 + +# GCN-LABEL: bb.5: +# GCN: INLINEASM +# GCN-NEXT: V_DIV_FMAS_F32 + name: div_fmas body: | @@ -57,10 +69,16 @@ bb.3: $vgpr4, $vcc = V_DIV_SCALE_F32_e64 0, $vgpr1, 0, $vgpr1, 0, $vgpr3, 0, 0, implicit $mode, implicit $exec $vgpr0 = V_DIV_FMAS_F32_e64 0, $vgpr1, 0, $vgpr2, 0, $vgpr3, 0, 0, implicit $mode, implicit $vcc, implicit $exec - S_ENDPGM 0 -... + bb.4: + INLINEASM &"; non-empy asm", 1, 851978, def $vcc + $vgpr0 = V_DIV_FMAS_F32_e64 0, $vgpr1, 0, $vgpr2, 0, $vgpr3, 0, 0, implicit $mode, implicit $vcc, implicit $exec + ; No hazard here because the asm is empty + bb.5: + INLINEASM &"", 1, 851978, def $vcc + $vgpr0 = V_DIV_FMAS_F32_e64 0, $vgpr1, 0, $vgpr2, 0, $vgpr3, 0, 0, implicit $mode, implicit $vcc, implicit $exec + S_ENDPGM 0 ... --- # GCN-LABEL: name: s_getreg @@ -427,3 +445,56 @@ $vgpr3 = V_MOV_B32_dpp $vgpr3, $vgpr0, 0, 15, 15, 0, implicit $exec S_ENDPGM 0 ... +--- + +# GCN-LABEL: name: salu_write_buffer_smrd +# GCN-LABEL: bb.0: +# GCN: $sgpr15 = S_MOV_B32 123 +# SI-NEXT: S_NOP 3 +# GCN-NEXT: S_BUFFER_LOAD_DWORD_SGPR + +# GCN-LABEL: bb.1: +# GCN: INLINEASM +# SI-NEXT: S_NOP 3 +# GCN-NEXT: S_BUFFER_LOAD_DWORD_SGPR + +name: salu_write_buffer_smrd +body: | + bb.0: + liveins: $sgpr12_sgpr13_sgpr14_sgpr15 + $sgpr15 = S_MOV_B32 123 + $sgpr17 = S_BUFFER_LOAD_DWORD_SGPR $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr16, 0 + + bb.1: + liveins: $sgpr17 + INLINEASM &"; foo", 1, 851978, def $sgpr14 + $sgpr18 = S_BUFFER_LOAD_DWORD_SGPR $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr16, 0 + S_ENDPGM 0, implicit $sgpr17, implicit $sgpr18 +... + +--- + +# GCN-LABEL: name: vmem_read_sgpr_valu_def +# GCN-LABEL: bb.0: +# GCN: V_READLANE_B32 +# GFX89-NEXT: S_NOP 4 +# GCN-NEXT: BUFFER_STORE_DWORD_OFFSET + +# GCN-LABEL: bb.1: +# GCN: INLINEASM +# GFX89-NEXT: S_NOP 4 +# GCN-NEXT: BUFFER_STORE_DWORD_OFFSET + +name: vmem_read_sgpr_valu_def +body: | + bb.0: + liveins: $sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0 + + $sgpr6 = V_READLANE_B32 $vgpr0, 0 + BUFFER_STORE_DWORD_OFFSET $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr6, 0, 0, 0, 0, implicit $exec + + bb.1: + INLINEASM &"; foo", 1, 851978, def $sgpr7 + BUFFER_STORE_DWORD_OFFSET $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr7, 0, 0, 0, 0, implicit $exec + S_ENDPGM 0 +... Index: llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir +++ llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir @@ -1483,3 +1483,47 @@ $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec BUFFER_STORE_DWORDX2_OFFEN_exact $vgpr2_vgpr3, $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec ... + +--- + +# GCN-LABEL: name: mfma_write_agpr_mfma_read_overlap_asm +# GCN: INLINEASM +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 2 +# GCN-NEXT: V_MFMA +name: mfma_write_agpr_mfma_read_overlap_asm +body: | + bb.0: + INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1966090 /* regdef:AReg_128 */, def $agpr1_agpr2_agpr3_agpr4 + $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec +... + +--- + +# GCN-LABEL: name: mfma_write_agpr_mfma_read_overlap_asm_second_def +# GCN: INLINEASM +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 2 +# GCN-NEXT: V_MFMA +name: mfma_write_agpr_mfma_read_overlap_asm_second_def +body: | + bb.0: + INLINEASM &"", 1 /* sideeffect attdialect */, 3211274 /* regdef:VReg_64 */, def $vgpr0_vgpr1, 3145738 /* regdef:AReg_64 */, def $agpr3_agpr4 + $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec +... + +... +# GCN-LABEL: name: sgemm4x4_mfma_write_agpr_mfma_read_overlap_asm +# GCN: INLINEASM +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 2 +# GCN-NEXT: V_MFMA +name: sgemm4x4_mfma_write_agpr_mfma_read_overlap_asm +body: | + bb.0: + INLINEASM &"", 1 /* sideeffect attdialect */, 5832714 /* regdef:VReg_128 */, def $vgpr0_vgpr1_vgpr2_vgpr3, 5767178 /* regdef:AReg_128 */, def $agpr2_agpr3_agpr4_agpr5 + $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec +... Index: llvm/test/CodeGen/AMDGPU/mai-hazards.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/mai-hazards.mir +++ llvm/test/CodeGen/AMDGPU/mai-hazards.mir @@ -107,6 +107,18 @@ ... --- +# GCN-LABEL: name: mfma_write_agpr_mfma_read_overlap_asm +# GCN: INLINEASM +# GCN-NEXT: S_NOP 1 +# GCN-NEXT: V_MFMA +name: mfma_write_agpr_mfma_read_overlap_asm +body: | + bb.0: + INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1966090 /* regdef:AReg_128 */, def $agpr1_agpr2_agpr3_agpr4 + $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec +... +--- + # GCN-LABEL: name: mfma_write_agpr_mfma_read_partial # GCN: V_MFMA # GCN-NEXT: S_NOP 1 @@ -475,4 +487,3 @@ $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec $vgpr4 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, implicit $exec, implicit $flat_scr ... ----