diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp @@ -83,6 +83,10 @@ // an s_delay_alu instruction. static const unsigned TRANS_MAX = 4; + // The maximum number of SALU cycles we can encode in an s_delay_alu + // instruction. + static const unsigned SALU_CYCLES_MAX = 3; + // If it was written by a (non-TRANS) VALU, remember how many clock cycles // are left until it completes, and how many other (non-TRANS) VALU we have // seen since it was issued. @@ -120,7 +124,9 @@ TRANSNumVALU = 0; break; case SALU: - SALUCycles = Cycles; + // Guard against pseudo-instructions like SI_CALL which are marked as + // SALU but with a very high latency. + SALUCycles = std::min(Cycles, SALU_CYCLES_MAX); break; } } @@ -349,6 +355,7 @@ if (instructionWaitsForVALU(MI)) { // Forget about all outstanding VALU delays. + // TODO: This is overkill since it also forgets about SALU delays. State = DelayState(); } else if (Type != OTHER) { DelayInfo Delay; diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll --- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll @@ -119,7 +119,7 @@ ; GFX11-NEXT: s_mov_b32 s3, 0 ; GFX11-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s25 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(/* invalid instid value */) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s2, s2, 1 ; GFX11-NEXT: s_lshr_b32 s2, s2, s30 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)