diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -292,6 +292,11 @@ VgprVmemTypes[GprNo] = 0; } + void setNonKernelFunctionInitialState() { + for (InstCounterType Counter : inst_counter_types()) + setScoreUB(Counter, getWaitCountMax(Counter)); + } + void print(raw_ostream &); void dump() { print(dbgs()); } @@ -1857,6 +1862,12 @@ ; BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0); + // initialize the incoming as if all counters dirty + auto NonKernelInitialState = + std::make_unique(ST, Limits, Encoding); + NonKernelInitialState->setNonKernelFunctionInitialState(); + BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState); + Modified = true; } diff --git a/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll b/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll --- a/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll +++ b/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll @@ -33,6 +33,8 @@ ; GFX11-LABEL: test1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm tail call void @llvm.amdgcn.endpgm() unreachable diff --git a/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll b/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll --- a/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll +++ b/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll @@ -45,6 +45,7 @@ ; GFX11-BACKOFF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-BACKOFF-NEXT: flat_load_b32 v0, v[0:1] ; GFX11-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-BACKOFF-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-BACKOFF-NEXT: s_barrier ; GFX11-BACKOFF-NEXT: flat_store_b32 v[2:3], v0 ; GFX11-BACKOFF-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir --- a/llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir @@ -48,6 +48,7 @@ ; GFX9-NEXT: $vgpr4 = V_MAC_F32_e32 0, $vgpr5, $vgpr4, implicit $mode, implicit $exec ; GFX9-NEXT: $vgpr6 = V_MAC_F32_e32 0, $vgpr7, $vgpr6, implicit $mode, implicit $exec ; GFX9-NEXT: S_ENDPGM 0 + ; ; GFX10-LABEL: name: max-counter-lgkmcnt ; GFX10: liveins: $vgpr99 ; GFX10-NEXT: {{ $}} @@ -79,6 +80,7 @@ ; GFX10-NEXT: S_WAITCNT 52863 ; GFX10-NEXT: $vgpr6 = V_MAC_F32_e32 0, $vgpr7, $vgpr6, implicit $mode, implicit $exec ; GFX10-NEXT: S_ENDPGM 0 + ; ; GFX11-LABEL: name: max-counter-lgkmcnt ; GFX11: liveins: $vgpr99 ; GFX11-NEXT: {{ $}} @@ -109,6 +111,8 @@ ; GFX11-NEXT: $vgpr4 = V_MAC_F32_e32 0, $vgpr5, $vgpr4, implicit $mode, implicit $exec ; GFX11-NEXT: S_WAITCNT 64743 ; GFX11-NEXT: $vgpr6 = V_MAC_F32_e32 0, $vgpr7, $vgpr6, implicit $mode, implicit $exec + ; GFX11-NEXT: S_NOP 0 + ; GFX11-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0 ; GFX11-NEXT: S_ENDPGM 0 $vgpr0_vgpr1 = DS_READ2_B32_gfx9 renamable $vgpr99, 0, 1, 0, implicit $exec $vgpr2_vgpr3 = DS_READ2_B32_gfx9 renamable $vgpr99, 2, 3, 0, implicit $exec @@ -219,6 +223,7 @@ ; GFX9-NEXT: $vgpr2 = V_MAC_F32_e32 0, $vgpr3, $vgpr2, implicit $mode, implicit $exec ; GFX9-NEXT: $vgpr3 = V_MAC_F32_e32 0, $vgpr4, $vgpr3, implicit $mode, implicit $exec ; GFX9-NEXT: S_ENDPGM 0 + ; ; GFX10-LABEL: name: max-counter-vmcnt ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4 ; GFX10-NEXT: {{ $}} @@ -296,6 +301,7 @@ ; GFX10-NEXT: $vgpr2 = V_MAC_F32_e32 0, $vgpr3, $vgpr2, implicit $mode, implicit $exec ; GFX10-NEXT: $vgpr3 = V_MAC_F32_e32 0, $vgpr4, $vgpr3, implicit $mode, implicit $exec ; GFX10-NEXT: S_ENDPGM 0 + ; ; GFX11-LABEL: name: max-counter-vmcnt ; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4 ; GFX11-NEXT: {{ $}} @@ -372,6 +378,8 @@ ; GFX11-NEXT: $vgpr1 = V_MAC_F32_e32 0, $vgpr2, $vgpr1, implicit $mode, implicit $exec ; GFX11-NEXT: $vgpr2 = V_MAC_F32_e32 0, $vgpr3, $vgpr2, implicit $mode, implicit $exec ; GFX11-NEXT: $vgpr3 = V_MAC_F32_e32 0, $vgpr4, $vgpr3, implicit $mode, implicit $exec + ; GFX11-NEXT: S_NOP 0 + ; GFX11-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0 ; GFX11-NEXT: S_ENDPGM 0 $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec @@ -468,6 +476,7 @@ ; GFX9-NEXT: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec ; GFX9-NEXT: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec ; GFX9-NEXT: S_ENDPGM 0 + ; ; GFX10-LABEL: name: max-counter-expcnt ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0, $vgpr1 ; GFX10-NEXT: {{ $}} @@ -482,6 +491,7 @@ ; GFX10-NEXT: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec ; GFX10-NEXT: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec ; GFX10-NEXT: S_ENDPGM 0 + ; ; GFX11-LABEL: name: max-counter-expcnt ; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0, $vgpr1 ; GFX11-NEXT: {{ $}} @@ -495,6 +505,8 @@ ; GFX11-NEXT: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec ; GFX11-NEXT: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec ; GFX11-NEXT: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_NOP 0 + ; GFX11-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0 ; GFX11-NEXT: S_ENDPGM 0 EXP 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec