Index: lib/Target/AMDGPU/AMDGPUSubtarget.h
===================================================================
--- lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -476,6 +476,10 @@
     return FlatScratchInsts;
   }
 
+  bool hasFlatLgkmVMemCountInOrder() const {
+    return getGeneration() > GFX9;
+  }
+
   bool hasD16LoadStore() const {
     return getGeneration() >= GFX9;
   }
Index: lib/Target/AMDGPU/SIInsertWaitcnts.cpp
===================================================================
--- lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -136,7 +136,7 @@
 // "s_waitcnt 0" before use.
 class BlockWaitcntBrackets {
 public:
-  BlockWaitcntBrackets() {
+  BlockWaitcntBrackets(const SISubtarget *SubTarget) : ST(SubTarget) {
     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
          T = (enum InstCounterType)(T + 1)) {
       memset(VgprScores[T], 0, sizeof(VgprScores[T]));
@@ -314,6 +314,7 @@
   void dump() { print(dbgs()); }
 
 private:
+  const SISubtarget *ST = nullptr;
   bool WaitAtBeginning = false;
   bool RevisitLoop = false;
   bool MixedExpTypes = false;
@@ -737,9 +738,12 @@
   const int32_t LB = getScoreLB(T);
   const int32_t UB = getScoreUB(T);
   if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
-    if (T == VM_CNT && hasPendingFlat()) {
-      // If there is a pending FLAT operation, and this is a VM waitcnt,
-      // then we need to force a waitcnt 0 for VM.
+    if ((T == VM_CNT || T == LGKM_CNT) &&
+        hasPendingFlat() &&
+        !ST->hasFlatLgkmVMemCountInOrder()) {
+      // If there is a pending FLAT operation, and this is a VMem or LGKM
+      // waitcnt and the target can report early completion, then we need
+      // to force a waitcnt 0.
       NeedWait = CNT_MASK(T);
       setScoreLB(T, getScoreUB(T));
     } else if (counterOutOfOrder(T)) {
@@ -1202,7 +1206,7 @@
           if (!ScoreBracket) {
             assert(!BlockVisitedSet.count(TBB));
             BlockWaitcntBracketsMap[TBB] =
-                llvm::make_unique<BlockWaitcntBrackets>();
+                llvm::make_unique<BlockWaitcntBrackets>(ST);
             ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
           }
           ScoreBracket->setRevisitLoop(true);
@@ -1877,7 +1881,7 @@
 
     BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
     if (!ScoreBrackets) {
-      BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>();
+      BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>(ST);
       ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
     }
     ScoreBrackets->setPostOrder(MBB.getNumber());
Index: test/CodeGen/AMDGPU/waitcnt.mir
===================================================================
--- test/CodeGen/AMDGPU/waitcnt.mir
+++ test/CodeGen/AMDGPU/waitcnt.mir
@@ -1,4 +1,5 @@
-# RUN: llc -march=amdgcn -mcpu=fiji -run-pass si-insert-waitcnts  %s -o - | FileCheck %s
+# RUN: llc -march=amdgcn -mcpu=gfx803 -run-pass si-insert-waitcnts  %s -o - | FileCheck -check-prefix=GFX89 %s
+# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass si-insert-waitcnts  %s -o - | FileCheck -check-prefix=GFX89 %s
 
 --- |
   define amdgpu_kernel void @flat_zero_waitcnt(i32 addrspace(1)* %global4,
@@ -30,22 +31,14 @@
 
 # CHECK-LABEL: bb.1:
 # CHECK: FLAT_LOAD_DWORD
-# CHECK: S_WAITCNT 368
+# GFX89: S_WAITCNT 112
 # CHECK: FLAT_LOAD_DWORDX4
-# The first load has no mem operand, so we should assume it accesses the flat
-# address space.
-# s_waitcnt lgkmcnt(1)
-# CHECK-NEXT: S_WAITCNT 383
 
 # CHECK-LABEL: bb.2:
 # CHECK: FLAT_LOAD_DWORD
-# CHECK: S_WAITCNT 368
+# GFX89: S_WAITCNT 112
 # CHECK: FLAT_LOAD_DWORDX4
 
-# One outstanding load accesses the flat address space.
-# s_waitcnt lgkmcnt(1)
-# CHECK-NEXT: S_WAITCNT 383
-
 name: flat_zero_waitcnt
 
 body: |