Index: lib/Target/AMDGPU/FLATInstructions.td
===================================================================
--- lib/Target/AMDGPU/FLATInstructions.td
+++ lib/Target/AMDGPU/FLATInstructions.td
@@ -28,11 +28,6 @@
   let SubtargetPredicate = isCIVI;
 
   let FLAT = 1;
-  // Internally, FLAT instruction are executed as both an LDS and a
-  // Buffer instruction; so, they increment both VM_CNT and LGKM_CNT
-  // and are not considered done until both have been decremented.
-  let VM_CNT = 1;
-  let LGKM_CNT = 1;
 
   let UseNamedOperandTable = 1;
   let hasSideEffects = 0;
@@ -61,6 +56,12 @@
 
   // TODO: M0 if it could possibly access LDS (before gfx9? only)?
   let Uses = !if(is_flat_global, [EXEC], [EXEC, FLAT_SCR]);
+
+  // Internally, FLAT instruction are executed as both an LDS and a
+  // Buffer instruction; so, they increment both VM_CNT and LGKM_CNT
+  // and are not considered done until both have been decremented.
+  let VM_CNT = 1;
+  let LGKM_CNT = !if(!or(is_flat_global, is_flat_scratch), 0, 1);
 }
 
 class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
Index: lib/Target/AMDGPU/SIInsertWaitcnts.cpp
===================================================================
--- lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1151,8 +1151,7 @@
   // instruction, update the upper-bound of the appropriate counter's
   // bracket and the destination operand scores.
   // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
-  uint64_t TSFlags = Inst.getDesc().TSFlags;
-  if (TII->isDS(Inst) && (TSFlags & SIInstrFlags::LGKM_CNT)) {
+  if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
     if (TII->getNamedOperand(Inst, AMDGPU::OpName::gds) &&
 	TII->getNamedOperand(Inst, AMDGPU::OpName::gds)->getImm() != 0) {
       ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
@@ -1162,8 +1161,12 @@
     }
   } else if (TII->isFLAT(Inst)) {
     assert(Inst.mayLoad() || Inst.mayStore());
-    ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
-    ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
+
+    if (TII->usesVM_CNT(Inst))
+      ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
+
+    if (TII->usesLGKM_CNT(Inst))
+      ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
 
     // This is a flat memory operation. Check to see if it has memory
     // tokens for both LDS and Memory, and if so mark it as a flat.
Index: lib/Target/AMDGPU/SIInstrInfo.h
===================================================================
--- lib/Target/AMDGPU/SIInstrInfo.h
+++ lib/Target/AMDGPU/SIInstrInfo.h
@@ -496,6 +496,10 @@
     return MI.getDesc().TSFlags & SIInstrFlags::VM_CNT;
   }
 
+  static bool usesLGKM_CNT(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::LGKM_CNT;
+  }
+
   static bool sopkIsZext(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::SOPK_ZEXT;
   }
Index: test/CodeGen/AMDGPU/waitcnt-flat.ll
===================================================================
--- test/CodeGen/AMDGPU/waitcnt-flat.ll
+++ test/CodeGen/AMDGPU/waitcnt-flat.ll
@@ -1,5 +1,6 @@
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=GCN %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global | FileCheck --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
 
 ; If flat_store_dword and flat_load_dword use different registers for the data
 ; operand, this test is not broken.  It just means it is no longer testing
@@ -9,8 +10,19 @@
 ; XGCN: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[DATA:v[0-9]+]]
 ; XGCN: s_waitcnt vmcnt(0) lgkmcnt(0)
 ; XGCN: flat_load_dword [[DATA]], v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 %in) {
-  store volatile i32 0, i32 addrspace(1)* %out
-  %val = load volatile i32, i32 addrspace(1)* %out
+define amdgpu_kernel void @test(i32 addrspace(4)* %out, i32 %in) {
+  store volatile i32 0, i32 addrspace(4)* %out
+  %val = load volatile i32, i32 addrspace(4)* %out
+  ret void
+}
+
+; Make sure lgkmcnt isn't used for global_* instructions
+; GCN-LABEL: {{^}}test_waitcnt_type_flat_global:
+; GFX9: global_load_dword [[LD:v[0-9]+]]
+; GFX9-NEXT: s_waitcnt vmcnt(0){{$}}
+; GFX9-NEXT: ds_write_b32 [[LD]]
+define amdgpu_kernel void @test_waitcnt_type_flat_global(i32 addrspace(1)* %in) {
+  %val = load volatile i32, i32 addrspace(1)* %in
+  store volatile i32 %val, i32 addrspace(3)* undef
   ret void
 }