Index: lib/Target/AMDGPU/FLATInstructions.td =================================================================== --- lib/Target/AMDGPU/FLATInstructions.td +++ lib/Target/AMDGPU/FLATInstructions.td @@ -28,11 +28,6 @@ let SubtargetPredicate = isCIVI; let FLAT = 1; - // Internally, FLAT instruction are executed as both an LDS and a - // Buffer instruction; so, they increment both VM_CNT and LGKM_CNT - // and are not considered done until both have been decremented. - let VM_CNT = 1; - let LGKM_CNT = 1; let UseNamedOperandTable = 1; let hasSideEffects = 0; @@ -61,6 +56,12 @@ // TODO: M0 if it could possibly access LDS (before gfx9? only)? let Uses = !if(is_flat_global, [EXEC], [EXEC, FLAT_SCR]); + + // Internally, FLAT instruction are executed as both an LDS and a + // Buffer instruction; so, they increment both VM_CNT and LGKM_CNT + // and are not considered done until both have been decremented. + let VM_CNT = 1; + let LGKM_CNT = !if(!or(is_flat_global, is_flat_scratch), 0, 1); } class FLAT_Real op, FLAT_Pseudo ps> : Index: lib/Target/AMDGPU/SIInsertWaitcnts.cpp =================================================================== --- lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1151,8 +1151,7 @@ // instruction, update the upper-bound of the appropriate counter's // bracket and the destination operand scores. // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere. - uint64_t TSFlags = Inst.getDesc().TSFlags; - if (TII->isDS(Inst) && (TSFlags & SIInstrFlags::LGKM_CNT)) { + if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) { if (TII->getNamedOperand(Inst, AMDGPU::OpName::gds) && TII->getNamedOperand(Inst, AMDGPU::OpName::gds)->getImm() != 0) { ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst); @@ -1162,8 +1161,12 @@ } } else if (TII->isFLAT(Inst)) { assert(Inst.mayLoad() || Inst.mayStore()); - ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst); - ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst); + + if (TII->usesVM_CNT(Inst)) + ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst); + + if (TII->usesLGKM_CNT(Inst)) + ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst); // This is a flat memory operation. Check to see if it has memory // tokens for both LDS and Memory, and if so mark it as a flat. Index: lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.h +++ lib/Target/AMDGPU/SIInstrInfo.h @@ -496,6 +496,10 @@ return MI.getDesc().TSFlags & SIInstrFlags::VM_CNT; } + static bool usesLGKM_CNT(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::LGKM_CNT; + } + static bool sopkIsZext(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::SOPK_ZEXT; } Index: test/CodeGen/AMDGPU/waitcnt-flat.ll =================================================================== --- test/CodeGen/AMDGPU/waitcnt-flat.ll +++ test/CodeGen/AMDGPU/waitcnt-flat.ll @@ -1,5 +1,6 @@ -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=GCN %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global | FileCheck --check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -enable-var-scope -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s ; If flat_store_dword and flat_load_dword use different registers for the data ; operand, this test is not broken. It just means it is no longer testing @@ -9,8 +10,19 @@ ; XGCN: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[DATA:v[0-9]+]] ; XGCN: s_waitcnt vmcnt(0) lgkmcnt(0) ; XGCN: flat_load_dword [[DATA]], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 %in) { - store volatile i32 0, i32 addrspace(1)* %out - %val = load volatile i32, i32 addrspace(1)* %out +define amdgpu_kernel void @test(i32 addrspace(4)* %out, i32 %in) { + store volatile i32 0, i32 addrspace(4)* %out + %val = load volatile i32, i32 addrspace(4)* %out + ret void +} + +; Make sure lgkmcnt isn't used for global_* instructions +; GCN-LABEL: {{^}}test_waitcnt_type_flat_global: +; GFX9: global_load_dword [[LD:v[0-9]+]] +; GFX9-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX9-NEXT: ds_write_b32 [[LD]] +define amdgpu_kernel void @test_waitcnt_type_flat_global(i32 addrspace(1)* %in) { + %val = load volatile i32, i32 addrspace(1)* %in + store volatile i32 %val, i32 addrspace(3)* undef ret void }