Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -472,6 +472,10 @@ return FlatScratchInsts; } + bool hasFlatLgkmVMemCountInOrder() const { + return getGeneration() > GFX9; + } + bool hasD16LoadStore() const { return getGeneration() >= GFX9; } Index: llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -136,7 +136,7 @@ // "s_waitcnt 0" before use. class BlockWaitcntBrackets { public: - BlockWaitcntBrackets() { + BlockWaitcntBrackets(const SISubtarget *SubTarget) : ST(SubTarget) { for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; T = (enum InstCounterType)(T + 1)) { memset(VgprScores[T], 0, sizeof(VgprScores[T])); @@ -314,6 +314,7 @@ void dump() { print(dbgs()); } private: + const SISubtarget *ST = nullptr; bool WaitAtBeginning = false; bool RevisitLoop = false; bool MixedExpTypes = false; @@ -735,9 +736,12 @@ const int32_t LB = getScoreLB(T); const int32_t UB = getScoreUB(T); if ((UB >= ScoreToWait) && (ScoreToWait > LB)) { - if (T == VM_CNT && hasPendingFlat()) { - // If there is a pending FLAT operation, and this is a VM waitcnt, - // then we need to force a waitcnt 0 for VM. + if ((T == VM_CNT || T == LGKM_CNT) && + hasPendingFlat() && + !ST->hasFlatLgkmVMemCountInOrder()) { + // If there is a pending FLAT operation, and this is a VMem or LGKM + // waitcnt and the target can report early completion, then we need + // to force a waitcnt 0. NeedWait = CNT_MASK(T); setScoreLB(T, getScoreUB(T)); } else if (counterOutOfOrder(T)) { @@ -1200,7 +1204,7 @@ if (!ScoreBracket) { assert(!BlockVisitedSet.count(TBB)); BlockWaitcntBracketsMap[TBB] = - llvm::make_unique(); + llvm::make_unique(ST); ScoreBracket = BlockWaitcntBracketsMap[TBB].get(); } ScoreBracket->setRevisitLoop(true); @@ -1879,7 +1883,7 @@ BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get(); if (!ScoreBrackets) { - BlockWaitcntBracketsMap[&MBB] = llvm::make_unique(); + BlockWaitcntBracketsMap[&MBB] = llvm::make_unique(ST); ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get(); } ScoreBrackets->setPostOrder(MBB.getNumber()); Index: llvm/trunk/test/CodeGen/AMDGPU/waitcnt.mir =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/waitcnt.mir +++ llvm/trunk/test/CodeGen/AMDGPU/waitcnt.mir @@ -1,4 +1,5 @@ -# RUN: llc -march=amdgcn -mcpu=fiji -run-pass si-insert-waitcnts %s -o - | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=gfx803 -run-pass si-insert-waitcnts %s -o - | FileCheck -check-prefix=GFX89 %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass si-insert-waitcnts %s -o - | FileCheck -check-prefix=GFX89 %s --- | define amdgpu_kernel void @flat_zero_waitcnt(i32 addrspace(1)* %global4, @@ -30,22 +31,14 @@ # CHECK-LABEL: bb.1: # CHECK: FLAT_LOAD_DWORD -# CHECK: S_WAITCNT 368 +# GFX89: S_WAITCNT 112 # CHECK: FLAT_LOAD_DWORDX4 -# The first load has no mem operand, so we should assume it accesses the flat -# address space. -# s_waitcnt lgkmcnt(1) -# CHECK-NEXT: S_WAITCNT 383 # CHECK-LABEL: bb.2: # CHECK: FLAT_LOAD_DWORD -# CHECK: S_WAITCNT 368 +# GFX89: S_WAITCNT 112 # CHECK: FLAT_LOAD_DWORDX4 -# One outstanding load accesses the flat address space. -# s_waitcnt lgkmcnt(1) -# CHECK-NEXT: S_WAITCNT 383 - name: flat_zero_waitcnt body: |