diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -1332,6 +1332,12 @@ return false; } +static bool isStoreCountWaitZero(const MachineInstr &I) { + return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT && + I.getOperand(0).getReg() == AMDGPU::SGPR_NULL && + !I.getOperand(1).getImm(); +} + bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) { if (!RunLdsBranchVmemWARHazardFixup) return false; @@ -1351,9 +1357,7 @@ return false; auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) { - return IsHazardInst(I) || (I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT && - I.getOperand(0).getReg() == AMDGPU::SGPR_NULL && - !I.getOperand(1).getImm()); + return IsHazardInst(I) || isStoreCountWaitZero(I); }; auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) { @@ -1370,9 +1374,7 @@ if (InstType == InstType2) return true; - return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT && - I.getOperand(0).getReg() == AMDGPU::SGPR_NULL && - !I.getOperand(1).getImm(); + return isStoreCountWaitZero(I); }; return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) != diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -211,19 +211,20 @@ return ScoreUBs[T]; } + unsigned getScoreRange(InstCounterType T) const { + return getScoreUB(T) - getScoreLB(T); + } + // Mapping from event to counter. InstCounterType eventCounter(WaitEventType E) { - if (WaitEventMaskForInst[VM_CNT] & (1 << E)) - return VM_CNT; - if (WaitEventMaskForInst[LGKM_CNT] & (1 << E)) - return LGKM_CNT; - if (WaitEventMaskForInst[VS_CNT] & (1 << E)) - return VS_CNT; - assert(WaitEventMaskForInst[EXP_CNT] & (1 << E)); - return EXP_CNT; + for (auto T : inst_counter_types()) { + if (WaitEventMaskForInst[T] & (1 << E)) + return T; + } + llvm_unreachable("event type has no associated counter"); } - unsigned getRegScore(int GprNo, InstCounterType T) { + unsigned getRegScore(int GprNo, InstCounterType T) const { if (GprNo < NUM_ALL_VGPRS) { return VgprScores[T][GprNo]; } @@ -240,8 +241,7 @@ bool counterOutOfOrder(InstCounterType T) const; void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const; void simplifyWaitcnt(InstCounterType T, unsigned &Count) const; - void determineWait(InstCounterType T, unsigned ScoreToWait, - AMDGPU::Waitcnt &Wait) const; + void determineWait(InstCounterType T, int RegNo, AMDGPU::Waitcnt &Wait) const; void applyWaitcnt(const AMDGPU::Waitcnt &Wait); void applyWaitcnt(InstCounterType T, unsigned Count); void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI, @@ -305,9 +305,9 @@ assert(T < NUM_INST_CNTS); ScoreUBs[T] = Val; if (T == EXP_CNT) { - unsigned UB = ScoreUBs[T] - getWaitCountMax(EXP_CNT); - if (ScoreLBs[T] < UB && UB < ScoreUBs[T]) - ScoreLBs[T] = UB; + unsigned UB = ScoreUBs[EXP_CNT] - getWaitCountMax(EXP_CNT); + if (ScoreLBs[EXP_CNT] < UB && UB < ScoreUBs[EXP_CNT]) + ScoreLBs[EXP_CNT] = UB; } } @@ -694,29 +694,30 @@ void WaitcntBrackets::print(raw_ostream &OS) { OS << '\n'; for (auto T : inst_counter_types()) { - unsigned LB = getScoreLB(T); - unsigned UB = getScoreUB(T); + unsigned SR = getScoreRange(T); switch (T) { case VM_CNT: - OS << " VM_CNT(" << UB - LB << "): "; + OS << " VM_CNT(" << SR << "): "; break; case LGKM_CNT: - OS << " LGKM_CNT(" << UB - LB << "): "; + OS << " LGKM_CNT(" << SR << "): "; break; case EXP_CNT: - OS << " EXP_CNT(" << UB - LB << "): "; + OS << " EXP_CNT(" << SR << "): "; break; case VS_CNT: - OS << " VS_CNT(" << UB - LB << "): "; + OS << " VS_CNT(" << SR << "): "; break; default: - OS << " UNKNOWN(" << UB - LB << "): "; + OS << " UNKNOWN(" << SR << "): "; break; } - if (LB < UB) { + if (SR != 0) { // Print vgpr scores. + unsigned LB = getScoreLB(T); + for (int J = 0; J <= VgprUB; J++) { unsigned RegScore = getRegScore(J, T); if (RegScore <= LB) @@ -755,18 +756,17 @@ void WaitcntBrackets::simplifyWaitcnt(InstCounterType T, unsigned &Count) const { - const unsigned LB = getScoreLB(T); - const unsigned UB = getScoreUB(T); - // The number of outstanding events for this type, T, can be calculated // as (UB - LB). If the current Count is greater than or equal to the number // of outstanding events, then the wait for this counter is redundant. - if (Count >= UB - LB) + if (Count >= getScoreRange(T)) Count = ~0u; } -void WaitcntBrackets::determineWait(InstCounterType T, unsigned ScoreToWait, +void WaitcntBrackets::determineWait(InstCounterType T, int RegNo, AMDGPU::Waitcnt &Wait) const { + unsigned ScoreToWait = getRegScore(RegNo, T); + // If the score of src_operand falls within the bracket, we need an // s_waitcnt instruction. const unsigned LB = getScoreLB(T); @@ -1106,8 +1106,7 @@ for (int RegNo = CallAddrOpInterval.first; RegNo < CallAddrOpInterval.second; ++RegNo) - ScoreBrackets.determineWait( - LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait); + ScoreBrackets.determineWait(LGKM_CNT, RegNo, Wait); int RtnAddrOpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst); @@ -1117,8 +1116,7 @@ for (int RegNo = RtnAddrOpInterval.first; RegNo < RtnAddrOpInterval.second; ++RegNo) - ScoreBrackets.determineWait( - LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait); + ScoreBrackets.determineWait(LGKM_CNT, RegNo, Wait); } } } else { @@ -1150,11 +1148,9 @@ continue; unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS; // VM_CNT is only relevant to vgpr or LDS. - ScoreBrackets.determineWait( - VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait); + ScoreBrackets.determineWait(VM_CNT, RegNo, Wait); if (Memop->isStore()) { - ScoreBrackets.determineWait( - EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait); + ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait); } } @@ -1176,17 +1172,14 @@ if (Op.isUse() || !SIInstrInfo::isVMEM(MI) || ScoreBrackets.hasOtherPendingVmemTypes(RegNo, getVmemType(MI))) { - ScoreBrackets.determineWait( - VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait); + ScoreBrackets.determineWait(VM_CNT, RegNo, Wait); ScoreBrackets.clearVgprVmemTypes(RegNo); } if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) { - ScoreBrackets.determineWait( - EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait); + ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait); } } - ScoreBrackets.determineWait( - LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait); + ScoreBrackets.determineWait(LGKM_CNT, RegNo, Wait); } } } @@ -1205,8 +1198,7 @@ // after fixing the scheduler. Also, the Shader Compiler code is // independent of target. if (readsVCCZ(MI) && ST->hasReadVCCZBug()) { - if (ScoreBrackets.getScoreLB(LGKM_CNT) < - ScoreBrackets.getScoreUB(LGKM_CNT) && + if (ScoreBrackets.getScoreRange(LGKM_CNT) != 0 && ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) { Wait.LgkmCnt = 0; } @@ -1228,9 +1220,7 @@ Wait.VsCnt = 0; if (FlushVmCnt) { - unsigned UB = ScoreBrackets.getScoreUB(VM_CNT); - unsigned LB = ScoreBrackets.getScoreLB(VM_CNT); - if (UB - LB != 0) + if (ScoreBrackets.getScoreRange(VM_CNT) != 0) Wait.VmCnt = 0; } @@ -1245,9 +1235,7 @@ MachineInstr *OldWaitcntInstr) { AMDGPU::Waitcnt Wait; - unsigned UB = ScoreBrackets.getScoreUB(VM_CNT); - unsigned LB = ScoreBrackets.getScoreLB(VM_CNT); - if (UB - LB == 0) + if (ScoreBrackets.getScoreRange(VM_CNT) == 0) return false; Wait.VmCnt = 0; @@ -1603,8 +1591,7 @@ // 2. Restore the correct value of vccz by writing the current value // of vcc back to vcc. if (ST->hasReadVCCZBug() && - ScoreBrackets.getScoreLB(LGKM_CNT) < - ScoreBrackets.getScoreUB(LGKM_CNT) && + ScoreBrackets.getScoreRange(LGKM_CNT) != 0 && ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) { // Writes to vcc while there's an outstanding smem read may get // clobbered as soon as any read completes.