Index: lib/Target/AMDGPU/SIInsertWaitcnts.cpp =================================================================== --- lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -127,6 +127,22 @@ (w) < (enum WaitEventType)NUM_WAIT_EVENTS; \ (w) = (enum WaitEventType)((w) + 1)) +void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) { + switch (T) { + case VM_CNT: + Wait.VmCnt = std::min(Wait.VmCnt, Count); + break; + case EXP_CNT: + Wait.ExpCnt = std::min(Wait.ExpCnt, Count); + break; + case LGKM_CNT: + Wait.LgkmCnt = std::min(Wait.LgkmCnt, Count); + break; + default: + llvm_unreachable("bad InstCounterType"); + } +} + // This is a per-basic-block object that maintains current score brackets // of each wait counter, and a per-register scoreboard for each wait counter. // We also maintain the latest score for every event type that can change the @@ -233,6 +249,7 @@ if (GprNo < NUM_ALL_VGPRS) { return VgprScores[T][GprNo]; } + assert(T == LGKM_CNT); return SgprScores[GprNo - NUM_ALL_VGPRS]; } @@ -269,7 +286,12 @@ } bool counterOutOfOrder(InstCounterType T) const; - unsigned int updateByWait(InstCounterType T, int ScoreToWait); + bool simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const; + bool simplifyWaitcnt(InstCounterType T, unsigned &Count) const; + void determineWait(InstCounterType T, int ScoreToWait, + AMDGPU::Waitcnt &Wait) const; + void applyWaitcnt(const AMDGPU::Waitcnt &Wait); + void applyWaitcnt(InstCounterType T, unsigned Count); void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI, WaitEventType E, MachineInstr &MI); @@ -301,10 +323,6 @@ void setPostOrder(int32_t PostOrderIn) { PostOrder = PostOrderIn; } int32_t getPostOrder() const { return PostOrder; } - void setWaitcnt(MachineInstr *WaitcntIn) { Waitcnt = WaitcntIn; } - void clearWaitcnt() { Waitcnt = nullptr; } - MachineInstr *getWaitcnt() const { return Waitcnt; } - bool mixedExpTypes() const { return MixedExpTypes; } void setMixedExpTypes(bool MixedExpTypesIn) { MixedExpTypes = MixedExpTypesIn; @@ -319,7 +337,6 @@ bool RevisitLoop = false; bool MixedExpTypes = false; int32_t PostOrder = 0; - MachineInstr *Waitcnt = nullptr; int32_t ScoreLBs[NUM_INST_CNTS] = {0}; int32_t ScoreUBs[NUM_INST_CNTS] = {0}; int32_t EventUBs[NUM_WAIT_EVENTS] = {0}; @@ -445,7 +462,8 @@ bool mayAccessLDSThroughFlat(const MachineInstr &MI) const; void generateWaitcntInstBefore(MachineInstr &MI, - BlockWaitcntBrackets *ScoreBrackets); + BlockWaitcntBrackets *ScoreBrackets, + MachineInstr *OldWaitcntInstr); void updateEventWaitcntAfter(MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets); void mergeInputScoreBrackets(MachineBasicBlock &Block); @@ -453,8 +471,6 @@ unsigned countNumBottomBlocks(const MachineLoop *Loop); void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block); void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst); - bool isWaitcntStronger(unsigned LHS, unsigned RHS); - unsigned combineWaitcnt(unsigned LHS, unsigned RHS); }; } // end anonymous namespace @@ -712,17 +728,34 @@ OS << '\n'; } -unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T, - int ScoreToWait) { - unsigned int NeedWait = 0; +/// Simplify the waitcnt, in the sense of removing redundant counts, and return +/// whether a waitcnt instruction is needed at all. +bool BlockWaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const { + return simplifyWaitcnt(VM_CNT, Wait.VmCnt) | + simplifyWaitcnt(EXP_CNT, Wait.ExpCnt) | + simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt); +} + +bool BlockWaitcntBrackets::simplifyWaitcnt(InstCounterType T, + unsigned &Count) const { + const int32_t LB = getScoreLB(T); + const int32_t UB = getScoreUB(T); + if (Count < (unsigned)UB && UB - (int32_t)Count > LB) + return true; + + Count = ~0u; + return false; +} + +void BlockWaitcntBrackets::determineWait(InstCounterType T, int ScoreToWait, + AMDGPU::Waitcnt &Wait) const { if (ScoreToWait == -1) { // The score to wait is unknown. This implies that it was not encountered // during the path of the CFG walk done during the current traversal but // may be seen on a different path. Emit an s_wait counter with a // conservative value of 0 for the counter. - NeedWait = CNT_MASK(T); - setScoreLB(T, getScoreUB(T)); - return NeedWait; + addWait(Wait, T, 0); + return; } // If the score of src_operand falls within the bracket, we need an @@ -736,21 +769,38 @@ // If there is a pending FLAT operation, and this is a VMem or LGKM // waitcnt and the target can report early completion, then we need // to force a waitcnt 0. - NeedWait = CNT_MASK(T); - setScoreLB(T, getScoreUB(T)); + addWait(Wait, T, 0); } else if (counterOutOfOrder(T)) { // Counter can get decremented out-of-order when there // are multiple types event in the bracket. Also emit an s_wait counter // with a conservative value of 0 for the counter. - NeedWait = CNT_MASK(T); - setScoreLB(T, getScoreUB(T)); + addWait(Wait, T, 0); } else { - NeedWait = CNT_MASK(T); - setScoreLB(T, ScoreToWait); + addWait(Wait, T, UB - ScoreToWait); } } +} - return NeedWait; +void BlockWaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) { + applyWaitcnt(VM_CNT, Wait.VmCnt); + applyWaitcnt(EXP_CNT, Wait.ExpCnt); + applyWaitcnt(LGKM_CNT, Wait.LgkmCnt); + + if (Wait.ExpCnt == 0) + setMixedExpTypes(false); +} + +void BlockWaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) { + const int32_t UB = getScoreUB(T); + if (Count >= (unsigned)UB) + return; + if (Count != 0) { + if (counterOutOfOrder(T)) + return; + setScoreLB(T, std::max(getScoreLB(T), UB - (int32_t)Count)); + } else { + setScoreLB(T, UB); + } } // Where there are multiple types of event in the bracket of a counter, @@ -841,29 +891,6 @@ !MI.getOperand(1).isUndef(); } -/// Given wait count encodings checks if LHS is stronger than RHS. -bool SIInsertWaitcnts::isWaitcntStronger(unsigned LHS, unsigned RHS) { - if (AMDGPU::decodeVmcnt(IV, LHS) > AMDGPU::decodeVmcnt(IV, RHS)) - return false; - if (AMDGPU::decodeLgkmcnt(IV, LHS) > AMDGPU::decodeLgkmcnt(IV, RHS)) - return false; - if (AMDGPU::decodeExpcnt(IV, LHS) > AMDGPU::decodeExpcnt(IV, RHS)) - return false; - return true; -} - -/// Given wait count encodings create a new encoding which is stronger -/// or equal to both. -unsigned SIInsertWaitcnts::combineWaitcnt(unsigned LHS, unsigned RHS) { - unsigned VmCnt = std::min(AMDGPU::decodeVmcnt(IV, LHS), - AMDGPU::decodeVmcnt(IV, RHS)); - unsigned LgkmCnt = std::min(AMDGPU::decodeLgkmcnt(IV, LHS), - AMDGPU::decodeLgkmcnt(IV, RHS)); - unsigned ExpCnt = std::min(AMDGPU::decodeExpcnt(IV, LHS), - AMDGPU::decodeExpcnt(IV, RHS)); - return AMDGPU::encodeWaitcnt(IV, VmCnt, ExpCnt, LgkmCnt); -} - /// Generate s_waitcnt instruction to be placed before cur_Inst. /// Instructions of a given type are returned in order, /// but instructions of different types can complete out of order. @@ -875,31 +902,23 @@ /// The "score bracket" is bound by the lower bound and upper bound /// scores (*_score_LB and *_score_ub respectively). void SIInsertWaitcnts::generateWaitcntInstBefore( - MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets) { - // To emit, or not to emit - that's the question! - // Start with an assumption that there is no need to emit. - unsigned int EmitWaitcnt = 0; - - // ForceEmitZeroWaitcnt: force a single s_waitcnt 0 due to hw bug - bool ForceEmitZeroWaitcnt = false; - + MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets, + MachineInstr *OldWaitcntInstr) { setForceEmitWaitcnt(); bool IsForceEmitWaitcnt = isForceEmitWaitcnt(); if (MI.isDebugInstr()) return; + AMDGPU::Waitcnt Wait; + // See if an s_waitcnt is forced at block entry, or is needed at // program end. if (ScoreBrackets->getWaitAtBeginning()) { // Note that we have already cleared the state, so we don't need to update // it. ScoreBrackets->clearWaitAtBeginning(); - for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; - T = (enum InstCounterType)(T + 1)) { - EmitWaitcnt |= CNT_MASK(T); - ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T)); - } + Wait = AMDGPU::Waitcnt::allZero(); } // See if this instruction has a forced S_WAITCNT VM. @@ -907,8 +926,7 @@ else if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 || MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC || MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) { - EmitWaitcnt |= - ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT)); + Wait.VmCnt = 0; } // All waits must be resolved at call return. @@ -916,23 +934,14 @@ // with knowledge of the called routines. if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG || MI.getOpcode() == AMDGPU::S_SETPC_B64_return) { - for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; - T = (enum InstCounterType)(T + 1)) { - if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) { - ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T)); - EmitWaitcnt |= CNT_MASK(T); - } - } + Wait = AMDGPU::Waitcnt::allZero(); } // Resolve vm waits before gs-done. else if ((MI.getOpcode() == AMDGPU::S_SENDMSG || MI.getOpcode() == AMDGPU::S_SENDMSGHALT) && ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) == AMDGPU::SendMsg::ID_GS_DONE)) { - if (ScoreBrackets->getScoreUB(VM_CNT) > ScoreBrackets->getScoreLB(VM_CNT)) { - ScoreBrackets->setScoreLB(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT)); - EmitWaitcnt |= CNT_MASK(VM_CNT); - } + Wait.VmCnt = 0; } #if 0 // TODO: the following blocks of logic when we have fence. else if (MI.getOpcode() == SC_FENCE) { @@ -996,14 +1005,14 @@ if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) { // Export and GDS are tracked individually, either may trigger a waitcnt // for EXEC. - EmitWaitcnt |= ScoreBrackets->updateByWait( - EXP_CNT, ScoreBrackets->getEventUB(EXP_GPR_LOCK)); - EmitWaitcnt |= ScoreBrackets->updateByWait( - EXP_CNT, ScoreBrackets->getEventUB(EXP_PARAM_ACCESS)); - EmitWaitcnt |= ScoreBrackets->updateByWait( - EXP_CNT, ScoreBrackets->getEventUB(EXP_POS_ACCESS)); - EmitWaitcnt |= ScoreBrackets->updateByWait( - EXP_CNT, ScoreBrackets->getEventUB(GDS_GPR_LOCK)); + ScoreBrackets->determineWait( + EXP_CNT, ScoreBrackets->getEventUB(EXP_GPR_LOCK), Wait); + ScoreBrackets->determineWait( + EXP_CNT, ScoreBrackets->getEventUB(EXP_PARAM_ACCESS), Wait); + ScoreBrackets->determineWait( + EXP_CNT, ScoreBrackets->getEventUB(EXP_POS_ACCESS), Wait); + ScoreBrackets->determineWait( + EXP_CNT, ScoreBrackets->getEventUB(GDS_GPR_LOCK), Wait); } #if 0 // TODO: the following code to handle CALL. @@ -1035,8 +1044,8 @@ continue; unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS; // VM_CNT is only relevant to vgpr or LDS. - EmitWaitcnt |= ScoreBrackets->updateByWait( - VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT)); + ScoreBrackets->determineWait( + VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT), Wait); } for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { @@ -1047,11 +1056,11 @@ for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { if (TRI->isVGPR(MRIA, Op.getReg())) { // VM_CNT is only relevant to vgpr or LDS. - EmitWaitcnt |= ScoreBrackets->updateByWait( - VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT)); + ScoreBrackets->determineWait( + VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT), Wait); } - EmitWaitcnt |= ScoreBrackets->updateByWait( - LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT)); + ScoreBrackets->determineWait( + LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT), Wait); } } // End of for loop that looks at all source operands to decide vm_wait_cnt @@ -1069,10 +1078,10 @@ if (AS != AMDGPUAS::LOCAL_ADDRESS) continue; unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS; - EmitWaitcnt |= ScoreBrackets->updateByWait( - VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT)); - EmitWaitcnt |= ScoreBrackets->updateByWait( - EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT)); + ScoreBrackets->determineWait( + VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT), Wait); + ScoreBrackets->determineWait( + EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT), Wait); } } for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { @@ -1082,13 +1091,13 @@ ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, true); for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { if (TRI->isVGPR(MRIA, Def.getReg())) { - EmitWaitcnt |= ScoreBrackets->updateByWait( - VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT)); - EmitWaitcnt |= ScoreBrackets->updateByWait( - EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT)); + ScoreBrackets->determineWait( + VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT), Wait); + ScoreBrackets->determineWait( + EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT), Wait); } - EmitWaitcnt |= ScoreBrackets->updateByWait( - LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT)); + ScoreBrackets->determineWait( + LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT), Wait); } } // End of for loop that looks at all dest operands. } @@ -1099,12 +1108,7 @@ // requiring a WAITCNT beforehand. if (MI.getOpcode() == AMDGPU::S_BARRIER && !ST->hasAutoWaitcntBeforeBarrier()) { - EmitWaitcnt |= - ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT)); - EmitWaitcnt |= ScoreBrackets->updateByWait( - EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT)); - EmitWaitcnt |= ScoreBrackets->updateByWait( - LGKM_CNT, ScoreBrackets->getScoreUB(LGKM_CNT)); + Wait = AMDGPU::Waitcnt::allZero(); } // TODO: Remove this work-around, enable the assert for Bug 457939 @@ -1114,140 +1118,78 @@ if (ScoreBrackets->getScoreLB(LGKM_CNT) < ScoreBrackets->getScoreUB(LGKM_CNT) && ScoreBrackets->hasPendingSMEM()) { - // Wait on everything, not just LGKM. vccz reads usually come from - // terminators, and we always wait on everything at the end of the - // block, so if we only wait on LGKM here, we might end up with - // another s_waitcnt inserted right after this if there are non-LGKM - // instructions still outstanding. - // FIXME: this is too conservative / the comment is wrong. - // We don't wait on everything at the end of the block and we combine - // waitcnts so we should never have back-to-back waitcnts. - ForceEmitZeroWaitcnt = true; - EmitWaitcnt = true; + Wait.LgkmCnt = 0; } } - // Does this operand processing indicate s_wait counter update? - if (EmitWaitcnt || IsForceEmitWaitcnt) { - int CntVal[NUM_INST_CNTS]; - - if (ForceEmitZeroWaitcnt || ForceEmitZeroWaitcnts) { - // Force all waitcnts to 0. - for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; - T = (enum InstCounterType)(T + 1)) { - ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T)); - } - CntVal[VM_CNT] = 0; - CntVal[EXP_CNT] = 0; - CntVal[LGKM_CNT] = 0; - } else { - for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; - T = (enum InstCounterType)(T + 1)) { - if (EmitWaitcnt & CNT_MASK(T)) { - int Delta = - ScoreBrackets->getScoreUB(T) - ScoreBrackets->getScoreLB(T); - int MaxDelta = ScoreBrackets->getWaitCountMax(T); - if (Delta >= MaxDelta) { - Delta = -1; - if (T != EXP_CNT) { - ScoreBrackets->setScoreLB( - T, ScoreBrackets->getScoreUB(T) - MaxDelta); - } - EmitWaitcnt &= ~CNT_MASK(T); - } - CntVal[T] = Delta; - } else { - // If we are not waiting for a particular counter then encode - // it as -1 which means "don't care." - CntVal[T] = -1; - } + // Early-out if no wait is indicated. + if (!ScoreBrackets->simplifyWaitcnt(Wait) && !IsForceEmitWaitcnt) { + if (OldWaitcntInstr) { + if (TrackedWaitcntSet.count(OldWaitcntInstr)) { + TrackedWaitcntSet.erase(OldWaitcntInstr); + OldWaitcntInstr->eraseFromParent(); + } else { + int64_t Imm = OldWaitcntInstr->getOperand(0).getImm(); + ScoreBrackets->applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm)); } } + return; + } - MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt(); - int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm(); - if (!OldWaitcnt || - (AMDGPU::decodeVmcnt(IV, Imm) != - (CntVal[VM_CNT] & AMDGPU::getVmcntBitMask(IV))) || - (AMDGPU::decodeExpcnt(IV, Imm) != - (CntVal[EXP_CNT] & AMDGPU::getExpcntBitMask(IV))) || - (AMDGPU::decodeLgkmcnt(IV, Imm) != - (CntVal[LGKM_CNT] & AMDGPU::getLgkmcntBitMask(IV)))) { - MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent()); - if (ContainingLoop) { - MachineBasicBlock *TBB = ContainingLoop->getHeader(); - BlockWaitcntBrackets *ScoreBracket = BlockWaitcntBracketsMap[TBB].get(); - if (!ScoreBracket) { - assert(!BlockVisitedSet.count(TBB)); - BlockWaitcntBracketsMap[TBB] = - llvm::make_unique(ST); - ScoreBracket = BlockWaitcntBracketsMap[TBB].get(); - } - ScoreBracket->setRevisitLoop(true); - LLVM_DEBUG(dbgs() << "set-revisit2: Block" - << ContainingLoop->getHeader()->getNumber() << '\n';); - } - } + if (ForceEmitZeroWaitcnts) + Wait = AMDGPU::Waitcnt::allZero(); - // Update an existing waitcount, or make a new one. - unsigned Enc = AMDGPU::encodeWaitcnt(IV, - ForceEmitWaitcnt[VM_CNT] ? 0 : CntVal[VM_CNT], - ForceEmitWaitcnt[EXP_CNT] ? 0 : CntVal[EXP_CNT], - ForceEmitWaitcnt[LGKM_CNT] ? 0 : CntVal[LGKM_CNT]); - // We don't remove waitcnts that existed prior to the waitcnt - // pass. Check if the waitcnt to-be-inserted can be avoided - // or if the prev waitcnt can be updated. - bool insertSWaitInst = true; - for (MachineBasicBlock::iterator I = MI.getIterator(), - B = MI.getParent()->begin(); - insertSWaitInst && I != B; --I) { - if (I == MI.getIterator()) - continue; + if (ForceEmitWaitcnt[VM_CNT]) + Wait.VmCnt = 0; + if (ForceEmitWaitcnt[EXP_CNT]) + Wait.ExpCnt = 0; + if (ForceEmitWaitcnt[LGKM_CNT]) + Wait.LgkmCnt = 0; - switch (I->getOpcode()) { - case AMDGPU::S_WAITCNT: - if (isWaitcntStronger(I->getOperand(0).getImm(), Enc)) - insertSWaitInst = false; - else if (!OldWaitcnt) { - OldWaitcnt = &*I; - Enc = combineWaitcnt(I->getOperand(0).getImm(), Enc); - } - break; - // TODO: skip over instructions which never require wait. - } - break; - } - if (insertSWaitInst) { - if (OldWaitcnt) { - assert(OldWaitcnt->getOpcode() == AMDGPU::S_WAITCNT); - if (ForceEmitZeroWaitcnts) - LLVM_DEBUG(dbgs() - << "Force emit s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)\n"); - if (IsForceEmitWaitcnt) - LLVM_DEBUG(dbgs() << "Force emit a s_waitcnt due to debug counter\n"); - - OldWaitcnt->getOperand(0).setImm(Enc); - if (!OldWaitcnt->getParent()) - MI.getParent()->insert(MI, OldWaitcnt); - - LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n" - << "Old Instr: " << MI << '\n' - << "New Instr: " << *OldWaitcnt << '\n'); - } else { - auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(), - MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT)) - .addImm(Enc); - TrackedWaitcntSet.insert(SWaitInst); - - LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n" - << "Old Instr: " << MI << '\n' - << "New Instr: " << *SWaitInst << '\n'); - } - } + ScoreBrackets->applyWaitcnt(Wait); - if (CntVal[EXP_CNT] == 0) { - ScoreBrackets->setMixedExpTypes(false); - } + AMDGPU::Waitcnt OldWait; + if (OldWaitcntInstr) { + OldWait = + AMDGPU::decodeWaitcnt(IV, OldWaitcntInstr->getOperand(0).getImm()); + } + if (OldWait.dominates(Wait)) + return; + + MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent()); + if (ContainingLoop) { + MachineBasicBlock *TBB = ContainingLoop->getHeader(); + BlockWaitcntBrackets *ScoreBracket = BlockWaitcntBracketsMap[TBB].get(); + if (!ScoreBracket) { + assert(!BlockVisitedSet.count(TBB)); + BlockWaitcntBracketsMap[TBB] = + llvm::make_unique(ST); + ScoreBracket = BlockWaitcntBracketsMap[TBB].get(); + } + ScoreBracket->setRevisitLoop(true); + LLVM_DEBUG(dbgs() << "set-revisit2: Block" + << ContainingLoop->getHeader()->getNumber() << '\n';); + } + + if (OldWaitcntInstr && !TrackedWaitcntSet.count(OldWaitcntInstr)) + Wait = Wait.combined(OldWait); + + unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); + if (OldWaitcntInstr) { + OldWaitcntInstr->getOperand(0).setImm(Enc); + + LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n" + << "Old Instr: " << MI << '\n' + << "New Instr: " << *OldWaitcntInstr << '\n'); + } else { + auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(), + MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT)) + .addImm(Enc); + TrackedWaitcntSet.insert(SWaitInst); + + LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n" + << "Old Instr: " << MI << '\n' + << "New Instr: " << *SWaitInst << '\n'); } } @@ -1570,21 +1512,33 @@ }); // Walk over the instructions. + MachineInstr *OldWaitcntInstr = nullptr; + for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end(); Iter != E;) { MachineInstr &Inst = *Iter; + // Remove any previously existing waitcnts. if (Inst.getOpcode() == AMDGPU::S_WAITCNT) { - // Leave pre-existing waitcnts, but note their existence via setWaitcnt. - // Remove the waitcnt-pass-generated waitcnts; the pass will add them back - // as needed. - if (!TrackedWaitcntSet.count(&Inst)) - ++Iter; - else { - ++Iter; - Inst.removeFromParent(); + if (OldWaitcntInstr) { + if (TrackedWaitcntSet.count(OldWaitcntInstr)) { + TrackedWaitcntSet.erase(OldWaitcntInstr); + OldWaitcntInstr->eraseFromParent(); + OldWaitcntInstr = nullptr; + } else if (!TrackedWaitcntSet.count(&Inst)) { + // Two successive s_waitcnt's, both of which are pre-existing and + // are therefore preserved. + int64_t Imm = OldWaitcntInstr->getOperand(0).getImm(); + ScoreBrackets->applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm)); + } else { + ++Iter; + Inst.eraseFromParent(); + continue; + } } - ScoreBrackets->setWaitcnt(&Inst); + + OldWaitcntInstr = &Inst; + ++Iter; continue; } @@ -1601,7 +1555,8 @@ // Generate an s_waitcnt instruction to be placed before // cur_Inst, if needed. - generateWaitcntInstBefore(Inst, ScoreBrackets); + generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr); + OldWaitcntInstr = nullptr; updateEventWaitcntAfter(Inst, ScoreBrackets); @@ -1616,8 +1571,6 @@ } #endif - ScoreBrackets->clearWaitcnt(); - LLVM_DEBUG({ Inst.print(dbgs()); ScoreBrackets->dump(); @@ -1632,10 +1585,7 @@ Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P || Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) { // TODO: && context->target_info->GwsRequiresMemViolTest() ) { - ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT)); - ScoreBrackets->updateByWait(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT)); - ScoreBrackets->updateByWait(LGKM_CNT, - ScoreBrackets->getScoreUB(LGKM_CNT)); + ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZero()); } // TODO: Remove this work-around after fixing the scheduler and enable the Index: lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h =================================================================== --- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -258,6 +258,32 @@ std::pair Default, bool OnlyFirstRequired = false); +/// Represents the counter values to wait for in an s_waitcnt instruction. +/// +/// Large values (including the maximum possible integer) can be used to +/// represent "don't care" waits. +struct Waitcnt { + unsigned VmCnt = ~0u; + unsigned ExpCnt = ~0u; + unsigned LgkmCnt = ~0u; + + Waitcnt() {} + Waitcnt(unsigned VmCnt, unsigned ExpCnt, unsigned LgkmCnt) + : VmCnt(VmCnt), ExpCnt(ExpCnt), LgkmCnt(LgkmCnt) {} + + static Waitcnt allZero() { return Waitcnt(0, 0, 0); } + + bool dominates(const Waitcnt &Other) const { + return VmCnt <= Other.VmCnt && ExpCnt <= Other.ExpCnt && + LgkmCnt <= Other.LgkmCnt; + } + + Waitcnt combined(const Waitcnt &Other) const { + return Waitcnt(std::min(VmCnt, Other.VmCnt), std::min(ExpCnt, Other.ExpCnt), + std::min(LgkmCnt, Other.LgkmCnt)); + } +}; + /// \returns Vmcnt bit mask for given isa \p Version. unsigned getVmcntBitMask(const IsaVersion &Version); @@ -291,6 +317,8 @@ void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt); +Waitcnt decodeWaitcnt(const IsaVersion &Version, unsigned Encoded); + /// \returns \p Waitcnt with encoded \p Vmcnt for given isa \p Version. unsigned encodeVmcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned Vmcnt); @@ -318,6 +346,8 @@ unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt); +unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded); + unsigned getInitialPSInputAddr(const Function &F); LLVM_READNONE Index: lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp =================================================================== --- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -522,6 +522,14 @@ Lgkmcnt = decodeLgkmcnt(Version, Waitcnt); } +Waitcnt decodeWaitcnt(const IsaVersion &Version, unsigned Encoded) { + Waitcnt Decoded; + Decoded.VmCnt = decodeVmcnt(Version, Encoded); + Decoded.ExpCnt = decodeExpcnt(Version, Encoded); + Decoded.LgkmCnt = decodeLgkmcnt(Version, Encoded); + return Decoded; +} + unsigned encodeVmcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned Vmcnt) { Waitcnt = @@ -552,6 +560,10 @@ return Waitcnt; } +unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded) { + return encodeWaitcnt(Version, Decoded.VmCnt, Decoded.ExpCnt, Decoded.LgkmCnt); +} + unsigned getInitialPSInputAddr(const Function &F) { return getIntegerAttribute(F, "InitialPSInputAddr", 0); } Index: test/CodeGen/AMDGPU/smrd-vccz-bug.ll =================================================================== --- test/CodeGen/AMDGPU/smrd-vccz-bug.ll +++ test/CodeGen/AMDGPU/smrd-vccz-bug.ll @@ -5,7 +5,7 @@ ; GCN-FUNC: {{^}}vccz_workaround: ; GCN: s_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0x0 ; GCN: v_cmp_neq_f32_e64 {{[^,]*}}, s{{[0-9]+}}, 0{{$}} -; VCCZ-BUG: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VCCZ-BUG: s_waitcnt lgkmcnt(0) ; VCCZ-BUG: s_mov_b64 vcc, vcc ; NOVCCZ-BUG-NOT: s_mov_b64 vcc, vcc ; GCN: s_cbranch_vccnz [[EXIT:[0-9A-Za-z_]+]] Index: test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir =================================================================== --- test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir +++ test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir @@ -47,7 +47,7 @@ --- # CHECK-LABEL: name: vccz_corrupt_workaround # CHECK: $vcc = V_CMP_EQ_F32 -# CHECK-NEXT: S_WAITCNT 0 +# CHECK-NEXT: S_WAITCNT 127 # CHECK-NEXT: $vcc = S_MOV_B64 $vcc # CHECK-NEXT: S_CBRANCH_VCCZ %bb.2, implicit killed $vcc Index: test/CodeGen/AMDGPU/waitcnt-preexisting.mir =================================================================== --- test/CodeGen/AMDGPU/waitcnt-preexisting.mir +++ test/CodeGen/AMDGPU/waitcnt-preexisting.mir @@ -4,9 +4,10 @@ # GCN: S_WAITCNT -16257 # GGN: DS_READ2_B32 # GGN: DS_READ2_B32 -# GCN: S_WAITCNT 127{{$}} +# GCN: S_WAITCNT 383{{$}} # GCN-NEXT: $vgpr1 = V_ADD_U32_e32 1, killed $vgpr1, implicit $exec # GCN-NEXT: $vgpr1 = V_MAX_U32_e32 killed $vgpr0, killed $vgpr1, implicit $exec +# GCN-NEXT: S_WAITCNT 127{{$}} # GCN-NEXT: $vgpr1 = V_MAX_U32_e32 killed $vgpr2, killed $vgpr1, implicit $exec --- | define amdgpu_cs void @test() {