Index: lib/Target/AMDGPU/SIInsertWaitcnts.cpp =================================================================== --- lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -13,6 +13,14 @@ /// Memory reads and writes are issued asynchronously, so we need to insert /// S_WAITCNT instructions when we want to access any of their results or /// overwrite any register that's used asynchronously. +/// +/// TODO: This pass currently keeps one timeline per hardware counter. A more +/// finely-grained approach that keeps one timeline per event type could +/// sometimes get away with generating weaker s_waitcnt instructions. For +/// example, when both SMEM and LDS are in flight and we need to wait for +/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient, +/// but the pass will currently generate a conservative lgkmcnt(0) because +/// multiple event types are in flight. // //===----------------------------------------------------------------------===// @@ -132,10 +140,13 @@ NUM_WAIT_EVENTS, }; -iterator_range> wait_event_types() { - return make_range(enum_iterator(VMEM_ACCESS), - enum_iterator(NUM_WAIT_EVENTS)); -} +static const uint32_t WaitEventMaskForInst[NUM_INST_CNTS] = { + (1 << VMEM_ACCESS), + (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) | + (1 << SQ_MESSAGE), + (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) | + (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS), +}; // The mapping is: // 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs @@ -232,24 +243,12 @@ // Mapping from event to counter. InstCounterType eventCounter(WaitEventType E) { - switch (E) { - case VMEM_ACCESS: + if (E == VMEM_ACCESS) return VM_CNT; - case LDS_ACCESS: - case GDS_ACCESS: - case SQ_MESSAGE: - case SMEM_ACCESS: + if (WaitEventMaskForInst[LGKM_CNT] & (1 << E)) return LGKM_CNT; - case EXP_GPR_LOCK: - case GDS_GPR_LOCK: - case VMW_GPR_LOCK: - case EXP_POS_ACCESS: - case EXP_PARAM_ACCESS: - return EXP_CNT; - default: - llvm_unreachable("unhandled event type"); - } - return NUM_INST_CNTS; + assert(WaitEventMaskForInst[EXP_CNT] & (1 << E)); + return EXP_CNT; } void setRegScore(int GprNo, InstCounterType T, int32_t Val) { @@ -278,7 +277,8 @@ void clear() { memset(ScoreLBs, 0, sizeof(ScoreLBs)); memset(ScoreUBs, 0, sizeof(ScoreUBs)); - memset(EventUBs, 0, sizeof(EventUBs)); + PendingEvents = 0; + memset(MixedPendingEvents, 0, sizeof(MixedPendingEvents)); for (auto T : inst_counter_types()) memset(VgprScores[T], 0, sizeof(VgprScores[T])); memset(SgprScores, 0, sizeof(SgprScores)); @@ -296,15 +296,9 @@ void setWaitAtBeginning() { WaitAtBeginning = true; } void clearWaitAtBeginning() { WaitAtBeginning = false; } bool getWaitAtBeginning() const { return WaitAtBeginning; } - void setEventUB(enum WaitEventType W, int32_t Val) { EventUBs[W] = Val; } int32_t getMaxVGPR() const { return VgprUB; } int32_t getMaxSGPR() const { return SgprUB; } - int32_t getEventUB(enum WaitEventType W) const { - assert(W < NUM_WAIT_EVENTS); - return EventUBs[W]; - } - bool counterOutOfOrder(InstCounterType T) const; bool simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const; bool simplifyWaitcnt(InstCounterType T, unsigned &Count) const; @@ -316,11 +310,12 @@ const MachineRegisterInfo *MRI, WaitEventType E, MachineInstr &MI); - bool hasPendingSMEM() const { - return (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] && - EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]); + bool hasPendingEvent(WaitEventType E) const { + return PendingEvents & (1 << E); } + void mergePendingEvents(const BlockWaitcntBrackets &Other); + bool hasPendingFlat() const { return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] && LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) || @@ -343,11 +338,6 @@ void setPostOrder(int32_t PostOrderIn) { PostOrder = PostOrderIn; } int32_t getPostOrder() const { return PostOrder; } - bool mixedExpTypes() const { return MixedExpTypes; } - void setMixedExpTypes(bool MixedExpTypesIn) { - MixedExpTypes = MixedExpTypesIn; - } - void print(raw_ostream &); void dump() { print(dbgs()); } @@ -355,11 +345,11 @@ const GCNSubtarget *ST = nullptr; bool WaitAtBeginning = false; bool RevisitLoop = false; - bool MixedExpTypes = false; int32_t PostOrder = 0; int32_t ScoreLBs[NUM_INST_CNTS] = {0}; int32_t ScoreUBs[NUM_INST_CNTS] = {0}; - int32_t EventUBs[NUM_WAIT_EVENTS] = {0}; + uint32_t PendingEvents = 0; + bool MixedPendingEvents[NUM_INST_CNTS] = {false}; // Remember the last flat memory operation. int32_t LastFlat[NUM_INST_CNTS] = {0}; // wait_cnt scores for every vgpr. @@ -559,19 +549,17 @@ const MachineRegisterInfo &MRIA = *MRI; InstCounterType T = eventCounter(E); int32_t CurrScore = getScoreUB(T) + 1; - // EventUB and ScoreUB need to be update regardless if this event changes - // the score of a register or not. + // PendingEvents and ScoreUB need to be update regardless if this event + // changes the score of a register or not. // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message. - EventUBs[E] = CurrScore; + if (!hasPendingEvent(E)) { + if (PendingEvents & WaitEventMaskForInst[T]) + MixedPendingEvents[T] = true; + PendingEvents |= 1 << E; + } setScoreUB(T, CurrScore); if (T == EXP_CNT) { - // Check for mixed export types. If they are mixed, then a waitcnt exp(0) - // is required. - if (!MixedExpTypes) { - MixedExpTypes = counterOutOfOrder(EXP_CNT); - } - // Put score on the source vgprs. If this is a store, just use those // specific register(s). if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) { @@ -803,9 +791,6 @@ applyWaitcnt(VM_CNT, Wait.VmCnt); applyWaitcnt(EXP_CNT, Wait.ExpCnt); applyWaitcnt(LGKM_CNT, Wait.LgkmCnt); - - if (Wait.ExpCnt == 0) - setMixedExpTypes(false); } void BlockWaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) { @@ -818,76 +803,28 @@ setScoreLB(T, std::max(getScoreLB(T), UB - (int32_t)Count)); } else { setScoreLB(T, UB); + MixedPendingEvents[T] = false; + PendingEvents &= ~WaitEventMaskForInst[T]; + } +} + +void BlockWaitcntBrackets::mergePendingEvents(const BlockWaitcntBrackets &Other) { + for (auto T : inst_counter_types()) { + uint32_t Old = PendingEvents & WaitEventMaskForInst[T]; + uint32_t New = Other.PendingEvents & WaitEventMaskForInst[T]; + if (Other.MixedPendingEvents[T] || (Old && New && Old != New)) + MixedPendingEvents[T] = true; + PendingEvents |= New; } } // Where there are multiple types of event in the bracket of a counter, // the decrement may go out of order. bool BlockWaitcntBrackets::counterOutOfOrder(InstCounterType T) const { - switch (T) { - case VM_CNT: - return false; - case LGKM_CNT: { - if (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] && - EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]) { - // Scalar memory read always can go out of order. - return true; - } - int NumEventTypes = 0; - if (EventUBs[LDS_ACCESS] > ScoreLBs[LGKM_CNT] && - EventUBs[LDS_ACCESS] <= ScoreUBs[LGKM_CNT]) { - NumEventTypes++; - } - if (EventUBs[GDS_ACCESS] > ScoreLBs[LGKM_CNT] && - EventUBs[GDS_ACCESS] <= ScoreUBs[LGKM_CNT]) { - NumEventTypes++; - } - if (EventUBs[SQ_MESSAGE] > ScoreLBs[LGKM_CNT] && - EventUBs[SQ_MESSAGE] <= ScoreUBs[LGKM_CNT]) { - NumEventTypes++; - } - if (NumEventTypes <= 1) { - return false; - } - break; - } - case EXP_CNT: { - // If there has been a mixture of export types, then a waitcnt exp(0) is - // required. - if (MixedExpTypes) - return true; - int NumEventTypes = 0; - if (EventUBs[EXP_GPR_LOCK] > ScoreLBs[EXP_CNT] && - EventUBs[EXP_GPR_LOCK] <= ScoreUBs[EXP_CNT]) { - NumEventTypes++; - } - if (EventUBs[GDS_GPR_LOCK] > ScoreLBs[EXP_CNT] && - EventUBs[GDS_GPR_LOCK] <= ScoreUBs[EXP_CNT]) { - NumEventTypes++; - } - if (EventUBs[VMW_GPR_LOCK] > ScoreLBs[EXP_CNT] && - EventUBs[VMW_GPR_LOCK] <= ScoreUBs[EXP_CNT]) { - NumEventTypes++; - } - if (EventUBs[EXP_PARAM_ACCESS] > ScoreLBs[EXP_CNT] && - EventUBs[EXP_PARAM_ACCESS] <= ScoreUBs[EXP_CNT]) { - NumEventTypes++; - } - - if (EventUBs[EXP_POS_ACCESS] > ScoreLBs[EXP_CNT] && - EventUBs[EXP_POS_ACCESS] <= ScoreUBs[EXP_CNT]) { - NumEventTypes++; - } - - if (NumEventTypes <= 1) { - return false; - } - break; - } - default: - break; - } - return true; + // Scalar memory read always can go out of order. + if (T == LGKM_CNT && hasPendingEvent(SMEM_ACCESS)) + return true; + return MixedPendingEvents[T]; } INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, @@ -1023,14 +960,12 @@ if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) { // Export and GDS are tracked individually, either may trigger a waitcnt // for EXEC. - ScoreBrackets->determineWait( - EXP_CNT, ScoreBrackets->getEventUB(EXP_GPR_LOCK), Wait); - ScoreBrackets->determineWait( - EXP_CNT, ScoreBrackets->getEventUB(EXP_PARAM_ACCESS), Wait); - ScoreBrackets->determineWait( - EXP_CNT, ScoreBrackets->getEventUB(EXP_POS_ACCESS), Wait); - ScoreBrackets->determineWait( - EXP_CNT, ScoreBrackets->getEventUB(GDS_GPR_LOCK), Wait); + if (ScoreBrackets->hasPendingEvent(EXP_GPR_LOCK) || + ScoreBrackets->hasPendingEvent(EXP_PARAM_ACCESS) || + ScoreBrackets->hasPendingEvent(EXP_POS_ACCESS) || + ScoreBrackets->hasPendingEvent(GDS_GPR_LOCK)) { + Wait.ExpCnt = 0; + } } #if 0 // TODO: the following code to handle CALL. @@ -1135,7 +1070,7 @@ if (readsVCCZ(MI) && ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) { if (ScoreBrackets->getScoreLB(LGKM_CNT) < ScoreBrackets->getScoreUB(LGKM_CNT) && - ScoreBrackets->hasPendingSMEM()) { + ScoreBrackets->hasPendingEvent(SMEM_ACCESS)) { Wait.LgkmCnt = 0; } } @@ -1315,7 +1250,6 @@ BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get(); int32_t MaxPending[NUM_INST_CNTS] = {0}; int32_t MaxFlat[NUM_INST_CNTS] = {0}; - bool MixedExpTypes = false; // For single basic block loops, we need to retain the Block's // score bracket to have accurate Pred info. So, make a copy of Block's @@ -1351,25 +1285,6 @@ PredScoreBrackets->pendingFlat(T) - PredScoreBrackets->getScoreLB(T); MaxFlat[T] = std::max(MaxFlat[T], span); } - - MixedExpTypes |= PredScoreBrackets->mixedExpTypes(); - } - - // Special handling for GDS_GPR_LOCK and EXP_GPR_LOCK. - for (MachineBasicBlock *Pred : Block.predecessors()) { - BlockWaitcntBrackets *PredScoreBrackets = - BlockWaitcntBracketsMap[Pred].get(); - bool Visited = BlockVisitedSet.count(Pred); - if (!Visited || PredScoreBrackets->getWaitAtBeginning()) { - continue; - } - - int GDSSpan = PredScoreBrackets->getEventUB(GDS_GPR_LOCK) - - PredScoreBrackets->getScoreLB(EXP_CNT); - MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan); - int EXPSpan = PredScoreBrackets->getEventUB(EXP_GPR_LOCK) - - PredScoreBrackets->getScoreLB(EXP_CNT); - MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan); } #if 0 @@ -1390,8 +1305,6 @@ ScoreBrackets->setLastFlat(T, MaxFlat[T]); } - ScoreBrackets->setMixedExpTypes(MixedExpTypes); - // Set the register scoreboard. for (MachineBasicBlock *Pred : Block.predecessors()) { if (!BlockVisitedSet.count(Pred)) { @@ -1434,52 +1347,7 @@ } } - // Also merge the WaitEvent information. - for (auto W : wait_event_types()) { - enum InstCounterType T = PredScoreBrackets->eventCounter(W); - int PredEventUB = PredScoreBrackets->getEventUB(W); - if (PredEventUB > PredScoreBrackets->getScoreLB(T)) { - int NewEventUB = - MaxPending[T] + PredEventUB - PredScoreBrackets->getScoreUB(T); - if (NewEventUB > 0) { - ScoreBrackets->setEventUB( - W, std::max(ScoreBrackets->getEventUB(W), NewEventUB)); - } - } - } - } - - // Special case handling of GDS_GPR_LOCK and EXP_GPR_LOCK. Merge this for the - // sequencing predecessors, because changes to EXEC require waitcnts due to - // the delayed nature of these operations. - for (MachineBasicBlock *Pred : Block.predecessors()) { - if (!BlockVisitedSet.count(Pred)) { - continue; - } - - BlockWaitcntBrackets *PredScoreBrackets = - BlockWaitcntBracketsMap[Pred].get(); - - int pred_gds_ub = PredScoreBrackets->getEventUB(GDS_GPR_LOCK); - if (pred_gds_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) { - int new_gds_ub = MaxPending[EXP_CNT] + pred_gds_ub - - PredScoreBrackets->getScoreUB(EXP_CNT); - if (new_gds_ub > 0) { - ScoreBrackets->setEventUB( - GDS_GPR_LOCK, - std::max(ScoreBrackets->getEventUB(GDS_GPR_LOCK), new_gds_ub)); - } - } - int pred_exp_ub = PredScoreBrackets->getEventUB(EXP_GPR_LOCK); - if (pred_exp_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) { - int new_exp_ub = MaxPending[EXP_CNT] + pred_exp_ub - - PredScoreBrackets->getScoreUB(EXP_CNT); - if (new_exp_ub > 0) { - ScoreBrackets->setEventUB( - EXP_GPR_LOCK, - std::max(ScoreBrackets->getEventUB(EXP_GPR_LOCK), new_exp_ub)); - } - } + ScoreBrackets->mergePendingEvents(*PredScoreBrackets); } // if a single block loop, update the score brackets. Not needed for other @@ -1562,7 +1430,7 @@ (!VCCZBugHandledSet.count(&Inst))) { if (ScoreBrackets->getScoreLB(LGKM_CNT) < ScoreBrackets->getScoreUB(LGKM_CNT) && - ScoreBrackets->hasPendingSMEM()) { + ScoreBrackets->hasPendingEvent(SMEM_ACCESS)) { if (ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) VCCZBugWorkAround = true; }