Index: lib/Target/AMDGPU/SIInsertWaitcnts.cpp
===================================================================
--- lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -13,6 +13,14 @@
 /// Memory reads and writes are issued asynchronously, so we need to insert
 /// S_WAITCNT instructions when we want to access any of their results or
 /// overwrite any register that's used asynchronously.
+///
+/// TODO: This pass currently keeps one timeline per hardware counter. A more
+/// finely-grained approach that keeps one timeline per event type could
+/// sometimes get away with generating weaker s_waitcnt instructions. For
+/// example, when both SMEM and LDS are in flight and we need to wait for
+/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
+/// but the pass will currently generate a conservative lgkmcnt(0) because
+/// multiple event types are in flight.
 //
 //===----------------------------------------------------------------------===//
 
@@ -132,10 +140,13 @@
   NUM_WAIT_EVENTS,
 };
 
-iterator_range<enum_iterator<WaitEventType>> wait_event_types() {
-  return make_range(enum_iterator<WaitEventType>(VMEM_ACCESS),
-                    enum_iterator<WaitEventType>(NUM_WAIT_EVENTS));
-}
+static const uint32_t WaitEventMaskForInst[NUM_INST_CNTS] = {
+  (1 << VMEM_ACCESS),
+  (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
+      (1 << SQ_MESSAGE),
+  (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
+      (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS),
+};
 
 // The mapping is:
 //  0                .. SQ_MAX_PGM_VGPRS-1               real VGPRs
@@ -232,24 +243,12 @@
 
   // Mapping from event to counter.
   InstCounterType eventCounter(WaitEventType E) {
-    switch (E) {
-    case VMEM_ACCESS:
+    if (E == VMEM_ACCESS)
       return VM_CNT;
-    case LDS_ACCESS:
-    case GDS_ACCESS:
-    case SQ_MESSAGE:
-    case SMEM_ACCESS:
+    if (WaitEventMaskForInst[LGKM_CNT] & (1 << E))
       return LGKM_CNT;
-    case EXP_GPR_LOCK:
-    case GDS_GPR_LOCK:
-    case VMW_GPR_LOCK:
-    case EXP_POS_ACCESS:
-    case EXP_PARAM_ACCESS:
-      return EXP_CNT;
-    default:
-      llvm_unreachable("unhandled event type");
-    }
-    return NUM_INST_CNTS;
+    assert(WaitEventMaskForInst[EXP_CNT] & (1 << E));
+    return EXP_CNT;
   }
 
   void setRegScore(int GprNo, InstCounterType T, int32_t Val) {
@@ -278,7 +277,8 @@
   void clear() {
     memset(ScoreLBs, 0, sizeof(ScoreLBs));
     memset(ScoreUBs, 0, sizeof(ScoreUBs));
-    memset(EventUBs, 0, sizeof(EventUBs));
+    PendingEvents = 0;
+    memset(MixedPendingEvents, 0, sizeof(MixedPendingEvents));
     for (auto T : inst_counter_types())
       memset(VgprScores[T], 0, sizeof(VgprScores[T]));
     memset(SgprScores, 0, sizeof(SgprScores));
@@ -296,15 +296,9 @@
   void setWaitAtBeginning() { WaitAtBeginning = true; }
   void clearWaitAtBeginning() { WaitAtBeginning = false; }
   bool getWaitAtBeginning() const { return WaitAtBeginning; }
-  void setEventUB(enum WaitEventType W, int32_t Val) { EventUBs[W] = Val; }
   int32_t getMaxVGPR() const { return VgprUB; }
   int32_t getMaxSGPR() const { return SgprUB; }
 
-  int32_t getEventUB(enum WaitEventType W) const {
-    assert(W < NUM_WAIT_EVENTS);
-    return EventUBs[W];
-  }
-
   bool counterOutOfOrder(InstCounterType T) const;
   bool simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
   bool simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
@@ -316,11 +310,12 @@
                      const MachineRegisterInfo *MRI, WaitEventType E,
                      MachineInstr &MI);
 
-  bool hasPendingSMEM() const {
-    return (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
-            EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]);
+  bool hasPendingEvent(WaitEventType E) const {
+    return PendingEvents & (1 << E);
   }
 
+  void mergePendingEvents(const BlockWaitcntBrackets &Other);
+
   bool hasPendingFlat() const {
     return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
              LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
@@ -343,11 +338,6 @@
   void setPostOrder(int32_t PostOrderIn) { PostOrder = PostOrderIn; }
   int32_t getPostOrder() const { return PostOrder; }
 
-  bool mixedExpTypes() const { return MixedExpTypes; }
-  void setMixedExpTypes(bool MixedExpTypesIn) {
-    MixedExpTypes = MixedExpTypesIn;
-  }
-
   void print(raw_ostream &);
   void dump() { print(dbgs()); }
 
@@ -355,11 +345,11 @@
   const GCNSubtarget *ST = nullptr;
   bool WaitAtBeginning = false;
   bool RevisitLoop = false;
-  bool MixedExpTypes = false;
   int32_t PostOrder = 0;
   int32_t ScoreLBs[NUM_INST_CNTS] = {0};
   int32_t ScoreUBs[NUM_INST_CNTS] = {0};
-  int32_t EventUBs[NUM_WAIT_EVENTS] = {0};
+  uint32_t PendingEvents = 0;
+  bool MixedPendingEvents[NUM_INST_CNTS] = {false};
   // Remember the last flat memory operation.
   int32_t LastFlat[NUM_INST_CNTS] = {0};
   // wait_cnt scores for every vgpr.
@@ -559,19 +549,17 @@
   const MachineRegisterInfo &MRIA = *MRI;
   InstCounterType T = eventCounter(E);
   int32_t CurrScore = getScoreUB(T) + 1;
-  // EventUB and ScoreUB need to be update regardless if this event changes
-  // the score of a register or not.
+  // PendingEvents and ScoreUB need to be update regardless if this event
+  // changes the score of a register or not.
   // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
-  EventUBs[E] = CurrScore;
+  if (!hasPendingEvent(E)) {
+    if (PendingEvents & WaitEventMaskForInst[T])
+      MixedPendingEvents[T] = true;
+    PendingEvents |= 1 << E;
+  }
   setScoreUB(T, CurrScore);
 
   if (T == EXP_CNT) {
-    // Check for mixed export types. If they are mixed, then a waitcnt exp(0)
-    // is required.
-    if (!MixedExpTypes) {
-      MixedExpTypes = counterOutOfOrder(EXP_CNT);
-    }
-
     // Put score on the source vgprs. If this is a store, just use those
     // specific register(s).
     if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
@@ -803,9 +791,6 @@
   applyWaitcnt(VM_CNT, Wait.VmCnt);
   applyWaitcnt(EXP_CNT, Wait.ExpCnt);
   applyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
-
-  if (Wait.ExpCnt == 0)
-    setMixedExpTypes(false);
 }
 
 void BlockWaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
@@ -818,76 +803,28 @@
     setScoreLB(T, std::max(getScoreLB(T), UB - (int32_t)Count));
   } else {
     setScoreLB(T, UB);
+    MixedPendingEvents[T] = false;
+    PendingEvents &= ~WaitEventMaskForInst[T];
+  }
+}
+
+void BlockWaitcntBrackets::mergePendingEvents(const BlockWaitcntBrackets &Other) {
+  for (auto T : inst_counter_types()) {
+    uint32_t Old = PendingEvents & WaitEventMaskForInst[T];
+    uint32_t New = Other.PendingEvents & WaitEventMaskForInst[T];
+    if (Other.MixedPendingEvents[T] || (Old && New && Old != New))
+      MixedPendingEvents[T] = true;
+    PendingEvents |= New;
   }
 }
 
 // Where there are multiple types of event in the bracket of a counter,
 // the decrement may go out of order.
 bool BlockWaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
-  switch (T) {
-  case VM_CNT:
-    return false;
-  case LGKM_CNT: {
-    if (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
-        EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]) {
-      // Scalar memory read always can go out of order.
-      return true;
-    }
-    int NumEventTypes = 0;
-    if (EventUBs[LDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
-        EventUBs[LDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
-      NumEventTypes++;
-    }
-    if (EventUBs[GDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
-        EventUBs[GDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
-      NumEventTypes++;
-    }
-    if (EventUBs[SQ_MESSAGE] > ScoreLBs[LGKM_CNT] &&
-        EventUBs[SQ_MESSAGE] <= ScoreUBs[LGKM_CNT]) {
-      NumEventTypes++;
-    }
-    if (NumEventTypes <= 1) {
-      return false;
-    }
-    break;
-  }
-  case EXP_CNT: {
-    // If there has been a mixture of export types, then a waitcnt exp(0) is
-    // required.
-    if (MixedExpTypes)
-      return true;
-    int NumEventTypes = 0;
-    if (EventUBs[EXP_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
-        EventUBs[EXP_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
-      NumEventTypes++;
-    }
-    if (EventUBs[GDS_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
-        EventUBs[GDS_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
-      NumEventTypes++;
-    }
-    if (EventUBs[VMW_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
-        EventUBs[VMW_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
-      NumEventTypes++;
-    }
-    if (EventUBs[EXP_PARAM_ACCESS] > ScoreLBs[EXP_CNT] &&
-        EventUBs[EXP_PARAM_ACCESS] <= ScoreUBs[EXP_CNT]) {
-      NumEventTypes++;
-    }
-
-    if (EventUBs[EXP_POS_ACCESS] > ScoreLBs[EXP_CNT] &&
-        EventUBs[EXP_POS_ACCESS] <= ScoreUBs[EXP_CNT]) {
-      NumEventTypes++;
-    }
-
-    if (NumEventTypes <= 1) {
-      return false;
-    }
-    break;
-  }
-  default:
-    break;
-  }
-  return true;
+  // Scalar memory read always can go out of order.
+  if (T == LGKM_CNT && hasPendingEvent(SMEM_ACCESS))
+    return true;
+  return MixedPendingEvents[T];
 }
 
 INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
@@ -1023,14 +960,12 @@
     if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
       // Export and GDS are tracked individually, either may trigger a waitcnt
       // for EXEC.
-      ScoreBrackets->determineWait(
-          EXP_CNT, ScoreBrackets->getEventUB(EXP_GPR_LOCK), Wait);
-      ScoreBrackets->determineWait(
-          EXP_CNT, ScoreBrackets->getEventUB(EXP_PARAM_ACCESS), Wait);
-      ScoreBrackets->determineWait(
-          EXP_CNT, ScoreBrackets->getEventUB(EXP_POS_ACCESS), Wait);
-      ScoreBrackets->determineWait(
-          EXP_CNT, ScoreBrackets->getEventUB(GDS_GPR_LOCK), Wait);
+      if (ScoreBrackets->hasPendingEvent(EXP_GPR_LOCK) ||
+          ScoreBrackets->hasPendingEvent(EXP_PARAM_ACCESS) ||
+          ScoreBrackets->hasPendingEvent(EXP_POS_ACCESS) ||
+          ScoreBrackets->hasPendingEvent(GDS_GPR_LOCK)) {
+        Wait.ExpCnt = 0;
+      }
     }
 
 #if 0 // TODO: the following code to handle CALL.
@@ -1135,7 +1070,7 @@
   if (readsVCCZ(MI) && ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) {
     if (ScoreBrackets->getScoreLB(LGKM_CNT) <
             ScoreBrackets->getScoreUB(LGKM_CNT) &&
-        ScoreBrackets->hasPendingSMEM()) {
+        ScoreBrackets->hasPendingEvent(SMEM_ACCESS)) {
       Wait.LgkmCnt = 0;
     }
   }
@@ -1315,7 +1250,6 @@
   BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
   int32_t MaxPending[NUM_INST_CNTS] = {0};
   int32_t MaxFlat[NUM_INST_CNTS] = {0};
-  bool MixedExpTypes = false;
 
   // For single basic block loops, we need to retain the Block's
   // score bracket to have accurate Pred info. So, make a copy of Block's
@@ -1351,25 +1285,6 @@
           PredScoreBrackets->pendingFlat(T) - PredScoreBrackets->getScoreLB(T);
       MaxFlat[T] = std::max(MaxFlat[T], span);
     }
-
-    MixedExpTypes |= PredScoreBrackets->mixedExpTypes();
-  }
-
-  // Special handling for GDS_GPR_LOCK and EXP_GPR_LOCK.
-  for (MachineBasicBlock *Pred : Block.predecessors()) {
-    BlockWaitcntBrackets *PredScoreBrackets =
-        BlockWaitcntBracketsMap[Pred].get();
-    bool Visited = BlockVisitedSet.count(Pred);
-    if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
-      continue;
-    }
-
-    int GDSSpan = PredScoreBrackets->getEventUB(GDS_GPR_LOCK) -
-                  PredScoreBrackets->getScoreLB(EXP_CNT);
-    MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
-    int EXPSpan = PredScoreBrackets->getEventUB(EXP_GPR_LOCK) -
-                  PredScoreBrackets->getScoreLB(EXP_CNT);
-    MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
   }
 
 #if 0
@@ -1390,8 +1305,6 @@
     ScoreBrackets->setLastFlat(T, MaxFlat[T]);
   }
 
-  ScoreBrackets->setMixedExpTypes(MixedExpTypes);
-
   // Set the register scoreboard.
   for (MachineBasicBlock *Pred : Block.predecessors()) {
     if (!BlockVisitedSet.count(Pred)) {
@@ -1434,52 +1347,7 @@
       }
     }
 
-    // Also merge the WaitEvent information.
-    for (auto W : wait_event_types()) {
-      enum InstCounterType T = PredScoreBrackets->eventCounter(W);
-      int PredEventUB = PredScoreBrackets->getEventUB(W);
-      if (PredEventUB > PredScoreBrackets->getScoreLB(T)) {
-        int NewEventUB =
-            MaxPending[T] + PredEventUB - PredScoreBrackets->getScoreUB(T);
-        if (NewEventUB > 0) {
-          ScoreBrackets->setEventUB(
-              W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
-        }
-      }
-    }
-  }
-
-  // Special case handling of GDS_GPR_LOCK and EXP_GPR_LOCK. Merge this for the
-  // sequencing predecessors, because changes to EXEC require waitcnts due to
-  // the delayed nature of these operations.
-  for (MachineBasicBlock *Pred : Block.predecessors()) {
-    if (!BlockVisitedSet.count(Pred)) {
-      continue;
-    }
-
-    BlockWaitcntBrackets *PredScoreBrackets =
-        BlockWaitcntBracketsMap[Pred].get();
-
-    int pred_gds_ub = PredScoreBrackets->getEventUB(GDS_GPR_LOCK);
-    if (pred_gds_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
-      int new_gds_ub = MaxPending[EXP_CNT] + pred_gds_ub -
-                       PredScoreBrackets->getScoreUB(EXP_CNT);
-      if (new_gds_ub > 0) {
-        ScoreBrackets->setEventUB(
-            GDS_GPR_LOCK,
-            std::max(ScoreBrackets->getEventUB(GDS_GPR_LOCK), new_gds_ub));
-      }
-    }
-    int pred_exp_ub = PredScoreBrackets->getEventUB(EXP_GPR_LOCK);
-    if (pred_exp_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
-      int new_exp_ub = MaxPending[EXP_CNT] + pred_exp_ub -
-                       PredScoreBrackets->getScoreUB(EXP_CNT);
-      if (new_exp_ub > 0) {
-        ScoreBrackets->setEventUB(
-            EXP_GPR_LOCK,
-            std::max(ScoreBrackets->getEventUB(EXP_GPR_LOCK), new_exp_ub));
-      }
-    }
+    ScoreBrackets->mergePendingEvents(*PredScoreBrackets);
   }
 
   // if a single block loop, update the score brackets. Not needed for other
@@ -1562,7 +1430,7 @@
         (!VCCZBugHandledSet.count(&Inst))) {
       if (ScoreBrackets->getScoreLB(LGKM_CNT) <
               ScoreBrackets->getScoreUB(LGKM_CNT) &&
-          ScoreBrackets->hasPendingSMEM()) {
+          ScoreBrackets->hasPendingEvent(SMEM_ACCESS)) {
         if (ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
           VCCZBugWorkAround = true;
       }