diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -30,6 +30,7 @@
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/DebugCounter.h"
@@ -365,8 +366,12 @@
 
   DenseSet<MachineInstr *> TrackedWaitcntSet;
   DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
+  DenseMap<MachineBasicBlock *, MachineLoop *> PreheaderLoop;
+  MachineLoopInfo *MLI;
   MachinePostDominatorTree *PDT;
 
+  bool shouldFlush(MachineBasicBlock *MBB);
+
   struct BlockInfo {
     MachineBasicBlock *MBB;
     std::unique_ptr<WaitcntBrackets> Incoming;
@@ -399,6 +404,7 @@
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
+    AU.addRequired<MachineLoopInfo>();
     AU.addRequired<MachinePostDominatorTree>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
@@ -439,6 +445,9 @@
 
   bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
   bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
+  bool generateWaitcntBlockEnd(MachineBasicBlock &Block,
+                               WaitcntBrackets &ScoreBrackets,
+                               MachineInstr *OldWaitcntInstr);
   bool generateWaitcntInstBefore(MachineInstr &MI,
                                  WaitcntBrackets &ScoreBrackets,
                                  MachineInstr *OldWaitcntInstr);
@@ -794,6 +803,7 @@
 
 INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
                       false)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
 INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
                     false)
@@ -806,10 +816,11 @@
   return new SIInsertWaitcnts();
 }
 
-/// Combine consecutive waitcnt instructions that precede \p MI and follow
-/// \p OldWaitcntInstr and apply any extra wait from waitcnt that were added
-/// by previous passes. Currently this pass conservatively assumes that these
-/// preexisting waitcnt are required for correctness.
+/// Combine consecutive waitcnt instructions that precede \p MI (or the end of
+/// the block if MI is null) and follow \p OldWaitcntInstr and apply any extra
+/// wait from waitcnt that were added by previous passes. Currently this pass
+/// conservatively assumes that these preexisting waitcnt are required for
+/// correctness.
 bool SIInsertWaitcnts::applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
                                                MachineInstr &OldWaitcntInstr,
                                                AMDGPU::Waitcnt &Wait,
@@ -817,8 +828,9 @@
   bool Modified = false;
   MachineInstr *WaitcntInstr = nullptr;
   MachineInstr *WaitcntVsCntInstr = nullptr;
+  auto EndBlockIt = OldWaitcntInstr.getParent()->end();
   for (auto II = OldWaitcntInstr.getIterator(), NextI = std::next(II);
-       &*II != MI; II = NextI, ++NextI) {
+       II != EndBlockIt && &*II != MI; II = NextI, ++NextI) {
     if (II->isMetaInstruction())
       continue;
 
@@ -928,6 +940,48 @@
   return true;
 }
 
+///  Generate s_waitcnt instruction to be placed at the end of Block.
+///  We may need to flush vmcnt in preheader blocks (inserting a wait at the
+///  end of the block) to avoid generating waits in loop body.
+///  These flush waits are inserted before terminator instructions in
+///  generateWaitcntInstBefore, but this function handles the case in which
+///  there is no terminator instruction in Block.
+bool SIInsertWaitcnts::generateWaitcntBlockEnd(MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets, MachineInstr *OldWaitcntInstr) {
+
+  AMDGPU::Waitcnt Wait;
+  bool Modified = false;
+
+  unsigned UB = ScoreBrackets.getScoreUB(VM_CNT);
+  unsigned LB = ScoreBrackets.getScoreLB(VM_CNT);
+  if (UB - LB != 0 && shouldFlush(&Block)) {
+    Wait.VmCnt = 0;
+      ScoreBrackets.applyWaitcnt(Wait);
+    if (OldWaitcntInstr) {
+      // Try to merge the required wait with preexisting waitcnt instructions.
+      // Also erase redundant waitcnt.
+      Modified =
+          applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, nullptr);
+    } else {
+      // Update waitcnt brackets after determining the required wait.
+      ScoreBrackets.applyWaitcnt(Wait);
+    }
+  }
+
+  // Build new waitcnt instructions unless no wait is needed.
+  if (Wait.hasWaitExceptVsCnt()) {
+    unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+    auto SWaitInst = BuildMI(Block, Block.end(),
+                             Block.back().getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+                         .addImm(Enc);
+    TrackedWaitcntSet.insert(SWaitInst);
+    Modified = true;
+
+    LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
+                      << "New Instr: " << *SWaitInst << '\n');
+  }
+
+  return Modified;
+}
 ///  Generate s_waitcnt instruction to be placed before cur_Inst.
 ///  Instructions of a given type are returned in order,
 ///  but instructions of different types can complete out of order.
@@ -1180,6 +1234,13 @@
   if (ForceEmitWaitcnt[VS_CNT])
     Wait.VsCnt = 0;
 
+  if (MI.getParent()->getFirstTerminator() == MI) {
+    unsigned UB = ScoreBrackets.getScoreUB(VM_CNT);
+    unsigned LB = ScoreBrackets.getScoreLB(VM_CNT);
+    if (UB - LB != 0 && shouldFlush(MI.getParent()))
+      Wait.VmCnt = 0;
+  }
+
   if (OldWaitcntInstr) {
     // Try to merge the required wait with preexisting waitcnt instructions.
     // Also erase redundant waitcnt.
@@ -1573,9 +1634,49 @@
     ++Iter;
   }
 
+  // Generate an s_waitcnt instruction to be placed at the end of Block,
+  // if needed.
+  Modified |= generateWaitcntBlockEnd(Block, ScoreBrackets, OldWaitcntInstr);
+
   return Modified;
 }
 
+// Determine if we should flush VMCNT in the given block. It may be worth
+// waiting for VMCNT in loop preheaders to avoid unnecessarily waiting in each
+// iteration inside loops. This function returns true if MBB is such a
+// preheader.
+bool SIInsertWaitcnts::shouldFlush(MachineBasicBlock *MBB) {
+  // We only flush on pre-GFX10, if there is no VSCNT counter.
+  if (ST->hasVscnt())
+    return false;
+  if (!PreheaderLoop.count(MBB))
+    return false;
+  MachineLoop *Loop = PreheaderLoop[MBB];
+  // We flush in the block if it is a preheader and its associated loop contains
+  // at least one VMEM store and a VMEM ref, but doesn't contain a VMEM load.
+  bool HasVMemLoad = false;
+  bool HasVMemStore = false;
+  bool HasVMemLoadUse = false;
+  for (MachineBasicBlock *const MBB: Loop->blocks()) {
+    for (MachineInstr &MI: *MBB) {
+      if (SIInstrInfo::isVMEM(MI)) {
+        if (MI.mayLoad())
+          HasVMemLoad = true;
+        if (MI.mayStore())
+          HasVMemStore = true;
+      }
+      for (MachineOperand &Op : MI.uses())
+        if (Op.isReg() && TRI->isVectorRegister(*MRI, Op.getReg()))
+          for (const MachineOperand &MO : make_range(MRI->def_begin(Op.getReg()), MRI->def_end()))
+            if (MO.getParent()->mayLoad())
+              HasVMemLoadUse = true;
+    }
+    if (HasVMemStore && HasVMemLoadUse && !HasVMemLoad)
+      return true;
+  }
+  return false;
+}
+
 bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
   ST = &MF.getSubtarget<GCNSubtarget>();
   TII = ST->getInstrInfo();
@@ -1583,6 +1684,7 @@
   MRI = &MF.getRegInfo();
   IV = AMDGPU::getIsaVersion(ST->getCPU());
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  MLI = &getAnalysis<MachineLoopInfo>();
   PDT = &getAnalysis<MachinePostDominatorTree>();
 
   ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
@@ -1608,6 +1710,13 @@
   BlockInfos.clear();
   bool Modified = false;
 
+  // Retrieve the loop associated with each preheader.
+  for (MachineLoop *const LI: *MLI) {
+    MachineBasicBlock *Preheader = LI->getLoopPreheader();
+    if (Preheader)
+      PreheaderLoop[Preheader] = LI;
+  }
+
   if (!MFI->isEntryFunction()) {
     // Wait for any outstanding memory operations that the input registers may
     // depend on. We can't track them and it's better to do the wait after the
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir
@@ -0,0 +1,173 @@
+# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX9 %s
+
+--- |
+
+  @spill = external addrspace(1) global i32
+
+  define amdgpu_cs void @waitcnt_vm_loop() {
+    ret void
+  }
+
+  define amdgpu_cs void @waitcnt_vm_loop_noterm() {
+    ret void
+  }
+
+  define amdgpu_cs void @waitcnt_vm_loop_load() {
+    ret void
+  }
+
+  define amdgpu_cs void @waitcnt_vm_loop_no_store() {
+    ret void
+  }
+
+  define amdgpu_cs void @waitcnt_vm_loop_no_use() {
+    ret void
+  }
+
+...
+---
+
+# GFX9-LABEL: waitcnt_vm_loop
+# GFX9-LABEL: bb.0
+# GFX9: S_WAITCNT 3952
+# GFX9-NEXT: S_BRANCH %bb.1
+# GFX9-LABEL-NEXT: bb.1
+# GFX9-NOT: S_WAITCNT 395{{2|3}}
+name:            waitcnt_vm_loop
+body:             |
+  bb.0:
+    successors: %bb.1
+
+    renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.1, %bb.2
+
+    BUFFER_STORE_DWORD_OFFEN_exact renamable $vgpr5, renamable $vgpr6, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
+    S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
+    S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+    S_BRANCH %bb.2
+
+  bb.2:
+    S_ENDPGM 0
+
+...
+---
+
+# GFX9-LABEL: waitcnt_vm_loop
+# GFX9-LABEL: bb.0
+# GFX9: S_WAITCNT 3952
+# GFX9-LABEL-NEXT: bb.1
+# GFX9-NOT: S_WAITCNT 395{{2|3}}
+name:            waitcnt_vm_loop_noterm
+body:             |
+  bb.0:
+    successors: %bb.1
+
+    renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
+
+  bb.1:
+    successors: %bb.1, %bb.2
+
+    BUFFER_STORE_DWORD_OFFEN_exact renamable $vgpr5, renamable $vgpr6, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
+    S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
+    S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+    S_BRANCH %bb.2
+
+  bb.2:
+    S_ENDPGM 0
+
+...
+---
+
+# GFX9-LABEL: waitcnt_vm_loop_load
+# GFX9-LABEL: bb.0
+# GFX9-NOT: S_WAITCNT 395{{2|3}}
+# GFX9: S_BRANCH %bb.1
+# GFX9-LABEL-NEXT: bb.1
+# GFX9: S_WAITCNT 3952
+name:            waitcnt_vm_loop_load
+body:             |
+  bb.0:
+    successors: %bb.1
+
+    renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.1, %bb.2
+
+    BUFFER_STORE_DWORD_OFFEN_exact renamable $vgpr5, renamable $vgpr6, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
+    renamable $vgpr7 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr7, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr7, implicit $exec
+    S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
+    S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+    S_BRANCH %bb.2
+
+  bb.2:
+    S_ENDPGM 0
+
+...
+---
+
+# GFX9-LABEL: waitcnt_vm_loop_no_store
+# GFX9-LABEL: bb.0
+# GFX9-NOT: S_WAITCNT 395{{2|3}}
+# GFX9: S_BRANCH %bb.1
+# GFX9-LABEL-NEXT: %bb.1
+# GFX9: S_WAITCNT 3952
+name:            waitcnt_vm_loop_no_store
+body:             |
+  bb.0:
+    successors: %bb.1
+
+    renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.1, %bb.2
+
+    $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
+    S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
+    S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+    S_BRANCH %bb.2
+
+  bb.2:
+    S_ENDPGM 0
+
+...
+---
+
+# GFX9-LABEL: waitcnt_vm_loop_no_use
+# GFX9-LABEL: bb.0
+# GFX9-NOT: S_WAITCNT 395{{2|3}}
+# GFX9: S_BRANCH %bb.1
+# GFX9-LABEL-NEXT: %bb.1
+# GFX9-NOT: S_WAITCNT 395{{2|3}}
+# GFX9-LABEL: %bb.2
+# GFX: S_WAITCNT 3953
+name:            waitcnt_vm_loop_no_use
+body:             |
+  bb.0:
+    successors: %bb.1
+
+    renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.1, %bb.2
+
+    BUFFER_STORE_DWORD_OFFEN_exact renamable $vgpr5, renamable $vgpr6, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = V_ADD_U32_e32 renamable $vgpr2, renamable $vgpr2, implicit $exec
+    S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
+    S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+    S_BRANCH %bb.2
+
+  bb.2:
+    $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr0, implicit $exec
+    S_ENDPGM 0
+
+...