Index: llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td
===================================================================
--- llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td
+++ llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -68,6 +68,8 @@
 def int_amdgcn_s_barrier : GCCBuiltin<"__builtin_amdgcn_s_barrier">,
   Intrinsic<[], [], [IntrConvergent]>;
 
+def int_amdgcn_s_waitcnt : Intrinsic<[], [llvm_i32_ty], []>;
+
 def int_amdgcn_div_scale : Intrinsic<
   // 1st parameter: Numerator
   // 2nd parameter: Denominator
Index: llvm/trunk/lib/Target/AMDGPU/SIInsertWaits.cpp
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInsertWaits.cpp
+++ llvm/trunk/lib/Target/AMDGPU/SIInsertWaits.cpp
@@ -68,6 +68,10 @@
   /// \brief Counter values we have already waited on.
   Counters WaitedOn;
 
+  /// \brief Counter values that we must wait on before the next counter
+  /// increase.
+  Counters DelayedWaitOn;
+
   /// \brief Counter values for last instruction issued.
   Counters LastIssued;
 
@@ -103,13 +107,17 @@
 
   /// \brief Handle instructions async components
   void pushInstruction(MachineBasicBlock &MBB,
-                       MachineBasicBlock::iterator I);
+                       MachineBasicBlock::iterator I,
+                       const Counters& Increment);
 
   /// \brief Insert the actual wait instruction
   bool insertWait(MachineBasicBlock &MBB,
                   MachineBasicBlock::iterator I,
                   const Counters &Counts);
 
+  /// \brief Handle existing wait instructions (from intrinsics)
+  void handleExistingWait(MachineBasicBlock::iterator I);
+
   /// \brief Do we need def2def checks?
   bool unorderedDefines(MachineInstr &MI);
 
@@ -287,10 +295,10 @@
 }
 
 void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
-                                    MachineBasicBlock::iterator I) {
+                                    MachineBasicBlock::iterator I,
+                                    const Counters &Increment) {
 
   // Get the hardware counter increments and sum them up
-  Counters Increment = getHwCounts(*I);
   Counters Limit = ZeroCounts;
   unsigned Sum = 0;
 
@@ -430,16 +438,38 @@
     Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]);
 }
 
+/// \brief check whether any of the counters is non-zero
+static bool countersNonZero(const Counters &Counter) {
+  for (unsigned i = 0; i < 3; ++i)
+    if (Counter.Array[i])
+      return true;
+  return false;
+}
+
+void SIInsertWaits::handleExistingWait(MachineBasicBlock::iterator I) {
+  assert(I->getOpcode() == AMDGPU::S_WAITCNT);
+
+  unsigned Imm = I->getOperand(0).getImm();
+  Counters Counts, WaitOn;
+
+  Counts.Named.VM = Imm & 0xF;
+  Counts.Named.EXP = (Imm >> 4) & 0x7;
+  Counts.Named.LGKM = (Imm >> 8) & 0xF;
+
+  for (unsigned i = 0; i < 3; ++i) {
+    if (Counts.Array[i] <= LastIssued.Array[i])
+      WaitOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
+    else
+      WaitOn.Array[i] = 0;
+  }
+
+  increaseCounters(DelayedWaitOn, WaitOn);
+}
+
 Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
 
   Counters Result = ZeroCounts;
 
-  // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,
-  // but we also want to wait for any other outstanding transfers before
-  // signalling other hardware blocks
-  if (MI.getOpcode() == AMDGPU::S_SENDMSG)
-    return LastIssued;
-
   // For each register affected by this instruction increase the result
   // sequence.
   //
@@ -544,6 +574,7 @@
   MRI = &MF.getRegInfo();
 
   WaitedOn = ZeroCounts;
+  DelayedWaitOn = ZeroCounts;
   LastIssued = ZeroCounts;
   LastOpcodeType = OTHER;
   LastInstWritesM0 = false;
@@ -552,6 +583,8 @@
   memset(&UsedRegs, 0, sizeof(UsedRegs));
   memset(&DefinedRegs, 0, sizeof(DefinedRegs));
 
+  SmallVector<MachineInstr *, 4> RemoveMI;
+
   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
        BI != BE; ++BI) {
 
@@ -607,13 +640,34 @@
           I->getOpcode() == AMDGPU::V_READFIRSTLANE_B32)
         TII->insertWaitStates(MBB, std::next(I), 4);
 
+      // Record pre-existing, explicitly requested waits
+      if (I->getOpcode() == AMDGPU::S_WAITCNT) {
+        handleExistingWait(*I);
+        RemoveMI.push_back(I);
+        continue;
+      }
+
+      Counters Required;
+
       // Wait for everything before a barrier.
-      if (I->getOpcode() == AMDGPU::S_BARRIER)
-        Changes |= insertWait(MBB, I, LastIssued);
+      //
+      // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,
+      // but we also want to wait for any other outstanding transfers before
+      // signalling other hardware blocks
+      if (I->getOpcode() == AMDGPU::S_BARRIER ||
+          I->getOpcode() == AMDGPU::S_SENDMSG)
+        Required = LastIssued;
       else
-        Changes |= insertWait(MBB, I, handleOperands(*I));
+        Required = handleOperands(*I);
+
+      Counters Increment = getHwCounts(*I);
 
-      pushInstruction(MBB, I);
+      if (countersNonZero(Required) || countersNonZero(Increment))
+        increaseCounters(Required, DelayedWaitOn);
+
+      Changes |= insertWait(MBB, I, Required);
+
+      pushInstruction(MBB, I, Increment);
       handleSendMsg(MBB, I);
     }
 
@@ -621,5 +675,8 @@
     Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
   }
 
+  for (MachineInstr *I : RemoveMI)
+    I->eraseFromParent();
+
   return Changes;
 }
Index: llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
+++ llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
@@ -43,8 +43,9 @@
   let ParserMethod = "parseSWaitCntOps";
 }
 
-def WAIT_FLAG : InstFlag<"printWaitFlag"> {
+def WAIT_FLAG : Operand <i32> {
   let ParserMatchClass = SWaitMatchClass;
+  let PrintMethod = "printWaitFlag";
 }
 
 let SubtargetPredicate = isGCN in {
@@ -506,6 +507,7 @@
   let isConvergent = 1;
 }
 
+let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in
 def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16">;
 def S_SETHALT : SOPP <0x0000000d, (ins i16imm:$simm16), "s_sethalt $simm16">;
 
@@ -2452,6 +2454,11 @@
 // SOPP Patterns
 //===----------------------------------------------------------------------===//
 
+def : Pat <
+  (int_amdgcn_s_waitcnt i32:$simm16),
+  (S_WAITCNT (as_i16imm $simm16))
+>;
+
 // FIXME: These should be removed eventually
 def : Pat <
   (int_AMDGPU_barrier_global),
Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll
===================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll
+++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll
@@ -0,0 +1,38 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s
+
+; CHECK-LABEL: {{^}}test1:
+; CHECK: image_store
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0){{$}}
+; CHECK-NEXT: image_store
+; CHECK-NEXT: s_endpgm
+define amdgpu_ps void @test1(<8 x i32> inreg %rsrc, <4 x float> %d0, <4 x float> %d1, i32 %c0, i32 %c1) {
+  call void @llvm.amdgcn.image.store.i32(<4 x float> %d0, i32 %c0, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 1, i1 0)
+  call void @llvm.amdgcn.s.waitcnt(i32 3840) ; 0xf00
+  call void @llvm.amdgcn.image.store.i32(<4 x float> %d1, i32 %c1, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 1, i1 0)
+  ret void
+}
+
+; Test that the intrinsic is merged with automatically generated waits and
+; emitted as late as possible.
+;
+; CHECK-LABEL: {{^}}test2:
+; CHECK: image_load
+; CHECK-NOT: s_waitcnt vmcnt(0){{$}}
+; CHECK: s_waitcnt
+; CHECK-NEXT: image_store
+define amdgpu_ps void @test2(<8 x i32> inreg %rsrc, i32 %c) {
+  %t = call <4 x float> @llvm.amdgcn.image.load.i32(i32 %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+  call void @llvm.amdgcn.s.waitcnt(i32 3840) ; 0xf00
+  %c.1 = mul i32 %c, 2
+  call void @llvm.amdgcn.image.store.i32(<4 x float> %t, i32 %c.1, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+  ret void
+}
+
+declare void @llvm.amdgcn.s.waitcnt(i32) #0
+
+declare <4 x float> @llvm.amdgcn.image.load.i32(i32, <8 x i32>, i32, i1, i1, i1, i1) #1
+declare void @llvm.amdgcn.image.store.i32(<4 x float>, i32, <8 x i32>, i32, i1, i1, i1, i1) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }